#!/usr/bin/perl -w
use strict;
# vim: sw=4 et ts=4
# how to use:
#
# google "some words, and more words"
# will google this: "some words" "and more words"
#
$|=1;
use HTTP::Request::Common qw(POST GET);
use LWP::Simple;
use LWP::UserAgent;
use Getopt::Long;
sub usage {
return <<__EOF__
Usage: google [-c] [-a] "search string1,word2" "search string2,word3"
will do 2 google queries: ( quotes are always added )
"search string1" "word2"
"search string2" "word3"
options:
-a return all result pages, instead of only the first 100 results.
-g include google cache links
-G only google cache links
-F dont filter - 'with omitted results included'
-c just print the count
-v verbose: print query
__EOF__
}
my $allpages;
my $countonly;
my $verbose;
my $dontfilter;
my $urlselector=1; # 1 = page, 2=cache ( 3 = both )
Getopt::Long::Configure(qw(no_ignorecase));
GetOptions(
"a"=>\$allpages,
"c"=>\$countonly,
"g"=>sub { $urlselector=3; },
"G"=>sub { $urlselector=2; },
"F"=>\$dontfilter,
"v"=>\$verbose,
) or die usage();
my $ua= LWP::UserAgent->new();
$ua->agent("Mozilla/4.75 [en] (X11; U; Linux 2.2.17 i686)");
binmode STDOUT, ":utf8";
for my $query (@ARGV) {
my $pagenr=0;
my $empty=0;
my %urls;
while ($pagenr==0 || $allpages && $empty<2) {
my $googlehtml;
if ($query eq "-") {
if ($pagenr==0) {
$googlehtml= join "\n", <>;
}
else {
$googlehtml= "";
}
}
else {
$googlehtml = QueryGoogle($query, $pagenr);
}
if ($countonly) {
printf("%12d %s\n", ParseCount($googlehtml), $query);
}
else {
if ($googlehtml =~ /
Page (\d+) of \d+ results/) {
my $curpage= $1;
printf("curpage= %d : %d\n", $pagenr, $curpage);
if ($curpage != $pagenr) {
last;
}
}
my $result= ParseGoogle($googlehtml);
my $oldcount=scalar keys %urls;
$urls{$_->{url}}++ for @$result;
my $newcount=scalar keys %urls;
PrintResult($result) if ($oldcount!=$newcount);
$empty= ($oldcount!=$newcount) ? 0 : $empty+1;
}
$pagenr++;
}
if (!$countonly) {
printf("\n\ntotal %d pages\n", $pagenr);
}
}
exit(0);
sub uniq {
my ($list)= @_;
my %ref;
$ref{$_}++ for (@$list);
return [keys %ref];
}
sub QueryGoogle {
my ($query, $pagenr)= @_;
$query= join " ", map { (/^\w+:/)?$_:"\"$_\"" } split /\s*,\s*/, $query;
$query=~ s/[&?%+]/sprintf("%%%02x", ord($&))/egs;
$query =~ s/ /+/gs;
$query =~ s/,\s*/"+"/g;
my $num= $countonly ? 10 : 100;
if ($pagenr==0 && $verbose) {
print "query: $query\n";
}
my $pagerq= GET sprintf("http://www.google.com/search?q=%s&num=%d&hl=en&safe=off&btnG=Search&start=%d%s", $query, $num, $pagenr*$num, $dontfilter?"&filter=0":"");
print $pagerq->as_string if ($verbose);
my $rp= $ua->request($pagerq);
return $rp->content;
}
sub GetQuery {
my ($parts, $qid)= @_;
my $query="";
for my $part (@$parts) {
my $n_choices= scalar @$part;
my $choice= $qid%$n_choices;
$qid= int($qid/$n_choices);
$query .= $part->[$choice];
}
return $query;
}
sub getTotalChoices {
my ($parts)= @_;
my $total= 1;
for my $part (@$parts) {
my $n_choices= scalar @$part;
$total *= $n_choices;
}
return $total;
}
sub ParseGoogle {
my ($html)= @_;
my @result;
my @parts= split(/
/, $html);
for my $part (@parts) {
next if ($part !~ /href/);
#next if ($part =~ /Did you mean(?: to search for)?:.*\/search/);
#next if ($part =~ /Category:.*href=.*directory.google.com/);
#next if ($part =~ /Result.*?Search took.*?second/i);
return [] if ($part =~ /
Your search -
.*?<\/b> - did not match any documents/is);
my ($url, @googleurls)= ($part =~ /|]+>)(.*?)<\/a>/is);
$title =~ s/\s+/ /gs if ($title);
$title =~ s/\s+\s+/>/gs if ($title);
my ($desc)= ($part =~ /\s*(.*?)(?:|)/is);
$desc =~ s/\s+/ /gs if ($desc);
$desc =~ s/\s+\s+/>/gs if ($desc);
my ($cache)= grep { /search.q=cache/ } @googleurls;
push @result, {url=>$url, cache=>$cache, title=>decodehtml($title), desc=>decodehtml($desc)};
}
return \@result;
}
sub ParseCount {
my ($html)= @_;
# print $html;
if ($html=~/swrnum=(\d+)"/i) {
return $1;
}
elsif ($html =~ /of about\s*(.*?)<\/b>/) {
my $n= $1;
$n =~ s/,//g;
return $n;
}
return 0;
}
sub decodehtml {
my $str= shift;
return "" unless defined $str;
$str =~ s/<\/?b>//gi;
$str =~ s/ \r*/\n\t/gi;
$str =~ s/.*?<\/a>//; # strips 'view as html'
$str =~ s/\t$//;
$str =~ s/<//g;
$str =~ s/"/"/g;
$str =~ s/(\d+);/chr($1)/ge;
$str =~ s/&/&/g;
return $str;
}
sub PrintResult {
my ($results) = @_;
for my $r (@$results) {
print "$r->{url}\n" if ($urlselector&1) && $r->{url};
print "$r->{cache}\n" if ($urlselector&2) && $r->{cache};
print "\t$r->{title}\n\t$r->{desc}\n";
}
}
|