#!/usr/bin/perl -w use strict; # vim: sw=4 et ts=4 # how to use: # # google "some words, and more words" # will google this: "some words" "and more words" # $|=1; use HTTP::Request::Common qw(POST GET); use LWP::Simple; use LWP::UserAgent; use Getopt::Long; sub usage { return <<__EOF__ Usage: google [-c] [-a] "search string1,word2" "search string2,word3" will do 2 google queries: ( quotes are always added ) "search string1" "word2" "search string2" "word3" options: -a return all result pages, instead of only the first 100 results. -c just print the count -v verbose: print query __EOF__ } my $allpages; my $countonly; my $verbose; GetOptions( "a"=>\$allpages, "c"=>\$countonly, "v"=>\$verbose, ) or die usage(); my $ua= LWP::UserAgent->new(); $ua->agent("Mozilla/4.75 [en] (X11; U; Linux 2.2.17 i686)"); binmode STDOUT, ":utf8"; for my $query (@ARGV) { my $pagenr=0; my $empty=0; my %urls; while ($pagenr==0 || $allpages && $empty<2) { my $googlehtml; if ($query eq "-") { if ($pagenr==0) { $googlehtml= join "\n", <>; } else { $googlehtml= ""; } } else { $googlehtml = QueryGoogle($query, $pagenr); } if ($countonly) { printf("%12d %s\n", ParseCount($googlehtml), $query); } else { my $result= ParseGoogle($googlehtml); my $oldcount=scalar keys %urls; $urls{$_->{url}}++ for @$result; my $newcount=scalar keys %urls; PrintResult($result) if ($oldcount!=$newcount); $empty= ($oldcount!=$newcount) ? 0 : $empty+1; } $pagenr++; } if (!$countonly) { printf("\n\ntotal %d pages\n", $pagenr); } } exit(0); sub uniq { my ($list)= @_; my %ref; $ref{$_}++ for (@$list); return [keys %ref]; } sub QueryGoogle { my ($query, $pagenr)= @_; $query= join " ", map { (/^\w+:/)?$_:"\"$_\"" } split /\s*,\s*/, $query; $query=~ s/[&?%+]/sprintf("%%%02x", ord($&))/egs; $query =~ s/ /+/gs; $query =~ s/,\s*/"+"/g; my $num= $countonly ? 10 : 100; if ($pagenr==0 && $verbose) { print "query: $query\n"; } my $pagerq= GET sprintf("http://www.google.com/search?q=%s&num=${num}&hl=en&safe=off&btnG=Search&start=%d", $query, $pagenr*100); # print $pagerq->as_string; my $rp= $ua->request($pagerq); return $rp->content; } sub GetQuery { my ($parts, $qid)= @_; my $query=""; for my $part (@$parts) { my $n_choices= scalar @$part; my $choice= $qid%$n_choices; $qid= int($qid/$n_choices); $query .= $part->[$choice]; } return $query; } sub getTotalChoices { my ($parts)= @_; my $total= 1; for my $part (@$parts) { my $n_choices= scalar @$part; $total *= $n_choices; } return $total; } sub ParseGoogle { my ($html)= @_; my @result; my @parts= split(/
/, $html); for my $part (@parts) { next if ($part !~ /href/); #next if ($part =~ /Did you mean(?: to search for)?:.*\/search/); #next if ($part =~ /Category:.*href=.*directory.google.com/); #next if ($part =~ /Result.*?Search took.*?second/i); return [] if ($part =~ /
Your search - .*?<\/b> - did not match any documents/is); my ($url)= ($part =~ /|]+>)(.*?)<\/a>/is); $title =~ s/\s+/ /gs if ($title); $title =~ s/\s+\s+/>/gs if ($title); my ($desc)= ($part =~ /\s*(.*?)(?:|)/is); $desc =~ s/\s+/ /gs if ($desc); $desc =~ s/\s+\s+/>/gs if ($desc); push @result, {url=>$url, title=>decodehtml($title), desc=>decodehtml($desc)}; } return \@result; } sub ParseCount { my ($html)= @_; # print $html; if ($html=~/swrnum=(\d+)"/i) { return $1; } elsif ($html =~ /of about\s*(.*?)<\/b>/) { my $n= $1; $n =~ s/,//g; return $n; } return 0; } sub decodehtml { my $str= shift; return "" unless defined $str; $str =~ s/<\/?b>//gi; $str =~ s/
\r*/\n\t/gi; $str =~ s/.*?<\/a>//; # strips 'view as html' $str =~ s/\t$//; $str =~ s/<//g; $str =~ s/"/"/g; $str =~ s/&#(\d+);/chr($1)/ge; $str =~ s/&/&/g; return $str; } sub PrintResult { my ($results) = @_; for my $r (@$results) { print "$r->{url}\n\t$r->{title}\n\t$r->{desc}\n"; } }