#!/usr/bin/perl -w
# (C) 2003-2007 Willem Jan Hengeveld <itsme@xs4all.nl>
# Web: http://www.xs4all.nl/~itsme/
#      http://wiki.xda-developers.com/
#
# $Id$
#

use strict;

# this script searches all files specified for the given pattern.
# both unicode and ascii strings are found.

# done: add verbose switch, which lists all matching strings.
# todo: add workaround for matches at 16M block boundary ( which currently are not found )
# todo: add option to list rva instead of fileoffset
# todo: add option to search for x86/arm/thumb-relative references
# todo: in verbose/hex mode, dump result as hex.


# examples:
# search current dir for ascii or unicode strings RDO1: PER1: SPK1:
#   findstr -v "\x00(RDO|PER|SPK)1:\x00" .
#
# search current dir for hex dword 0x12345678
#   findstr -x 12345678 .
#
# search current dir for byte sequence 01 02
#   findstr -b "\x01\x02" .
#

# BUG:  somehow a string with '0' in it fails to convert.
#
use IO::File;
use Getopt::Long;

my $g_matchword= 0;
my $g_matchbinary= 0;
my $g_pattern_is_hex= 0;
my $g_pattern_is_guid= 0;
my $g_verbose= 0;
my $g_listonly= 0;
my $bRecursive = 0;
my $pattern;
my $g_matchcase= 0;
my $g_flags= "(?i)";

sub usage {
return <<__EOF__
Usage: findstr [-r] [-b] [-x | -g] [-w] pattern [files]
      -r   recurse into directories
      -b   binary match mode
      -v   verbose - list matched content
      -I   match case ( default: case insensitive )
      -x   pattern is specified as comma separated list of 32bit dwords
      -g   pattern is a GUID, implies binary
      -w   only find whole words
__EOF__
}
GetOptions(
    "w" => \$g_matchword,
    "b" => \$g_matchbinary,
    "I" => \$g_matchcase,
    "x" => \$g_pattern_is_hex,
    "g" => \$g_pattern_is_guid,
    "v" => \$g_verbose,
    "r" => \$bRecursive,
    "l" => \$g_listonly,
) or die usage();

die usage() if (!@ARGV);

$pattern= shift;
if ($g_pattern_is_guid) {
    $pattern= parse_guid_pattern($pattern);
    $g_matchbinary= 1;
}
if ($g_pattern_is_hex) {
    $pattern= parse_hex_pattern($pattern);
    $g_matchbinary= 1;
}
if ($g_matchbinary || $g_matchcase) {
    $g_flags= "";
}
if (@ARGV) {
    for (@ARGV) {
        if (-f $_) {
            SearchFile($_, $pattern);
        }
        elsif (-d $_) {
            SearchDirectory($_, $pattern);
        }
        else {
            for (glob($_)) {
                if (-f $_) {
                    SearchFile($_, $pattern);
                }
                elsif (-d $_) {
                    SearchDirectory($_, $pattern);
                }
            }
        }
    }
}
else {
    binmode STDIN;
    $/=undef;
    my $data= <>;
    SearchString("STDIN", $data, $pattern);
}
sub parse_guid_pattern {
    my @guidstrs= 
    my @guidpatterns;
    for my $guidstr (split /\|/, shift) {
        if ($guidstr =~ /(\w{8}-\w{4}-\w{4}-\w{4}-\w{12})/) {
            my @w= map { length($_)<=8 ? hex($_) : (hex(substr($_,0,8)), hex(substr($_,8))) } split /-/, $1;
            my $guid= pack("VvvnNn", @w);
            my $pattern= unpack("H*", $guid);
            $pattern =~ s/\w\w/\\x$&/g;
            push @guidpatterns, $pattern;
        }
    }
    return join("|", @guidpatterns);
}
sub findhexwordsize {
    my %size;
    $size{length($_)}++ for map { if (length($_)&1) { s/^0// } $_ }  split /[^0-9a-fA-F]+/, $_[0];

    my @lengths= sort { $a<=>$b} keys %size;
    if ($lengths[-1]>8) {
        return 0;
    }
    return 4 if ($size{8} || $size{7} || $size{6} || $size{5});
    return 2 if ($size{4} || $size{3});
    return 1 if ($size{2} || $size{1});
    return 1;
}
sub parse_hex_pattern {
    my $hexpattern= shift;

    my $wordsize= findhexwordsize($hexpattern);
    my @dw;
    for my $hexword (split /[^0-9A-Fa-f?]+/, $hexpattern) {
        if ($hexword =~ /^\?+$/) {
            push @dw, ".{$wordsize}";
        }
        elsif ($wordsize==0) {
            #pattern is long hex string
            $hexword =~ s/../\\x$&/g;

            push @dw, $hexword;
        }
        elsif ($hexword =~ /^[0-9a-f?]+$/i) {
            $hexword =~ s/^0+//;
            $hexword = ("0" x (2*$wordsize-length($hexword))) . $hexword;
            my @bytes;
            for my $hexbyte ($hexword =~ /(..)/g) {
                push @bytes, byte_to_regex($hexbyte);
            }
            push @dw, join("", reverse @bytes);
        }
        else {
            die "unrecognised hex pattern: either ? or 8 digits: '$hexword'\n";
        }
    }
    return join("", @dw);
}
sub byte_to_regex {
    my $bytepattern= shift;

    if ($bytepattern eq "??") {
        return ".";
    }
    else {
        return "\\x$bytepattern";
    }
}
sub SearchDirectory {
    my ($path, $pattern)= @_;
    #print "search $bRecursive path '$path' for '$pattern'\n";

    opendir(DIR, $path) or warn "$!: reading $path\n";
    my @files= readdir DIR;
    closedir DIR;

    for (@files) {
        next if ($_ eq "." || $_ eq "..");
        my $file= makeFullPath($path, $_);
        if (-f $file) {
            SearchFile($file, $pattern);
        }
        elsif ($bRecursive && -d $file)
        {
            SearchDirectory($file, $pattern);
        }
    }
}
sub SearchFile {
    my ($path, $pattern)= @_;
    #print "search file '$path' for '$pattern'\n";

    my $fh= IO::File->new("$path", "r") or do { warn "$path: $!\n"; return; };
    binmode $fh;
    my $data;
    my $ofs= 0;
    while (!$fh->eof()) {
        $fh->read($data, 0x1000000);

        last if (SearchString($path, $ofs, $data, $pattern) && $g_listonly);

        $ofs += length($data);
    }
    $fh->close();
}
sub SearchString {
    my ($name, $startofs, $data, $pattern)= @_;
    #print "search string '$name' of ", length($data), " bytes for '$pattern'\n";

    my ($pre, $post)= ("", "");
    if ($g_matchword) {
        ($pre, $post)= ('\b', '\b');
    }

    my %offsets;
    my $pos= 0;
    while (defined $pos && $pos<length($data)) {
        pos($data)= $pos;
        if ($data =~ /\G.*?$g_flags$pre($pattern)$post/gs) {
            $pos= pos($data);
            if ($pos) {
                $offsets{$pos-length($1)}= {l=>length($1), d=>$1, t=>"a"};
                last if ($g_listonly);
            }
        }
        else {
            last;
        }
    }

    if (!$g_matchbinary) {
        if ($g_matchword) {
            ($pre, $post)= ('(?:\W\0|^)', '(?:\W\0|$)');
        }
        $pattern = ToUnicodePattern($pattern);
        $pos= 0;
        while (defined $pos && $pos<length($data)) {
            pos($data)= $pos;
            if ($data =~ /\G.*?$g_flags$pre($pattern)$post/gs) {
                $pos= pos($data);
                if ($pos) {
                    $offsets{$pos-length($1)}= {l=>length($1), d=>$1, t=>"u"};
                    last if ($g_listonly);
                }
            }
            else {
                last;
            }
        }
    }
    if ($g_listonly) {
        if (keys %offsets) {
            printf("%s\n", $name);
            return 1;
        }
        return 0;
    }
    if (!$g_verbose) {
        if (keys %offsets) {
            print "$name\n\t", join(", ", map { sprintf("%08lx", $_+$startofs); } sort {$a<=>$b} keys %offsets), "\n";
        }
    }
    elsif ($g_pattern_is_guid) {
        printf("%08lx %-20s %s\n", $_+$startofs, $name, MatchAsGuid($offsets{$_}{d}, $offsets{$_}{t})) for (sort {$a<=>$b} keys %offsets);
    }
    else {
        printf("%08lx %-20s %s\n", $_+$startofs, $name, MatchAsString($offsets{$_}{d}, $offsets{$_}{t})) for (sort {$a<=>$b} keys %offsets);
    }
}
sub MatchAsGuid {
    my ($d, $t)= @_;
    my @w= unpack("VvvnNn", $d);
    return sprintf("{%08lx-%04x-%04x-%04x-%08x%04x}", @w);
}
sub MatchAsString {
    my ($d, $t)= @_;
    $d= substr($d, 0, 256);
    if ($g_pattern_is_hex && (length($d)%4==0)) {
        return join(" ", map { sprintf("%08lx", $_) } unpack("V*", $d));
    }
    elsif ($g_matchbinary) {
        return join(" ", map { sprintf("%02x", $_) } unpack("C*", $d))
    }
    elsif ($t eq "a") {
        return "\"$d\"";
    }
    elsif ($t eq "u") {
        $d =~ s/(.)\x00/$1/g;
        return "L\"$d\"";
    }
    else {
        warn "unexpected match type $t\n";
    }
}
sub makeFullPath {
    my ($full, @parts)= @_;

    $full ||="";
    for my $path (@parts) {
        next if (!defined $path);
        $path =~ s{^/}{};         # remove leading slash
        $full =~ s{/?$}{/$path};  # remove trailing slash, append path
    }

    return $full;
}

sub ToUnicodePattern {
    my ($pat)= @_;

    my $upat= "";
    my $pos= 0;

    # !!! somehow this pattern fails to match  '0'
    while ($pat =~ 
            /
                (
                  \[[^\]]+\]    # match [...]
                |
                  \([^)]+\)     # match (...)
                |
                  [^*+?.{()^\$[\]\\]  # match normal + |
                |
                  \\[^x]           # match escapes
                |
                  \\x[0-9a-f]{2}
                |
                  \.
                )
                (          # quantifier
                  (?:
                    \*
                  |
                    \+
                  |
                    \?
                  |
                    {\d*,\d*}
                  )
                  \??          # followed by optional minimal quant.
                |              # match no quantifier
                )
            /gx
    
    ) {  # .. for vim: /
        my ($data, $quant)= ($1, $2);
        #printf("dat=%s q=%s\n", $data, $quant);
        if ($pos != pos($pat)-length($&)) {
            printf("WARN: missed regex chars %d-%d : %s\n", $pos, pos($pat)-length($&), substr($pat, $pos, pos($pat)-length($&)-$pos));
        }
        $pos= pos($pat);
        if (my ($spec, $content)= $data =~ /^\((\?[#:=!>]|\?<[=!]|)(.*)\)$/) {
            $content =~ s/./$&\x00/g;
            $data= "($spec$content)";
        }
        elsif ($data =~ /^\[|^\\/) {
            $data .= "\x00";
        }
        else {
            $data =~ s/./$&\x00/g;
        }
        if ($quant eq "") {
            $upat .= $data;
        }
        elsif ($data =~ /^\(/) {
            $upat .= "$data$quant";
        }
        else {
            $upat .= "(?:$data)$quant";
        }
    }
    #print "ascii   pattern: $pat\n";
    #print "unicode pattern: $upat\n";
    if ($pos!=length($pat)) {
        printf("WARN: missed regex chars %d-%d : %s\n", $pos, length($pat), substr($pat, $pos));
    }
    return $upat;
}
