use strict; use warnings; # decoder for lex/yacc tables # # # short int yy_acclist[83] = lp -> act # short int yy_accept[60] = state -> lp # int yy_ec[256] = char -> c # int yy_meta[24] = c -> c # short int yy_base[61] = state -> chkidx # short int yy_def[61] = state -> state # short int yy_nxt[93] = chkidx:state+c -> state # short int yy_chk[93] = chkidx:state+c -> state # books: # Compiler Design in C by Holub # Principles of compiler Design, by Aho, A. V. and Ullman, J. D # Compilers: Principles, Techniques and Tools, by A. V. Aho, R. Sethi and J. D. Ullman # http://linux.thai.net/~thep/datrie/datrie.html # http://compilers.iecc.com/comparch/article/94-10-131 # http://compilers.iecc.com/comparch/article/94-10-091 # http://compilers.iecc.com/comparch/article/01-11-035 # # http://www.scribd.com/doc/5880/Parsing-Techniques # # "Table-Compression Methods" # # There is a scheme in the Aho/Sethi/Ullman compiler book for compressing # scanner tables. It involves creating two pairs of tables. The first has # "base" and "default" entries, the second has "next" and "check" entries. # The "base" entry is indexed by the current state and yields an index into # the next/check table. The "default" entry gives what to do if the state # transition isn't found in next/check. The "next" entry gives the next # state to enter, but only if the "check" entry verifies that this entry is # correct for the current state. Flex creates templates of series of # next/check entries and then encodes differences from these templates as a # way to compress the tables. use Getopt::Long; my $create_dot; my $dump_statestats; my $dump_chk_nxt; my $dump_txn; my $dump_tablestats; my $dump_charclasses; my %info; my %skipstate; GetOptions( "dot"=>\$create_dot, "tstats"=>\$dump_tablestats, "sstats"=>\$dump_statestats, "cclass"=>\$dump_charclasses, "chknxt"=>\$dump_chk_nxt, "txn"=>\$dump_txn, "cname=s"=>sub { my ($c, $name)= $_[1]=~/(\d+)=(\w+)/; $info{cname}{$c}=$name; }, "skip=s"=>sub { $skipstate{$_}++ for split /,/,$_[1]; }, ) or die usage(); sub usage { return <<__EOF__; Usage: yydecode [options] def.txt [words...] -dot create dot -tstats dump tablestats -sstats dump statestats -cclass dump charclasses -chknxt dump chk+nxt tables -txn dump transitions -cname add name for charclass -skip specify states to exclude in the .dot file __EOF__ } my %tabs; my %rtabs; my $lxfile=shift || die usage(); open FH, "<$lxfile" or die "$lxfile: $!\n"; print "dump of $lxfile\n" if $dump_tablestats||$dump_statestats||$dump_charclasses||$dump_chk_nxt||$dump_txn; my $section; while () { if (/^(yy\w*)\s+DC[DW]\s+(.*?)\s*$/) { my ($name, $values)= ($1,$2); my @values= map { $_>0x8000? $_-0x10000 : $_ } map { eval($_) } split /,\s*/, $values; my %r; $r{$_}++ for @values; my @k=sort {$a<=>$b} keys %r; if ($dump_tablestats) { printf("%04x <=%2d %s\n", scalar @values, $k[-1], join(",", map { $_<0?sprintf("%3d:-%03x", $r{$_}, -$_):sprintf("%3d:%04x", $r{$_}, $_) } @k)); } #printf("%-10s %04x %04x .. %04x %04x\n", $name, @values[0,1,-2,-1]); $tabs{$name}=\@values; push @{$rtabs{$name}{$values[$_]}}, $_ for 0..$#values; if ($dump_tablestats) { print "$_"; } } elsif (/\[(\w+)\]/) { $section= (exists $info{$1}) ? $info{$1} : ($info{$1}= { }); } elsif ($section && /^(\w+)\s*=\s*(.*)/) { my ($k, $v)= ($1, $2); $v=~ s/\s+$//; $section->{$k}= $v; } } close FH; # dump reverse of yy_ec if ($dump_charclasses) { printf(" c: charset\n"); print map { sprintf("%2d: %s\n", $_, join(" ",map { $_<32 || $_>126 ? sprintf("%02x", $_) : chr($_).' ' } @{$rtabs{yy_ec}{$_}})) } sort {$a<=>$b} keys %{$rtabs{yy_ec}}; print "\n"; } # create names for 'c' create_c_names(); dump_as_dot () if ($create_dot); state_stats () if ($dump_statestats); dump_nxt_chk_tabs () if ($dump_chk_nxt); my $txn= calc_transitions(); dump_transitions($txn) if ($dump_txn); #find_tokens(); # process commandline args for my $str (@ARGV) { printf("-------\n"); yymatch(unpack 'C*', $str); } ############################################################## sub escape_char { my $ch=shift; return $ch==0 ? "\\0" : $ch==10? "\\n" : $ch==13? "\\r" : $ch==9 ? "\\t" : $ch==ord('\\') ? "\\\\" : $ch==ord('"') ? "\\\"" : $ch<32 || $ch>=127 ? sprintf("\\x%02X", $ch) : chr($ch); } sub create_c_names { for my $c (keys %{$rtabs{yy_ec}}) { my $n= @{$rtabs{yy_ec}{$c}}; my $ch= $n>=1 ? $rtabs{yy_ec}{$c}[0] : undef; my $ch2= $n>=2 ? $rtabs{yy_ec}{$c}[1] : undef; if ($n==1 && (($ch>=32 && $ch<127)|| $ch==10 || $ch==13 || $ch==9)) { $info{cname}{$c}= escape_char($ch); } if ($n==2 && (($ch>=32 && $ch<127)|| $ch==10 || $ch==13 || $ch==9) && (($ch2^$ch) == 0x20)) { $info{cname}{$c}= sprintf("[%c%c]", $ch, $ch2); } elsif (!exists $info{cname}{$c}) { $info{cname}{$c}= "#$c"; } } } # print nxt + chk tables sub dump_nxt_chk_tabs { my $mdiff= @{$tabs{yy_meta}}; my @rb= sort {$a<=>$b} keys %{$rtabs{yy_base}}; my $hdr= join("",map { sprintf("%3d",$_) } 0..$mdiff-1); my $namhdr= join("",map { sprintf("%3s",$info{cname}{$_}) } 0..$mdiff-1); printf("%s |%s\n", $hdr, $hdr); printf("%s |%s\n", $namhdr, $namhdr); for my $i (0..$#rb-1) { my $end= $rb[$i]+$mdiff-1; #my $end= $rb[$i+1]-1; printf(" %-*s | %-*s | %s\n", 3*$mdiff, join(" ", map { sprintf("%2d", $_) } @{$tabs{yy_chk}}[$rb[$i]..$end]), 3*$mdiff, join(" ", map { sprintf("%2d", $_) } @{$tabs{yy_nxt}}[$rb[$i]..$end]), join(",",@{$rtabs{yy_base}{$rb[$i]}})); } } # calc from/to state stats sub state_stats { my %from; my %to; printf(" state<=%d c<=%d\n", $#{$tabs{yy_def}}, $#{$tabs{yy_meta}}); for my $state (0..$#{$tabs{yy_def}}) { for my $c (0..$#{$tabs{yy_meta}}) { my $next= nextstate($state, $c); $to{$next}++; } } printf("%4d %5d\n", $to{$_}||0, $_) for sort { ($to{$a}||0) <=> ($to{$b}||0) || $a<=>$b } 0..$#{$tabs{yy_def}}; } # dump in .dot format sub dump_as_dot { print "digraph lexer {\n"; for my $state (0..$#{$tabs{yy_def}}) { for my $c (0..$#{$tabs{yy_meta}}) { my $next= nextstate($state, $c); # last state = usually the error state: don't include in .dot next if (exists $skipstate{$next}); printf("%d -> %d [label=\"%s\"]\n", $state, $next, $info{cname}{$c}); } } print "}\n"; } sub calc_transitions { my %transition; for my $state (0..$#{$tabs{yy_def}}) { for my $c (0..$#{$tabs{yy_meta}}) { my $next= nextstate($state, $c); push @{$transition{$state}{$next}}, $c; } } return \%transition; } sub dump_transitions { my $txn= shift; my $n= scalar keys %$txn; my %cref; for my $y (0..$n-1) { for my $x (0..$n-1) { if ($txn->{$x}{$y}) { my $r= rangestring(@{$txn->{$x}{$y}}); printf(" %14s", $r); $cref{$r}++; } else { printf((" " x 14)."."); } } print "\n"; } printf("---------\n"); print map { sprintf("%5d : %s\n", $cref{$_}, $_) } sort keys %cref; } sub rangestring { my @num=@_; my $p0; my @strs; my $i; my $add= sub { if (!defined $p0) { push @strs, $num[$i]; } elsif ($num[$i]-$p0==1) { push @strs, $p0; push @strs, $p0+1; } else { push @strs, $p0.'-'.$num[$i]; } }; # find ranges for ($i=0 ; $i+1<@_ ; $i++) { if ($num[$i+1]-$num[$i]==1) { if (!defined $p0) { $p0= $num[$i]; } } else { $add->(); undef $p0; } } $add->(); return join ",", @strs; } sub walk_tree { my ($txn, %states)= @_; my $n= scalar keys %states; my $prev= grep { $states{$_} == $n-1 } keys %states; my @next= keys %{$txn->{$prev}}; if (@next) { my $x=0; for my $next (@next) { next if (exists $states{$next}); walk_tree($txn, %states, $next=>$n+1); $x++; } if ($x==0) { printf("!%s\n", join(",", sort { $states{$b}<=>$states{$a} } keys %states)); } } } sub find_tokens { walk_tree($txn, 1=>0); } sub checkstate { my ($s, $a)= @_; return $tabs{yy_chk}[$tabs{yy_base}[$s]+$a]==$s; } sub nextstate { my ($s, $a)= @_; while (!checkstate($s, $a)) { my ($ns, $na); #print "->$s"; $ns= $tabs{yy_def}[$s]; if ($s>=@{$tabs{yy_accept}}) { # todo: use >=accept or >=def here ? #print "+"; $na= $tabs{yy_meta}[$a]; } else { $na= $a; } if ($ns==$s && $na==$a) { #print "->$s!"; last; } ($s,$a)=($ns,$na); } $s= $tabs{yy_nxt}[$tabs{yy_base}[$s]+$a]; #print "->$s\n"; return $s; } sub yymatch { my (@chars)= @_; my $state= 1; for my $char (@chars) { my $c= $tabs{yy_ec}[$char]; if ($tabs{yy_accept}[$state]) { printf("%02x c=%2d stat=%2d accept\n", $char, $c, $state); } $state= nextstate($state, $c); } my $act= $tabs{yy_accept}[$state]; printf("state=%d act=%d\n", $state, $act); }