#!perl -w use strict; use IO::File; use Compress::Zlib; use Digest::SHA qw(sha1 sha1_hex); use Getopt::Long; $|=1; my $verbose= 0; use constant { PACKTYPE_NONE=>0, PACKTYPE_COMMIT=>1, PACKTYPE_TREE=>2, PACKTYPE_BLOB=>3, PACKTYPE_TAG=>4, # marked as future in cache.h PACKTYPE_FUTURE=>5, PACKTYPE_OFS_DELTA=>6, PACKTYPE_REF_DELTA=>7, }; my %typename= ( +PACKTYPE_NONE()=> 'none', +PACKTYPE_COMMIT()=> 'commit', +PACKTYPE_TREE()=> 'tree', +PACKTYPE_BLOB()=> 'blob', +PACKTYPE_TAG()=> 'tag', +PACKTYPE_FUTURE()=> 'future', +PACKTYPE_OFS_DELTA()=> 'ofs_delta', +PACKTYPE_REF_DELTA()=> 'ref_delta', ); my $dumpblobs; GetOptions( "v+"=>\$verbose, "b"=>\$dumpblobs, ); $|=1; for my $gitdir (@ARGV) { $gitdir .= '/.git' if (-d "$gitdir/.git"); my $git= processgitdir($gitdir); #dump_as_dot($git); analyze_graph($git); #my $graph= simplify_commitgraph($git); } sub b2h { return unpack("H*", $_[0]); } sub gittype { my ($git, $sha)= @_; return exists $git->{$sha} ? $git->{$sha}{_type} : "?"; } sub analyze_graph { my ($git)= @_; my %stats; while (my ($sha, $obj) = each %$git) { if ($obj->{_type} eq 'commit') { # { parent=>[sha1], tree=>sha1, committer, author, message } $stats{cpnt}{gittype($git,$_)}++ for @{$obj->{parent}}; $stats{ctre}{gittype($git, $obj->{tree})}++; } elsif ($obj->{_type} eq 'tag') { # { object=>sha1, type, tagger, tag, message } $stats{gobj}{gittype($git, $obj->{object})}++; if ($git->{$obj->{object}}{_type} ne $obj->{type}) { warn sprintf("warn: tag %s obj %s != %s\n", b2h($sha), gittype($git,$obj->{object}), $obj->{type}); } } elsif ($obj->{_type} eq 'tree') { # { tree=>{sha1=>[file]} } for my $file (keys %{$obj->{tree}}) { $stats{tfle}{gittype($git,$file)}++; } } elsif ($obj->{_type} eq 'blob') { # { size } } } my %x; $x{$_}++ for map { keys %$_ } values %stats; printf(" : %s\n", join("",map {sprintf("%7s", $_)} sort keys %x)); for my $y (keys %stats) { printf("%s: ", $y); for my $x (sort keys %x) { printf(" %6d", $stats{$y}{$x}||0); } printf("\n"); } print "}\n"; } sub dump_as_dot { my ($git)= @_; print "digraph git {\n"; while (my ($sha, $obj) = each %$git) { if ($obj->{_type} eq 'commit') { # { parent=>[sha1], tree=>sha1, committer, author, message } # the parent->commit link printf("\"%s\" -> \"%s\" [color=red]\n", b2h($_), b2h($sha)) for @{$obj->{parent}}; # the commit -> tree link printf("\"%s\" -> \"%s\"\n", b2h($sha), b2h($obj->{tree})); printf("\"%s\" [shape=trapezium, color=red]\n", b2h($sha)); } elsif ($obj->{_type} eq 'tag') { # { object=>sha1, type, tagger, tag, message } printf("\"%s\" <- \"%s\" [color=blue]\n", b2h($obj->{object}), b2h($sha)); printf("\"%s\" [shape=parallelogram, color=blue]\n", b2h($sha)); } elsif ($obj->{_type} eq 'tree') { # { tree=>{sha1=>[file]} } for my $file (keys %{$obj->{tree}}) { # tree -> file links printf("\"%s\" -> \"%s\"\n", b2h($sha), b2h($file)); # the tree printf("\"%s\" [shape=triangle]\n", b2h($sha)); # the file printf("\"%s\" [shape=box label=\"%s%s\"]\n", b2h($file), exists $git->{$file} && exists $git->{$file}{size} ? sprintf("size=%d\\n", $git->{$file}{size}) : "", join("\\n", @{$obj->{tree}{$file}})); #printf("\"%s-f\" -> \"%s\"\n", b2h($file), b2h($file)); } } elsif ($obj->{_type} eq 'blob') { # { size } #printf("\"%s\" [shape=box, label=\"size %d\"]\n", b2h($sha), $obj->{size}); } } print "}\n"; } sub processgitdir { my ($gitdir)=@_; my $git= {}; processfile("$gitdir/index", $git) if -e "$gitdir/index"; my $packdir= "$gitdir/objects/pack"; if (-d $packdir) { opendir DIR, $packdir or die "fatal: $packdir: $!\n"; my @files= readdir DIR; closedir DIR; for my $fn (@files) { next if $fn eq '.' || $fn eq '..'; processfile("$packdir/$fn", $git); } } for my $odir (glob "$gitdir/objects/[0-9a-f][0-9a-f]") { opendir DIR, $odir or die "fatal: $odir: $!\n"; my @files= readdir DIR; closedir DIR; for my $fn (@files) { next if $fn eq '.' || $fn eq '..'; processfile("$odir/$fn", $git); } } return $git; } sub processfile { my ($fn, $git)= @_; printf("---%s\n", $fn); my $fh= IO::File->new($fn, "r") or die "fatal: $fn: $!\n"; binmode $fh; my $data; $fh->read($data, -s $fh); $fh->close(); processdata($data, $fn, $git); } sub processdata { my $fn= $_[1]; my $git= $_[2]; if ($_[0] =~ /^DIR/) { parseindexdata($_[0], $fn); } elsif ($_[0] =~ /^PACK/) { parsepackdata($_[0], $fn, $git); } elsif ($_[0] =~ /^\x78\x01/) { parsecompressedobjectdata($_[0], $fn, $git); } elsif ($_[0] =~ /^\xff\x74\x4f\x63/) { parsepackidxdata($_[0], $fn); } else { warn sprintf("warn: unknown file format: %s : %s\n", $fn, unpack("H*", substr($_[0], 0,16))); } } # 44 49 52 43 00 00 00 # 02, 00 00 00 02, 49 92 a3 00, 00 00 00 00, 49 92 a3 00, 00 00 00 00, 0e 00 00 05, 00 1b f1 fe, 00 00 81 a4, 00 00 01 f5, 00 00 00 14, 00 00 00 08 # e5 a0 c3 3b 87 05 66 0c f6 32 96 be 96 f6 bb f6 41 5d 68 1c # 00 0d : 70 72 6a 31 2f 61 62 63 64 2e 74 78 74 # 00, 00 00 00 00, 49 92 a3 00, 00 00 00 00, 49 92 a3 00, 00 00 00 00, 0e 00 00 05, 00 1b f1 ff, 00 00 81 a4, 00 00 01 f5, 00 00 00 14, 00 00 00 08 # 1a 74 af 79 08 dc 24 5d cf 5d d4 24 de 7d 1a 07 a9 3b fa a9 # 00 0d : 70 72 6a 32 2f 6b 6c 6d 6e 2e 74 78 74 # 00 00 00 00 00 # 37 96 94 50 da e2 c6 38 70 21 6f a9 b7 8c f3 a8 e1 e6 31 c0 sub parseindexdata { my ($data, $fn)= @_; printf("indexdata %s\n", $fn); } sub decodepackheader { my $type; my $size; my $shift=0; while ($_[1]>4)&7; $size= $byte&15; $shift= 4; } else { $size |= ($byte&127)<<$shift; $shift += 7; } last unless $byte&0x80; } return ($type, $size); } sub decodedeltasrcoffset { my $size=0; my $i=0; while ($_[1]0); #parseobjectdata($ddata); my $ofs= 0; my $obj; if ($type == PACKTYPE_COMMIT) { $obj= objparse_commit($ddata, $ofs); } elsif ($type == PACKTYPE_TREE) { $obj= objparse_tree($ddata, $ofs); } elsif ($type == PACKTYPE_BLOB) { $obj= objparse_blob($ddata, $ofs); } elsif ($type == PACKTYPE_TAG) { $obj= objparse_tag($ddata, $ofs); } elsif ($type == PACKTYPE_OFS_DELTA) { $obj= objparse_delta($ddata, $ofs); } else { warn sprintf("warn: unhandled type %d : %s\n", $type, unpack("H*",substr($ddata, 0, 32))); } $obj->{sha}= $objsha; $obj->{raw}= $ddata; if ($type==PACKTYPE_OFS_DELTA) { $obj->{srcdofs}= $deltasrcofs; #printf("delta source= %08lx\n", $hdrofs-$deltasrcofs) if ($verbose>1); if (!exists $objsbyofs{$hdrofs-$deltasrcofs}) { die sprintf("fatal: delta source not found: %08lx\n", $hdrofs-$deltasrcofs); } my $src= $objsbyofs{$hdrofs-$deltasrcofs}; my $newdata= applydelta($src->{raw}, $obj->{cmds}); my $type= $src->{_type}; my $nofs= 0; printf("%08lx: %-10s %6d %s\n", $hdrofs, $type, length($newdata), b2h($objsha)) if ($verbose>1); if ($type eq "commit") { $obj= objparse_commit($newdata, $nofs); } elsif ($type eq "tree") { $obj= objparse_tree($newdata, $nofs); } elsif ($type eq "blob") { $obj= objparse_blob($newdata, $nofs); } elsif ($type eq "tag") { $obj= objparse_tag($newdata, $nofs); } else { warn "warn: unhandled type $type\n"; } my $objsha= sha1(sprintf("%s %d\x00%s", $type, length($newdata), $newdata)); $obj->{sha}= $objsha; $obj->{raw}= $newdata; } if ($obj) { add_to_git($git, $obj); $objsbyofs{$hdrofs}= $obj; } } if ($ofs!=length($data)-20) { warn sprintf("warn: expected sha1 at 0x%x instead of 0x%x in .pack\n", length($data)-20, $ofs); } my $packsha1= substr($data, $ofs, 20); } sub applydelta { my ($src, $deltacmds)= @_; my $dst=""; for my $d (@$deltacmds) { if ($d->{type} eq 'insert') { $dst .= $d->{data}; } elsif ($d->{type} eq 'copy') { $dst .= substr($src, $d->{offset}, $d->{size}); } } return $dst; } sub parsecompressedobjectdata { my $fn= $_[1]; my $git= $_[2]; my $cofs= 0; my $ddata= uncompressdata($_[0], $cofs); if ($cofs!=length($_[0])) { warn sprintf("warn: compressed objfile contains more data at 0x%x\n", $cofs); } if ($fn =~ m{objects/(\w\w)/(\w+)$} && sha1_hex($ddata) ne "$1$2") { warn sprintf("warn: filename does not match sha1: %s .. %s\n", $fn, sha1_hex($ddata)); } my $objsha= pack("H*", "$1$2"); if ($ddata =~ /^(\w+)\s(\d+)\x00/) { my $ofs= length($&); my ($type, $size)= ($1,$2); if ($size!=length($ddata)-$ofs) { warn sprintf("warn: object size mismatch: stored=0x%x, size=0x%x\n", $size, length($ddata)-$ofs); } printf("%-10s %6d %s\n", $type, $size, b2h($objsha)) if ($verbose>0); my $obj; if ($type eq "commit") { $obj= objparse_commit($ddata, $ofs); } elsif ($type eq "tree") { $obj= objparse_tree($ddata, $ofs); } elsif ($type eq "blob") { $obj= objparse_blob($ddata, $ofs); } elsif ($type eq "tag") { $obj= objparse_tag($ddata, $ofs); } else { warn "warn: unhandled type $type\n"; } $obj->{sha}= $objsha; if ($obj) { add_to_git($git, $obj); } } else { warn sprintf("warn: unknown object format, %s\n", unpack("H*", substr($ddata, 0, 32))); } } sub parsepackidxdata { my ($data, $fn)= @_; my ($magic, $version)= unpack("NN", $data); my @fanout= unpack("N256", substr($data, 8, 4*256)); my $n= $fanout[-1]; if (length($data) != 0x430+28*$n) { warn sprintf("warn: index size = 0x%x, expected 0x%x ( n=0x%x )\n", length($data), 0x430+28*$n, $n); } my @sha1list= map { substr($data, 0x408+20*$_, 20) } 0..$n-1; my @crclist= unpack("N*", substr($data, 0x408+20*$n, 4*$n)); my @ofslist= unpack("N*", substr($data, 0x408+20*$n+4*$n, 4*$n)); my $packsha1= substr($data, 0x408+28*$n, 20); my $idxsha1= substr($data, 0x408+28*$n+20, 20); } sub uncompressdata { # 0 = data, 1 = offset my $datalen= length($_[0]); my $d= inflateInit(); my $output=""; while (1) { my $data= substr($_[0], $_[1], 0x8000); my $datalen= length($data); my ($outdata, $err)= $d->inflate($data); if ($err!=Z_STREAM_END && $err!=Z_OK) { die "fatal: inflate: $err\n"; } $_[1] += $datalen-length($data); $output .= $outdata; last if ($err==Z_STREAM_END); } return $output; } # returns { size } sub objparse_blob { #my $data= ; printf(" size=%d\n", length($_[0])-$_[1]) if ($verbose>1); printf("-----\n%s\n-----", substr($_[0], $_[1])) if ($dumpblobs); return { _type=>'blob', size=>length($_[0])-$_[1] }; } sub objparse_keyvallist { my $data= substr($_[0], $_[1]); if ($data =~ /^(.*?)\n\n(.*)$/s) { my ($headers, $message)= ($1,$2); my @lines= split /\n/, $headers; my @keys= map { /\w+/; $& } @lines; my %nkeys; $nkeys{$_}++ for @keys; # note: commit keys found: committer, parent, tree, author # note: tag keys found: object, type, tag, tagger # can have multiple parents if ( grep { $_ ne 'parent' && $nkeys{$_}!=1 } keys %nkeys ) { warn sprintf("warn: dupkeys: %s\n", join(",", map { sprintf("%s:%d", $_, $nkeys{$_}) } keys %nkeys)); } my %keys; for (@lines) { if (/(\w+)\s+(.*)/) { if (exists $keys{$1}) { $keys{$1} .= ",".$2; } else { $keys{$1}= $2; } } } #printf(" tree %s\n", $keys{tree}) if exists $keys{tree}; #printf(" parents %s\n", $keys{parent}) if exists $keys{parent}; #printf(" object %s\n", $keys{object}) if exists $keys{object}; # convert to binary sha's $keys{tree}=pack("H*", $keys{tree}) if exists $keys{tree}; if (exists $keys{parent}) { $keys{parent}=[ map { pack("H*", $_) } split /,/,$keys{parent} ] } else { $keys{parent}=[]; } $keys{object}=pack("H*", $keys{object}) if exists $keys{object}; $keys{message}= $message; return \%keys; } else { warn "warn: unknown commit file format\n"; } # tree # author .. # \n\n # comment } # returns { tree=>{sha1=>[file]} } sub objparse_tree { my $data= substr($_[0], $_[1]); my $pos=0; my %tree; while ($data =~ /([^\x00]+)\x00(.{20})/gs) { my ($file, $sha)= ($1, $2); my $npos= pos($data); if ($pos+length($&)!=$npos) { warn "warn: tree pos mismatch: $pos != $npos\n"; } #printf(" %s %s\n", b2h($sha), $file); push @{$tree{$sha}}, $file; $pos = $npos; } my $npos= length($data); if ($pos!=$npos) { warn "warn: tree pos mismatch: $pos != $npos\n"; } printf(" %s\n", join("\n ", map { sprintf("%s %s", b2h($_), join(";", @{$tree{$_}})) } keys %tree)) if ($verbose>1); return {tree=>\%tree, _type=>'tree'}; } # returns { parent=>[sha1], tree=>sha1, committer, author, message } sub objparse_commit { my $obj= objparse_keyvallist(@_); $obj->{_type}= "commit"; printf(" t=%s p:%s\n", b2h($obj->{tree}), join(",",map { b2h($_) } @{$obj->{parent}})) if ($verbose>1); printf("%s\n", join '\t', split /\n/, $obj->{message}) if ($verbose>1); return $obj; } # returns { object=>sha1, type, tagger, tag, message } sub objparse_tag { my $obj= objparse_keyvallist(@_); $obj->{_type}= "tag"; printf(" %s:%s %s\n", b2h($obj->{object}), $obj->{type}, $obj->{tag}) if ($verbose>1); printf("%s\n", join '\t', split /\n/, $obj->{message}) if ($verbose>1); return $obj; } # returns { cmds=[ {type=insert,data}, {type=copy,size,offset} ] } sub objparse_delta { my $data= substr($_[0], $_[1]); my @delta; my $srcsize= decodedeltasize(@_); my $dstsize= decodedeltasize(@_); my $patchofs=0; while ($_[1]$srcsize || $cpoff+$cpsize>$srcsize) { warn "warn: delta src > srcsize\n"; } if ($patchofs > $dstsize || $patchofs+$cpsize > $dstsize) { warn "warn: delta dst > dstsize\n"; } push @delta, { type=>'copy', size=>$cpsize, offset=>$cpoff }; $patchofs += $cpsize; } elsif ($cmd) { if ($_[1]+$cmd>length($_[0])) { die "fatal: delta ofs > size\n"; } if ($patchofs > $dstsize || $patchofs+$cmd > $dstsize) { warn "warn: delta dst > dstsize\n"; } push @delta, { type=>'insert', data=>substr($_[0], $_[1], $cmd) }; $_[1] += $cmd; $patchofs += $cmd; } else { # marked as 'future encoding' in patch-delta.c die "fatal: unexpected delta opcode 0\n"; } } if ($patchofs != $dstsize) { warn "warn: delta patchsize != dstsize\n"; } if ($verbose>0) { my $ofs=0; for my $d (@delta) { if ($d->{type} eq 'insert') { printf(" @%08lx: insert %s\n", $ofs, unpack("H*", $d->{data})); $ofs += length($d->{data}); } elsif ($d->{type} eq 'copy') { printf(" @%08lx: copy from %08lx, %08lx bytes\n", $ofs, $d->{offset}, $d->{size}); $ofs += $d->{size}; } } printf(" delta total: src=%08lx, dst=%08lx, calcdst=%08lx\n", $srcsize, $dstsize, $ofs); } return { _type=>'delta', cmds=>\@delta }; } sub add_to_git { my ($git, $obj)= @_; if (exists $git->{$obj->{sha}}) { warn sprintf("warn: duplicate objid: %s\n", b2h($obj->{sha})); } $git->{$obj->{sha}}= $obj; }