#!/usr/local/bin/perl # Give this script one or more directories, # and it will warn if any of them or their # children contain files that have the same MD5sum. # # This is done in a very IO friendly way, # but uses a fair piece of RAM. # # -Craig Reyenga use strict; use Digest::MD5; use File::Find; use File::Basename; use Getopt::Std; use Time::HiRes qw(gettimeofday); use Cwd qw(realpath cwd); my %opt; getopts("zv0W:L:s:r:l:X:", \%opt ); my %whites; my %logdb; my %sizes; my $debug=0; my ($allbytes,$scanbytes,$scanprog,$scanrate,$timeleft)=(0,0,0,0,0); my $dig=Digest::MD5->new; my @xs; if (defined($opt{X})) { @xs=split('|', $opt{X}); } if (defined($opt{v})) { $debug=1; } if (defined($opt{z})) { $| = 1; } if (defined($opt{W})) { if (!open(WHITE, $opt{W})) { print STDERR "Can't open whitelist: $opt{W}: $!.\n"; } else { while (my $wlent = ) { chomp $wlent; next if (index($wlent,'#') == 0); $whites{realpath($wlent)}=1; } close WHITE; } } if (defined($opt{L})) { if (!open(OLDLOG, $opt{L})) { print STDERR "Can't open source logfile: $opt{L}: $!.\n"; } else { while (my $logline = ) { chomp $logline; next if (index($logline,'#') == 0); my @logent=split(' ', $logline); my $mtime=shift(@logent); my $cksum=shift(@logent); my $fname=join(' ', @logent); $logdb{realpath($fname)}="$mtime,$cksum"; } close OLDLOG; } } unless (@ARGV) { my $app=basename($0); print < File containing a list of files to not check. This is done by comparing the 'realpath' of the names in the list, and the realpath of each encountered element. -L File containing a log to read from a previous run. -l File to write log to. May be the same as the input logfile. -0 Display files that are empty (0 bytes). -s mtime,ctime,alpha,beta Sort duplicates by: File modification time File inode change time Alphabetically (case insensitive) Alphabetically (case sensitive) -r Reverse the sorting. -X Don't use -X quite yet :-) EOF } print "### Getting list of files\n" if ($debug); my $startdir=cwd(); foreach my $dir (@ARGV) { $dir=realpath($dir); unless (chdir($dir)) { print STDERR "Can't open $dir\n"; next; } find( sub { foreach my $x (@xs) { return if ($File::Find::name =~ /\Q$x\E/); } if (!-l $File::Find::name && -f _) { if (defined($opt{W})) { return if $whites{$File::Find::name}; } my $size=-s $File::Find::name; $allbytes += $size; if (defined($sizes{$size})) { push(@{$sizes{$size}}, $File::Find::name); $scanbytes += (scalar @{$sizes{$size}} == 2) ? ($size*2) : $size; } else { $sizes{$size}=[$File::Find::name]; } } }, $dir); chdir($startdir); } my $startime = gettimeofday(); my $inittime = gettimeofday(); my $thistime = $inittime; if ($debug) { print "### ".getfsizex($allbytes)." in total.\n"; print "### ".getfsizex($scanbytes)." to be scanned.\n"; print "### ".getfsizex($allbytes-$scanbytes)." skipped (".percentify(($allbytes-$scanbytes)/$allbytes)."%).\n"; } foreach my $size (sort{ $a <=> $b } keys %sizes) { my @aos=@{$sizes{$size}}; delete $sizes{$size}; if ($size == 0) { print "##Empty:\n"; if (defined($opt{0})) { print "$_\n" for (@aos); } else { print "#$_\n" for (@aos); } next; } next unless (scalar @aos >= 2); if (defined($opt{s})) { if ($opt{s} eq 'mtime') { @aos=sort{(stat($a))[9] <=> (stat($b))[9]} @aos; } elsif ($opt{s} eq 'ctime') { @aos=sort{(stat($a))[10] <=> (stat($b))[10]} @aos; } elsif ($opt{s} eq 'alpha') { @aos=sort{ lc($a) cmp lc($b) } @aos; } elsif ($opt{s} eq 'beta') { @aos=sort @aos; } @aos=reverse(@aos) if (defined($opt{r})); } my %sums; foreach my $i (@aos) { my $sum; my @stat=stat($i); unless (@stat) { print STDERR "##Can't open $i: $!\n"; next; } if (defined($logdb{$i})) { my ($mtime,$cksum)=split(',',$logdb{$i}); if ($mtime == $stat[9]) { $sum = $cksum; } } else { unless (open FILE, $i) { print STDERR "##Can't open $i: $!\n"; next; } $dig->addfile(*FILE); $sum = $dig->hexdigest; close FILE; } $scanprog += $size; $logdb{$i} = join(',', $stat[9], $sum); if (defined($sums{$sum})) { push(@{$sums{$sum}}, $i); } else { $sums{$sum}=[$i]; } } foreach my $cursum (keys %sums) { my @files=@{$sums{$cursum}}; next unless (scalar @files >= 2); print "##Duplicate: s=$size md5=$cursum\n"; print "#" . shift(@files) . "\n"; print join("\n", @files); print "\n"; } $thistime = gettimeofday(); if ($thistime > ($inittime + 5) && $debug) { $scanrate = int($scanprog / (gettimeofday() - $startime)); $timeleft = ($scanbytes - $scanprog) / $scanrate; print "### " . getfsizex($scanprog) . ' scanned so far (' . percentify($scanprog/$scanbytes) . "%) at ".getfsizex($scanrate)."/s. ".getfsizex($scanbytes-$scanprog)." to go.\n"; print "### Estimate finish at " . (scalar localtime(gettimeofday()+$timeleft)) . ".\n"; $inittime = $thistime; } } print "### Done scan at ".(scalar localtime())."\n" if ($debug); if (defined($opt{l})) { if (!open(NEWLOG, ">".$opt{l})) { print STDERR "Can't open output log file: $!\n"; } else { foreach my $i (keys %logdb) { my ($mtime,$cksum)=split(',',$logdb{$i}); print NEWLOG "$mtime $cksum $i\n"; } close NEWLOG; } } sub percentify { return int(shift(@_) * 100); } sub getfsizex { my $fsize_o = round(shift); my ($units,$long,$out); $units = (scalar @_) ? uc(shift) : 0; $long = (scalar @_) ? shift : 0; return '0B' unless ( ($fsize_o) && ($fsize_o =~ /^-?\d+\.?\d*$/) ); my @r_unit = qw (B K M G T P E Z); my $fsize = $fsize_o; my $unit_t = 0; if ($units) { foreach my $unit (@r_unit) { last if ($unit eq $units); $unit_t++; } $fsize /= (1024**$unit_t); } else { while ($fsize > 1000) { last if ($unit_t >= $#r_unit); $fsize /= 1024; $unit_t++; } } my $digits = 0; if ($unit_t >= 2) { $digits = 1; if ($fsize >= 99.99) { $digits = 0; } elsif ($fsize >= 9.99) { $digits = 1; } } $fsize=round($fsize,$digits); $out = "${fsize}$r_unit[$unit_t]"; $out .= " (" . commify($fsize_o) . " bytes)" unless (($fsize==$fsize_o)||(!$long)); return $out; } sub commify { local $_ = shift; 1 while s/^([-+]?\d+)(\d{3})/$1,$2/; return $_; } sub round { my $n = shift; my $d = shift || 0; return sprintf("%.${d}f",$n); } __END__