Advertisement
Guest User

find_duplicates.pl

a guest
Apr 7th, 2012
181
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Perl 3.77 KB | None | 0 0
  1. use strict;
  2. use warnings;
  3. use File::Compare;
  4. use File::Spec;
  5. use Digest::SHA1;
  6. use Getopt::Long qw(:config pass_through);
  7. use Set::Scalar;
  8.  
  9. my $search_subdir=0; #flag to determine whether or not subdirectories should be searched.
  10.  
  11. #flag to determine whether or not we do a line-by-line comparison.
  12. #If not enabled (default), then the SHA1 hashes of each file will be used for comparison.
  13. my $line_by_line=0;
  14.  
  15. GetOptions('recursive|r'=>\$search_subdir,'line_by_line|l'=>\$line_by_line);
  16.  
  17. my $dir=".";
  18.  
  19. warn "WARNING: All arguments except " . $ARGV[0] . " will be ignored.\n" if @ARGV>1;
  20.  
  21. $dir=$ARGV[0] if @ARGV;
  22.  
  23. die "Argument $dir is not a directory" unless (-d $dir);
  24.  
  25. #For reasons I don't understand, File::Find doesn't seem to like relative directories...
  26. my $abs_dir=File::Spec->rel2abs($dir);
  27.  
  28. my @files=();
  29.  
  30. if($search_subdir)
  31. {
  32.     #Do a depth-first grab of files in $dir and all subdirectories.
  33.     use File::Find;
  34.     find(\&grab_files,$abs_dir);
  35. }
  36. else #Only grab files from $dir.
  37. {
  38.     opendir(my $dh,$abs_dir) or die $!;
  39.  
  40.     @files=map{File::Spec->catfile($abs_dir,$_)}grep{-f $_}readdir($dh);
  41.  
  42.     closedir($dh);
  43. }
  44.  
  45. unless(@files)
  46. {
  47.     print "No files found in directory $dir\n";
  48.     exit 0;
  49. }
  50.  
  51. #Array of Set::Scalar objects,
  52. #each of which represent files that are (pairwise) duplicate.
  53. #So, this forms a partition of the subset of @files that has a duplicate.
  54.  
  55. my @duplicates=();
  56.  
  57. #We now compare all distinct pairs of files in @files.
  58. #The comparison function (given below) depends on whether or not -l is enabled.
  59.  
  60. foreach my $i(0..($#files-2))
  61. {
  62.     my $file1=$files[$i];
  63.    
  64.     foreach my $j (($i+1)..($#files-1))
  65.     {
  66.         my $file2=$files[$j];
  67.  
  68.         if(compare_files($file1,$file2)) #If they're the same...
  69.         {
  70.             #first, see if $file1 is in any element of @duplicates.
  71.             my $found=0; #flag to see if we found $file1 or $file2
  72.  
  73.             foreach my $set (@duplicates)
  74.             {
  75.                 if($set->has($file1))
  76.                 {
  77.                     $set->insert($file2);
  78.                     $found=1;
  79.                     last;
  80.                 }
  81.                 elsif($set->has($file2))
  82.                 {
  83.                     $set->insert($file1);
  84.                     $found=1;
  85.                     last;
  86.                 }
  87.             }
  88.  
  89.             unless($found) #If we didn't find $file1 or $file2 in @duplicates, add a new set!
  90.             {
  91.                 push @duplicates,Set::Scalar->new($file1,$file2);
  92.             }
  93.         }
  94.     }
  95. }
  96.  
  97. #Now we print out the results.
  98.  
  99. unless(@duplicates)
  100. {
  101.     print "No duplicate files found!\n";
  102.     exit 0;
  103. }
  104.  
  105. my $hl="\n\n" . ('~' x 20) . "\n\n"; #Horizontal "line" to keep duplicate sets nice and separated.
  106.  
  107. print "Duplicates:\n";
  108.  
  109. foreach my $set (@duplicates)
  110. {
  111.     print $hl;
  112.     my @elements=$set->elements;
  113.     foreach(sort @elements)
  114.     {
  115.         print "$_\n";
  116.     }
  117.     print $hl;
  118. }
  119.  
  120. sub compare_files
  121. {
  122.     my ($file1,$file2)=@_;
  123.  
  124.     if($line_by_line) #using File::Compare::compare
  125.     {
  126.         my $ret_val=eval{compare($file1,$file2)};
  127.  
  128.         die "File::Compare::compare encountered an error: " . $@ if $@;
  129.  
  130.         return 1 if $ret_val==0; #compare() returns 0 if the files are the same...
  131.  
  132.         return undef;
  133.     }
  134.     else #Otherwise, we use Digest::SHA1.
  135.     {
  136.         open(my $fh1,"< ",$file1) or die $!;
  137.         open(my $fh2,"<",$file2) or die $!;
  138.  
  139.         my $sha1=Digest::SHA1->new;
  140.  
  141.         $sha1->addfile($fh1); #Reads file.
  142.         my $hex1=$sha1->hexdigest; #40 byte hex string.
  143.  
  144.         $sha1->reset;
  145.         $sha1->addfile($fh2);
  146.         my $hex2=$sha1->hexdigest;
  147.  
  148.         close($fh1);
  149.         close($fh2);
  150.  
  151.         return $hex1 eq $hex2;
  152.     }
  153. }
  154.  
  155. sub grab_files
  156. {
  157.     my $file=$File::Find::name; #/relative/path/to/file/filename
  158.     if((-r $file) and (-f $file))
  159.     {
  160.         push @files,$file;
  161.     }
  162.     elsif((-f $file) and !(-r $file))
  163.     {
  164.         #Customizing the warning message in case we don't have a LOGNAME value in %ENV (eg in Windows)
  165.         my $warning_msg="WARNING: File $file is not readable";
  166.         $warning_msg.=" by user " . $ENV{LOGNAME} if exists $ENV{LOGNAME};
  167.         $warning_msg .="\n";
  168.         warn $warning_msg;
  169.     }
  170. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement