Advertisement
googiek

detect_duplicates.pl

Aug 4th, 2013
327
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Perl 4.16 KB | None | 0 0
  1. # detect_duplicates.pl
  2. # Usage: perl detect_duplicates.pl <DIRECTORY>
  3. # --------------------
  4. # By googiek, 2013
  5. # --------------------
  6. # This script searches recursively through a given directory and finds
  7. # duplicate files. Excellent for a large, shared dropbox folder.
  8. #
  9. # The script sorts the files by size, and finds files that have the same size.
  10. # Then, for files with the same size (ie potential duplicates) the script compares
  11. # the md5 checksum of the files to ensure that they are identical. (Checking by size first
  12. # saves time, since md5 is a slower operation than a size check
  13. #
  14. # NOTE: Be careful when working with old iWork files, or other files that are packages with lots
  15. # of tiny system files in them. Often they'll show up as identical to the system files in other
  16. # Pages, Numbers, etc files, and bring up a lot of duplicates.
  17. #
  18. # Running this script with no argument, "perl detect_duplicates.pl" brings up this help
  19. #
  20. # ----------------------------------------------------------------------------------------
  21. #
  22. # This script only detects files with exactly the same content. It does not find images that
  23. # are resized / different aspect ratio, GIFs in different frame numbers, watermarks, etc.
  24. # Adding this would be a cool exercise though, and would probably involve some Mad Math,
  25. # ie taking into account color profiles, finding the average color of certain areas in the images,
  26. # or even affine transformations.
  27.  
  28. use strict;
  29. use Data::Dumper;
  30. use Digest::MD5;
  31.  
  32. unless(scalar(@ARGV)){print "
  33. # detect_duplicates.pl
  34. # Usage: perl detect_duplicates.pl <DIRECTORY>
  35. # --------------------
  36. # This script searches recursively through a given directory and finds
  37. # duplicate files. Excellent for a large, shared dropbox folder.
  38. #
  39. # The script sorts the files by size, and finds files that have the same size.
  40. # Then, for files with the same size (ie potential duplicates) the script compares
  41. # the md5 checksum of the files to ensure that they are identical. (Checking by size first
  42. # saves time, since md5 is a slower operation than a size check
  43. #
  44. # NOTE: Be careful when working with old iWork files, or other files that are packages with lots
  45. # of tiny system files in them. Often they'll show up as identical to the system files in other
  46. # Pages, Numbers, etc files, and bring up a lot of duplicates.
  47. #
  48. # Running this script with no argument, \"perl detect_duplicates.pl\" brings up this help
  49. ";
  50. die;
  51. }
  52.  
  53. my (@files, $filetemp, @filesizes);
  54. my $dir = $ARGV[0];
  55. my @todelete;
  56.  
  57. #get files
  58. push(@files, @{AddFiles($dir)});
  59. print scalar(@files) . " files found...\n";
  60.  
  61. #sort files by size
  62. @files = sort{-s $b <=> -s $a} @files;
  63. print "Files have been sorted...\n";
  64.  
  65. my $file1_digest = Digest::MD5->new;
  66. my $file2_digest = Digest::MD5->new;
  67.        
  68. for (0 .. scalar(@files)-2){
  69.     if(-s $files[$_] == -s $files[$_+1]){ #if two files have the same size...
  70.         open (FILE1, $files[$_]) or die "Can't open $files[$_]\n$!\n";
  71.         open (FILE2, $files[$_+1]) or die "Can't open $files[$_+1]\n$!\n";
  72.         $file1_digest -> addfile(*FILE1);
  73.         $file2_digest -> addfile(*FILE2);
  74.         if($file1_digest->hexdigest eq $file2_digest->hexdigest and !($files[$_] =~ m/\.DS_Store/)){ #compare the md5 of each file
  75.             print "Duplicates found:\t$files[$_]\t$files[$_+1]\n";
  76.             push(@todelete, $files[$_]);
  77.         }
  78.     }
  79. }
  80.  
  81. print "\nDelete ".scalar(@todelete)." files? (y/n)";
  82. chomp(my $flag = <STDIN>);
  83. if(lc($flag) eq "y"){
  84.     foreach (@todelete){unlink($_);}
  85.     print "Files deleted.\n";
  86. }
  87. else {print"Files not deleted.\n";}
  88. print "\n\nDone!\n\n";
  89.  
  90. # This function recursively searches through a folder, and adds files to an array
  91. sub AddFiles
  92. {
  93.     my $dir = $_[0];
  94.     my $dirhandle;
  95.     opendir ($dirhandle, $dir) or die "Ouch!\n$!\n\n";
  96.     my $filetemp;
  97.     my @files; 
  98.    
  99.     while($filetemp = readdir($dirhandle)){
  100.         if(-f "$dir/$filetemp" and $filetemp ne "Icon\r"){push(@files, "$dir/$filetemp");}
  101.        
  102.         #checks if the file is a directory, not . or .., and not a symbolic link. This prevents infinite loops
  103.         elsif(-d "$dir/$filetemp" and !(-l "$dir/$filetemp") and !($filetemp =~ /^\.+$/)){push(@files, @{AddFiles("$dir/$filetemp")});}
  104.     }
  105.     return \@files;
  106.    
  107.     close $dirhandle;
  108. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement