This week only. Pastebin PRO Accounts Christmas Special! Don't miss out!Want more features on Pastebin? Sign Up, it's FREE!
Guest

detect_duplicates.pl

By: googiek on Aug 4th, 2013  |  syntax: Perl  |  size: 4.16 KB  |  views: 187  |  expires: Never
download  |  raw  |  embed  |  report abuse  |  print
Text below is selected. Please press Ctrl+C to copy to your clipboard. (⌘+C on Mac)
  1. # detect_duplicates.pl
  2. # Usage: perl detect_duplicates.pl <DIRECTORY>
  3. # --------------------
  4. # By googiek, 2013
  5. # --------------------
  6. # This script searches recursively through a given directory and finds
  7. # duplicate files. Excellent for a large, shared dropbox folder.
  8. #
  9. # The script sorts the files by size, and finds files that have the same size.
  10. # Then, for files with the same size (ie potential duplicates) the script compares
  11. # the md5 checksum of the files to ensure that they are identical. (Checking by size first
  12. # saves time, since md5 is a slower operation than a size check
  13. #
  14. # NOTE: Be careful when working with old iWork files, or other files that are packages with lots
  15. # of tiny system files in them. Often they'll show up as identical to the system files in other
  16. # Pages, Numbers, etc files, and bring up a lot of duplicates.
  17. #
  18. # Running this script with no argument, "perl detect_duplicates.pl" brings up this help
  19. #
  20. # ----------------------------------------------------------------------------------------
  21. #
  22. # This script only detects files with exactly the same content. It does not find images that
  23. # are resized / different aspect ratio, GIFs in different frame numbers, watermarks, etc.
  24. # Adding this would be a cool exercise though, and would probably involve some Mad Math,
  25. # ie taking into account color profiles, finding the average color of certain areas in the images,
  26. # or even affine transformations.
  27.  
  28. use strict;
  29. use Data::Dumper;
  30. use Digest::MD5;
  31.  
  32. unless(scalar(@ARGV)){print "
  33. # detect_duplicates.pl
  34. # Usage: perl detect_duplicates.pl <DIRECTORY>
  35. # --------------------
  36. # This script searches recursively through a given directory and finds
  37. # duplicate files. Excellent for a large, shared dropbox folder.
  38. #
  39. # The script sorts the files by size, and finds files that have the same size.
  40. # Then, for files with the same size (ie potential duplicates) the script compares
  41. # the md5 checksum of the files to ensure that they are identical. (Checking by size first
  42. # saves time, since md5 is a slower operation than a size check
  43. #
  44. # NOTE: Be careful when working with old iWork files, or other files that are packages with lots
  45. # of tiny system files in them. Often they'll show up as identical to the system files in other
  46. # Pages, Numbers, etc files, and bring up a lot of duplicates.
  47. #
  48. # Running this script with no argument, \"perl detect_duplicates.pl\" brings up this help
  49. ";
  50. die;
  51. }
  52.  
  53. my (@files, $filetemp, @filesizes);
  54. my $dir = $ARGV[0];
  55. my @todelete;
  56.  
  57. #get files
  58. push(@files, @{AddFiles($dir)});
  59. print scalar(@files) . " files found...\n";
  60.  
  61. #sort files by size
  62. @files = sort{-s $b <=> -s $a} @files;
  63. print "Files have been sorted...\n";
  64.  
  65. my $file1_digest = Digest::MD5->new;
  66. my $file2_digest = Digest::MD5->new;
  67.                
  68. for (0 .. scalar(@files)-2){
  69.         if(-s $files[$_] == -s $files[$_+1]){ #if two files have the same size...
  70.                 open (FILE1, $files[$_]) or die "Can't open $files[$_]\n$!\n";
  71.                 open (FILE2, $files[$_+1]) or die "Can't open $files[$_+1]\n$!\n";
  72.                 $file1_digest -> addfile(*FILE1);
  73.                 $file2_digest -> addfile(*FILE2);
  74.                 if($file1_digest->hexdigest eq $file2_digest->hexdigest and !($files[$_] =~ m/\.DS_Store/)){ #compare the md5 of each file
  75.                         print "Duplicates found:\t$files[$_]\t$files[$_+1]\n";
  76.                         push(@todelete, $files[$_]);
  77.                 }
  78.         }
  79. }
  80.  
  81. print "\nDelete ".scalar(@todelete)." files? (y/n)";
  82. chomp(my $flag = <STDIN>);
  83. if(lc($flag) eq "y"){
  84.         foreach (@todelete){unlink($_);}
  85.         print "Files deleted.\n";
  86. }
  87. else {print"Files not deleted.\n";}
  88. print "\n\nDone!\n\n";
  89.  
  90. # This function recursively searches through a folder, and adds files to an array
  91. sub AddFiles
  92. {
  93.         my $dir = $_[0];
  94.         my $dirhandle;
  95.         opendir ($dirhandle, $dir) or die "Ouch!\n$!\n\n";
  96.         my $filetemp;
  97.         my @files;     
  98.        
  99.         while($filetemp = readdir($dirhandle)){
  100.                 if(-f "$dir/$filetemp" and $filetemp ne "Icon\r"){push(@files, "$dir/$filetemp");}
  101.                
  102.                 #checks if the file is a directory, not . or .., and not a symbolic link. This prevents infinite loops
  103.                 elsif(-d "$dir/$filetemp" and !(-l "$dir/$filetemp") and !($filetemp =~ /^\.+$/)){push(@files, @{AddFiles("$dir/$filetemp")});}
  104.         }
  105.         return \@files;
  106.        
  107.         close $dirhandle;
  108. }
clone this paste RAW Paste Data