# detect_duplicates.pl # Usage: perl detect_duplicates.pl # -------------------- # By googiek, 2013 # -------------------- # This script searches recursively through a given directory and finds # duplicate files. Excellent for a large, shared dropbox folder. # # The script sorts the files by size, and finds files that have the same size. # Then, for files with the same size (ie potential duplicates) the script compares # the md5 checksum of the files to ensure that they are identical. (Checking by size first # saves time, since md5 is a slower operation than a size check # # NOTE: Be careful when working with old iWork files, or other files that are packages with lots # of tiny system files in them. Often they'll show up as identical to the system files in other # Pages, Numbers, etc files, and bring up a lot of duplicates. # # Running this script with no argument, "perl detect_duplicates.pl" brings up this help # # ---------------------------------------------------------------------------------------- # # This script only detects files with exactly the same content. It does not find images that # are resized / different aspect ratio, GIFs in different frame numbers, watermarks, etc. # Adding this would be a cool exercise though, and would probably involve some Mad Math, # ie taking into account color profiles, finding the average color of certain areas in the images, # or even affine transformations. use strict; use Data::Dumper; use Digest::MD5; unless(scalar(@ARGV)){print " # detect_duplicates.pl # Usage: perl detect_duplicates.pl # -------------------- # This script searches recursively through a given directory and finds # duplicate files. Excellent for a large, shared dropbox folder. # # The script sorts the files by size, and finds files that have the same size. # Then, for files with the same size (ie potential duplicates) the script compares # the md5 checksum of the files to ensure that they are identical. (Checking by size first # saves time, since md5 is a slower operation than a size check # # NOTE: Be careful when working with old iWork files, or other files that are packages with lots # of tiny system files in them. Often they'll show up as identical to the system files in other # Pages, Numbers, etc files, and bring up a lot of duplicates. # # Running this script with no argument, \"perl detect_duplicates.pl\" brings up this help "; die; } my (@files, $filetemp, @filesizes); my $dir = $ARGV[0]; my @todelete; #get files push(@files, @{AddFiles($dir)}); print scalar(@files) . " files found...\n"; #sort files by size @files = sort{-s $b <=> -s $a} @files; print "Files have been sorted...\n"; my $file1_digest = Digest::MD5->new; my $file2_digest = Digest::MD5->new; for (0 .. scalar(@files)-2){ if(-s $files[$_] == -s $files[$_+1]){ #if two files have the same size... open (FILE1, $files[$_]) or die "Can't open $files[$_]\n$!\n"; open (FILE2, $files[$_+1]) or die "Can't open $files[$_+1]\n$!\n"; $file1_digest -> addfile(*FILE1); $file2_digest -> addfile(*FILE2); if($file1_digest->hexdigest eq $file2_digest->hexdigest and !($files[$_] =~ m/\.DS_Store/)){ #compare the md5 of each file print "Duplicates found:\t$files[$_]\t$files[$_+1]\n"; push(@todelete, $files[$_]); } } } print "\nDelete ".scalar(@todelete)." files? (y/n)"; chomp(my $flag = ); if(lc($flag) eq "y"){ foreach (@todelete){unlink($_);} print "Files deleted.\n"; } else {print"Files not deleted.\n";} print "\n\nDone!\n\n"; # This function recursively searches through a folder, and adds files to an array sub AddFiles { my $dir = $_[0]; my $dirhandle; opendir ($dirhandle, $dir) or die "Ouch!\n$!\n\n"; my $filetemp; my @files; while($filetemp = readdir($dirhandle)){ if(-f "$dir/$filetemp" and $filetemp ne "Icon\r"){push(@files, "$dir/$filetemp");} #checks if the file is a directory, not . or .., and not a symbolic link. This prevents infinite loops elsif(-d "$dir/$filetemp" and !(-l "$dir/$filetemp") and !($filetemp =~ /^\.+$/)){push(@files, @{AddFiles("$dir/$filetemp")});} } return \@files; close $dirhandle; }