# detect_duplicates.pl
# Usage: perl detect_duplicates.pl <DIRECTORY>
# --------------------
# By googiek, 2013
# --------------------
# This script searches recursively through a given directory and finds
# duplicate files. Excellent for a large, shared dropbox folder.
#
# The script sorts the files by size, and finds files that have the same size.
# Then, for files with the same size (ie potential duplicates) the script compares
# the md5 checksum of the files to ensure that they are identical. (Checking by size first
# saves time, since md5 is a slower operation than a size check
#
# NOTE: Be careful when working with old iWork files, or other files that are packages with lots
# of tiny system files in them. Often they'll show up as identical to the system files in other
# Pages, Numbers, etc files, and bring up a lot of duplicates.
#
# Running this script with no argument, "perl detect_duplicates.pl" brings up this help
#
# ----------------------------------------------------------------------------------------
#
# This script only detects files with exactly the same content. It does not find images that
# are resized / different aspect ratio, GIFs in different frame numbers, watermarks, etc.
# Adding this would be a cool exercise though, and would probably involve some Mad Math,
# ie taking into account color profiles, finding the average color of certain areas in the images,
# or even affine transformations.
use strict;
use Data::Dumper;
use Digest::MD5;
unless(scalar(@ARGV)){print "
# detect_duplicates.pl
# Usage: perl detect_duplicates.pl <DIRECTORY>
# --------------------
# This script searches recursively through a given directory and finds
# duplicate files. Excellent for a large, shared dropbox folder.
#
# The script sorts the files by size, and finds files that have the same size.
# Then, for files with the same size (ie potential duplicates) the script compares
# the md5 checksum of the files to ensure that they are identical. (Checking by size first
# saves time, since md5 is a slower operation than a size check
#
# NOTE: Be careful when working with old iWork files, or other files that are packages with lots
# of tiny system files in them. Often they'll show up as identical to the system files in other
# Pages, Numbers, etc files, and bring up a lot of duplicates.
#
# Running this script with no argument, \"perl detect_duplicates.pl\" brings up this help
";
die;
}
my (@files, $filetemp, @filesizes);
my $dir = $ARGV[0];
my @todelete;
#get files
push(@files, @{AddFiles($dir)});
print scalar(@files) . " files found...\n";
#sort files by size
@files = sort{-s $b <=> -s $a} @files;
print "Files have been sorted...\n";
my $file1_digest = Digest::MD5->new;
my $file2_digest = Digest::MD5->new;
for (0 .. scalar(@files)-2){
if(-s $files[$_] == -s $files[$_+1]){ #if two files have the same size...
open (FILE1, $files[$_]) or die "Can't open $files[$_]\n$!\n";
open (FILE2, $files[$_+1]) or die "Can't open $files[$_+1]\n$!\n";
$file1_digest -> addfile(*FILE1);
$file2_digest -> addfile(*FILE2);
if($file1_digest->hexdigest eq $file2_digest->hexdigest and !($files[$_] =~ m/\.DS_Store/)){ #compare the md5 of each file
print "Duplicates found:\t$files[$_]\t$files[$_+1]\n";
push(@todelete, $files[$_]);
}
}
}
print "\nDelete ".scalar(@todelete)." files? (y/n)";
chomp(my $flag = <STDIN>);
if(lc($flag) eq "y"){
foreach (@todelete){unlink($_);}
print "Files deleted.\n";
}
else {print"Files not deleted.\n";}
print "\n\nDone!\n\n";
# This function recursively searches through a folder, and adds files to an array
sub AddFiles
{
my $dir = $_[0];
my $dirhandle;
opendir ($dirhandle, $dir) or die "Ouch!\n$!\n\n";
my $filetemp;
my @files;
while($filetemp = readdir($dirhandle)){
if(-f "$dir/$filetemp" and $filetemp ne "Icon\r"){push(@files, "$dir/$filetemp");}
#checks if the file is a directory, not . or .., and not a symbolic link. This prevents infinite loops
elsif(-d "$dir/$filetemp" and !(-l "$dir/$filetemp") and !($filetemp =~ /^\.+$/)){push(@files, @{AddFiles("$dir/$filetemp")});}
}
return \@files;
close $dirhandle;
}