Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # detect_duplicates.pl
- # Usage: perl detect_duplicates.pl <DIRECTORY>
- # --------------------
- # By googiek, 2013
- # --------------------
- # This script searches recursively through a given directory and finds
- # duplicate files. Excellent for a large, shared dropbox folder.
- #
- # The script sorts the files by size, and finds files that have the same size.
- # Then, for files with the same size (ie potential duplicates) the script compares
- # the md5 checksum of the files to ensure that they are identical. (Checking by size first
- # saves time, since md5 is a slower operation than a size check
- #
- # NOTE: Be careful when working with old iWork files, or other files that are packages with lots
- # of tiny system files in them. Often they'll show up as identical to the system files in other
- # Pages, Numbers, etc files, and bring up a lot of duplicates.
- #
- # Running this script with no argument, "perl detect_duplicates.pl" brings up this help
- #
- # ----------------------------------------------------------------------------------------
- #
- # This script only detects files with exactly the same content. It does not find images that
- # are resized / different aspect ratio, GIFs in different frame numbers, watermarks, etc.
- # Adding this would be a cool exercise though, and would probably involve some Mad Math,
- # ie taking into account color profiles, finding the average color of certain areas in the images,
- # or even affine transformations.
- use strict;
- use Data::Dumper;
- use Digest::MD5;
- unless(scalar(@ARGV)){print "
- # detect_duplicates.pl
- # Usage: perl detect_duplicates.pl <DIRECTORY>
- # --------------------
- # This script searches recursively through a given directory and finds
- # duplicate files. Excellent for a large, shared dropbox folder.
- #
- # The script sorts the files by size, and finds files that have the same size.
- # Then, for files with the same size (ie potential duplicates) the script compares
- # the md5 checksum of the files to ensure that they are identical. (Checking by size first
- # saves time, since md5 is a slower operation than a size check
- #
- # NOTE: Be careful when working with old iWork files, or other files that are packages with lots
- # of tiny system files in them. Often they'll show up as identical to the system files in other
- # Pages, Numbers, etc files, and bring up a lot of duplicates.
- #
- # Running this script with no argument, \"perl detect_duplicates.pl\" brings up this help
- ";
- die;
- }
- my (@files, $filetemp, @filesizes);
- my $dir = $ARGV[0];
- my @todelete;
- #get files
- push(@files, @{AddFiles($dir)});
- print scalar(@files) . " files found...\n";
- #sort files by size
- @files = sort{-s $b <=> -s $a} @files;
- print "Files have been sorted...\n";
- my $file1_digest = Digest::MD5->new;
- my $file2_digest = Digest::MD5->new;
- for (0 .. scalar(@files)-2){
- if(-s $files[$_] == -s $files[$_+1]){ #if two files have the same size...
- open (FILE1, $files[$_]) or die "Can't open $files[$_]\n$!\n";
- open (FILE2, $files[$_+1]) or die "Can't open $files[$_+1]\n$!\n";
- $file1_digest -> addfile(*FILE1);
- $file2_digest -> addfile(*FILE2);
- if($file1_digest->hexdigest eq $file2_digest->hexdigest and !($files[$_] =~ m/\.DS_Store/)){ #compare the md5 of each file
- print "Duplicates found:\t$files[$_]\t$files[$_+1]\n";
- push(@todelete, $files[$_]);
- }
- }
- }
- print "\nDelete ".scalar(@todelete)." files? (y/n)";
- chomp(my $flag = <STDIN>);
- if(lc($flag) eq "y"){
- foreach (@todelete){unlink($_);}
- print "Files deleted.\n";
- }
- else {print"Files not deleted.\n";}
- print "\n\nDone!\n\n";
- # This function recursively searches through a folder, and adds files to an array
- sub AddFiles
- {
- my $dir = $_[0];
- my $dirhandle;
- opendir ($dirhandle, $dir) or die "Ouch!\n$!\n\n";
- my $filetemp;
- my @files;
- while($filetemp = readdir($dirhandle)){
- if(-f "$dir/$filetemp" and $filetemp ne "Icon\r"){push(@files, "$dir/$filetemp");}
- #checks if the file is a directory, not . or .., and not a symbolic link. This prevents infinite loops
- elsif(-d "$dir/$filetemp" and !(-l "$dir/$filetemp") and !($filetemp =~ /^\.+$/)){push(@files, @{AddFiles("$dir/$filetemp")});}
- }
- return \@files;
- close $dirhandle;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement