Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- use strict;
- use warnings;
- use File::Compare;
- use File::Spec;
- use Digest::SHA1;
- use Getopt::Long qw(:config pass_through);
- use Set::Scalar;
- my $search_subdir=0; #flag to determine whether or not subdirectories should be searched.
- #flag to determine whether or not we do a line-by-line comparison.
- #If not enabled (default), then the SHA1 hashes of each file will be used for comparison.
- my $line_by_line=0;
- GetOptions('recursive|r'=>\$search_subdir,'line_by_line|l'=>\$line_by_line);
- my $dir=".";
- warn "WARNING: All arguments except " . $ARGV[0] . " will be ignored.\n" if @ARGV>1;
- $dir=$ARGV[0] if @ARGV;
- die "Argument $dir is not a directory" unless (-d $dir);
- #For reasons I don't understand, File::Find doesn't seem to like relative directories...
- my $abs_dir=File::Spec->rel2abs($dir);
- my @files=();
- if($search_subdir)
- {
- #Do a depth-first grab of files in $dir and all subdirectories.
- use File::Find;
- find(\&grab_files,$abs_dir);
- }
- else #Only grab files from $dir.
- {
- opendir(my $dh,$abs_dir) or die $!;
- @files=map{File::Spec->catfile($abs_dir,$_)}grep{-f $_}readdir($dh);
- closedir($dh);
- }
- unless(@files)
- {
- print "No files found in directory $dir\n";
- exit 0;
- }
- #Array of Set::Scalar objects,
- #each of which represent files that are (pairwise) duplicate.
- #So, this forms a partition of the subset of @files that has a duplicate.
- my @duplicates=();
- #We now compare all distinct pairs of files in @files.
- #The comparison function (given below) depends on whether or not -l is enabled.
- foreach my $i(0..($#files-2))
- {
- my $file1=$files[$i];
- foreach my $j (($i+1)..($#files-1))
- {
- my $file2=$files[$j];
- if(compare_files($file1,$file2)) #If they're the same...
- {
- #first, see if $file1 is in any element of @duplicates.
- my $found=0; #flag to see if we found $file1 or $file2
- foreach my $set (@duplicates)
- {
- if($set->has($file1))
- {
- $set->insert($file2);
- $found=1;
- last;
- }
- elsif($set->has($file2))
- {
- $set->insert($file1);
- $found=1;
- last;
- }
- }
- unless($found) #If we didn't find $file1 or $file2 in @duplicates, add a new set!
- {
- push @duplicates,Set::Scalar->new($file1,$file2);
- }
- }
- }
- }
- #Now we print out the results.
- unless(@duplicates)
- {
- print "No duplicate files found!\n";
- exit 0;
- }
- my $hl="\n\n" . ('~' x 20) . "\n\n"; #Horizontal "line" to keep duplicate sets nice and separated.
- print "Duplicates:\n";
- foreach my $set (@duplicates)
- {
- print $hl;
- my @elements=$set->elements;
- foreach(sort @elements)
- {
- print "$_\n";
- }
- print $hl;
- }
- sub compare_files
- {
- my ($file1,$file2)=@_;
- if($line_by_line) #using File::Compare::compare
- {
- my $ret_val=eval{compare($file1,$file2)};
- die "File::Compare::compare encountered an error: " . $@ if $@;
- return 1 if $ret_val==0; #compare() returns 0 if the files are the same...
- return undef;
- }
- else #Otherwise, we use Digest::SHA1.
- {
- open(my $fh1,"< ",$file1) or die $!;
- open(my $fh2,"<",$file2) or die $!;
- my $sha1=Digest::SHA1->new;
- $sha1->addfile($fh1); #Reads file.
- my $hex1=$sha1->hexdigest; #40 byte hex string.
- $sha1->reset;
- $sha1->addfile($fh2);
- my $hex2=$sha1->hexdigest;
- close($fh1);
- close($fh2);
- return $hex1 eq $hex2;
- }
- }
- sub grab_files
- {
- my $file=$File::Find::name; #/relative/path/to/file/filename
- if((-r $file) and (-f $file))
- {
- push @files,$file;
- }
- elsif((-f $file) and !(-r $file))
- {
- #Customizing the warning message in case we don't have a LOGNAME value in %ENV (eg in Windows)
- my $warning_msg="WARNING: File $file is not readable";
- $warning_msg.=" by user " . $ENV{LOGNAME} if exists $ENV{LOGNAME};
- $warning_msg .="\n";
- warn $warning_msg;
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement