find_duplicates.pl

use strict;
use warnings;
use File::Compare;
use File::Spec;
use Digest::SHA1;
use Getopt::Long qw(:config pass_through);
use Set::Scalar;

my $search_subdir=0; #flag to determine whether or not subdirectories should be searched.

#flag to determine whether or not we do a line-by-line comparison.
#If not enabled (default), then the SHA1 hashes of each file will be used for comparison.
my $line_by_line=0;

GetOptions('recursive|r'=>\$search_subdir,'line_by_line|l'=>\$line_by_line);

my $dir=".";

warn "WARNING: All arguments except " . $ARGV[0] . " will be ignored.\n" if @ARGV>1;

$dir=$ARGV[0] if @ARGV;

die "Argument $dir is not a directory" unless (-d $dir);

#For reasons I don't understand, File::Find doesn't seem to like relative directories...
my $abs_dir=File::Spec->rel2abs($dir);

my @files=();

if($search_subdir)
{
    #Do a depth-first grab of files in $dir and all subdirectories.
    use File::Find;
    find(\&grab_files,$abs_dir);
}
else #Only grab files from $dir.
{
    opendir(my $dh,$abs_dir) or die $!;

    @files=map{File::Spec->catfile($abs_dir,$_)}grep{-f $_}readdir($dh);

    closedir($dh);
}

unless(@files)
{
    print "No files found in directory $dir\n";
    exit 0;
}

#Array of Set::Scalar objects,
#each of which represent files that are (pairwise) duplicate.
#So, this forms a partition of the subset of @files that has a duplicate.

my @duplicates=();

#We now compare all distinct pairs of files in @files.
#The comparison function (given below) depends on whether or not -l is enabled.

foreach my $i(0..($#files-2))
{
    my $file1=$files[$i];

    foreach my $j (($i+1)..($#files-1))
    {
        my $file2=$files[$j];

        if(compare_files($file1,$file2)) #If they're the same...
        {
            #first, see if $file1 is in any element of @duplicates.
            my $found=0; #flag to see if we found $file1 or $file2

            foreach my $set (@duplicates)
            {
                if($set->has($file1))
                {
                    $set->insert($file2);
                    $found=1;
                    last;
                }
                elsif($set->has($file2))
                {
                    $set->insert($file1);
                    $found=1;
                    last;
                }
            }

            unless($found) #If we didn't find $file1 or $file2 in @duplicates, add a new set!
            {
                push @duplicates,Set::Scalar->new($file1,$file2);
            }
        }
    }
}

#Now we print out the results.

unless(@duplicates)
{
    print "No duplicate files found!\n";
    exit 0;
}

my $hl="\n\n" . ('~' x 20) . "\n\n"; #Horizontal "line" to keep duplicate sets nice and separated.

print "Duplicates:\n";

foreach my $set (@duplicates)
{
    print $hl;
    my @elements=$set->elements;
    foreach(sort @elements)
    {
        print "$_\n";
    }
    print $hl;
}

sub compare_files
{
    my ($file1,$file2)=@_;

    if($line_by_line) #using File::Compare::compare
    {
        my $ret_val=eval{compare($file1,$file2)};

        die "File::Compare::compare encountered an error: " . $@ if $@;

        return 1 if $ret_val==0; #compare() returns 0 if the files are the same...

        return undef;
    }
    else #Otherwise, we use Digest::SHA1.
    {
        open(my $fh1,"< ",$file1) or die $!;
        open(my $fh2,"<",$file2) or die $!;

        my $sha1=Digest::SHA1->new;

        $sha1->addfile($fh1); #Reads file.
        my $hex1=$sha1->hexdigest; #40 byte hex string.

        $sha1->reset;
        $sha1->addfile($fh2);
        my $hex2=$sha1->hexdigest;

        close($fh1);
        close($fh2);

        return $hex1 eq $hex2;
    }
}

sub grab_files
{
    my $file=$File::Find::name; #/relative/path/to/file/filename
    if((-r $file) and (-f $file))
    {
        push @files,$file;
    }
    elsif((-f $file) and !(-r $file))
    {
        #Customizing the warning message in case we don't have a LOGNAME value in %ENV (eg in Windows)
        my $warning_msg="WARNING: File $file is not readable";
        $warning_msg.=" by user " . $ENV{LOGNAME} if exists $ENV{LOGNAME};
        $warning_msg .="\n";
        warn $warning_msg;
    }
}