Advertisement
Guest User

Untitled

a guest
Sep 9th, 2023
101
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Perl 7.22 KB | None | 0 0
  1. #!/usr/bin/perl
  2. use strict;
  3. use warnings;
  4.  
  5. use File::Find;
  6. use File::stat;
  7. use Digest::MD5;
  8. use Fcntl;
  9. use Cwd 'abs_path';
  10.  
  11.  
  12. my $progname = $0;
  13. $progname =~ s@.*/@@g;
  14.  
  15. unless (scalar(@ARGV) > 0) {
  16. print <<"USAGE";
  17.  
  18. Usage: $progname dir1 [dir2] ... [dirN] [-d] [-dt] [-same] [-all]
  19.  
  20. Scans dir1 for duplicates of any file found under dir1 to dirN. Also
  21. finds truncated copies of files, provided that at least the first 32K
  22. of each copy is intact. Also reports whether files are "similar", which
  23. means that 32K or more matched before a difference was found.
  24.  
  25. Switches:
  26.     -d: Delete duplicates found under dir1
  27.     -dt: Delete truncated dupes found anywhere (not just dir1)
  28.     -same: Only compares files which have the same name.
  29.     -all: Treat all directories as if they're dir1. -d will delete
  30.        files found under any directory.
  31.  
  32. Suggestion: do trial runs without -d / -dt first. Dupes and truncations
  33. printed in the left hand column are what will be deleted by -d or -dt.
  34.  
  35. USAGE
  36. exit;
  37. }
  38.  
  39. # parse command line arguments
  40. my @searchDirs;
  41. my $deleteDupeMode = 0;
  42. my $deleteTruncMode = 0;
  43. my $sameNameMode = 0;
  44. my $allMode = 0;
  45.  
  46. foreach my $arg (@ARGV) {
  47.    if ($arg eq "-d") {
  48.        $deleteDupeMode = 1;
  49.    } elsif ($arg eq "-dt") {
  50.        $deleteTruncMode = 1;
  51.    } elsif ($arg eq "-same") {
  52.        $sameNameMode = 1;
  53.    } elsif ($arg eq "-all") {
  54.        $allMode = 1;
  55.    } else {
  56.        if (-d $arg) {
  57.            push (@searchDirs, $arg)
  58.        } else {
  59.            die "Error: \"$arg\" is not a directory.\n";
  60.        }
  61.    }
  62. }
  63.  
  64. # Hash of fileInfo containing all files under examination.
  65. # Keyed by absolute pathname to guarantee we don't examine the same file twice.
  66. my %filesByAbsPath;
  67.  
  68. # Hash of arrays of fileInfo.  Keyed by MD5 tag; identifies lists of files which
  69. # should be compared to one another.
  70. my %filesByTag;
  71.  
  72. # Global to track whether files currently being added are in the first pass
  73. my $first = 1;
  74.  
  75.  
  76. # calcTag: returns the MD5 digest of the first 32K of the given file (its 'tag')
  77. sub calcTag($)
  78. {
  79.     my ($filename) = @_;
  80.  
  81.     if (-d $filename) {
  82.         # doing MD5 on a directory is not supported
  83.         return "unsupported"; # we need to return something
  84.     }
  85.  
  86.     # Use 'sysopen' to safely handle filenames with leading
  87.     # whitespace or leading "-"
  88.     sysopen(FILE, $filename, O_RDONLY)
  89.          or die "Unable to open file \"$filename\": $!\n";
  90.     binmode(FILE); # just in case we're on Windows!
  91.     my $data;
  92.     read(FILE, $data, 32768);
  93.     close(FILE);
  94.     return Digest::MD5->new->add($data)->hexdigest;
  95. }
  96.  
  97.  
  98. # checkFile: invoked from the 'find' routine on each file or directory in turn
  99. sub checkFile()
  100. {
  101.     return unless -f $_; # only interested in files, not directories
  102.  
  103.     my $filename = $_;
  104.     my $dirname = $File::Find::dir;
  105.     my $path = $File::Find::name;
  106.  
  107.     return if $filename =~ /^\._/; # ignore files whose names start with "._"
  108.     return if $filename =~ /^\.DS_Store$/;
  109.  
  110.     # Never examine the same file twice.  Avoids detecting the
  111.     # same file as a dupe of itself when the user invokes the
  112.     # script on nested directories.
  113.     my $abspath = abs_path($filename);
  114.     return if ($filesByAbsPath{$abspath});
  115.  
  116.     my $statInfo = stat($filename)
  117.         or warn "Can't stat file \"$dirname/$filename\": $!\n" and return;
  118.     my $size = $statInfo->size;
  119.     my $tag = calcTag("$filename");
  120.  
  121.     my $fileInfo = {
  122.         'name' => $filename,
  123.         'path' => $path,
  124.         'size' => $size,
  125.         'first' => $first,
  126.     };
  127.  
  128.     $filesByAbsPath{$abspath} = $fileInfo;
  129.     push(@{$filesByTag{$tag}}, $fileInfo);
  130. }
  131.  
  132.  
  133. use constant SIZE => 131072;
  134.  
  135. sub compareFiles
  136. {
  137.     my $finf1 = shift;
  138.     my $finf2 = shift;
  139.    
  140.     my $fn1 = $finf1->{path};
  141.     my $fn2 = $finf2->{path};
  142.    
  143.     if ($sameNameMode && $finf1->{name} ne $finf2->{name}) {
  144.         return;
  145.     }
  146.  
  147.     sysopen(FILE1, $fn1, O_RDONLY)
  148.          or die "Unable to open file \"$fn1\": $!\n";
  149.     binmode(FILE1); # just in case we're on Windows!
  150.  
  151.     sysopen(FILE2, $fn2, O_RDONLY)
  152.          or die "Unable to open file \"$fn2\": $!\n";
  153.     binmode(FILE2); # just in case we're on Windows!
  154.  
  155.     my ($data1, $data2);
  156.     my ($nbytes1, $nbytes2);
  157.     my $bytes_compared = 0;
  158.     my $same = 1;
  159.     my $trunc = 0;
  160.  
  161.     do {
  162.         $nbytes1 = read(FILE1, $data1, SIZE);
  163.         $nbytes2 = read(FILE2, $data2, SIZE);
  164.  
  165.         my $len = ($nbytes1 < $nbytes2) ? $nbytes1 : $nbytes2;
  166.         $bytes_compared += $len;
  167.  
  168.         if ($data1 ne $data2) {
  169.             $same = 0;
  170.             if ($nbytes1 != $nbytes2) {
  171.                 my $tdata1 = substr($data1, 0, $len);
  172.                 my $tdata2 = substr($data2, 0, $len);
  173.                 $trunc = 1 if ($tdata1 eq $tdata2);
  174.             }
  175.         }
  176.     } while ($nbytes1 == SIZE && $nbytes2 == SIZE);
  177.    
  178.     close(FILE1);
  179.     close(FILE2);
  180.  
  181.     if ($same) {
  182.         push(@{$finf1->{dupe}}, $finf2);
  183.     } elsif ($trunc) {
  184.         if ($finf1->{size} < $finf2->{size}) {
  185.             push(@{$finf1->{trunc}}, $finf2);
  186.         } else {
  187.             push(@{$finf2->{trunc}}, $finf1);
  188.         }
  189.     } else {
  190.         push(@{$finf1->{sim}}, $finf2);
  191.     }
  192. }
  193.  
  194.  
  195. MAIN:
  196. {
  197.     # Calculate MD5 hash tags for all files.
  198.     while (my $dir = shift @searchDirs) {
  199.         print "Hashing files under \"$dir\" ...\n";
  200.         find(\&checkFile, $dir);
  201.         $first = 0;
  202.     }
  203.  
  204.     # Check for duplicate, truncated, and similar files based on the MD5 hash tag.
  205.     foreach my $fileList (values %filesByTag) {
  206.         while (my $finfo1 = shift @{$fileList}) {
  207.             if ($allMode || $finfo1->{first}) {
  208.                 foreach my $finfo2 (@{$fileList}) {
  209.                     compareFiles($finfo1, $finfo2);
  210.                 }
  211.             }
  212.         }
  213.     }
  214.  
  215.     # Find the maximum path name length to prettify printout
  216.     my $maxlen = 0;
  217.     foreach my $finfo (values %filesByAbsPath) {
  218.         if ($finfo->{dupe} || $finfo->{trunc} || $finfo->{sim}) {
  219.             my $len = length($finfo->{path});
  220.             $maxlen = ($maxlen > $len) ? $maxlen : $len;
  221.         }
  222.     }
  223.     $maxlen += 1;
  224.  
  225.     # Print information about each dupe/trunc/similar and optionally delete
  226.     foreach my $finfo (sort values %filesByAbsPath) {
  227.         # only want to print the original name once, for clarity,
  228.         # so store it in a string and clear the string after each use
  229.         my $name = $finfo->{path};
  230.         foreach my $dupe (@{$finfo->{dupe}}) {
  231.             printf("%-${maxlen}s DUPE OF     %s\n", $name, $dupe->{path});
  232.             $name = "";
  233.         }
  234.  
  235.         foreach my $trunc (@{$finfo->{trunc}}) {
  236.             printf("%-${maxlen}s TRUNC OF    %s\n", $name, $trunc->{path});
  237.             $name = "";
  238.         }
  239.  
  240.         foreach my $sim (@{$finfo->{sim}}) {
  241.             printf("%-${maxlen}s similar to  %s\n", $name, $sim->{path});
  242.             $name = "";
  243.         }
  244.        
  245.         if (($finfo->{dupe} && $deleteDupeMode) || ($finfo->{trunc} && $deleteTruncMode)) {
  246.             if (unlink $finfo->{path}) {
  247.                 printf("%-${maxlen}s DELETED\n", $finfo->{path});
  248.             } else {
  249.                 warn "Couldn't delete $finfo->{path}\n";
  250.             }
  251.         }
  252.     }
  253. }
  254.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement