Advertisement
Guest User

fdupes

a guest
Oct 17th, 2011
185
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #!/usr/bin/perl -w
  2. # Usage: ./fdupes.pl <start directory>
  3.  
  4. use strict;
  5. use Term::ReadKey;
  6. use File::Find;
  7.  
  8. # testing - 0 for interactive mode, 1 to skip all deletion etc
  9. my $testing = 0;
  10.  
  11. # skip files smaller than 100 bytes. Set to zero if you like...
  12. my $minsize = 100;
  13.  
  14. my $filecount = my $bytecount = my $fileschecked = my $wasted = 0;
  15. my %files = ();
  16. &usage unless (@ARGV);
  17.  
  18. my $searchdir = $ARGV[0];
  19. my $autodelete;
  20.  
  21. if ( $ARGV[0] eq '-d' ) {
  22.     $searchdir  = $ARGV[1];
  23.     $autodelete = "yes";
  24. }
  25.  
  26. sub wanted {
  27.     return unless -f;
  28.     return if -l;
  29.    
  30.     my $filesize = ( stat($_) )[7];
  31.     $bytecount += $filesize;
  32.     return unless $filesize > $minsize;    # skip small files
  33.     $filecount++;
  34.     push @{ $files{$filesize} }, $File::Find::name;
  35. }
  36.  
  37. find( \&wanted, $searchdir || "." );
  38.  
  39. # update progress display 1000 times maximum
  40. my $update_period = int( $filecount / 1000 ) + 1;
  41.  
  42. if ( $fileschecked % $update_period == 0 ) {
  43.     print "Progress: $fileschecked/$filecount\r";
  44.  
  45.     # note \r does carriage return, but NO LINE FEED
  46.     # for progress display
  47. }
  48.  
  49. my @dupesets;
  50.  
  51. # list of lists - @{$dupesets[0]} = (file1, file2)
  52. # where file1 and file2 are dupes
  53. foreach my $size ( keys %files ) {
  54.     my @entries       = @{ $files{$size} };
  55.     my $samesizecount = scalar @entries;
  56.     if ( @{ $files{$size} } == 1 ) {    # unique size
  57.         $fileschecked++;
  58.         next;
  59.     }
  60.  
  61.     # duplicates by file size.. Check if files are the same
  62.     while ( my $base = shift @entries ) {
  63.  
  64.         # get first entry in list under filesize
  65.         my @dupes = ();
  66.         my $count = 0;
  67.         while ( $count <= $#entries ) {
  68.  
  69.             # go through all @entries
  70.             my $compare = $entries[$count];
  71.             if ( &same( $base, $compare ) ) {
  72.  
  73.                 # remove "compare" from list so it can't be used
  74.                 # on next run
  75.                 splice( @entries, $count, 1 );
  76.  
  77.                 # removed "compare" from list - update progress
  78.                 if ( ++$fileschecked % $update_period == 0 ) {
  79.                     print "Progress: $fileschecked/$filecount\r";
  80.                 }
  81.                 if (@dupes) {
  82.  
  83.                     # already have some dupes - just add duplicate
  84.                     # #n to list
  85.                     push @dupes, $compare;
  86.                     $wasted += $size;
  87.                 } else {
  88.  
  89.                     # no dupes yet - include base file and duplicate
  90.                     # #1 in list
  91.                     push @dupes, ( $base, $compare );
  92.                     $wasted += $size;
  93.                 }
  94.             } else {
  95.                 $count++;
  96.  
  97.                 # only increase counter if not a dupe - note splice
  98.                 # will break $array[$position] loop otherwise
  99.             }
  100.         }
  101.         if (@dupes) {
  102.             push @dupesets, \@dupes;
  103.         }
  104.  
  105.         # "base" file removed from list of files to check - update
  106.         # progress meter
  107.         if ( ++$fileschecked % $update_period == 0 ) {
  108.             print "Progress: $fileschecked/$filecount\r";
  109.         }
  110.     }
  111. }
  112. if (@dupesets) {
  113.     my @deletelist = ();
  114.  
  115.     # at least one set of duplicates exists
  116.  
  117.     # number of sets of duplicates
  118.     my $dupesetcount = scalar(@dupesets);
  119.  
  120.     my $dupesetcounter = 0;
  121.  
  122.     if ($autodelete) {
  123.         foreach my $setref (@dupesets) {
  124.             my $firstdupe=shift @$setref;
  125.             push @deletelist, @$setref ;
  126.         }
  127.        
  128.     } else {
  129.  
  130.         foreach my $setref (@dupesets) {
  131.             if ($testing) {
  132.                 print @$setref, "\n";
  133.                 next;
  134.             }
  135.             $dupesetcounter++;
  136.             my @dupes = @$setref;
  137.             print "Duplicates found ($dupesetcounter / $dupesetcount)",
  138.               "... Should I keep...\n";
  139.             my $count = 0;
  140.  
  141.             # print up list of options of which file to keep
  142.             while ( $count <= $#dupes ) {    # go through all @entries
  143.                 my $entry = $dupes[$count];
  144.                 print $count + 1, " : $entry\n";
  145.                 $count++;
  146.             }
  147.  
  148.             # alternative options - keep all files, skip to end
  149.             print "0: All\n";
  150.             print "A: Skip all remaining duplicates\n";
  151.  
  152.             # use ReadKey to get user input
  153.             ReadMode 4;                      # Turn off controls keys
  154.             my $key = '';
  155.             while ( not defined( $key = ReadKey(-1) ) ) {
  156.  
  157.                 # No key yet
  158.             }
  159.             ReadMode 0;                      # Reset tty mode before exiting
  160.  
  161.             if ( $key eq 'A' ) {
  162.  
  163.                 # skip any remaining dupes and get to deletion bit
  164.                 last;
  165.             }
  166.  
  167.             # not a number or 'A' - default to zero (ie keep all files)
  168.             $key = '0' unless ( $key =~ /^\d+$/ );
  169.  
  170.             if ( $key == 0 ) {               # ALL - don't delete anything
  171.                                              #print "you chose: ALL\n";
  172.             } elsif ( defined $dupes[ $key - 1 ] ) {
  173.                 print "you chose: ", $dupes[ $key - 1 ], "\n";
  174.                 my @list_to_delete = @dupes;
  175.  
  176.                 # remove file to keep from list
  177.                 splice( @list_to_delete, $key - 1, 1 );
  178.  
  179.                 # add rest to deletelist
  180.                 push @deletelist, @list_to_delete;
  181.             } else {
  182.  
  183.                 #print "you chose: invalid number... (nothing will",
  184.                 #   " be deleted)\n";
  185.             }
  186.             print "\n";
  187.         }
  188.     }
  189.  
  190.     # confirm deletion if any files are needing deleting
  191.     if (@deletelist) {
  192.         print "\n------------------------\n";
  193.         print "list of files to delete:\n";
  194.         foreach (@deletelist) {
  195.             print "$_\n";
  196.         }
  197.         print "\nAre you *sure* you want to delete all these files?",
  198.           " (Y/N)\n";
  199.         ReadMode 4;    # Turn off controls keys
  200.         my $key = '';
  201.         while ( not defined( $key = ReadKey(-1) ) ) {
  202.  
  203.             # No key yet
  204.         }
  205.         ReadMode 0;    # Reset tty mode before exiting
  206.         if ( lc($key) eq 'y' ) {
  207.             print "deleting\n";
  208.             unlink @deletelist;
  209.         } else {
  210.             print "wussing out\n";
  211.         }
  212.     }
  213.  
  214.     1 while $wasted =~ s/^([-+]?\d+)(\d{3})/$1,$2/;
  215.     print "$wasted bytes in duplicated files\n";
  216. }
  217.  
  218. # routine to check equivalence in files. pass 1 checks first
  219. # "line" of file (up to \n char), rest of file checked if 1st
  220. # line matches
  221. sub same {
  222.     local ( $a, $b ) = @_;
  223.     open( A, $a ) || die;
  224.     open( B, $b ) || die;
  225.     if ( <A> ne <B> ) {    # FIRST LINE is not the same
  226.         return 0;          # not duplicates
  227.     } else {               # try WHOLE FILE
  228.         local $/ = undef;
  229.         return <A> eq <B>;
  230.     }
  231. }
  232.  
  233. sub usage {
  234.     print "Usage: $0 [-d] <start directory>\n";
  235.     print "     -d : autodelete, keeps first\n";
  236.     exit;
  237. }
  238.  
  239.  
  240.  
Advertisement
RAW Paste Data Copied
Advertisement