1. #!/usr/bin/perl -w
  2. use strict;
  3.  
  4. our $VERSION = '1.2.0';
  5.  
  6. =pod
  7.  
  8. =head1 NAME
  9.  
  10. simplify_static_dir.pl - Looks through an unchanging (static) directory and
  11. figures out how it can simplify it by creating hard and symbolic (`soft') links
  12. between identical files.
  13.  
  14. =head1 SYNOPSIS
  15.  
  16. simplify_static_dir.pl [OPTIONS] [DIR]...
  17.  
  18. =head1 DESCRIPTION
  19.  
  20. The more files this script can process at once, the better. It maintains an
  21. internal hash of files indexed by their SHA1 checksum. When it finds a
  22. collision it removes the file with the least amount of file system links, then
  23. creates a hard link to the other in its place. The larger the number of files
  24. scanned, the more likely it is that there will be collisions.
  25.  
  26. There are failsafes in place, though. If somehow two files' SHA1 checksums are
  27. identical, but the file sizes do not match, this program will error out (and
  28. you can go ahead and report the collision; it may be worth money).
  29.  
  30. There are other features built in as well. Following the logic that unique
  31. data, once created has the attribute of being unique of that point in time,
  32. this script will copy the timestamp of the older file onto the newer file just
  33. before it makes the hard link. Therefore, many times, simplified directories
  34. will have the appearance of being older than they actually are.
  35.  
  36. From the perspective of any program reading files from the simplified
  37. directoriws, the files will look and behave the same way as the initial state.
  38.  
  39. =head1 OPTIONS
  40.  
  41. =over
  42.  
  43. =item B<-v>
  44.  
  45. Verbose output.
  46.  
  47. =item B<-f>
  48.  
  49. Print a sum of the number of freed bytes.
  50.  
  51. =item B<-m> I<REGEX>
  52.  
  53. Only match file paths matching REGEX.
  54.  
  55. =item B<-M> I<REGEX>
  56.  
  57. Exclude file paths matching REGEX.
  58.  
  59. =item B<-s>
  60.  
  61. Generate symlinks only.
  62.  
  63. =item B<-S>
  64.  
  65. Do not generate ANY symlinks.
  66.  
  67. =item B<-z>
  68.  
  69. Include zero-length files in search.
  70.  
  71. =item B<--help>
  72.  
  73. Output this help message and exit.
  74.  
  75. =item B<--version>
  76.  
  77. Output version information and exit.
  78.  
  79. =back
  80.  
  81. By default, scans the current directory. Files not able to be hard-linked are
  82. symlinked by default.
  83.  
  84. =head1 CHANGES
  85.  
  86. =over
  87.  
  88. =item 1.1.0
  89.  
  90. =over
  91.  
  92. =item *
  93.  
  94. Outputs GNU-style messages (ala `rm -v,' `ln -v,' etc.).
  95.  
  96. =back
  97.  
  98. =item 1.1.1
  99.  
  100. =over
  101.  
  102. =item *
  103.  
  104. Added `-m' and `-M' options.
  105.  
  106. =back
  107.  
  108. =item 1.1.2
  109.  
  110. =over
  111.  
  112. =item *
  113.  
  114. Added `-z' option. Now the default behavior is to not process empty files.
  115. Because what's the point of freeing up no space?
  116.  
  117. =back
  118.  
  119. =item 1.2.0
  120.  
  121. =over
  122.  
  123. =item *
  124.  
  125. Uses L<Digest::SHA> instead of L<Digest::MD5>. MD5 has been broken.
  126.  
  127. =back
  128.  
  129. =back
  130.  
  131. =head1 COPYRIGHT
  132.  
  133. Copyright (C) 2010 Dan Church.
  134.  
  135. License GPLv3+: GNU GPL version 3 or later (L<http://gnu.org/licenses/gpl.html>).
  136.  
  137. This is free software: you are free to change and redistribute it.
  138.  
  139. There is NO WARRANTY, to the extent permitted by law.
  140.  
  141. =head1 AUTHOR
  142.  
  143. Written by Dan Church S<E<lt>amphetamachine@gmail.comE<gt>>
  144.  
  145. =cut
  146.  
  147. use File::Find      qw/ find /;
  148. #use Digest::MD5    qw//;
  149. use Digest::SHA     qw//;
  150. use Getopt::Std     qw/ getopts /;
  151. use Pod::Text       qw//;
  152.  
  153. sub HELP_MESSAGE {
  154. #   my $fh = shift;
  155. #   print $fh <<EOF
  156. #Usage: $0 [DIRS]
  157. #Simplify a directory by hard-linking identical files.
  158. #
  159. #  -v           Verbose output.
  160. #  -f           Print a sum of the number of freed bytes.
  161. #  -m REGEX     Only match file paths matching REGEX.
  162. #  -M REGEX     Exclude file paths matching REGEX.
  163. #  -s           Generate symlinks only.
  164. #  -S           Do not generate ANY symlinks.
  165. #  -z           Include zero-length files in search.
  166. #
  167. #By default, scans the current directory. Files not able to be hard-linked are
  168. #symlinked by default.
  169. #EOF
  170. #;
  171.     my ($fh, $pod) = (shift, Pod::Text->new);
  172.     $pod->parse_from_file($0, $fh);
  173.  
  174.     exit 0;
  175. }
  176.  
  177. my %opts = (
  178.     'v' => 0,
  179.     'f' => 0,
  180.     'm' => '',
  181.     'M' => '',
  182.     's' => 0,
  183.     'S' => 0,
  184.     'z' => 0,
  185. );
  186.  
  187. &getopts('vfm:M:sSz', \%opts);
  188.  
  189. my %files;
  190.  
  191. # correct relative paths
  192. # OR if no directories given, search the current directory
  193. push @ARGV, $ENV{'PWD'} unless map { s#^([^/])#$ENV{'PWD'}/$1# } @ARGV;
  194.  
  195. my $freed_bytes = 0;
  196.  
  197. &find(\&findexec, @ARGV);
  198.  
  199. sub findexec {
  200.     # limit to or exclude file patterns specified by `-m' or `-M',
  201.     # respectively
  202.  
  203.     # truth table
  204.     # -m matches    | -M is used & matches  | !return?
  205.     # 0     | 0         | 0
  206.     # 0     | 1         | 0
  207.     # 1     | 0         | 1
  208.     # 1     | 1         | 0
  209.     # note: m// will match everything
  210.     unless ($File::Find::name =~ m/$opts{'m'}/ and
  211.         !(length $opts{'M'} and $File::Find::name =~ m/$opts{'M'}/)) {
  212.  
  213.         print STDERR "Skipping path `$File::Find::name'\n"
  214.             if $opts{'v'};
  215.         return;
  216.     }
  217.  
  218.     # make sure the file exists and it's not a link
  219.     if (-f $File::Find::name && ! -l $File::Find::name) {
  220.         #my $ctx = Digest::MD5->new;
  221.         my $ctx = Digest::SHA->new;
  222.         open FILE, "<$File::Find::name";
  223.         $ctx->addfile(*FILE);
  224.  
  225.         # save the hex digest because reading the value from
  226.         # Digest::* destroys it
  227.         my $digest = $ctx->hexdigest;
  228.  
  229.         # organize results from lstat into hash
  230.         my $entry = {};
  231.         (@{$entry}{qw/ name dev ino mode nlink uid gid rdev size
  232.                 atime mtime ctime blksize blocks /})
  233.             = ($File::Find::name, lstat $File::Find::name);
  234.  
  235.         # skip zero-length files if wanted (`-z')
  236.         # truth table:
  237.         # -z | non-zero length | return?
  238.         # 0  | 0               | 1
  239.         # 0  | 1               | 0
  240.         # 1  | 0               | 0
  241.         # 1  | 1               | 0
  242.         return unless $opts{'z'} or $entry->{'size'};
  243.  
  244.         # check to see if we've come across a file with the same crc
  245.         if (exists $files{$digest}) {
  246.             my $curr_entry = $files{$digest};
  247.  
  248.             # don't waste my time
  249.             return if $curr_entry->{'name'} eq $entry->{'name'} or
  250.                 $curr_entry->{'ino'} == $entry->{'ino'};
  251.  
  252.             # identical files should be the same size (I'm paranoid
  253.             # of checksumming procedures); if it's not, there's a
  254.             # problem with the checksumming procedure or this
  255.             # script is processing WAY too many files
  256.             # (addendum: perhaps I should congratulate the user on
  257.             # finding a collision in SHA-1)
  258.             if ($curr_entry->{'size'} != $entry->{'size'}) {
  259. die "ERROR: checksums identical for two non-identical files!!!:\n".
  260.     "files:\t`$curr_entry->{'name'}'\n".
  261.           "\t`$entry->{'name'}'\n".
  262.     "SHA1: ($digest)\n".
  263.     '(this is probably a limit of SHA1; try processing fewer files)';
  264.             }
  265.  
  266.             # find the oldest time stamp
  267.             my ($atime, $mtime) = @{(sort
  268.                 {$a->{'mtime'} <=> $b->{'mtime'}}
  269.                 ($entry, $curr_entry)
  270.             )[0]}{qw/ atime mtime /};
  271.  
  272.             # find the file less embedded in the file system
  273.             my ($less_linked, $more_linked) = sort
  274.                 {$a->{'nlink'} <=> $b->{'nlink'}}
  275.                 ($entry, $curr_entry);
  276.  
  277.             # hard-linkable files must exist on the same device and
  278.             # must not already be hard-linked
  279.             if ($curr_entry->{'dev'} == $entry->{'dev'} &&
  280.                 ! $opts{'s'}) {
  281. #               print "hard-linking $new_file\t=>$old_file\n";
  282.                 # attempt to unlink the file
  283.                 printf STDERR "removing file `%s'\n",
  284.                     $less_linked->{'name'} if $opts{'v'};
  285.                 unless (unlink $less_linked->{'name'}) {
  286.  
  287.                     # couldn't do it; try more-linked file
  288.  
  289.                     printf STDERR <<EOF
  290. Failed to remove file `%s': %s
  291. (using `%s')
  292. EOF
  293. ,
  294.                     $less_linked->{'name'},
  295.                     $!,
  296.                     $more_linked->{'name'}
  297.                         if $opts{'v'};
  298.  
  299.                     # if we can't do this, there's no point
  300.                     # in continuing
  301.                     unless (unlink $more_linked->{'name'}) {
  302. printf STDERR <<EOF
  303. Failed to remove file `%s' (second failure on match): %s
  304. Skipping...
  305. EOF
  306. ,
  307.                         $more_linked->{'name'},
  308.                         $!
  309.                             if $opts{'v'};
  310.  
  311.                         return;
  312.                     }
  313.  
  314.                     # the ol' switcheroo
  315.                     ($more_linked, $less_linked) =
  316.                     ($less_linked, $more_linked);
  317.  
  318.                 }
  319.  
  320.                 # we unlinked it or failed out
  321.                 $freed_bytes += $less_linked->{'size'}
  322.                     unless $less_linked->{'nlink'} > 1;
  323.  
  324.                 printf STDERR "hard linking `%s' => `%s'\n",
  325.                 $less_linked->{'name'}, $more_linked->{'name'}
  326.                 if $opts{'v'};
  327.  
  328.                 # hard link the files
  329.                 link $more_linked->{'name'},
  330.                 $less_linked->{'name'};
  331.  
  332.                 # update link count in our hash to reflect the
  333.                 # file system (referenced)
  334.                 ++$more_linked->{'nlink'};
  335.  
  336.                 # preserve older time stamp
  337.                 utime $atime, $mtime, $less_linked->{'name'};
  338.             } elsif (! $opts{'S'}) {
  339.                 # files are on different drives;
  340.                 # most that can be done is to soft-link them
  341.  
  342.                 unless (unlink $less_linked->{'name'}) {
  343.  
  344.                     # couldn't do it; try more-linked file
  345.  
  346.                     printf STDERR "couldn't remove file `%s' (using `%s')\n",
  347.                     $less_linked->{'name'},
  348.                     $more_linked->{'name'} if $opts{'v'};
  349.  
  350.                     # if we can't do this, there's no point
  351.                     # in continuing
  352.                     unlink $more_linked->{'name'}
  353.                         or return;
  354.  
  355.                     # the ol' switcheroo
  356.                     ($more_linked, $less_linked) =
  357.                     ($less_linked, $more_linked);
  358.  
  359.                 }
  360.  
  361.                 # we unlinked it or failed out
  362.                 $freed_bytes += $less_linked->{'size'};
  363.  
  364.                 printf STDERR "soft-linking %s => %s\n",
  365.                 $less_linked->{'name'}, $more_linked->{'name'}
  366.                 if $opts{'v'};
  367.  
  368.                 # create a soft link (TODO: relative links)
  369.                 symlink $more_linked->{'name'},
  370.                 $less_linked->{'name'};
  371.  
  372.                 # preserve older time stamp
  373.                 utime $atime, $mtime, $less_linked->{'name'};
  374.             }
  375.         } else {
  376.             # the file is unique (as far as we know)
  377.             # create a new entry in the hash table
  378.             $files{$digest} = $entry;
  379.         }
  380.     #} elsif (-l $File::Find::name) {
  381.     #   # do something to simplify symlinks
  382.     #   printf STDERR "FIXME: simplifying symlink `%s'\n",
  383.     #   $File::Find::name
  384.     #   if $opts{'v'};
  385.  
  386.     #   printf STDERR "symlink `%s' points to `%s'\n",
  387.     #   $File::Find::name, readlink $File::Find::name;
  388.     }
  389. }
  390.  
  391. printf STDERR "freed %d bytes (%0.4G %s)\n",
  392.     $freed_bytes, &hr_size($freed_bytes)
  393.         if $opts{'f'} or $opts{'v'};
  394.  
  395. sub hr_size {
  396.     my $sz = shift;
  397.     my @sizes = qw/ B KB MB GB TB PB EB ZB YB /;
  398.     my $fact = 1024;
  399.     my $thresh = 0.1;
  400.     my @ret;
  401.     foreach my $exp (reverse 0 .. $#sizes) {
  402.         if ($sz > (1 - $thresh) * $fact ** $exp) {
  403.             @ret = ($sz / $fact ** $exp, $sizes[$exp]);
  404.             last;
  405.         }
  406.     }
  407.  
  408.     # default to ($sz, 'bytes')
  409.     @ret = ($sz, $sizes[0]) unless @ret;
  410.  
  411.     wantarray ? @ret : "@ret"
  412. }