fant0men

MD5db perl script

Jul 31st, 2019
133
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #!/usr/bin/perl
  2.  
  3. use 5.16.0;
  4. use strict;
  5. use warnings;
  6. #use feature 'unicode_strings';
  7. use Cwd qw(abs_path cwd);
  8. use Digest::MD5 qw(md5_hex);
  9. use IO::Handle qw(autoflush);
  10. use File::Basename qw(basename dirname);
  11. #use File::Slurp qw(read_file);
  12. use diagnostics;
  13.  
  14. use threads qw(yield);
  15. use threads::shared;
  16. use Thread::Queue;
  17. use Thread::Semaphore;
  18. #use Fcntl qw(:flock);
  19. use POSIX qw(SIGINT);
  20. use POSIX qw(ceil);
  21.  
  22. # Create the thread queue.
  23. my $q = Thread::Queue->new();
  24.  
  25. chomp(my $cores = `grep -c ^processor /proc/cpuinfo`);
  26.  
  27. my (@lib, $mode);
  28.  
  29. # Path to and name of log file to be used for logging.
  30. my $logf = "$ENV{HOME}/md5db.log";
  31.  
  32. # Delimiter used for database
  33. my $delim = "\t\*\t";
  34.  
  35. # Array for storing the actual arguments used by the script internally.
  36. # Might be useful for debugging.
  37. my @cmd = (basename($0));
  38.  
  39. # Name of database file.
  40. my $db = 'md5.db';
  41.  
  42. # Clear screen command.
  43. my $clear = `clear && echo`;
  44.  
  45. # Creating a hash that will store the names of files that are
  46. # too big to fit into RAM. We'll process them last.
  47. my %large :shared;
  48.  
  49. # Creating the %gone_tmp hash that will store the names and hashes of possibly
  50. # deleted files.
  51. my %gone_tmp :shared;
  52.  
  53. # Creating a few shared variables.
  54. # %err will be used for errors
  55. # $n will be used to count the number of files processed
  56. # %md5h is the database hash
  57. my %err :shared;
  58. my $n :shared = 0;
  59. my %md5h :shared;
  60. my %file_contents :shared;
  61. my $stopping :shared = 0;
  62. my $file_stack :shared = 0;
  63. my $busy :shared = 0;
  64.  
  65. my $disk_size = 1000000000;
  66.  
  67. # This will be used to control access to the logger subroutine.
  68. my $semaphore = Thread::Semaphore->new();
  69.  
  70. POSIX::sigaction(SIGINT, POSIX::SigAction->new(\&handler))
  71. || die "Error setting SIGINT handler: $!\n";
  72.  
  73. # Creating a custom POSIX signal handler.
  74. # First we create a shared variable that will work as a SIGINT switch.
  75. # Then we define the handler subroutine.
  76. # Each subroutine to be used for starting threads will have to
  77. # take notice of the state of the $saw_sigint variable.
  78. my $saw_sigint :shared = 0;
  79. sub handler { $saw_sigint = 1; }
  80.  
  81. # Open file handle for the log file
  82. open(my $LOG, '>>', $logf) or die "Can't open '$logf': $!";
  83.  
  84. # Make the $LOG file handle unbuffered for instant logging.
  85. $LOG->autoflush(1);
  86.  
  87. # Duplicate STDERR as a regular file handle
  88. open(my $SE, ">&STDERR") or die "Can't duplicate STDERR: $!";
  89.  
  90. ### Subroutine for printing usage instructions
  91. sub usage {
  92.  
  93.     my $s = basename($0);
  94.  
  95.     say <<"HELP"
  96. Usage: $s [options] [directory 1] .. [directory N]
  97.  
  98.     -help Print this help message.
  99.  
  100.     -double Check database for files that have identical
  101.     hashes.
  102.  
  103.     -import Import MD5 sums to the database from already existing
  104.     \*.MD5 files in each directory.
  105.  
  106.     -index Index new files in each directory.
  107.  
  108.     -test Test the MD5 sums of the files in the database to see if
  109.     they've changed.
  110.  
  111. HELP
  112. }
  113.  
  114. # This loop goes through the argument list as passed to the script
  115. # by the user when ran.
  116. foreach my $arg (@ARGV) {
  117.  
  118.     # If argument starts with a dash '-', interprete it as an option
  119.     if ($arg =~ /^-/) {
  120.  
  121.         given ($arg) {
  122.             # When '-double', set script mode to 'double', and call
  123.             # the md5double subroutine later.
  124.             when (/^-double$/) {
  125.                 if (!$mode) { push(@cmd, $arg); $mode = 'double'; }
  126.             }
  127.  
  128.             # When '-import', set script mode to 'import', and call
  129.             # the md5import subroutine later.
  130.             when (/^-import$/) {
  131.                 if (!$mode) { push(@cmd, $arg); $mode = 'import'; }
  132.             }
  133.  
  134.             # When '-help', set script mode to 'help', and print
  135.             # usage instructions later.
  136.             when (/^-help$/) {
  137.                 if (!$mode) { push(@cmd, $arg); $mode = 'help'; }
  138.             }
  139.  
  140.             # When '-index', set script mode to 'index', and call
  141.             # the md5index subroutine later.
  142.             when (/^-index$/) {
  143.                 if (!$mode) { push(@cmd, $arg); $mode = 'index'; }
  144.             }
  145.  
  146.             # When '-test', set the script mode to 'test', and call
  147.             # the md5test subroutine later.
  148.             when (/^-test$/) {
  149.                 if (!$mode) { push(@cmd, $arg); $mode = 'test'; }
  150.             }
  151.         }
  152.     # If argument is a directory, include it in the @lib array
  153.     } elsif (-d $arg) {
  154.         my $dn = abs_path($arg);
  155.         push(@lib, $dn); push(@cmd, $dn); }
  156. }
  157.  
  158. # If no switches were used, print usage instructions
  159. if (!@lib || !$mode || $mode eq 'help')
  160.     { usage(); exit; }
  161.  
  162. #say "@cmd\n";
  163.  
  164. # This subroutine is for loading files into RAM.
  165. sub file2ram {
  166.     my $fn = shift;
  167.  
  168.     my $size = (stat($fn))[7];
  169.     if (!$size) { return(); }
  170.     if ($size < $disk_size) {
  171.  
  172.         open(my $read_fn, '<:raw', $fn) or die "Can't open '$fn': $!";
  173.         sysread($read_fn, $file_contents{$fn}, $size);
  174.         close($read_fn) or die "Can't close '$fn': $!";
  175.  
  176.         { lock($file_stack);
  177.         $file_stack += length($file_contents{$fn}); }
  178.  
  179.         $q->enqueue($fn);
  180.     } elsif ($size) {
  181.         $large{$fn} = 1;
  182.     }
  183. }
  184.  
  185. # This routine is called if something goes wrong and the script needs
  186. # to quit prematurely.
  187. sub iquit {
  188.     my $tid = threads->tid();
  189.     if ($tid == 1) {
  190.  
  191.         # Set the $stopping variable to let the threads know it's time
  192.         # to stop, and sleep for 1 second so they'll have time to quit.
  193.         { lock($stopping);
  194.         $stopping = 1; }
  195.  
  196.         sleep(1);
  197.  
  198.         # Write the hash to the database file and write to the log.
  199.         hash2file();
  200.         logger('int', $n);
  201.  
  202.         # Detaching the threads so Perl will clean up after us.
  203.         foreach my $t (threads->list()) { $t->detach(); }
  204.         exit;
  205.  
  206.     # If the thread calling this function isn't thread 0/1, yield until
  207.     # $stopping is set.
  208.     } elsif ($tid > 1) { while (!$stopping) { yield(); } }
  209. }
  210.  
  211. # Subroutine for controlling the log file
  212. # Applying a semaphore so multiple threads won't try to
  213. # access it at once, just in case ;-)
  214. sub logger {
  215.  
  216.     $semaphore->down();
  217.  
  218.     my($arg, $sw, @fn, $n);
  219.  
  220.     # Creating a variable to hold the current time.
  221.     my $now = localtime(time);
  222.  
  223.     # Array of accepted switches to this subroutine
  224.     my @larg = qw{start int gone corr diff end};
  225.  
  226.     # Loop through all the arguments passed to this subroutine
  227.     # Perform checks that decide which variable the arguments are to
  228.     # be assigned to.
  229.     CHECK: while (@_) {
  230.             $arg = shift(@_);
  231.  
  232.             # If $arg is a switch, set the $sw variable and start
  233.             # the next iteration of the CHECK loop.
  234.             foreach (@larg) {
  235.                 if ($_ eq $arg) { $sw = $arg; next CHECK; }
  236.             }
  237.  
  238.             # If $arg is a number assign it to $n, if it's a file
  239.             # add it to @fn.
  240.             if ($arg =~ /^[0-9]+$/) { $n = $arg; }
  241.             else { push(@fn, $arg); }
  242.     }
  243.     given ($sw) {
  244.         # Starts writing the log.
  245.         when ('start') {
  246.             say $LOG "\n**** Logging started on $now ****\n";
  247.             say $LOG "Running script in '$mode' mode on:\n";
  248.             foreach my $dn (@lib) { say $LOG $dn; }
  249.             say $LOG "";
  250.         }
  251.         # When the script is interrupted by user pressing ^C,
  252.         # say so in STDOUT, close the log.
  253.         when ('int') {
  254.             say "\nInterrupted by user!\n";
  255.             say $LOG $n . " file(s) were tested.";
  256.             say $LOG "\n**** Logging ended on $now ****\n";
  257.             close $LOG or die "Can't close '$LOG': $!";
  258.         }
  259.         # Called when file has been deleted or moved.
  260.         when ('gone') {
  261.             say $LOG $fn[0] . "\n\t" . "has been (re)moved.\n";
  262.             $err{$fn[0]} = "has been (re)moved.\n";
  263.         }
  264.         # Called when file has been corrupted.
  265.         when ('corr') {
  266.             say $LOG $fn[0] . "\n\t" .
  267.             "has been corrupted.\n";
  268.             $err{$fn[0]} = "has been corrupted.\n";
  269.         }
  270.         when ('diff') {
  271.             say $LOG $fn[0] . "\n\t" .
  272.                 "doesn't match the hash in database.\n";
  273.             $err{$fn[0]} = "doesn't match the hash in database.\n";
  274.         }
  275.         # Called when done, and to close the log.
  276.         # If no errors occurred write "Everything is OK!" to the log.
  277.         # If errors occurred print the %err hash.
  278.         # Either way, print number of files processed.
  279.         when ('end') {
  280.             if (!%err) {
  281.                 say $LOG "\nEverything is OK!\n";
  282.             } else {
  283.                 say "\n**** Errors Occurred ****\n";
  284.                 foreach my $fn (sort keys %err) {
  285.                     say $SE $fn . "\n\t" . $err{$fn};
  286.                 }
  287.             }
  288.  
  289.             say $LOG $n . " file(s) were tested.\n" if ($n);
  290.             say $LOG "\n**** Logging ended on $now ****\n";
  291.             close $LOG or die "Can't close '$LOG': $!";
  292.         }
  293.     }
  294.     $semaphore->up();
  295. }
  296.  
  297.  
  298. # Subroutine for reading a database file into the database hash.
  299. # This is the first subroutine that will be executed and all others
  300. # depend upon it, cause without it we don't have a
  301. # database hash to work with.
  302. sub file2hash {
  303.  
  304.     my $db = shift;
  305.     my $dn = shift;
  306.  
  307.     # The format string which is used for parsing the database file
  308.     my $format = qr/^.*\t\*\t[[:alnum:]]{32}$/;
  309.     my (@dbfile, $md5db_in);
  310.  
  311.     # Open the database file and read it into the @dbfile variable
  312.     open($md5db_in, '<', $db) or die "Can't open '$db': $!";
  313.     chomp (@dbfile = (<$md5db_in>));
  314.     close($md5db_in) or die "Can't close '$db': $!";
  315.  
  316.     # Loop through all the lines in the database file and split
  317.     # them before storing in the database hash.
  318.     # Also, print each line to STDOUT for debug purposes
  319.     foreach my $line (@dbfile) {
  320.  
  321.         # If current line matches the proper database file format,
  322.         # continue.
  323.         if ($line =~ /$format/) {
  324.  
  325.             # Split the line into relative file name, and MD5 sum.
  326.             # Also create another variable that contains the absolute
  327.             # file name.
  328.             my ($rel_fn, $hash) = (split(/\Q$delim/, $line));
  329.             my $abs_fn;
  330.             if ($dn ne '.') { $abs_fn = "$dn/$rel_fn"; }
  331.             else { $abs_fn = $rel_fn; }
  332.  
  333.             # If the file name matches "$HOME/.*", then ignore it.
  334.             # Directories in the home-dir of a user are usually
  335.             # configuration files for the desktop and various
  336.             # applications. These files change often and will therefore
  337.             # clog the log file created by this script, making it hard
  338.             # to read.
  339.  
  340.             # If the file name starts with a dot, check further to see
  341.             # if it matches "$HOME/.*".
  342.             if ($abs_fn =~ m/^\./) {
  343.                 my $absabs_fn = abs_path($abs_fn);
  344.                 if ($absabs_fn =~ m(^/home/[[:alnum:]]+/\.)) { next; }
  345.             }
  346.  
  347.             # If $abs_fn is a real file and not already in the hash,
  348.             # continue.
  349.             if (-f $abs_fn && ! $md5h{$abs_fn}) {
  350.                 $md5h{$abs_fn} = $hash;
  351.                 say $abs_fn . $delim . $hash;
  352.  
  353.             # If the file is in the database hash but the MD5 sum
  354.             # found in the database doesn't match the one in the hash,
  355.             # print to the log.
  356.             #
  357.             # This will most likely only be the case for any extra
  358.             # databases that are found in the search path given to
  359.             # the script.
  360.             } elsif (-f $abs_fn && $md5h{$abs_fn} ne $hash) {
  361.                 logger('diff', $abs_fn);
  362.             # Saves the names of deleted or moved files in '%gone'.
  363.             } elsif (! -f $abs_fn) {
  364.                 lock(%gone_tmp);
  365.                 $gone_tmp{${abs_fn}} = $hash;
  366.             }
  367.         }
  368.     }
  369.  
  370.     # Clears the screen, thereby scrolling past the database file print
  371.     print $clear;
  372. }
  373.  
  374. # Subroutine for printing the database hash to the database file
  375. sub hash2file {
  376.  
  377.     my $md5db_out;
  378.  
  379.     open($md5db_out, '>', $db) or die "Can't open '$db': $!";
  380.     # Loops through all the keys in the database hash and prints
  381.     # the entries (divided by the $delim variable) to the database file.
  382.     foreach my $k (sort(keys(%md5h))) {
  383.         say $md5db_out $k . $delim . $md5h{$k};
  384.     }
  385.     close($md5db_out) or die "Can't close '$db': $!";
  386. }
  387.  
  388. sub init_hash {
  389.     my $dn = shift;
  390.  
  391.     # Get all the file names in the path.
  392.     my($files, $md5dbs) = getfiles($dn);
  393.  
  394.     # But first import hashes from any databases found
  395.     # in the search path to avoid re-hashing them.
  396.     if (@{$md5dbs}) {
  397.         foreach my $db (@{$md5dbs}) {
  398.             my $dn = dirname($db);
  399.             file2hash($db, $dn);
  400.         }
  401.     }
  402.     return($files, $md5dbs);
  403. }
  404.  
  405. # Subroutine for finding files
  406. # Finds all the files inside the directory name passed to it,
  407. # and sorts the output before storing it in the @files array.
  408. sub getfiles {
  409.  
  410.     my $dn = shift;
  411.     my(@files, @md5dbs);
  412.  
  413.     open(FIND, '-|', 'find', $dn, '-type', 'f', '-name', '*', '-nowarn')
  414.     or die "Can't run 'find': $!";
  415.     while (my $fn = (<FIND>)) {
  416.             chomp($fn);
  417.  
  418.             # If the file name matches "$HOME/.*", then ignore it.
  419.             # Directories in the home-dir of a user are usually
  420.             # configuration files for the desktop and various
  421.             # applications. These files change often and will therefore
  422.             # clog the log file created by this script, making it hard
  423.             # to read.
  424.             #my $home = $ENV{"HOME"};
  425.             if ($fn =~ m(^/home/[[:alnum:]]+/\.)) { next; }
  426.  
  427.             # Using quotemeta operators here (\Q & \E) because Perl
  428.             # interprets the string as a regular expression when
  429.             # it's not.
  430.             $fn =~ s(\Q$dn\E/)();
  431.  
  432.             if (-f $fn && basename($fn) ne $db) {
  433.                 push(@files, $fn);
  434.             } elsif (-f $fn && basename($fn) eq $db) {
  435.                 push(@md5dbs, $fn);
  436.             }
  437.     }
  438.     close(FIND) or die "Can't close 'find': $!";
  439.     return(\@files, \@md5dbs);
  440. }
  441.  
  442. sub md5double {
  443.  
  444.     if (!keys(%md5h)) {
  445.         say "No database file. Run the script in 'index' mode first\n" .
  446.         "to index the files.";
  447.         exit;
  448.     }
  449.  
  450.     # Loop through the %md5h hash and save the checksums as keys in a
  451.     # new hash called %exists. Each of those keys will hold an
  452.     # anonymous array with the matching file names.
  453.     my %exists;
  454.     foreach my $fn (keys(%md5h)) {
  455.         my $hash = $md5h{$fn};
  456.         if (!$exists{${hash}}) {
  457.             $exists{${hash}}->[0] = $fn;
  458.         } else {
  459.             push(@{$exists{${hash}}}, $fn);
  460.         }
  461.     }
  462.  
  463.     # Loop through the %exists hash and print files that are identical,
  464.     # if any.
  465.     foreach my $hash (keys(%exists)) {
  466.         if (@{$exists{${hash}}} > 1) {
  467.             say "These files have the same hash (${hash}):";
  468.             foreach my $fn (@{$exists{${hash}}}) {
  469.                 say $fn;
  470.             }
  471.             say "";
  472.         }
  473.     }
  474. }
  475.  
  476. # Subroutine for finding and parsing *.MD5 files, adding the hashes
  477. # to the database hash and thereby also to the file.
  478. # It takes 1 argument:
  479. # (1) file name
  480. sub md5import {
  481.  
  482.     my $md5fn = shift;
  483.  
  484.     my ($fn, $hash, @fields, @lines);
  485.  
  486.     # The format string which is used for parsing the *.MD5 files.
  487.     my $format = qr/^[[:alnum:]]{32}\s\*.*/;
  488.  
  489.     # If the file extension is *.MD5 in either upper- or
  490.     # lowercase, continue.
  491.     if ($md5fn =~ /.md5$/i) {
  492.  
  493.         # Open the *.MD5 file and read its contents to the
  494.         # @lines array.
  495.         open(MD5, '<', $md5fn) or die "Can't open '$md5fn': $!";
  496.         chomp(@lines = (<MD5>));
  497.         close(MD5) or die "Can't close '$md5fn': $!";
  498.  
  499.         # Loop to check that the format of the *.MD5 file really
  500.         # is correct before proceeding.
  501.         foreach my $line (@lines) {
  502.  
  503.             # If format string matches the line(s) in the *.MD5
  504.             # file, continue.
  505.             if ($line =~ /$format/) {
  506.  
  507.                 # Split the line so that the hash and file name go
  508.                 # into @fields array.
  509.                 # After that strip the path (if any) of the file
  510.                 # name, and prepend the path of the *.MD5 file to
  511.                 # it instead.
  512.                 # Store hash and file name in the $hash and $fn
  513.                 # variables for readability.
  514.                 @fields = split(/\s\Q*/, $line, 2);
  515.                 my $path = dirname($md5fn);
  516.                 $hash = $fields[0];
  517.  
  518.                 if ($path eq '.') { $fn = basename($fields[1]); }
  519.                 else { $fn = dirname($md5fn)
  520.                 . '/' . basename($fields[1]); }
  521.  
  522.                 # Convert CR+LF newlines to proper LF to avoid
  523.                 # identical file names from being interpreted as
  524.                 # different.
  525.                 $fn =~ s/\r//;
  526.  
  527.                 # Unless file name already is in the database hash,
  528.                 # print a message, add it to the hash.
  529.                 if (! $md5h{$fn} && -f $fn) {
  530.  
  531.                     say "$fn" . "\n\t" .
  532.                     "Imported MD5 sum from '" .
  533.                     basename($md5fn) .
  534.                     "'.\n";
  535.  
  536.                     $md5h{$fn} = $hash;
  537.  
  538.                     # If file name is not a real file, add $fn
  539.                     # to %gone hash..
  540.                     # If file name is in database hash but the
  541.                     # MD5 sum from the MD5 file doesn't match,
  542.                     # print to the log.
  543.                 } elsif (! -f $fn) {
  544.                     lock(%gone_tmp);
  545.                     $gone_tmp{${fn}} = $hash;
  546.                 }
  547.                 elsif ($md5h{$fn} ne $hash)
  548.                 { logger('diff', $md5fn); }
  549.             }
  550.         }
  551.     }
  552. }
  553.  
  554.  
  555. sub md5sum {
  556.     my $fn = shift;
  557.     my $hash;
  558.  
  559.     while ($busy) { yield(); }
  560.  
  561.     if ($large{$fn}) {
  562.         lock($busy);
  563.         $busy = 1;
  564.         my $read_fn;
  565.         open($read_fn, '<:raw', $fn) or die "Can't open '$fn': $!";
  566.         $hash = Digest::MD5->new->addfile($read_fn)->hexdigest;
  567.         close($read_fn) or die "Can't close '$fn': $!";
  568.         $busy = 0;
  569.     } else {
  570.         $hash = md5_hex($file_contents{$fn});
  571.  
  572.         { lock($file_stack);
  573.         $file_stack -= length($file_contents{$fn}); }
  574.  
  575.         { lock(%file_contents);
  576.         delete($file_contents{$fn}); }
  577.     }
  578.  
  579.     return $hash;
  580. }
  581.  
  582. # Subroutine to index the files
  583. # i.e calculate and store the MD5 sums in the database hash/file.
  584. sub md5index {
  585.     my $tid = threads->tid();
  586.  
  587.     # Loop through the thread que.
  588.     LOOP2: while ((my $fn = $q->dequeue_nb()) || !$stopping) {
  589.             if (!$fn) { yield(); next; }
  590.  
  591.             $md5h{$fn} = md5sum($fn);
  592.             say "$tid $fn: done indexing (${file_stack})";
  593.  
  594.             { lock($n);
  595.             $n++; }
  596.  
  597.             # If the $saw_sigint variable has been tripped.
  598.             # Quit this 'while' loop, thereby closing the thread.
  599.             if ($saw_sigint) {
  600.                 say "Closing thread: " . $tid;
  601.                 iquit();
  602.             }
  603.     }
  604.  
  605.     while (!$stopping) {
  606.         goto(LOOP2);
  607.     }
  608. }
  609.  
  610. # Subroutine for testing to see if the MD5 sums
  611. # in the database file are correct (i.e. have changed or not).
  612. sub md5test {
  613.     my $tid = threads->tid();
  614.     my ($oldmd5, $newmd5);
  615.  
  616.     # Loop through the thread queue.
  617.     LOOP: while ((my $fn = $q->dequeue_nb()) || !$stopping) {
  618.                 if (!$fn) { yield(); next; }
  619.  
  620.                 $newmd5 = md5sum($fn);
  621.                 say "$tid $fn: done testing (${file_stack})";
  622.                 $oldmd5 = $md5h{$fn};
  623.  
  624.                 # If the new MD5 sum doesn't match the one in the hash,
  625.                 # and file doesn't already exist in the %err hash,
  626.                 # log it and replace the old MD5 sum in the hash with
  627.                 # the new one.
  628.                 if ($newmd5 ne $oldmd5 && ! $err{$fn}) {
  629.                     logger('diff', $fn);
  630.                     $md5h{$fn} = $newmd5;
  631.                 }
  632.  
  633.                 { lock($n);
  634.                 $n++; }
  635.  
  636.                 # If the $saw_sigint variable has been tripped.
  637.                 # Quit this 'while' loop, thereby closing the thread.
  638.                 if ($saw_sigint) {
  639.                     say "Closing thread: " . $tid;
  640.                     iquit();
  641.                 }
  642.     }
  643.  
  644.     while (!$stopping) {
  645.         goto(LOOP);
  646.     }
  647. }
  648.  
  649. sub md5flac {
  650.     my $fn = shift;
  651.     my (@req, $hash);
  652.  
  653.     if ($fn =~ /.flac$/i) {
  654.  
  655.         if (! @req) {
  656.             chomp(@req = ( `which flac metaflac 2>&-` ));
  657.  
  658.             if (! $req[0] || ! $req[1]) {
  659.                 say "You need both 'flac' and 'metaflac' to test FLAC files!\n" .
  660.                 "Using normal test method...\n";
  661.                 @req = '0';
  662.                 return;
  663.             }
  664.         }
  665.  
  666.         unless ($req[0] eq '0') {
  667.             chomp($hash = `metaflac --show-md5sum "$fn" 2>&-`);
  668.             if ($? != 0 && $? != 2) { logger('corr', $fn); return; }
  669.  
  670.             system('flac', '--totally-silent', '--test', "$fn");
  671.             if ($? != 0 && $? != 2) { logger('corr', $fn); return; }
  672.  
  673.             return $hash;
  674.         }
  675.     }
  676. }
  677.  
  678. # Subroutine for figuring out which files have gone missing.
  679. # If identical MD5 hashes can be found in %md5h, then delete those keys from %gone.
  680. # When done, loop through the %gone hash and echo each key to the logger.
  681. sub p_gone {
  682.  
  683.     # Unless @gone_fn is non-empty, return from this subroutine.
  684.     unless (%gone_tmp) { return; }
  685.  
  686.     my %gone;
  687.     my @gone;
  688.     my $size = %gone_tmp;
  689.  
  690.     # Translates the %gone_tmp hash to the %gone hash / array.
  691.     # We need to do it in this complicated way because 'threads::shared'
  692.     # has no support for hashes within hashes and arrays within arrays.
  693.     # That's why the global variables are only simple arrays, and we
  694.     # translate them to a hash / array here (in this subroutine).
  695.     foreach my $fn (keys(%gone_tmp)) {
  696.         my $hash = $gone_tmp{${fn}};
  697.         push(@{$gone{${hash}}}, $fn);
  698.     }
  699.  
  700.     # Deletes the %gone_tmp hash as it's not needed anymore.
  701.     { lock(%gone_tmp);
  702.     undef(%gone_tmp); }
  703.  
  704.     # Loops through the %md5h hash and deletes every matching MD5
  705.     # hash from the %gone hash / array.
  706.     foreach my $fn (keys(%md5h)) {
  707.         my $hash = ${md5h{${fn}}};
  708.         if ($gone{${hash}}) {
  709.             delete($gone{${hash}});
  710.         }
  711.     }
  712.  
  713.     # Translates the %gone hash to @gone array.
  714.     # Because then we can sort by filename before printing to the logger.
  715.     foreach my $hash (keys(%gone)) {
  716.         foreach my $fn (@{$gone{${hash}}}) {
  717.             push(@gone, $fn);
  718.         }
  719.     }
  720.  
  721.     # Deletes the %gone hash as it's not needed anymore.
  722.     undef(%gone);
  723.  
  724.     # Logs all missing files.
  725.     foreach my $fn (sort(@gone)) {
  726.         logger('gone', $fn);
  727.     }
  728. }
  729.  
  730. # Depending on which script mode is active,
  731. # set the @run array to the correct arguments.
  732. # This will be used to start the threads later.
  733. my @run;
  734. given ($mode) {
  735.     when ('index') {
  736.         @run = (\&md5index);
  737.     }
  738.     when ('test') {
  739.         @run = (\&md5test);
  740.     }
  741. }
  742.  
  743. # If script mode is either 'import' or 'double' we'll start only
  744. # one thread, else we'll start as many as the available number of CPUs.
  745. my @threads;
  746. if ($mode ne 'import' && $mode ne 'double') {
  747.     foreach (1 .. $cores) {
  748.         START_THREADS: push(@threads, threads->create(@run));
  749.     }
  750. }
  751.  
  752. # This loop is where the actual action takes place
  753. # (i.e. where all the subroutines get called from)
  754. foreach my $dn (@lib) {
  755.     if (-d $dn) {
  756.  
  757.         # Change directory to $dn.
  758.         chdir($dn)
  759.             or die "Can't change directory to '$dn': $!";
  760.  
  761.         # Start logging.
  762.         logger('start');
  763.  
  764.         # Initialize the database hash, and the files array.
  765.         # The init_hash function returns references.
  766.         my($files, $md5dbs) = init_hash($dn);
  767.  
  768.         given ($mode) {
  769.             when ('double') {
  770.                 # Find identical files in database.
  771.                 md5double();
  772.             }
  773.             when ('import') {
  774.  
  775.                 # For all the files in $dn, run md5import.
  776.                 foreach my $fn (@{$files}) { md5import($fn); }
  777.  
  778.             }
  779.             when ('index') {
  780.  
  781.                 foreach my $fn (@{$files}) {
  782.                     if ($saw_sigint) { iquit(); }
  783.                     while ($file_stack >= $disk_size) {
  784.                         my $active = threads->running();
  785.                         say("${active}: $file_stack > $disk_size");
  786.                         yield();
  787.                     }
  788.                     # Unless file name exists in the database hash,
  789.                     # continue.
  790.                     unless ($md5h{$fn}) {
  791.                         file2ram($fn);
  792.                     }
  793.                 }
  794.  
  795.             }
  796.             when ('test') {
  797.  
  798.                 # Fetch all the keys for the database hash and put
  799.                 # them in the queue.
  800.                 foreach my $fn (sort(keys(%md5h))) {
  801.                     if ($saw_sigint) { iquit(); }
  802.                     while ($file_stack >= $disk_size) {
  803.                         say("$file_stack > $disk_size");
  804.                         yield();
  805.                     }
  806.                     file2ram($fn);
  807.  
  808.                 }
  809.  
  810.             }
  811.         }
  812.  
  813.         if (%large) {
  814.             while ($file_stack > 0) {
  815.                 say("$file_stack > 0");
  816.                 yield();
  817.             }
  818.             foreach my $fn (sort(keys(%large))) {
  819.                 $q->enqueue($fn);
  820.             }
  821.         }
  822.  
  823. #   use Digest::MD5;
  824. #   $md5 = Digest::MD5->new;
  825. #   $md5->add('foo', 'bar');
  826. #   $md5->add('baz');
  827. #   $digest = $md5->hexdigest;
  828. #   print "Digest is $digest\n";
  829.  
  830.         { lock($stopping);
  831.         $stopping = 1; }
  832.  
  833.         foreach my $t (threads->list()) { $t->join(); }
  834.         #say("All threads joined");
  835.  
  836.         p_gone();
  837.  
  838.         # Print the hash to the database file and close the log
  839.         hash2file();
  840.         logger('end', $n);
  841.     }
  842. }
RAW Paste Data