Advertisement
Guest User

Untitled

a guest
Sep 26th, 2017
65
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Perl 16.67 KB | None | 0 0
  1. #!/usr/bin/perl
  2.  
  3. # Keith Richards July 2010.
  4. # Accept two PDFs as input.
  5. # Base = the baseline PDF
  6. # Test = the new PDF that is being tested.
  7. # Turn the PDFs into images using Ghostscript,
  8. # then use imagemagick to compare and highlight differences
  9. # Turn the resulting images into a PDF and concatenate them.
  10.  
  11. # Sample usage:
  12. #   ./pdfcomp.pl -basefile LGNP.FIN.WFG1911.STND.OUTPDF.PDF -testfile LGNT.FIN.WFG1911.STND.OUTPDF.PDF
  13.  
  14. use warnings;
  15. use strict;
  16. use Getopt::Long;
  17. use File::Path;
  18. use Parallel::ForkManager;
  19. my $pm = new Parallel::ForkManager ( 3 );
  20.  
  21. my $maxFuzz = 100;    # The biggest fuzz factor I'm willing to accept.
  22. my $fileName_Base   = "NULL";  
  23. my $fileName_Test   = "NULL";  
  24. my $numPages        = 0;    # How many pages should I compare? If value is left at 0, I default to looking at all pages.
  25. my $baseStart       = 1;    # What page of the base file should I start looking at?
  26. my $testStart       = 1;    # What page of the test file should I start looking at?
  27. my $noClean;                  # Clean up the test and base images after comparing them?
  28. my $noLog;                    # Don't write to the log file. This isn't implemented yet.
  29. my $logFile         = "pdfcomp.log";
  30. my $outFile         = "pdfcomp.pdf";
  31. my $verbose;          # If set, then don't tell ghostscript to be quiet, don't add "-q" flag.
  32. my $fuzzFactor  = 0;  # how much of a fuzz factor to tell Imagemagic to apply when doing the compare
  33. my $chunkSize   = 100;# how many pages to compare at a time
  34. my $pixelThreshold = 0; # How many pixels can be differnt in images before a diff PDF is created for the pages.
  35. my $forkOkay;         # Use forking. Makes the standard output and log a mess, but can speed up compare on large files
  36. my $printHelp;        # Print a "usage" statement describing inputs
  37.  
  38.  
  39. # Grab the command line arguments so they can be printed to the log
  40. #
  41. my $cmdParms = " ";
  42. my $numArgs = $#ARGV;
  43. foreach my $argnum (0 .. $#ARGV) {
  44.    #print "@ARGV[$argnum]\n";
  45.    $cmdParms = $cmdParms . $ARGV[$argnum] . " ";
  46. }
  47.  
  48. # Set internal variables using the command line options
  49. #
  50. GetOptions
  51. (
  52.  "basefile=s"  => \$fileName_Base,
  53.  "testfile=s"  => \$fileName_Test,
  54.  "numpages=s"  => \$numPages,
  55.  "basestart=s" => \$baseStart,
  56.  "teststart=s" => \$testStart,
  57.  "logname=s"   => \$logFile,
  58.  "noclean"     => \$noClean,
  59.  "nolog"       => \$noLog,
  60.  "outfile=s"   => \$outFile,
  61.  "verbose"     => \$verbose,
  62.  "fuzz=s"      => \$fuzzFactor,
  63.  "chunksize=s" => \$chunkSize,
  64.  "pixelcnt=s"  => \$pixelThreshold,
  65.  "fork"        => \$forkOkay,
  66.  "help"        => \$printHelp
  67. );
  68.  
  69.  
  70. # Print out "usage" statement
  71. #
  72. if ( ( $printHelp ) or ( $fileName_Base eq "NULL" ) or ( $fileName_Test eq "NULL" ) )
  73. {
  74.     print "Usage: pdfcomp.pl -basefile=filename -testfile=filename -numpages=# -basestart=# \n";
  75.     print "-teststart=# -logname=filename -noclean -nolog -outfile=filename -verbose\n";
  76.     print "-chunksize=# -fork -help\n\n";
  77.     print "-basefile:   The baseline file to test against. Prompted if missing.\n";
  78.     print "-testfile:   The new file being tested. Prompted if missing.\n";
  79.     print "-numpages:   The number of pages to compare. Optional, default is all pages.\n";
  80.     print "-basestart:  Which page to start comparing in the base file. Optional, defaults to first page.\n";
  81.     print "-teststart:  Which page to start comparing in the test file. Optional, defaults to first page.\n";
  82.     print "-logname:    Over ride the default log file name. Optional, defaults to $logFile.\n";
  83.     print "-nolog:      Don't print a log file. Optional.\n";
  84.     print "-noclean:    Don't clean up the intermediate image and PDF files in the images directory. Optional.\n";
  85.     print "-outfile:    Over ride the default pdf output file name. Optional, defaults to $outFile.\n";
  86.     print "-verbose:    Print a lot more information to the screen and log file. Optional.\n";
  87.     print "-chunksize:  Over ride the default number of pages per a loop. Optional, defaults to $chunkSize.\n";
  88.     print "-pixelcnt:   When comparing pages, how many pixels must be different to have a diff PDF created.\n";
  89.     print "             Optional, defaults to 0.\n";
  90.     # print "-fork:       Okay to use forking to speed up processing large files. Optional.\n";
  91.     print "-help:       Print this usage statement. Optional\n\n";
  92. }
  93.  
  94. # Check for the input BASE and TEST file.
  95. #
  96. if ( $fileName_Base eq "NULL" )
  97. {
  98.     print "What is the name of the BASE file? ";
  99.     $fileName_Base = <STDIN>;
  100.     chomp $fileName_Base;
  101. }
  102. die "I can't find the input file $fileName_Base." unless ( -e $fileName_Base );
  103.  
  104. if ( $fileName_Test eq "NULL" )
  105. {
  106.     print "What is the name of the TEST file? ";
  107.     $fileName_Test = <STDIN>;
  108.     chomp $fileName_Test;
  109. }
  110. die "I can't find the input file $fileName_Test." unless ( -e $fileName_Test );
  111.  
  112. # Detect if input files are PDF or Postscript
  113. #
  114. my $baseHeadCommand = "head -n 1 $fileName_Base";
  115. my $testHeadCommand = "head -n 1 $fileName_Test";
  116. my $baseLineOne = `$baseHeadCommand`;
  117. my $testLineOne = `$testHeadCommand`;
  118. my $fileTypeBase = "NULL";
  119. my $fileTypeTest = "NULL";
  120.  
  121. if ( $baseLineOne =~ /PDF-/ )
  122. {
  123.   $fileTypeBase = "pdf";
  124. }
  125. elsif ( $baseLineOne =~ /PS-Adobe-/ )
  126. {
  127.   $fileTypeBase = "ps";
  128. }
  129. else
  130. {
  131.   $fileTypeTest = "INVALID";
  132.   print "Unknown file type: $testLineOne\n";
  133.   die ( "I give up.\n" );
  134. }
  135.  
  136. if ( $testLineOne =~ /PDF-/ )
  137. {
  138.   $fileTypeTest = "pdf";
  139. }
  140. elsif ( $testLineOne =~ /PS-Adobe-/ )
  141. {
  142.   $fileTypeTest = "ps";
  143. }
  144. else
  145. {
  146.   $fileTypeTest = "INVALID";
  147.   print "Unknown file type: $testLineOne\n";
  148.   die ( "I give up.\n" );
  149. }
  150.  
  151.  
  152. # Validation of input variables.
  153. #
  154. #print "fuzzFactor=$fuzzFactor\n";
  155. if ( $fuzzFactor > $maxFuzz )
  156. {
  157.   print "Fuzz Factor of $fuzzFactor is too high, I'm reseting it to $maxFuzz.\n";
  158.   $fuzzFactor = $maxFuzz;  
  159. }
  160. if ( $fuzzFactor < 0 )
  161. {
  162.   print "Fuzz Factor of $fuzzFactor is too low, I'm reseting it to 0.\n";
  163.   $fuzzFactor = 0;  
  164. }
  165.  
  166. # Test that I can write to the current directory
  167. #
  168. open ( TEMP, ">pdfcompTemp$$.txt" ) || die ( "ABEND: Error opening temp file, make sure you run this job from a file system you have permission to write to. Error: $!" );
  169. print TEMP "Making sure I can create files on the current file system. If you see this than I can.\n";
  170. close ( TEMP );
  171. unlink "pdfcompTemp$$.txt";
  172.  
  173.  
  174. # Open the log file
  175. #
  176. unless ( $noLog )
  177. {
  178.     print "Opening log file $logFile.\n";
  179.     open( STDOUT, "| tee $logFile" ) or die "Failed to dup STDOUT to $logFile: $!";
  180.     open( STDERR, ">&STDOUT" ) or die "Failed to dup STDERR: $!";
  181. }
  182.  
  183. print "Command line parameters: $cmdParms\n";;
  184. $|++;
  185.  
  186.  
  187. # Look at input PDFs, find page counts.
  188. #
  189. my $basePageCnt = 0;
  190. my $testPageCnt = 0;
  191. if ( $fileTypeBase eq "pdf" )
  192. {
  193.   print "Using pdfinfo to find the page count for $fileName_Base, this could take a while...\n";
  194.   my $pdfcommand = "pdfinfo $fileName_Base | grep '^Pages:'";
  195.   $basePageCnt = `$pdfcommand`;
  196.   $basePageCnt =~ /Pages:\s*(\d+)/;
  197.   $basePageCnt = $1;
  198.   print "Base PDF page count: $basePageCnt\n";
  199. }
  200. elsif ( $fileTypeBase eq "ps" )
  201. {
  202.   print "Using grep to find the page count of the Postscript, this could take a while...\n";
  203.   my $pdfcommand = "grep -c %%Page:  $fileName_Base";
  204.   $basePageCnt = `$pdfcommand`;
  205.   print "Base PS page count: $basePageCnt\n";
  206. }
  207. else
  208. {
  209.   print "Unknown file type, I'm quiting: $fileTypeBase.\n";
  210.   die;
  211. }
  212.  
  213. if ( $fileTypeTest eq "pdf" )
  214. {
  215.   print "Using pdfinfo to find the page count for $fileName_Test, this could take a while...\n";
  216.   my $pdfcommand = "pdfinfo $fileName_Test | grep '^Pages:'";
  217.   $testPageCnt = `$pdfcommand`;
  218.   $testPageCnt =~ /Pages:\s*(\d+)/;
  219.   $testPageCnt = $1;
  220.   print "Base PDF page count: $testPageCnt\n";
  221. }
  222. elsif ( $fileTypeTest eq "ps" )
  223. {
  224.   print "Using grep to find the page count for $fileName_Test, this could take a while...\n";
  225.   my $pdfcommand = "grep -c %%Page:  $fileName_Test";
  226.   $testPageCnt = `$pdfcommand`;
  227.   print "Test PS page count: $testPageCnt\n";
  228. }
  229. else
  230. {
  231.   print "Unknown file type, I'm quit: $fileTypeTest.\n";
  232.   die;
  233. }
  234.  
  235. $basePageCnt += 0;
  236. $testPageCnt += 0;
  237. print "WARNING: Base and Test PDFs do not have the same number of pages: Base:$basePageCnt, Test:$testPageCnt.\n" unless ( $basePageCnt eq $testPageCnt );
  238. print "WARNING: BASE PDF has less pages then you asked to be compared.\n"   unless ( $basePageCnt >= $numPages );
  239. print "WARNING: TEST PDF has less pages then you asked to be compared.\n"   unless ( $testPageCnt >= $numPages );
  240.  
  241.  
  242. # Deciding the range of pages to compare.
  243. #
  244. if ( ( $numPages == "0" ) or ( $numPages > ( $basePageCnt + $baseStart )  ) or ( $numPages > ( $testPageCnt + $testStart) ) )
  245. {
  246.     $numPages = $basePageCnt;
  247.     if ( $testPageCnt < $basePageCnt )
  248.     {
  249.         $numPages = $testPageCnt;
  250.     }  
  251. }
  252.  
  253. print "I have decided to compare $numPages pages.\n";
  254. my $baseStop = $baseStart + ( $numPages - 1 );
  255. my $testStop = $testStart + ( $numPages - 1 );         
  256. print "For the BASE file I will be looking at page $baseStart through $baseStop.\n";
  257. print "For the TEST file I will be looking at page $testStart through $testStop.\n";
  258.  
  259. # Clean up any previous runs
  260. #
  261. my $imageDir = "images";
  262. if ( -d "./$imageDir" )
  263. {
  264.     print "\nI found a previous $imageDir directory. I am deleting it and all the files in it...\n";
  265.     rmtree( $imageDir, 0, 1);
  266. }
  267. mkdir "$imageDir";
  268.  
  269.  
  270. # Convert the PDFs to Images
  271. #
  272. my $quietFlag;
  273. if ( $verbose )
  274. {
  275.   $quietFlag = " ";             # if "verbose" then don't pass in the "be quit" flag to ghost script
  276. }
  277. else
  278. {
  279.   $quietFlag = " -q ";
  280. }
  281.  
  282. # Calculate how many pages the current loop should look at
  283. #
  284. #print "numPage:$numPages\n";
  285. #print "chunkSize:$chunkSize\n";
  286. if ( $numPages <  $chunkSize )
  287. {
  288.   $chunkSize = $numPages - 1;    #  Make sure the chunksize isn't larger then the actual number of pages
  289. }
  290.  
  291. if ( ( $fileTypeBase eq "ps" ) or ( $fileTypeTest eq "ps" ) )
  292. {
  293.   $chunkSize = $testPageCnt - 1;
  294.   print "* Postscript files cannot be compared in pieces, I have to compare the whole file.\n";
  295. }
  296.  
  297. if ( $verbose ) { print "chunkSize:$chunkSize\n"; }
  298.  
  299. # base start and stop
  300. my $curBaseStart   = $baseStart;                   if ( $verbose ) { print "curBaseStart:$curBaseStart\n"; }
  301. my $curBaseEnd   = $baseStart + $chunkSize - 1;        if ( $verbose ) { print "curBaseEnd:$curBaseEnd\n"; }
  302.  
  303. # test start and stop
  304. my $curTestStart = $testStart;                     if ( $verbose ) { print "curTestStart:$curTestStart\n"; }
  305. my $curTestEnd   = $testStart + $chunkSize - 1;        if ( $verbose ) { print "curTestEnd:$curTestEnd\n"; }
  306.  
  307. # set up loop variables
  308. my $loopCnt = 0;
  309. my $diffCnt  = 0;           # Count of the number of pages found with differences
  310. my $pdfList  = " ";     # List of the pdf pages that had differences, used by Ghostscript to create one big final pdf.
  311.  
  312. while ( $curBaseEnd <= $baseStop )
  313. {
  314.   $loopCnt++;
  315.   print "\n***Loop $loopCnt\n";
  316.  
  317.  
  318.   # Make sure the current chunks stop pages are not greater then the
  319.   # end page the user asked for.
  320.   #
  321.   if ( $curBaseEnd > $baseStop )
  322.   {
  323.     $curBaseEnd = $baseStop;
  324.   }
  325.   if ( $curTestEnd > $testStop )
  326.   {
  327.     $curTestEnd = $testStop;
  328.   }
  329.    
  330.   # Fork to run the two Ghost scripts in seperate threads
  331.   #
  332. #  if ( $forkOkay )
  333.   # {
  334.     # Forks and returns the pid for the child:
  335. #    my $pid = $pm->start and next;
  336. #  }
  337.  
  338.  
  339.   # For the two Ghostscripts so the can run together.
  340.   # For BASE and TEST turn the current chunk of pages into images
  341.   #
  342.   if ( $verbose ) { print "Forking to run both ghostscript commands."; }
  343.   my $pid = fork();
  344.   if ( not defined $pid )
  345.   {
  346.     die ( "I was unable to fork: $!")
  347.   }
  348.   elsif ( $pid == 0 )
  349.   {
  350.     # I am the child process.
  351.     my $gsCommand = "gs" . $quietFlag . "-dSAFER -dBATCH -dNOPAUSE -sDEVICE=png16m -dGraphicsAlphaBits=4 -dFirstPage#$curBaseStart -dLastPage#$curBaseEnd -sOutputFile=./$imageDir/base_chunk${loopCnt}_%05d.png $fileName_Base";
  352.     if ( $verbose )
  353.     {
  354.       print "\nConverting BASELINE PDFs to images: $gsCommand\n";
  355.     }
  356.     else
  357.     {
  358.         print "\nRunning ghostscript on BASE file.";
  359.     }
  360.     system "$gsCommand";
  361.     if ( $verbose ) { print "ghostscript is finished converting BASE, chunk $loopCnt, pdf to images.\n"; }
  362.     exit(0);  
  363.   }
  364.   else
  365.   {
  366.     # I am the parent process.
  367.     my $gsCommand = "gs" . $quietFlag . "-dSAFER -dBATCH -dNOPAUSE -sDEVICE=png16m -dGraphicsAlphaBits=4 -dFirstPage#$curTestStart -dLastPage#$curTestEnd -sOutputFile=./$imageDir/test_chunk${loopCnt}_%05d.png $fileName_Test";
  368.     if ( $verbose )
  369.     {
  370.       print "\nConverting TEST PDFs to images: $gsCommand\n";
  371.     }
  372.     else
  373.     {
  374.         print "\nRunning ghostscript on TEST file.\n";
  375.     }
  376.     system "$gsCommand";
  377.     if ( $verbose ) { print "ghostscript is finished converting TEST, chunk $loopCnt, pdf to images.\n"; }
  378.     waitpid($pid,0);
  379.   }  
  380.  
  381.   # if ( $forkOkay )
  382.   # {
  383.   # $pm->finish; # Terminates the child process  
  384.   #}
  385.  
  386.  
  387.   # Loop through the images using ImageMagick to look for differences
  388.   #  
  389.   my $imageCnt = 0;     # Loop counter  
  390.   my $chunkTotalCnt = $curBaseEnd - $curBaseStart + 1; # For last chunk this won't be the chunk size
  391.  
  392.   for ( $imageCnt = 1; $imageCnt <= $chunkTotalCnt; $imageCnt++ )
  393.   {
  394.     my $numSufix = sprintf( "%05d", $imageCnt ); # recreate the file sufix numbers coming out of the ghostscript commands.
  395.    
  396.     # recreate the file names coming out of Ghostscript, make a similar name for difference PDF
  397.     #
  398.     my $file_base = "./$imageDir/base_chunk${loopCnt}_" . $numSufix . ".png";
  399.     my $file_test = "./$imageDir/test_chunk${loopCnt}_" . $numSufix . ".png";
  400.     my $file_diff = "./$imageDir/diff_chunk${loopCnt}_" . $numSufix . ".png";
  401.    
  402.     # Find the md5sum for the current base and test page
  403.     #
  404.     my $md5Command = "md5sum $file_base";
  405.     my $base_md5 = `$md5Command`;  
  406.     my @base_md5_array = split(/ /, $base_md5 );   
  407.     #print "base_md5: $base_md5_array[0]\n";       
  408.    
  409.        $md5Command = "md5sum $file_test";
  410.     my $test_md5 = `$md5Command`;
  411.     my @test_md5_array = split(/ /, $test_md5 );
  412.     #print "test_md5: $test_md5_array[0]\n";
  413.    
  414.     #  If md5sum is the same, don't bother making a compare image
  415.     #  else, make the compare image.
  416.     #
  417.     my $curPagecnt = $curBaseStart + $imageCnt - 1;   # the current pages number in the full file, not just this chunk
  418.    
  419.     if ( $base_md5_array[0] eq $test_md5_array[0] )
  420.     {
  421.       #print "$i: Base page $numBase and test page $numTest have same md5sum, skipping compare.\n";
  422.       #if ( $verbose )
  423.       #{
  424.         print  "$curPagecnt: Same md5sum, not comparing.\n";
  425.       #}
  426.       #print LOG "$i: Same (md5sum)\n";
  427.     }
  428.     else
  429.     {  
  430.       #my $imCommand = "/apps/ImageMagick-6.3.9/bin/compare -metric AE $file_base $file_test -highlight-color Red $file_diff";
  431.       my $imCommand = "/apps/ImageMagick-6.3.9/bin/compare -metric AE -fuzz " . $fuzzFactor . "% $file_base $file_test $file_diff";
  432.       #print "$i: Comparing: $imCommand\n";
  433.       print "$curPagecnt: Comparing...";
  434.       my $imDiff = `$imCommand 2>&1 1>/dev/null`;       # grab the STDOUT get the number of different pixels. Throw the STDERR away.
  435.       chomp $imDiff;
  436.       #system  "$imCommand";
  437.       print "Pages differ by $imDiff pixels, ";
  438.  
  439.       if ( $imDiff == $pixelThreshold )
  440.       {
  441.           print "below threshold of $pixelThreshold\n";
  442.           unless ( $noClean )
  443.           {
  444.             if ( $verbose ) {   print "     Deleting the compare image.\n"; }
  445.             unlink $file_diff unless ( $noClean );
  446.           }
  447.       }
  448.       else
  449.       {
  450.         $diffCnt++;
  451.         print "creating diff pdf\n";
  452.         my $convertCmd = "/apps/ImageMagick-6.3.9/bin/convert $file_diff $file_diff.pdf";
  453.         #print $convertCmd . "\n";
  454.         system ( $convertCmd );
  455.         $pdfList = $pdfList . " $file_diff.pdf";
  456.         #print LOG "$i: differnt. (pixel cnt=$imDiff)\n";
  457.       } # end else ( imagemagic found image differences.
  458.     } # end else (md5 didn't match
  459.   } # end for (looping through current chunks images)
  460.  
  461.  
  462.   # Update the current chunks counts so the loop can look at the next chunk.
  463.   #
  464.   $curBaseStart = $curBaseEnd + 1;
  465.   $curTestStart = $curTestEnd + 1;
  466.   $curBaseEnd += $chunkSize;
  467.   $curTestEnd += $chunkSize;
  468.  
  469.  #if ( $forkOkay )
  470.   #{
  471.     $pm->finish; # Terminates the child process  
  472.   #}
  473. }
  474.  
  475. print "$diffCnt of $numPages pages had differences.\n";
  476.  
  477. # Concatenating all the Pdfs into one
  478. #
  479. if ( $diffCnt > 0 )
  480. {
  481.     print "Creating difference pdf $outFile.\n";
  482.     my $catCmd = "gs -q -sPAPERSIZE=letter -dNOPAUSE -dBATCH -sDEVICE=pdfwrite -sOutputFile=$outFile $pdfList";
  483.     system ( $catCmd );
  484. }
  485. else
  486. {
  487.     print "All pages match. I'm not creating a difference pdf.\n";
  488. }
  489.  
  490.  
  491. # Delete the base and test images for current page
  492. #  
  493. unless ( $noClean )
  494. {
  495.     print "\nRemoving intermediate files.\n";
  496.     rmtree( $imageDir, 0, 1);      
  497. }
  498.  
  499. print "\nDone.\n";
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement