Advertisement
Guest User

Untitled

a guest
Sep 26th, 2017
48
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Perl 16.64 KB | None | 0 0
  1. #!/usr/bin/perl
  2.  
  3. # Accept two PDFs as input.
  4. # Base = the baseline PDF
  5. # Test = the new PDF that is being tested.
  6. # Turn the PDFs into images using Ghostscript,
  7. # then use imagemagick to compare and highlight differences
  8. # Turn the resulting images into a PDF and concatenate them.
  9.  
  10. # Sample usage:
  11. #   ./pdfcomp.pl -basefile LGNP.FIN.WFG1911.STND.OUTPDF.PDF -testfile LGNT.FIN.WFG1911.STND.OUTPDF.PDF
  12.  
  13. use warnings;
  14. use strict;
  15. use Getopt::Long;
  16. use File::Path;
  17. use Parallel::ForkManager;
  18. my $pm = new Parallel::ForkManager ( 3 );
  19.  
  20. my $maxFuzz = 100;    # The biggest fuzz factor I'm willing to accept.
  21. my $fileName_Base   = "NULL";  
  22. my $fileName_Test   = "NULL";  
  23. my $numPages        = 0;    # How many pages should I compare? If value is left at 0, I default to looking at all pages.
  24. my $baseStart       = 1;    # What page of the base file should I start looking at?
  25. my $testStart       = 1;    # What page of the test file should I start looking at?
  26. my $noClean;                  # Clean up the test and base images after comparing them?
  27. my $noLog;                    # Don't write to the log file. This isn't implemented yet.
  28. my $logFile         = "pdfcomp.log";
  29. my $outFile         = "pdfcomp.pdf";
  30. my $verbose;          # If set, then don't tell ghostscript to be quiet, don't add "-q" flag.
  31. my $fuzzFactor  = 0;  # how much of a fuzz factor to tell Imagemagic to apply when doing the compare
  32. my $chunkSize   = 100;# how many pages to compare at a time
  33. my $pixelThreshold = 0; # How many pixels can be differnt in images before a diff PDF is created for the pages.
  34. my $forkOkay;         # Use forking. Makes the standard output and log a mess, but can speed up compare on large files
  35. my $printHelp;        # Print a "usage" statement describing inputs
  36.  
  37.  
  38. # Grab the command line arguments so they can be printed to the log
  39. #
  40. my $cmdParms = " ";
  41. my $numArgs = $#ARGV;
  42. foreach my $argnum (0 .. $#ARGV) {
  43.    #print "@ARGV[$argnum]\n";
  44.    $cmdParms = $cmdParms . $ARGV[$argnum] . " ";
  45. }
  46.  
  47. # Set internal variables using the command line options
  48. #
  49. GetOptions
  50. (
  51.  "basefile=s"  => \$fileName_Base,
  52.  "testfile=s"  => \$fileName_Test,
  53.  "numpages=s"  => \$numPages,
  54.  "basestart=s" => \$baseStart,
  55.  "teststart=s" => \$testStart,
  56.  "logname=s"   => \$logFile,
  57.  "noclean"     => \$noClean,
  58.  "nolog"       => \$noLog,
  59.  "outfile=s"   => \$outFile,
  60.  "verbose"     => \$verbose,
  61.  "fuzz=s"      => \$fuzzFactor,
  62.  "chunksize=s" => \$chunkSize,
  63.  "pixelcnt=s"  => \$pixelThreshold,
  64.  "fork"        => \$forkOkay,
  65.  "help"        => \$printHelp
  66. );
  67.  
  68.  
  69. # Print out "usage" statement
  70. #
  71. if ( ( $printHelp ) or ( $fileName_Base eq "NULL" ) or ( $fileName_Test eq "NULL" ) )
  72. {
  73.     print "Usage: pdfcomp.pl -basefile=filename -testfile=filename -numpages=# -basestart=# \n";
  74.     print "-teststart=# -logname=filename -noclean -nolog -outfile=filename -verbose\n";
  75.     print "-chunksize=# -fork -help\n\n";
  76.     print "-basefile:   The baseline file to test against. Prompted if missing.\n";
  77.     print "-testfile:   The new file being tested. Prompted if missing.\n";
  78.     print "-numpages:   The number of pages to compare. Optional, default is all pages.\n";
  79.     print "-basestart:  Which page to start comparing in the base file. Optional, defaults to first page.\n";
  80.     print "-teststart:  Which page to start comparing in the test file. Optional, defaults to first page.\n";
  81.     print "-logname:    Over ride the default log file name. Optional, defaults to $logFile.\n";
  82.     print "-nolog:      Don't print a log file. Optional.\n";
  83.     print "-noclean:    Don't clean up the intermediate image and PDF files in the images directory. Optional.\n";
  84.     print "-outfile:    Over ride the default pdf output file name. Optional, defaults to $outFile.\n";
  85.     print "-verbose:    Print a lot more information to the screen and log file. Optional.\n";
  86.     print "-chunksize:  Over ride the default number of pages per a loop. Optional, defaults to $chunkSize.\n";
  87.     print "-pixelcnt:   When comparing pages, how many pixels must be different to have a diff PDF created.\n";
  88.     print "             Optional, defaults to 0.\n";
  89.     # print "-fork:       Okay to use forking to speed up processing large files. Optional.\n";
  90.     print "-help:       Print this usage statement. Optional\n\n";
  91. }
  92.  
  93. # Check for the input BASE and TEST file.
  94. #
  95. if ( $fileName_Base eq "NULL" )
  96. {
  97.     print "What is the name of the BASE file? ";
  98.     $fileName_Base = <STDIN>;
  99.     chomp $fileName_Base;
  100. }
  101. die "I can't find the input file $fileName_Base." unless ( -e $fileName_Base );
  102.  
  103. if ( $fileName_Test eq "NULL" )
  104. {
  105.     print "What is the name of the TEST file? ";
  106.     $fileName_Test = <STDIN>;
  107.     chomp $fileName_Test;
  108. }
  109. die "I can't find the input file $fileName_Test." unless ( -e $fileName_Test );
  110.  
  111. # Detect if input files are PDF or Postscript
  112. #
  113. my $baseHeadCommand = "head -n 1 $fileName_Base";
  114. my $testHeadCommand = "head -n 1 $fileName_Test";
  115. my $baseLineOne = `$baseHeadCommand`;
  116. my $testLineOne = `$testHeadCommand`;
  117. my $fileTypeBase = "NULL";
  118. my $fileTypeTest = "NULL";
  119.  
  120. if ( $baseLineOne =~ /PDF-/ )
  121. {
  122.   $fileTypeBase = "pdf";
  123. }
  124. elsif ( $baseLineOne =~ /PS-Adobe-/ )
  125. {
  126.   $fileTypeBase = "ps";
  127. }
  128. else
  129. {
  130.   $fileTypeTest = "INVALID";
  131.   print "Unknown file type: $testLineOne\n";
  132.   die ( "I give up.\n" );
  133. }
  134.  
  135. if ( $testLineOne =~ /PDF-/ )
  136. {
  137.   $fileTypeTest = "pdf";
  138. }
  139. elsif ( $testLineOne =~ /PS-Adobe-/ )
  140. {
  141.   $fileTypeTest = "ps";
  142. }
  143. else
  144. {
  145.   $fileTypeTest = "INVALID";
  146.   print "Unknown file type: $testLineOne\n";
  147.   die ( "I give up.\n" );
  148. }
  149.  
  150.  
  151. # Validation of input variables.
  152. #
  153. #print "fuzzFactor=$fuzzFactor\n";
  154. if ( $fuzzFactor > $maxFuzz )
  155. {
  156.   print "Fuzz Factor of $fuzzFactor is too high, I'm reseting it to $maxFuzz.\n";
  157.   $fuzzFactor = $maxFuzz;  
  158. }
  159. if ( $fuzzFactor < 0 )
  160. {
  161.   print "Fuzz Factor of $fuzzFactor is too low, I'm reseting it to 0.\n";
  162.   $fuzzFactor = 0;  
  163. }
  164.  
  165. # Test that I can write to the current directory
  166. #
  167. open ( TEMP, ">pdfcompTemp$$.txt" ) || die ( "ABEND: Error opening temp file, make sure you run this job from a file system you have permission to write to. Error: $!" );
  168. print TEMP "Making sure I can create files on the current file system. If you see this than I can.\n";
  169. close ( TEMP );
  170. unlink "pdfcompTemp$$.txt";
  171.  
  172.  
  173. # Open the log file
  174. #
  175. unless ( $noLog )
  176. {
  177.     print "Opening log file $logFile.\n";
  178.     open( STDOUT, "| tee $logFile" ) or die "Failed to dup STDOUT to $logFile: $!";
  179.     open( STDERR, ">&STDOUT" ) or die "Failed to dup STDERR: $!";
  180. }
  181.  
  182. print "Command line parameters: $cmdParms\n";;
  183. $|++;
  184.  
  185.  
  186. # Look at input PDFs, find page counts.
  187. #
  188. my $basePageCnt = 0;
  189. my $testPageCnt = 0;
  190. if ( $fileTypeBase eq "pdf" )
  191. {
  192.   print "Using pdfinfo to find the page count for $fileName_Base, this could take a while...\n";
  193.   my $pdfcommand = "pdfinfo $fileName_Base | grep '^Pages:'";
  194.   $basePageCnt = `$pdfcommand`;
  195.   $basePageCnt =~ /Pages:\s*(\d+)/;
  196.   $basePageCnt = $1;
  197.   print "Base PDF page count: $basePageCnt\n";
  198. }
  199. elsif ( $fileTypeBase eq "ps" )
  200. {
  201.   print "Using grep to find the page count of the Postscript, this could take a while...\n";
  202.   my $pdfcommand = "grep -c %%Page:  $fileName_Base";
  203.   $basePageCnt = `$pdfcommand`;
  204.   print "Base PS page count: $basePageCnt\n";
  205. }
  206. else
  207. {
  208.   print "Unknown file type, I'm quiting: $fileTypeBase.\n";
  209.   die;
  210. }
  211.  
  212. if ( $fileTypeTest eq "pdf" )
  213. {
  214.   print "Using pdfinfo to find the page count for $fileName_Test, this could take a while...\n";
  215.   my $pdfcommand = "pdfinfo $fileName_Test | grep '^Pages:'";
  216.   $testPageCnt = `$pdfcommand`;
  217.   $testPageCnt =~ /Pages:\s*(\d+)/;
  218.   $testPageCnt = $1;
  219.   print "Base PDF page count: $testPageCnt\n";
  220. }
  221. elsif ( $fileTypeTest eq "ps" )
  222. {
  223.   print "Using grep to find the page count for $fileName_Test, this could take a while...\n";
  224.   my $pdfcommand = "grep -c %%Page:  $fileName_Test";
  225.   $testPageCnt = `$pdfcommand`;
  226.   print "Test PS page count: $testPageCnt\n";
  227. }
  228. else
  229. {
  230.   print "Unknown file type, I'm quit: $fileTypeTest.\n";
  231.   die;
  232. }
  233.  
  234. $basePageCnt += 0;
  235. $testPageCnt += 0;
  236. print "WARNING: Base and Test PDFs do not have the same number of pages: Base:$basePageCnt, Test:$testPageCnt.\n" unless ( $basePageCnt eq $testPageCnt );
  237. print "WARNING: BASE PDF has less pages then you asked to be compared.\n"   unless ( $basePageCnt >= $numPages );
  238. print "WARNING: TEST PDF has less pages then you asked to be compared.\n"   unless ( $testPageCnt >= $numPages );
  239.  
  240.  
  241. # Deciding the range of pages to compare.
  242. #
  243. if ( ( $numPages == "0" ) or ( $numPages > ( $basePageCnt + $baseStart )  ) or ( $numPages > ( $testPageCnt + $testStart) ) )
  244. {
  245.     $numPages = $basePageCnt;
  246.     if ( $testPageCnt < $basePageCnt )
  247.     {
  248.         $numPages = $testPageCnt;
  249.     }  
  250. }
  251.  
  252. print "I have decided to compare $numPages pages.\n";
  253. my $baseStop = $baseStart + ( $numPages - 1 );
  254. my $testStop = $testStart + ( $numPages - 1 );         
  255. print "For the BASE file I will be looking at page $baseStart through $baseStop.\n";
  256. print "For the TEST file I will be looking at page $testStart through $testStop.\n";
  257.  
  258. # Clean up any previous runs
  259. #
  260. my $imageDir = "images";
  261. if ( -d "./$imageDir" )
  262. {
  263.     print "\nI found a previous $imageDir directory. I am deleting it and all the files in it...\n";
  264.     rmtree( $imageDir, 0, 1);
  265. }
  266. mkdir "$imageDir";
  267.  
  268.  
  269. # Convert the PDFs to Images
  270. #
  271. my $quietFlag;
  272. if ( $verbose )
  273. {
  274.   $quietFlag = " ";             # if "verbose" then don't pass in the "be quit" flag to ghost script
  275. }
  276. else
  277. {
  278.   $quietFlag = " -q ";
  279. }
  280.  
  281. # Calculate how many pages the current loop should look at
  282. #
  283. #print "numPage:$numPages\n";
  284. #print "chunkSize:$chunkSize\n";
  285. if ( $numPages <  $chunkSize )
  286. {
  287.   $chunkSize = $numPages - 1;    #  Make sure the chunksize isn't larger then the actual number of pages
  288. }
  289.  
  290. if ( ( $fileTypeBase eq "ps" ) or ( $fileTypeTest eq "ps" ) )
  291. {
  292.   $chunkSize = $testPageCnt - 1;
  293.   print "* Postscript files cannot be compared in pieces, I have to compare the whole file.\n";
  294. }
  295.  
  296. if ( $verbose ) { print "chunkSize:$chunkSize\n"; }
  297.  
  298. # base start and stop
  299. my $curBaseStart   = $baseStart;                   if ( $verbose ) { print "curBaseStart:$curBaseStart\n"; }
  300. my $curBaseEnd   = $baseStart + $chunkSize - 1;        if ( $verbose ) { print "curBaseEnd:$curBaseEnd\n"; }
  301.  
  302. # test start and stop
  303. my $curTestStart = $testStart;                     if ( $verbose ) { print "curTestStart:$curTestStart\n"; }
  304. my $curTestEnd   = $testStart + $chunkSize - 1;        if ( $verbose ) { print "curTestEnd:$curTestEnd\n"; }
  305.  
  306. # set up loop variables
  307. my $loopCnt = 0;
  308. my $diffCnt  = 0;           # Count of the number of pages found with differences
  309. my $pdfList  = " ";     # List of the pdf pages that had differences, used by Ghostscript to create one big final pdf.
  310.  
  311. while ( $curBaseEnd <= $baseStop )
  312. {
  313.   $loopCnt++;
  314.   print "\n***Loop $loopCnt\n";
  315.  
  316.  
  317.   # Make sure the current chunks stop pages are not greater then the
  318.   # end page the user asked for.
  319.   #
  320.   if ( $curBaseEnd > $baseStop )
  321.   {
  322.     $curBaseEnd = $baseStop;
  323.   }
  324.   if ( $curTestEnd > $testStop )
  325.   {
  326.     $curTestEnd = $testStop;
  327.   }
  328.    
  329.   # Fork to run the two Ghost scripts in seperate threads
  330.   #
  331. #  if ( $forkOkay )
  332.   # {
  333.     # Forks and returns the pid for the child:
  334. #    my $pid = $pm->start and next;
  335. #  }
  336.  
  337.  
  338.   # For the two Ghostscripts so the can run together.
  339.   # For BASE and TEST turn the current chunk of pages into images
  340.   #
  341.   if ( $verbose ) { print "Forking to run both ghostscript commands."; }
  342.   my $pid = fork();
  343.   if ( not defined $pid )
  344.   {
  345.     die ( "I was unable to fork: $!")
  346.   }
  347.   elsif ( $pid == 0 )
  348.   {
  349.     # I am the child process.
  350.     my $gsCommand = "gs" . $quietFlag . "-dSAFER -dBATCH -dNOPAUSE -sDEVICE=png16m -dGraphicsAlphaBits=4 -dFirstPage#$curBaseStart -dLastPage#$curBaseEnd -sOutputFile=./$imageDir/base_chunk${loopCnt}_%05d.png $fileName_Base";
  351.     if ( $verbose )
  352.     {
  353.       print "\nConverting BASELINE PDFs to images: $gsCommand\n";
  354.     }
  355.     else
  356.     {
  357.         print "\nRunning ghostscript on BASE file.";
  358.     }
  359.     system "$gsCommand";
  360.     if ( $verbose ) { print "ghostscript is finished converting BASE, chunk $loopCnt, pdf to images.\n"; }
  361.     exit(0);  
  362.   }
  363.   else
  364.   {
  365.     # I am the parent process.
  366.     my $gsCommand = "gs" . $quietFlag . "-dSAFER -dBATCH -dNOPAUSE -sDEVICE=png16m -dGraphicsAlphaBits=4 -dFirstPage#$curTestStart -dLastPage#$curTestEnd -sOutputFile=./$imageDir/test_chunk${loopCnt}_%05d.png $fileName_Test";
  367.     if ( $verbose )
  368.     {
  369.       print "\nConverting TEST PDFs to images: $gsCommand\n";
  370.     }
  371.     else
  372.     {
  373.         print "\nRunning ghostscript on TEST file.\n";
  374.     }
  375.     system "$gsCommand";
  376.     if ( $verbose ) { print "ghostscript is finished converting TEST, chunk $loopCnt, pdf to images.\n"; }
  377.     waitpid($pid,0);
  378.   }  
  379.  
  380.   # if ( $forkOkay )
  381.   # {
  382.   # $pm->finish; # Terminates the child process  
  383.   #}
  384.  
  385.  
  386.   # Loop through the images using ImageMagick to look for differences
  387.   #  
  388.   my $imageCnt = 0;     # Loop counter  
  389.   my $chunkTotalCnt = $curBaseEnd - $curBaseStart + 1; # For last chunk this won't be the chunk size
  390.  
  391.   for ( $imageCnt = 1; $imageCnt <= $chunkTotalCnt; $imageCnt++ )
  392.   {
  393.     my $numSufix = sprintf( "%05d", $imageCnt ); # recreate the file sufix numbers coming out of the ghostscript commands.
  394.    
  395.     # recreate the file names coming out of Ghostscript, make a similar name for difference PDF
  396.     #
  397.     my $file_base = "./$imageDir/base_chunk${loopCnt}_" . $numSufix . ".png";
  398.     my $file_test = "./$imageDir/test_chunk${loopCnt}_" . $numSufix . ".png";
  399.     my $file_diff = "./$imageDir/diff_chunk${loopCnt}_" . $numSufix . ".png";
  400.    
  401.     # Find the md5sum for the current base and test page
  402.     #
  403.     my $md5Command = "md5sum $file_base";
  404.     my $base_md5 = `$md5Command`;  
  405.     my @base_md5_array = split(/ /, $base_md5 );   
  406.     #print "base_md5: $base_md5_array[0]\n";       
  407.    
  408.        $md5Command = "md5sum $file_test";
  409.     my $test_md5 = `$md5Command`;
  410.     my @test_md5_array = split(/ /, $test_md5 );
  411.     #print "test_md5: $test_md5_array[0]\n";
  412.    
  413.     #  If md5sum is the same, don't bother making a compare image
  414.     #  else, make the compare image.
  415.     #
  416.     my $curPagecnt = $curBaseStart + $imageCnt - 1;   # the current pages number in the full file, not just this chunk
  417.    
  418.     if ( $base_md5_array[0] eq $test_md5_array[0] )
  419.     {
  420.       #print "$i: Base page $numBase and test page $numTest have same md5sum, skipping compare.\n";
  421.       #if ( $verbose )
  422.       #{
  423.         print  "$curPagecnt: Same md5sum, not comparing.\n";
  424.       #}
  425.       #print LOG "$i: Same (md5sum)\n";
  426.     }
  427.     else
  428.     {  
  429.       #my $imCommand = "/apps/ImageMagick-6.3.9/bin/compare -metric AE $file_base $file_test -highlight-color Red $file_diff";
  430.       my $imCommand = "/apps/ImageMagick-6.3.9/bin/compare -metric AE -fuzz " . $fuzzFactor . "% $file_base $file_test $file_diff";
  431.       #print "$i: Comparing: $imCommand\n";
  432.       print "$curPagecnt: Comparing...";
  433.       my $imDiff = `$imCommand 2>&1 1>/dev/null`;       # grab the STDOUT get the number of different pixels. Throw the STDERR away.
  434.       chomp $imDiff;
  435.       #system  "$imCommand";
  436.       print "Pages differ by $imDiff pixels, ";
  437.  
  438.       if ( $imDiff == $pixelThreshold )
  439.       {
  440.           print "below threshold of $pixelThreshold\n";
  441.           unless ( $noClean )
  442.           {
  443.             if ( $verbose ) {   print "     Deleting the compare image.\n"; }
  444.             unlink $file_diff unless ( $noClean );
  445.           }
  446.       }
  447.       else
  448.       {
  449.         $diffCnt++;
  450.         print "creating diff pdf\n";
  451.         my $convertCmd = "/apps/ImageMagick-6.3.9/bin/convert $file_diff $file_diff.pdf";
  452.         #print $convertCmd . "\n";
  453.         system ( $convertCmd );
  454.         $pdfList = $pdfList . " $file_diff.pdf";
  455.         #print LOG "$i: differnt. (pixel cnt=$imDiff)\n";
  456.       } # end else ( imagemagic found image differences.
  457.     } # end else (md5 didn't match
  458.   } # end for (looping through current chunks images)
  459.  
  460.  
  461.   # Update the current chunks counts so the loop can look at the next chunk.
  462.   #
  463.   $curBaseStart = $curBaseEnd + 1;
  464.   $curTestStart = $curTestEnd + 1;
  465.   $curBaseEnd += $chunkSize;
  466.   $curTestEnd += $chunkSize;
  467.  
  468.  #if ( $forkOkay )
  469.   #{
  470.     $pm->finish; # Terminates the child process  
  471.   #}
  472. }
  473.  
  474. print "$diffCnt of $numPages pages had differences.\n";
  475.  
  476. # Concatenating all the Pdfs into one
  477. #
  478. if ( $diffCnt > 0 )
  479. {
  480.     print "Creating difference pdf $outFile.\n";
  481.     my $catCmd = "gs -q -sPAPERSIZE=letter -dNOPAUSE -dBATCH -sDEVICE=pdfwrite -sOutputFile=$outFile $pdfList";
  482.     system ( $catCmd );
  483. }
  484. else
  485. {
  486.     print "All pages match. I'm not creating a difference pdf.\n";
  487. }
  488.  
  489.  
  490. # Delete the base and test images for current page
  491. #  
  492. unless ( $noClean )
  493. {
  494.     print "\nRemoving intermediate files.\n";
  495.     rmtree( $imageDir, 0, 1);      
  496. }
  497.  
  498. print "\nDone.\n";
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement