Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/perl
- # Accept two PDFs as input.
- # Base = the baseline PDF
- # Test = the new PDF that is being tested.
- # Turn the PDFs into images using Ghostscript,
- # then use imagemagick to compare and highlight differences
- # Turn the resulting images into a PDF and concatenate them.
- # Sample usage:
- # ./pdfcomp.pl -basefile LGNP.FIN.WFG1911.STND.OUTPDF.PDF -testfile LGNT.FIN.WFG1911.STND.OUTPDF.PDF
- use warnings;
- use strict;
- use Getopt::Long;
- use File::Path;
- use Parallel::ForkManager;
- my $pm = new Parallel::ForkManager ( 3 );
- my $maxFuzz = 100; # The biggest fuzz factor I'm willing to accept.
- my $fileName_Base = "NULL";
- my $fileName_Test = "NULL";
- my $numPages = 0; # How many pages should I compare? If value is left at 0, I default to looking at all pages.
- my $baseStart = 1; # What page of the base file should I start looking at?
- my $testStart = 1; # What page of the test file should I start looking at?
- my $noClean; # Clean up the test and base images after comparing them?
- my $noLog; # Don't write to the log file. This isn't implemented yet.
- my $logFile = "pdfcomp.log";
- my $outFile = "pdfcomp.pdf";
- my $verbose; # If set, then don't tell ghostscript to be quiet, don't add "-q" flag.
- my $fuzzFactor = 0; # how much of a fuzz factor to tell Imagemagic to apply when doing the compare
- my $chunkSize = 100;# how many pages to compare at a time
- my $pixelThreshold = 0; # How many pixels can be differnt in images before a diff PDF is created for the pages.
- my $forkOkay; # Use forking. Makes the standard output and log a mess, but can speed up compare on large files
- my $printHelp; # Print a "usage" statement describing inputs
- # Grab the command line arguments so they can be printed to the log
- #
- my $cmdParms = " ";
- my $numArgs = $#ARGV;
- foreach my $argnum (0 .. $#ARGV) {
- #print "@ARGV[$argnum]\n";
- $cmdParms = $cmdParms . $ARGV[$argnum] . " ";
- }
- # Set internal variables using the command line options
- #
- GetOptions
- (
- "basefile=s" => \$fileName_Base,
- "testfile=s" => \$fileName_Test,
- "numpages=s" => \$numPages,
- "basestart=s" => \$baseStart,
- "teststart=s" => \$testStart,
- "logname=s" => \$logFile,
- "noclean" => \$noClean,
- "nolog" => \$noLog,
- "outfile=s" => \$outFile,
- "verbose" => \$verbose,
- "fuzz=s" => \$fuzzFactor,
- "chunksize=s" => \$chunkSize,
- "pixelcnt=s" => \$pixelThreshold,
- "fork" => \$forkOkay,
- "help" => \$printHelp
- );
- # Print out "usage" statement
- #
- if ( ( $printHelp ) or ( $fileName_Base eq "NULL" ) or ( $fileName_Test eq "NULL" ) )
- {
- print "Usage: pdfcomp.pl -basefile=filename -testfile=filename -numpages=# -basestart=# \n";
- print "-teststart=# -logname=filename -noclean -nolog -outfile=filename -verbose\n";
- print "-chunksize=# -fork -help\n\n";
- print "-basefile: The baseline file to test against. Prompted if missing.\n";
- print "-testfile: The new file being tested. Prompted if missing.\n";
- print "-numpages: The number of pages to compare. Optional, default is all pages.\n";
- print "-basestart: Which page to start comparing in the base file. Optional, defaults to first page.\n";
- print "-teststart: Which page to start comparing in the test file. Optional, defaults to first page.\n";
- print "-logname: Over ride the default log file name. Optional, defaults to $logFile.\n";
- print "-nolog: Don't print a log file. Optional.\n";
- print "-noclean: Don't clean up the intermediate image and PDF files in the images directory. Optional.\n";
- print "-outfile: Over ride the default pdf output file name. Optional, defaults to $outFile.\n";
- print "-verbose: Print a lot more information to the screen and log file. Optional.\n";
- print "-chunksize: Over ride the default number of pages per a loop. Optional, defaults to $chunkSize.\n";
- print "-pixelcnt: When comparing pages, how many pixels must be different to have a diff PDF created.\n";
- print " Optional, defaults to 0.\n";
- # print "-fork: Okay to use forking to speed up processing large files. Optional.\n";
- print "-help: Print this usage statement. Optional\n\n";
- }
- # Check for the input BASE and TEST file.
- #
- if ( $fileName_Base eq "NULL" )
- {
- print "What is the name of the BASE file? ";
- $fileName_Base = <STDIN>;
- chomp $fileName_Base;
- }
- die "I can't find the input file $fileName_Base." unless ( -e $fileName_Base );
- if ( $fileName_Test eq "NULL" )
- {
- print "What is the name of the TEST file? ";
- $fileName_Test = <STDIN>;
- chomp $fileName_Test;
- }
- die "I can't find the input file $fileName_Test." unless ( -e $fileName_Test );
- # Detect if input files are PDF or Postscript
- #
- my $baseHeadCommand = "head -n 1 $fileName_Base";
- my $testHeadCommand = "head -n 1 $fileName_Test";
- my $baseLineOne = `$baseHeadCommand`;
- my $testLineOne = `$testHeadCommand`;
- my $fileTypeBase = "NULL";
- my $fileTypeTest = "NULL";
- if ( $baseLineOne =~ /PDF-/ )
- {
- $fileTypeBase = "pdf";
- }
- elsif ( $baseLineOne =~ /PS-Adobe-/ )
- {
- $fileTypeBase = "ps";
- }
- else
- {
- $fileTypeTest = "INVALID";
- print "Unknown file type: $testLineOne\n";
- die ( "I give up.\n" );
- }
- if ( $testLineOne =~ /PDF-/ )
- {
- $fileTypeTest = "pdf";
- }
- elsif ( $testLineOne =~ /PS-Adobe-/ )
- {
- $fileTypeTest = "ps";
- }
- else
- {
- $fileTypeTest = "INVALID";
- print "Unknown file type: $testLineOne\n";
- die ( "I give up.\n" );
- }
- # Validation of input variables.
- #
- #print "fuzzFactor=$fuzzFactor\n";
- if ( $fuzzFactor > $maxFuzz )
- {
- print "Fuzz Factor of $fuzzFactor is too high, I'm reseting it to $maxFuzz.\n";
- $fuzzFactor = $maxFuzz;
- }
- if ( $fuzzFactor < 0 )
- {
- print "Fuzz Factor of $fuzzFactor is too low, I'm reseting it to 0.\n";
- $fuzzFactor = 0;
- }
- # Test that I can write to the current directory
- #
- open ( TEMP, ">pdfcompTemp$$.txt" ) || die ( "ABEND: Error opening temp file, make sure you run this job from a file system you have permission to write to. Error: $!" );
- print TEMP "Making sure I can create files on the current file system. If you see this than I can.\n";
- close ( TEMP );
- unlink "pdfcompTemp$$.txt";
- # Open the log file
- #
- unless ( $noLog )
- {
- print "Opening log file $logFile.\n";
- open( STDOUT, "| tee $logFile" ) or die "Failed to dup STDOUT to $logFile: $!";
- open( STDERR, ">&STDOUT" ) or die "Failed to dup STDERR: $!";
- }
- print "Command line parameters: $cmdParms\n";;
- $|++;
- # Look at input PDFs, find page counts.
- #
- my $basePageCnt = 0;
- my $testPageCnt = 0;
- if ( $fileTypeBase eq "pdf" )
- {
- print "Using pdfinfo to find the page count for $fileName_Base, this could take a while...\n";
- my $pdfcommand = "pdfinfo $fileName_Base | grep '^Pages:'";
- $basePageCnt = `$pdfcommand`;
- $basePageCnt =~ /Pages:\s*(\d+)/;
- $basePageCnt = $1;
- print "Base PDF page count: $basePageCnt\n";
- }
- elsif ( $fileTypeBase eq "ps" )
- {
- print "Using grep to find the page count of the Postscript, this could take a while...\n";
- my $pdfcommand = "grep -c %%Page: $fileName_Base";
- $basePageCnt = `$pdfcommand`;
- print "Base PS page count: $basePageCnt\n";
- }
- else
- {
- print "Unknown file type, I'm quiting: $fileTypeBase.\n";
- die;
- }
- if ( $fileTypeTest eq "pdf" )
- {
- print "Using pdfinfo to find the page count for $fileName_Test, this could take a while...\n";
- my $pdfcommand = "pdfinfo $fileName_Test | grep '^Pages:'";
- $testPageCnt = `$pdfcommand`;
- $testPageCnt =~ /Pages:\s*(\d+)/;
- $testPageCnt = $1;
- print "Base PDF page count: $testPageCnt\n";
- }
- elsif ( $fileTypeTest eq "ps" )
- {
- print "Using grep to find the page count for $fileName_Test, this could take a while...\n";
- my $pdfcommand = "grep -c %%Page: $fileName_Test";
- $testPageCnt = `$pdfcommand`;
- print "Test PS page count: $testPageCnt\n";
- }
- else
- {
- print "Unknown file type, I'm quit: $fileTypeTest.\n";
- die;
- }
- $basePageCnt += 0;
- $testPageCnt += 0;
- print "WARNING: Base and Test PDFs do not have the same number of pages: Base:$basePageCnt, Test:$testPageCnt.\n" unless ( $basePageCnt eq $testPageCnt );
- print "WARNING: BASE PDF has less pages then you asked to be compared.\n" unless ( $basePageCnt >= $numPages );
- print "WARNING: TEST PDF has less pages then you asked to be compared.\n" unless ( $testPageCnt >= $numPages );
- # Deciding the range of pages to compare.
- #
- if ( ( $numPages == "0" ) or ( $numPages > ( $basePageCnt + $baseStart ) ) or ( $numPages > ( $testPageCnt + $testStart) ) )
- {
- $numPages = $basePageCnt;
- if ( $testPageCnt < $basePageCnt )
- {
- $numPages = $testPageCnt;
- }
- }
- print "I have decided to compare $numPages pages.\n";
- my $baseStop = $baseStart + ( $numPages - 1 );
- my $testStop = $testStart + ( $numPages - 1 );
- print "For the BASE file I will be looking at page $baseStart through $baseStop.\n";
- print "For the TEST file I will be looking at page $testStart through $testStop.\n";
- # Clean up any previous runs
- #
- my $imageDir = "images";
- if ( -d "./$imageDir" )
- {
- print "\nI found a previous $imageDir directory. I am deleting it and all the files in it...\n";
- rmtree( $imageDir, 0, 1);
- }
- mkdir "$imageDir";
- # Convert the PDFs to Images
- #
- my $quietFlag;
- if ( $verbose )
- {
- $quietFlag = " "; # if "verbose" then don't pass in the "be quit" flag to ghost script
- }
- else
- {
- $quietFlag = " -q ";
- }
- # Calculate how many pages the current loop should look at
- #
- #print "numPage:$numPages\n";
- #print "chunkSize:$chunkSize\n";
- if ( $numPages < $chunkSize )
- {
- $chunkSize = $numPages - 1; # Make sure the chunksize isn't larger then the actual number of pages
- }
- if ( ( $fileTypeBase eq "ps" ) or ( $fileTypeTest eq "ps" ) )
- {
- $chunkSize = $testPageCnt - 1;
- print "* Postscript files cannot be compared in pieces, I have to compare the whole file.\n";
- }
- if ( $verbose ) { print "chunkSize:$chunkSize\n"; }
- # base start and stop
- my $curBaseStart = $baseStart; if ( $verbose ) { print "curBaseStart:$curBaseStart\n"; }
- my $curBaseEnd = $baseStart + $chunkSize - 1; if ( $verbose ) { print "curBaseEnd:$curBaseEnd\n"; }
- # test start and stop
- my $curTestStart = $testStart; if ( $verbose ) { print "curTestStart:$curTestStart\n"; }
- my $curTestEnd = $testStart + $chunkSize - 1; if ( $verbose ) { print "curTestEnd:$curTestEnd\n"; }
- # set up loop variables
- my $loopCnt = 0;
- my $diffCnt = 0; # Count of the number of pages found with differences
- my $pdfList = " "; # List of the pdf pages that had differences, used by Ghostscript to create one big final pdf.
- while ( $curBaseEnd <= $baseStop )
- {
- $loopCnt++;
- print "\n***Loop $loopCnt\n";
- # Make sure the current chunks stop pages are not greater then the
- # end page the user asked for.
- #
- if ( $curBaseEnd > $baseStop )
- {
- $curBaseEnd = $baseStop;
- }
- if ( $curTestEnd > $testStop )
- {
- $curTestEnd = $testStop;
- }
- # Fork to run the two Ghost scripts in seperate threads
- #
- # if ( $forkOkay )
- # {
- # Forks and returns the pid for the child:
- # my $pid = $pm->start and next;
- # }
- # For the two Ghostscripts so the can run together.
- # For BASE and TEST turn the current chunk of pages into images
- #
- if ( $verbose ) { print "Forking to run both ghostscript commands."; }
- my $pid = fork();
- if ( not defined $pid )
- {
- die ( "I was unable to fork: $!")
- }
- elsif ( $pid == 0 )
- {
- # I am the child process.
- my $gsCommand = "gs" . $quietFlag . "-dSAFER -dBATCH -dNOPAUSE -sDEVICE=png16m -dGraphicsAlphaBits=4 -dFirstPage#$curBaseStart -dLastPage#$curBaseEnd -sOutputFile=./$imageDir/base_chunk${loopCnt}_%05d.png $fileName_Base";
- if ( $verbose )
- {
- print "\nConverting BASELINE PDFs to images: $gsCommand\n";
- }
- else
- {
- print "\nRunning ghostscript on BASE file.";
- }
- system "$gsCommand";
- if ( $verbose ) { print "ghostscript is finished converting BASE, chunk $loopCnt, pdf to images.\n"; }
- exit(0);
- }
- else
- {
- # I am the parent process.
- my $gsCommand = "gs" . $quietFlag . "-dSAFER -dBATCH -dNOPAUSE -sDEVICE=png16m -dGraphicsAlphaBits=4 -dFirstPage#$curTestStart -dLastPage#$curTestEnd -sOutputFile=./$imageDir/test_chunk${loopCnt}_%05d.png $fileName_Test";
- if ( $verbose )
- {
- print "\nConverting TEST PDFs to images: $gsCommand\n";
- }
- else
- {
- print "\nRunning ghostscript on TEST file.\n";
- }
- system "$gsCommand";
- if ( $verbose ) { print "ghostscript is finished converting TEST, chunk $loopCnt, pdf to images.\n"; }
- waitpid($pid,0);
- }
- # if ( $forkOkay )
- # {
- # $pm->finish; # Terminates the child process
- #}
- # Loop through the images using ImageMagick to look for differences
- #
- my $imageCnt = 0; # Loop counter
- my $chunkTotalCnt = $curBaseEnd - $curBaseStart + 1; # For last chunk this won't be the chunk size
- for ( $imageCnt = 1; $imageCnt <= $chunkTotalCnt; $imageCnt++ )
- {
- my $numSufix = sprintf( "%05d", $imageCnt ); # recreate the file sufix numbers coming out of the ghostscript commands.
- # recreate the file names coming out of Ghostscript, make a similar name for difference PDF
- #
- my $file_base = "./$imageDir/base_chunk${loopCnt}_" . $numSufix . ".png";
- my $file_test = "./$imageDir/test_chunk${loopCnt}_" . $numSufix . ".png";
- my $file_diff = "./$imageDir/diff_chunk${loopCnt}_" . $numSufix . ".png";
- # Find the md5sum for the current base and test page
- #
- my $md5Command = "md5sum $file_base";
- my $base_md5 = `$md5Command`;
- my @base_md5_array = split(/ /, $base_md5 );
- #print "base_md5: $base_md5_array[0]\n";
- $md5Command = "md5sum $file_test";
- my $test_md5 = `$md5Command`;
- my @test_md5_array = split(/ /, $test_md5 );
- #print "test_md5: $test_md5_array[0]\n";
- # If md5sum is the same, don't bother making a compare image
- # else, make the compare image.
- #
- my $curPagecnt = $curBaseStart + $imageCnt - 1; # the current pages number in the full file, not just this chunk
- if ( $base_md5_array[0] eq $test_md5_array[0] )
- {
- #print "$i: Base page $numBase and test page $numTest have same md5sum, skipping compare.\n";
- #if ( $verbose )
- #{
- print "$curPagecnt: Same md5sum, not comparing.\n";
- #}
- #print LOG "$i: Same (md5sum)\n";
- }
- else
- {
- #my $imCommand = "/apps/ImageMagick-6.3.9/bin/compare -metric AE $file_base $file_test -highlight-color Red $file_diff";
- my $imCommand = "/apps/ImageMagick-6.3.9/bin/compare -metric AE -fuzz " . $fuzzFactor . "% $file_base $file_test $file_diff";
- #print "$i: Comparing: $imCommand\n";
- print "$curPagecnt: Comparing...";
- my $imDiff = `$imCommand 2>&1 1>/dev/null`; # grab the STDOUT get the number of different pixels. Throw the STDERR away.
- chomp $imDiff;
- #system "$imCommand";
- print "Pages differ by $imDiff pixels, ";
- if ( $imDiff == $pixelThreshold )
- {
- print "below threshold of $pixelThreshold\n";
- unless ( $noClean )
- {
- if ( $verbose ) { print " Deleting the compare image.\n"; }
- unlink $file_diff unless ( $noClean );
- }
- }
- else
- {
- $diffCnt++;
- print "creating diff pdf\n";
- my $convertCmd = "/apps/ImageMagick-6.3.9/bin/convert $file_diff $file_diff.pdf";
- #print $convertCmd . "\n";
- system ( $convertCmd );
- $pdfList = $pdfList . " $file_diff.pdf";
- #print LOG "$i: differnt. (pixel cnt=$imDiff)\n";
- } # end else ( imagemagic found image differences.
- } # end else (md5 didn't match
- } # end for (looping through current chunks images)
- # Update the current chunks counts so the loop can look at the next chunk.
- #
- $curBaseStart = $curBaseEnd + 1;
- $curTestStart = $curTestEnd + 1;
- $curBaseEnd += $chunkSize;
- $curTestEnd += $chunkSize;
- #if ( $forkOkay )
- #{
- $pm->finish; # Terminates the child process
- #}
- }
- print "$diffCnt of $numPages pages had differences.\n";
- # Concatenating all the Pdfs into one
- #
- if ( $diffCnt > 0 )
- {
- print "Creating difference pdf $outFile.\n";
- my $catCmd = "gs -q -sPAPERSIZE=letter -dNOPAUSE -dBATCH -sDEVICE=pdfwrite -sOutputFile=$outFile $pdfList";
- system ( $catCmd );
- }
- else
- {
- print "All pages match. I'm not creating a difference pdf.\n";
- }
- # Delete the base and test images for current page
- #
- unless ( $noClean )
- {
- print "\nRemoving intermediate files.\n";
- rmtree( $imageDir, 0, 1);
- }
- print "\nDone.\n";
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement