Advertisement
Guest User

lineprinter.pl

a guest
Mar 27th, 2014
72
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Perl 12.70 KB | None | 0 0
  1. package PDF::PageLines;
  2.  
  3. use 5.014;
  4. use warnings;
  5. use base qw(CAM::PDF::GS);
  6. use List::Util qw(sum);
  7.  
  8. our $VERSION = '1.00';
  9.  
  10. # More exact matches first
  11. # These are used to group hunks of data into separate distinct sections
  12. my @boxes = (
  13.     #                                       Row  Col
  14.     # attr          X min/max   Y min/max   axix axis
  15.     #[ top       =>   75,480,    670,740,    'x', 'y' ],
  16.  
  17.     # Middle columns
  18.     # attr          X min/max   Y min/max
  19.     #[ left      =>   80,160,    550,660,    'x', 'y' ],
  20.     #[ center    =>  160,380,    550,660,    'x', 'y' ],
  21.     #[ right     =>  385,475,    550,660,    'x', 'y' ],
  22.     #[ bc_right  =>  480,570,    550,705,    'y', 'x' ],
  23.  
  24.     # Bottom barcode
  25.     #[ bc_bottom =>  290,550,     35,185,    'x', 'y' ],
  26.  
  27.     # Fine print
  28.     #[ fineprint =>    0,275,     50,350,    'x', 'y' ],
  29. );
  30. my $font_point = 12;
  31. my $spaces_per_fullsize = 3;
  32.  
  33.  
  34.  
  35. sub new {
  36.     my ($pkg, @args) = @_;
  37.     my $self = $pkg->SUPER::new(@args);
  38.     $self->{refs}{text} = [];
  39.     $self->{locations}  = \@boxes;
  40.     return $self;
  41. }
  42.  
  43. sub getTextBlocks {
  44.     my ($self) = @_;
  45.     return @{$self->{refs}->{text}};
  46. }
  47.  
  48. sub renderText
  49. {
  50.    my $self = shift;
  51.    my $string = shift;
  52.    my $width = shift;
  53.  
  54.    #my ($xu, $yu) = $self->textLineToUser(0, 0);
  55.    my ($x, $y) = $self->textToDevice(0, 0);
  56.    my ($x2, $y2) = $self->textToDevice(1, 0);
  57.    # Compute angles and deltas
  58.    my $xa = cos( atan2($y2-$y, $x2-$x) );
  59.    my $ya = sin( atan2($y2-$y, $x2-$x) );
  60.    my $xd = $xa*$width*$self->{Tfs};
  61.    my $yd = $ya*$width*$self->{Tfs};
  62.  
  63.    my $location;
  64.    for my $location_coords (@{$self->{locations}}) {
  65.        next unless $x > $location_coords->[1]-5 and ($x+$xd) < $location_coords->[2]+5;
  66.        next unless $y > $location_coords->[3]-5 and ($y+$yd) < $location_coords->[4]+5;
  67.        $location = $location_coords->[0];
  68.        last;
  69.    }
  70.  
  71.    my $text = length($string)|3 && ord($string)>127 ? join('', map {chr(288-ord)} split('',$string)) : $string;
  72. #say "$text $self->{Tf} $self->{Tfs} $self->{Tc} $self->{Tw} $self->{Tz}";
  73. #say join ' / ', keys %{$self};
  74.  
  75.    #my $a = (atan2($y2-$y, $x2-$x)/(4*atan2(1, 1)))*180;
  76.    #printf "(%7.2f, %7.2f) W:%6.2f L:%10s %s (%s)\n", $x,$y, $xd, $location//'UNKNOWN', $text, $string;
  77.    push $self->{refs}{text}, {
  78.        x => $x,     xd => $xd,
  79.        y => $y,     yd => $yd,
  80.        fontsize => $self->{Tfs},
  81.        location => $location // 'UNKNOWN',
  82.        text     => $text,
  83.        #text     => $self->{Tf} eq 'F3' ? decode( 'UTF-16BE', $string  )  : $string,
  84.        #font     => $self->{Tf},
  85.    };
  86.    return;
  87. }
  88.  
  89. # Used location data to build string of group members, assuming high Y locality
  90. sub _build_line {
  91.     my ($xy,$objects) = @_;
  92.     my $xyd = $xy . 'd';
  93.     my @strings;
  94.     my $last;
  95.     for my $o ( @$objects ) {
  96.         # Add any needed whitespace
  97.         if ( $last ) {
  98.             my $last_end = $last->{$xy} + $last->{$xyd};
  99.             my $spaces = ($o->{$xy} - $last_end) / (($last->{$xyd} / $o->{fontsize}) * $font_point * $spaces_per_fullsize);
  100. #warn "$last_end - $o->{$xy} = " .($o->{$xy} - $last_end).", $last->{$xyd} / ($o->{fontsize} ) * 14) = $spaces     | $o->{text}";
  101.             #my $spaces = (($o->{$xy} - $last_end) / ($o->{fontsize}*16));
  102. #warn "$last_end - $o->{$xy} = " .($o->{$xy} - $last_end)." / ($o->{fontsize}/3) = $spaces";
  103.             #say "$last->{x} + $last->{xd} = $last_end <? $o->{x} ($o->{fontsize})" if $spaces < 0;
  104.             # If there's at least half a space, make a space
  105.             $spaces = int($spaces) || int($spaces+0.5);
  106.             push @strings, ' ' x $spaces if $spaces;
  107.             #say "$spaces spaces between $last->{text} and $o->{text}" if $spaces < 0;
  108.             #if $last_end
  109.         }
  110.         push @strings, $o->{text};
  111.         $last = $o;
  112.     }
  113.  
  114.     #say "BUILT LINE ($last->{location}): " . join '', @strings;
  115.     return join '', @strings;
  116. }
  117.  
  118. # Pre-sort to get sorted groups
  119. sub _group {
  120.     my ($items, $variance, $key) = @_;
  121.     my @groups;
  122.  
  123.     sub group_add_member {
  124.         $_[0] ||= {members=>[]};
  125.         push $_[0]{members}, $_[1];
  126.         $_[0]{count} = @{$_[0]{members}};
  127.         $_[0]{avg} = (sum map { $_[2] ? $_[1]{$_[2]} : $_[1] } @{$_[0]{members}}) / $_[0]{count};
  128.         #say "Computed $_[0]{avg} from (" . join(", ", map {$_[2]?$_[1]{$_[2]}:$_[1]} values $_[0]{members}) . ")";
  129.         return $_[0];
  130.     }
  131.  
  132.     for my $item (@$items) {
  133.         my $val = $key ? $item->{$key} : $item;
  134.         my $added;
  135.         for my $group (@groups) {
  136.             #say "$group->{avg} $val";
  137.             next unless abs($group->{avg} - $val) < $variance;
  138.             group_add_member $group, $item, $key;
  139.             $added = 1;
  140.         }
  141.         next if $added;
  142.         push @groups, group_add_member undef, $item, $key;
  143.     }
  144.  
  145.     #say Dumper \@groups;
  146.     return @groups;
  147. }
  148.  
  149.  
  150. sub get_lines {
  151.     my ($self,%P) = @_;
  152.     my %lrows = %{$self->get_pdf_section_groups||{}};
  153.     my @all_text_lines = map { map {$_->{text_line}} @{$_} } values %lrows;
  154.     #use Data::Dumper; say Dumper \%lrows;
  155.     die 'No grouped location rows found!' unless %lrows;
  156.     return @all_text_lines;
  157. }
  158.  
  159. sub get_pdf_section_groups {
  160.     my ($self,$want,$ignore) = @_;
  161.     $want = qr/$want/ if $want and ref $want ne 'Regexp';
  162.     $ignore = qr/$ignore/ if $ignore and ref $ignore ne 'Regexp';
  163.     my @texts = $self->getTextBlocks;
  164.  
  165.     my %locations;
  166.     for my $tblock ( sort { $b->{location} cmp $a->{location} } @texts ) {
  167.         next if $want and not $tblock->{location} =~ $want;
  168.         next if $ignore and $tblock->{location} =~ $ignore;
  169.         my $group = $locations{$tblock->{location}}||=[]; # Make array if there isn't one
  170.         push $group, $tblock;
  171.     }
  172.  
  173.     #say "Grouping headers for page $page";
  174.     my %lrows;
  175.     push $self->{locations}, [ UNKNOWN => 0,0,0,0,'x','y' ] unless grep {$_->[0] eq 'UNKNOWN'} @{$self->{locations}};
  176.     for my $location_data ( @{$self->{locations}} ) {
  177.         my ($name,$x1,$x2,$y1,$y2,$row_axis,$col_axis) = @$location_data;
  178.         next unless $locations{$name};
  179.         $row_axis ||= 'x';
  180.         $col_axis ||= 'y';
  181.         my @rows = sort { $b->{avg} <=> $a->{avg} } _group($locations{$name}, 3, $col_axis);
  182.         for my $row ( @rows ) {
  183.             $row->{members} = [ sort { $a->{$row_axis} <=> $b->{$row_axis} } @{$row->{members}} ];
  184.             $row->{text_line} = _build_line( $row_axis, $row->{members} );
  185.         }
  186.         $lrows{$name} = \@rows;
  187.     }
  188.  
  189.     return \%lrows;
  190. }
  191.  
  192.  
  193.  
  194. 1;
  195.  
  196. package PDF::Lines;
  197. use 5.014;
  198. use warnings;
  199. use base qw(CAM::PDF);
  200. use Data::Dumper;
  201. use Try::Tiny;
  202. use File::Slurp qw(read_file);
  203. use File::Temp;
  204. use List::Util qw(sum);
  205. #use PDF::PageLines;
  206.  
  207. my @AUTOTYPECHAIN = qw( TF::PDF::TMTicket TF::PDF::TMSeasonTicket );
  208.  
  209. sub new {
  210.     my ($pkg, @args) = @_;
  211.     my $self;
  212.     my $gs;
  213.     $gs = pop @args if @args>1 and $args[-1] =~ /ghostscript_rewrite/i;
  214.     try {
  215.         $self = $pkg->SUPER::new(@args) or die "Unable to read PDF: $CAM::PDF::errstr";
  216.         my $test_processor = $self->getPageContentTree(1)->traverse('PDF::PageLines');
  217.         # Catch errors early, attempt ghostscript fix if we can
  218.         my $hastext = $test_processor->getTextBlocks; # Count
  219.         die "No text found to proces in PDF" unless $hastext;
  220.     }
  221.     catch {
  222.         die $_ unless not $gs and /Incorrect password|The document cannot be decrypted|No text found|Unrecognized type in parseAny/i;
  223.         warn $_ if grep { $args[$_] eq 'debug' and $args[+$_] } 0 .. $#args;
  224.         my $pdf_data = read_file( shift @args );
  225.         my ($ofh, $outfile) = File::Temp::tempfile( 'gs_pdfconv_XXXX', DIR => '/var/tmp/TF' );
  226.         open( my $gsfh, '|-', "gs -q -dNOPAUSE -sDEVICE=pdfwrite -dCompatibilityLevel=1.3 -sOUTPUTFILE=$outfile -dBATCH -") or die $!;
  227.         print $gsfh $pdf_data or die "Unable to push all data through GS for PDF conversion! $!";
  228.         close($gsfh);
  229.         $self = $pkg->SUPER::new( $outfile, @args, 'ghostscript_rewrite' ) or die "Unable to read GS converted PDF: $CAM::PDF::errstr";;
  230.         unlink($outfile);
  231.     };
  232.     $self->{pdfinfo} = undef;
  233.     $self->{pages}   = [];
  234.     return $self;
  235. }
  236.  
  237. sub recurse_values {
  238.     my $self = shift;
  239.     my @returns;
  240.     for ( @_ ) {
  241.         if ( ref eq 'ARRAY' ) {
  242.             #say 'recurse A';
  243.             push @returns, [$self->recurse_values( @$_ )];
  244.         }
  245.         elsif ( ref eq 'HASH' ) {
  246.             #say 'recurse H';
  247.             my $h = $_;
  248.             push @returns, { map { $_ => $self->recurse_values( $h->{$_} ) } keys $h };
  249.         }
  250.         elsif ( ref eq 'CAM::PDF::Node' ) {
  251.             #say 'node';
  252.             push @returns, $self->recurse_values( $self->getValue( $_ ) );
  253.         }
  254.         else {
  255.             #say "val $_";
  256.             push @returns, $_;
  257.         }
  258.     }
  259.     return @returns;
  260. }
  261.  
  262. sub pdfinfo {
  263.     my $self = shift;
  264.     return $self->{pdfinfo} if $self->{pdfinfo};
  265.     return $self->{pdfinfo} = {} unless $self->{trailer}{Info};
  266.     my $pdfinfo = $self->getValue( $self->{trailer}{Info} );
  267.     $pdfinfo->{$_} = $self->getValue( $pdfinfo->{$_} ) for keys $pdfinfo;
  268.     return $self->{pdfinfo} = $pdfinfo;
  269. }
  270.  
  271. sub getPageAnnotations {
  272.     my $self = shift;
  273.     my $page = shift or die 'Page for annotations needed!';
  274.     my $raw_annots = $self->getPage( $page )->{Annots} or return;
  275.     my ($annots) = $self->recurse_values( $self->getValue( $raw_annots ) );
  276.     return @$annots;
  277. }
  278.  
  279. sub lines {
  280.     my ($self,%P) = @_;
  281.     return @{$self->{tickets}} if $self->{tickets} and ref $self->{tickets} eq 'ARRAY';
  282.     @P{qw(page_start page_stop)} = ($P{page},$P{page}) if $P{page};
  283.     $P{page_start}  //= 1;
  284.     $P{page_stop}   //= $self->numPages;
  285.  
  286.     # Try with supplied type, or known type, fall back to type list if we fail
  287.     $P{type} ||= $self->{ticket_type} || $AUTOTYPECHAIN[0];
  288.     for my $page ($P{page_start} .. $P{page_stop}) {
  289. #warn "page $page";
  290.         my @annotations = $self->getPageAnnotations( $page );
  291.         # Specific processor to season or regular ticket, etc.
  292.         my $processor = $self->getPageContentTree($page)->traverse('PDF::PageLines');
  293.         #die "FULLSTOP: No text found to proces in PDF" unless $processor->getTextBlocks;
  294.         my @lines = $processor->get_lines(%P);
  295.  
  296.         push $self->{pages}, \@lines;
  297.     }
  298.  
  299.     return @{$self->{pages}}
  300. }
  301.  
  302. sub ticket_set_info {
  303.     my ($self,%P) = @_;
  304.     die $self->is_excluded if $self->is_excluded;
  305.     return $self->{ticket_set} if $self->{ticket_set} and ref $self->{ticket_set} eq 'HASH';
  306.     $P{type} = "TF::PDF::$P{type}" if $P{type} and not $P{type} =~ /::/;
  307.  
  308.     my @tickets = $self->tickets(%P);
  309.     my %TS = map { $_ => $tickets[0]{$_} } grep { not /^(?:seat|barcode|pdf_page|notes)$/ } keys $tickets[0];
  310.     my @sorted_seats = map { $_->{seat} } sort { $a->{seat} <=> $b->{seat} } @tickets;
  311.     $TS{seats}     = join ',', @sorted_seats;
  312.     $TS{low_seat}  = $sorted_seats[0];
  313.     $TS{high_seat} = $sorted_seats[-1];
  314.     $TS{seat_range}= @tickets > 1 ? "$TS{low_seat}-$TS{high_seat}" : $tickets[0]{seat};
  315.     $TS{quantity}  = @tickets;
  316.     # Get all notes from every ticket as keys in anonymous hash, and then take those keys and join them
  317.     $TS{description}    = join ' / ', keys { map { map { $_ => 1 } @{$_->{notes}} } @tickets };
  318.     $TS{ticket_type}    = $self->{ticket_type}=~s/.*:://gr;
  319.  
  320.     # Make sure there's not different tickets joined in this set
  321.     for my $field ( qw(order_number name venue section row event_date) ) {
  322.         next unless @tickets > 1;
  323.         die "$field of ticket 1 and ticket $_ don't match!" for grep { $TS{$field} and $TS{$field} ne $tickets[$_]{$field} } (1 .. $#tickets);
  324.     }
  325.  
  326.     #printf "%15s: %s\n", $_, $TS{$_} for keys %TS;
  327.     $self->{ticket_set} = \%TS;
  328.     return $self->{ticket_set};
  329. }
  330.  
  331. sub extract_pages {
  332.     my ($self, @page_numbers) = @_;
  333.     my $new_pdf = __PACKAGE__->new( $self->toPDF ) or die 'Unable to duplicate PDF';
  334.     $new_pdf->extractPages( join ',', grep { not /\D/ } map { split /\s*,\s*/ } @page_numbers );
  335.     $new_pdf->clean();
  336.     return $new_pdf;
  337. }
  338.  
  339. sub replace_text {
  340.     my ($self,$from,$to) = @_;
  341.     $from = qr/\Q$from\E/i unless ref $from eq 'Regexp';
  342.     for my $p (1 .. $self->numPages()) {
  343.        my $content = $self->getPageContent($p);
  344.        $content =~ s/$from/$to/gms or next;
  345.        $self->setPageContent($p, $content);
  346.     }
  347.     $self->clean();
  348.     return $self;
  349. }
  350.  
  351. 1;
  352.  
  353. package main;
  354.  
  355. use 5.014;
  356. use warnings;
  357. #use PDF::Lines;
  358. use Data::Dumper;
  359.  
  360. my $file = shift or die "Usage: $0 file.pdf [page]\n";
  361. my $page = shift;
  362. my $pdf = PDF::Lines->new( $file );
  363. my @pages = $pdf->lines( page=>$page );
  364.  
  365. $page||=1;
  366. map { say "=== PAGE $page ==="; $page++; say for @$_ } @pages;
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement