Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- package PDF::PageLines;
- use 5.014;
- use warnings;
- use base qw(CAM::PDF::GS);
- use List::Util qw(sum);
- our $VERSION = '1.00';
- # More exact matches first
- # These are used to group hunks of data into separate distinct sections
- my @boxes = (
- # Row Col
- # attr X min/max Y min/max axix axis
- #[ top => 75,480, 670,740, 'x', 'y' ],
- # Middle columns
- # attr X min/max Y min/max
- #[ left => 80,160, 550,660, 'x', 'y' ],
- #[ center => 160,380, 550,660, 'x', 'y' ],
- #[ right => 385,475, 550,660, 'x', 'y' ],
- #[ bc_right => 480,570, 550,705, 'y', 'x' ],
- # Bottom barcode
- #[ bc_bottom => 290,550, 35,185, 'x', 'y' ],
- # Fine print
- #[ fineprint => 0,275, 50,350, 'x', 'y' ],
- );
- my $font_point = 12;
- my $spaces_per_fullsize = 3;
- sub new {
- my ($pkg, @args) = @_;
- my $self = $pkg->SUPER::new(@args);
- $self->{refs}{text} = [];
- $self->{locations} = \@boxes;
- return $self;
- }
- sub getTextBlocks {
- my ($self) = @_;
- return @{$self->{refs}->{text}};
- }
- sub renderText
- {
- my $self = shift;
- my $string = shift;
- my $width = shift;
- #my ($xu, $yu) = $self->textLineToUser(0, 0);
- my ($x, $y) = $self->textToDevice(0, 0);
- my ($x2, $y2) = $self->textToDevice(1, 0);
- # Compute angles and deltas
- my $xa = cos( atan2($y2-$y, $x2-$x) );
- my $ya = sin( atan2($y2-$y, $x2-$x) );
- my $xd = $xa*$width*$self->{Tfs};
- my $yd = $ya*$width*$self->{Tfs};
- my $location;
- for my $location_coords (@{$self->{locations}}) {
- next unless $x > $location_coords->[1]-5 and ($x+$xd) < $location_coords->[2]+5;
- next unless $y > $location_coords->[3]-5 and ($y+$yd) < $location_coords->[4]+5;
- $location = $location_coords->[0];
- last;
- }
- my $text = length($string)|3 && ord($string)>127 ? join('', map {chr(288-ord)} split('',$string)) : $string;
- #say "$text $self->{Tf} $self->{Tfs} $self->{Tc} $self->{Tw} $self->{Tz}";
- #say join ' / ', keys %{$self};
- #my $a = (atan2($y2-$y, $x2-$x)/(4*atan2(1, 1)))*180;
- #printf "(%7.2f, %7.2f) W:%6.2f L:%10s %s (%s)\n", $x,$y, $xd, $location//'UNKNOWN', $text, $string;
- push $self->{refs}{text}, {
- x => $x, xd => $xd,
- y => $y, yd => $yd,
- fontsize => $self->{Tfs},
- location => $location // 'UNKNOWN',
- text => $text,
- #text => $self->{Tf} eq 'F3' ? decode( 'UTF-16BE', $string ) : $string,
- #font => $self->{Tf},
- };
- return;
- }
- # Used location data to build string of group members, assuming high Y locality
- sub _build_line {
- my ($xy,$objects) = @_;
- my $xyd = $xy . 'd';
- my @strings;
- my $last;
- for my $o ( @$objects ) {
- # Add any needed whitespace
- if ( $last ) {
- my $last_end = $last->{$xy} + $last->{$xyd};
- my $spaces = ($o->{$xy} - $last_end) / (($last->{$xyd} / $o->{fontsize}) * $font_point * $spaces_per_fullsize);
- #warn "$last_end - $o->{$xy} = " .($o->{$xy} - $last_end).", $last->{$xyd} / ($o->{fontsize} ) * 14) = $spaces | $o->{text}";
- #my $spaces = (($o->{$xy} - $last_end) / ($o->{fontsize}*16));
- #warn "$last_end - $o->{$xy} = " .($o->{$xy} - $last_end)." / ($o->{fontsize}/3) = $spaces";
- #say "$last->{x} + $last->{xd} = $last_end <? $o->{x} ($o->{fontsize})" if $spaces < 0;
- # If there's at least half a space, make a space
- $spaces = int($spaces) || int($spaces+0.5);
- push @strings, ' ' x $spaces if $spaces;
- #say "$spaces spaces between $last->{text} and $o->{text}" if $spaces < 0;
- #if $last_end
- }
- push @strings, $o->{text};
- $last = $o;
- }
- #say "BUILT LINE ($last->{location}): " . join '', @strings;
- return join '', @strings;
- }
- # Pre-sort to get sorted groups
- sub _group {
- my ($items, $variance, $key) = @_;
- my @groups;
- sub group_add_member {
- $_[0] ||= {members=>[]};
- push $_[0]{members}, $_[1];
- $_[0]{count} = @{$_[0]{members}};
- $_[0]{avg} = (sum map { $_[2] ? $_[1]{$_[2]} : $_[1] } @{$_[0]{members}}) / $_[0]{count};
- #say "Computed $_[0]{avg} from (" . join(", ", map {$_[2]?$_[1]{$_[2]}:$_[1]} values $_[0]{members}) . ")";
- return $_[0];
- }
- for my $item (@$items) {
- my $val = $key ? $item->{$key} : $item;
- my $added;
- for my $group (@groups) {
- #say "$group->{avg} $val";
- next unless abs($group->{avg} - $val) < $variance;
- group_add_member $group, $item, $key;
- $added = 1;
- }
- next if $added;
- push @groups, group_add_member undef, $item, $key;
- }
- #say Dumper \@groups;
- return @groups;
- }
- sub get_lines {
- my ($self,%P) = @_;
- my %lrows = %{$self->get_pdf_section_groups||{}};
- my @all_text_lines = map { map {$_->{text_line}} @{$_} } values %lrows;
- #use Data::Dumper; say Dumper \%lrows;
- die 'No grouped location rows found!' unless %lrows;
- return @all_text_lines;
- }
- sub get_pdf_section_groups {
- my ($self,$want,$ignore) = @_;
- $want = qr/$want/ if $want and ref $want ne 'Regexp';
- $ignore = qr/$ignore/ if $ignore and ref $ignore ne 'Regexp';
- my @texts = $self->getTextBlocks;
- my %locations;
- for my $tblock ( sort { $b->{location} cmp $a->{location} } @texts ) {
- next if $want and not $tblock->{location} =~ $want;
- next if $ignore and $tblock->{location} =~ $ignore;
- my $group = $locations{$tblock->{location}}||=[]; # Make array if there isn't one
- push $group, $tblock;
- }
- #say "Grouping headers for page $page";
- my %lrows;
- push $self->{locations}, [ UNKNOWN => 0,0,0,0,'x','y' ] unless grep {$_->[0] eq 'UNKNOWN'} @{$self->{locations}};
- for my $location_data ( @{$self->{locations}} ) {
- my ($name,$x1,$x2,$y1,$y2,$row_axis,$col_axis) = @$location_data;
- next unless $locations{$name};
- $row_axis ||= 'x';
- $col_axis ||= 'y';
- my @rows = sort { $b->{avg} <=> $a->{avg} } _group($locations{$name}, 3, $col_axis);
- for my $row ( @rows ) {
- $row->{members} = [ sort { $a->{$row_axis} <=> $b->{$row_axis} } @{$row->{members}} ];
- $row->{text_line} = _build_line( $row_axis, $row->{members} );
- }
- $lrows{$name} = \@rows;
- }
- return \%lrows;
- }
- 1;
- package PDF::Lines;
- use 5.014;
- use warnings;
- use base qw(CAM::PDF);
- use Data::Dumper;
- use Try::Tiny;
- use File::Slurp qw(read_file);
- use File::Temp;
- use List::Util qw(sum);
- #use PDF::PageLines;
- my @AUTOTYPECHAIN = qw( TF::PDF::TMTicket TF::PDF::TMSeasonTicket );
- sub new {
- my ($pkg, @args) = @_;
- my $self;
- my $gs;
- $gs = pop @args if @args>1 and $args[-1] =~ /ghostscript_rewrite/i;
- try {
- $self = $pkg->SUPER::new(@args) or die "Unable to read PDF: $CAM::PDF::errstr";
- my $test_processor = $self->getPageContentTree(1)->traverse('PDF::PageLines');
- # Catch errors early, attempt ghostscript fix if we can
- my $hastext = $test_processor->getTextBlocks; # Count
- die "No text found to proces in PDF" unless $hastext;
- }
- catch {
- die $_ unless not $gs and /Incorrect password|The document cannot be decrypted|No text found|Unrecognized type in parseAny/i;
- warn $_ if grep { $args[$_] eq 'debug' and $args[+$_] } 0 .. $#args;
- my $pdf_data = read_file( shift @args );
- my ($ofh, $outfile) = File::Temp::tempfile( 'gs_pdfconv_XXXX', DIR => '/var/tmp/TF' );
- open( my $gsfh, '|-', "gs -q -dNOPAUSE -sDEVICE=pdfwrite -dCompatibilityLevel=1.3 -sOUTPUTFILE=$outfile -dBATCH -") or die $!;
- print $gsfh $pdf_data or die "Unable to push all data through GS for PDF conversion! $!";
- close($gsfh);
- $self = $pkg->SUPER::new( $outfile, @args, 'ghostscript_rewrite' ) or die "Unable to read GS converted PDF: $CAM::PDF::errstr";;
- unlink($outfile);
- };
- $self->{pdfinfo} = undef;
- $self->{pages} = [];
- return $self;
- }
- sub recurse_values {
- my $self = shift;
- my @returns;
- for ( @_ ) {
- if ( ref eq 'ARRAY' ) {
- #say 'recurse A';
- push @returns, [$self->recurse_values( @$_ )];
- }
- elsif ( ref eq 'HASH' ) {
- #say 'recurse H';
- my $h = $_;
- push @returns, { map { $_ => $self->recurse_values( $h->{$_} ) } keys $h };
- }
- elsif ( ref eq 'CAM::PDF::Node' ) {
- #say 'node';
- push @returns, $self->recurse_values( $self->getValue( $_ ) );
- }
- else {
- #say "val $_";
- push @returns, $_;
- }
- }
- return @returns;
- }
- sub pdfinfo {
- my $self = shift;
- return $self->{pdfinfo} if $self->{pdfinfo};
- return $self->{pdfinfo} = {} unless $self->{trailer}{Info};
- my $pdfinfo = $self->getValue( $self->{trailer}{Info} );
- $pdfinfo->{$_} = $self->getValue( $pdfinfo->{$_} ) for keys $pdfinfo;
- return $self->{pdfinfo} = $pdfinfo;
- }
- sub getPageAnnotations {
- my $self = shift;
- my $page = shift or die 'Page for annotations needed!';
- my $raw_annots = $self->getPage( $page )->{Annots} or return;
- my ($annots) = $self->recurse_values( $self->getValue( $raw_annots ) );
- return @$annots;
- }
- sub lines {
- my ($self,%P) = @_;
- return @{$self->{tickets}} if $self->{tickets} and ref $self->{tickets} eq 'ARRAY';
- @P{qw(page_start page_stop)} = ($P{page},$P{page}) if $P{page};
- $P{page_start} //= 1;
- $P{page_stop} //= $self->numPages;
- # Try with supplied type, or known type, fall back to type list if we fail
- $P{type} ||= $self->{ticket_type} || $AUTOTYPECHAIN[0];
- for my $page ($P{page_start} .. $P{page_stop}) {
- #warn "page $page";
- my @annotations = $self->getPageAnnotations( $page );
- # Specific processor to season or regular ticket, etc.
- my $processor = $self->getPageContentTree($page)->traverse('PDF::PageLines');
- #die "FULLSTOP: No text found to proces in PDF" unless $processor->getTextBlocks;
- my @lines = $processor->get_lines(%P);
- push $self->{pages}, \@lines;
- }
- return @{$self->{pages}}
- }
- sub ticket_set_info {
- my ($self,%P) = @_;
- die $self->is_excluded if $self->is_excluded;
- return $self->{ticket_set} if $self->{ticket_set} and ref $self->{ticket_set} eq 'HASH';
- $P{type} = "TF::PDF::$P{type}" if $P{type} and not $P{type} =~ /::/;
- my @tickets = $self->tickets(%P);
- my %TS = map { $_ => $tickets[0]{$_} } grep { not /^(?:seat|barcode|pdf_page|notes)$/ } keys $tickets[0];
- my @sorted_seats = map { $_->{seat} } sort { $a->{seat} <=> $b->{seat} } @tickets;
- $TS{seats} = join ',', @sorted_seats;
- $TS{low_seat} = $sorted_seats[0];
- $TS{high_seat} = $sorted_seats[-1];
- $TS{seat_range}= @tickets > 1 ? "$TS{low_seat}-$TS{high_seat}" : $tickets[0]{seat};
- $TS{quantity} = @tickets;
- # Get all notes from every ticket as keys in anonymous hash, and then take those keys and join them
- $TS{description} = join ' / ', keys { map { map { $_ => 1 } @{$_->{notes}} } @tickets };
- $TS{ticket_type} = $self->{ticket_type}=~s/.*:://gr;
- # Make sure there's not different tickets joined in this set
- for my $field ( qw(order_number name venue section row event_date) ) {
- next unless @tickets > 1;
- die "$field of ticket 1 and ticket $_ don't match!" for grep { $TS{$field} and $TS{$field} ne $tickets[$_]{$field} } (1 .. $#tickets);
- }
- #printf "%15s: %s\n", $_, $TS{$_} for keys %TS;
- $self->{ticket_set} = \%TS;
- return $self->{ticket_set};
- }
- sub extract_pages {
- my ($self, @page_numbers) = @_;
- my $new_pdf = __PACKAGE__->new( $self->toPDF ) or die 'Unable to duplicate PDF';
- $new_pdf->extractPages( join ',', grep { not /\D/ } map { split /\s*,\s*/ } @page_numbers );
- $new_pdf->clean();
- return $new_pdf;
- }
- sub replace_text {
- my ($self,$from,$to) = @_;
- $from = qr/\Q$from\E/i unless ref $from eq 'Regexp';
- for my $p (1 .. $self->numPages()) {
- my $content = $self->getPageContent($p);
- $content =~ s/$from/$to/gms or next;
- $self->setPageContent($p, $content);
- }
- $self->clean();
- return $self;
- }
- 1;
- package main;
- use 5.014;
- use warnings;
- #use PDF::Lines;
- use Data::Dumper;
- my $file = shift or die "Usage: $0 file.pdf [page]\n";
- my $page = shift;
- my $pdf = PDF::Lines->new( $file );
- my @pages = $pdf->lines( page=>$page );
- $page||=1;
- map { say "=== PAGE $page ==="; $page++; say for @$_ } @pages;
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement