Untitled

#!/usr/bin/perl
use strict;
use File::Find::Rule;
use Time::HiRes qw ( time alarm sleep );
use utf8;
use Text::Unidecode;
use Data::Dumper;
use List::Util qw /min/;
use OcrConfig;

#use Devel::Size qw(size total_size);
#use Devel::DumpSizes qw/dump_sizes/;

my $time1 = time;
my $countFiles;
my $countAll;

my ( $countInDic, $CountInDicLC, $countCleanInDic, $countCleanLCInDicLC,
    $notInDict, $tooShort );

my @files;
my %text;
my $item;
my %anagramProposals;

my $debug = 1;
warn "Begin time: " . time if ($debug);

# sets line-break character to nonsense -> sucks in whole text.
#$/ = "\x04";

my $year = $ARGV[1];

#my $pruneDictFreq = $ARGV[2];

# DA: moved hashes up to global level => accessibility and mutability in subs
# May require resetting hashes in outmost foreach loop
my (
    %IDs,               %words,         %wordsClean,
    %subs_type,         %firstWord,     %lastWord,
    %delimiter,         %WC,            %CC,
    %HPOS,              %VPOS,          %WIDTH,
    %HEIGHT,            %dirtLeft,      %dirtRight,
    %lowFreq,           %lowFreqLC,     @toCorrect,
    %inlineMerge,       %inlineMergeLC, %acrosslineMerge,
    %acrosslineMergeLC, %possessive
);

# load dictionarie(s) to memory
# SH: this loads the relevant files into global hashes - probably not good practice?
my ( %freqDict, %freqDictLC, %anagramHashDict, %anagramHashBasic,
    %confusion_list );
loadDictionaries( dictionaryPath( dictionary() ) );
loadAnagramHash( dictionaryPath( anagramHashMap() ) );
loadConfusionAlphabet( dictionaryPath( confusionAlphabet() ) );

warn "Dictionaries loaded " . time if ($debug);

@files = File::Find::Rule->file->in( $ARGV[0] );

foreach my $path (@files) {
    next unless ( $path =~ m/\/([^\/]+\d+)\.xml$/ );
    my $filename = $1;

    next unless ( $path =~ /straitstimes_19510102_0002.xml/ );

    print STDERR "$path\n";

    # sets input record separator character to undef -> enable "slurp" mode
    local $/;

    open( H, "<:utf8", "$path" ) || die "can't open file $path";
    my $file = <H>;
    close(H);

    $countFiles++;

    $file =~ tr/(\n|\r)/ /;

    my $countWords = 0;

    my ( $firstWord, $delimiter, $WC, $CC );

    while ( $file =~ m/<TextBlock([^>]+)>(.*?)<\/TextBlock>/g ) {
        my $textBlockAtt = $1;
        my $TextBlock    = $2;

        while ( $TextBlock =~ m/<TextLine([^>]+)>(.*?)<\/TextLine>/g ) {
            my $textLineAtt   = $1;
            my $TextLine      = $2;
            my $beginTextLine = 1;
            while ( $TextLine =~
m/<String ID="([^"]+)" HPOS="([^"]+)" VPOS="([^"]+)" WIDTH="([^"]+)" HEIGHT="([^"]+)" (.*?)\/>/g
              )
            {
                $countWords++;
                $countAll++;
                $IDs{$countWords}    = $1;
                $HPOS{$countWords}   = $2;
                $VPOS{$countWords}   = $3;
                $WIDTH{$countWords}  = $4;
                $HEIGHT{$countWords} = $5;
                my $rest = $6;

# This parse does not implement splitting on split_string, so the following would not be split:
# <String ID="P2_ST00064" HPOS="1371" VPOS="235" WIDTH="142" HEIGHT="18" CONTENT="January,l9sl." WC="0.95" CC="1101002002101"/>
# can we add this and still keep track of things?

                if ($beginTextLine) {
                    $firstWord{$countWords}      = 1;
                    $beginTextLine               = 0;
                    $lastWord{ $countWords - 1 } = 1
                      unless ( $countWords == 1 );
                }
                else {
                    $firstWord{$countWords} = 0;
                }

                if ( $rest =~ m/SUBS_TYPE="HypPart1"/ ) {
                    $subs_type{$countWords} = 1;
                }
                elsif ( $rest =~ m/SUBS_TYPE="HypPart2"/ ) {
                    $subs_type{$countWords} = 2;
                }
                else {
                    $subs_type{$countWords} = 0;
                }
                my $item;
                if ( $rest =~ m/CONTENT="([^"]+)"/ ) {
                    $item = $1;
                    $delimiter{$countWords} = "double";
                }
                elsif ( $rest =~ m/CONTENT='([^']+)'/ ) {
                    $item = $1;
                    $delimiter{$countWords} = "single";
                }
                else {
                    warn
                      "Can't work out content attribute for $TextLine in $path"
                      if ($debug);
                }

                if ( $rest =~ m/WC="([^"]+)" CC="([^"]+)"/ ) {
                    $WC{$countWords} = $1;
                    $CC{$countWords} = $2;
                }
                else {
                    warn
"Can't work out WC and CC attributes for $TextLine in $path"
                      if ($debug);
                }
                $words{$countWords} = $item;

                if ( $item =~ s/\A(\P{Word}+)// ) {
                    $dirtLeft{$countWords} = $1;
                }
                else {
                    $dirtLeft{$countWords} = "";
                }
                if ( $item =~ s/(\P{Word}+)\Z// ) {
                    $dirtRight{$countWords} = $1;
                }
                else {
                    $dirtRight{$countWords} = "";
                }

                $possessive{$countWords} = 1 if ( $item =~ m/\'s\Z/ );

                $wordsClean{$countWords} = $item;

            }
        }
    }

    # Now we have the relevant data in the hashes...
    # First check status depending on whether the word is in the dictionary

    my (%status);
    for ( my $count = 1 ; $count <= $countWords ; $count++ ) {

        # check first for manual correction rules
        # not yet implemented

# Status:
# 0 = in dictionary as is, no need to correct
# 1 = in dictionary, but only lowercase; no need to correct, but check case
# 2 = in dictionary as is, if dirt removed; deal with dirt (?)
# 3 = in dictionary as lowercase, if dirt removed; no need to correct, but check case, and deal with dirt
# 4 = manual correction rule
# 5 = not in dictionary, but too short
# 6 = not in dictionary, but number or possible abbreviation
# 7 = needs to be corrected

        if ( $freqDict{ $words{$count} } ) {
            $status{$count} = 0;
            $countInDic++;
            $lowFreq{$count} = 1
              if ( $freqDict{ $words{$count} } < freqThreshold() );
        }
        elsif ( $freqDictLC{ lc( $words{$count} ) } ) {
            $status{$count} = 1;
            $CountInDicLC++;
            $lowFreqLC{$count} = 1
              if ( $freqDictLC{ lc( $words{$count} ) } < freqThreshold() );
        }
        elsif ( $freqDict{ $wordsClean{$count} } ) {
            $status{$count} = 2;
            $countCleanInDic++;
            $lowFreq{$count} = 1
              if ( $freqDict{ $wordsClean{$count} } < freqThreshold() );
        }
        elsif ( $freqDictLC{ lc( $wordsClean{$count} ) } ) {
            $status{$count} = 3;
            $countCleanLCInDicLC++;
            $lowFreqLC{$count} = 1
              if ( $freqDictLC{ lc( $wordsClean{$count} ) } < freqThreshold() );
        }
        elsif ( length( $wordsClean{$count} ) < 3 ) {
            $status{$count} = 5;
            $tooShort++;
        }
        elsif (( $wordsClean{$count} =~ m/\A[\d\.\-\/\,]+\Z/ )
            || ( $wordsClean{$count} =~ m/\A[a-z](\.[a-z])+\Z/i ) )
        {
            $status{$count} = 6;
        }
        else {
            $notInDict++;
            $status{$count} = 7;
            push @toCorrect, $count;
        }

    }

#   if ($debug) {
#       my $percent = 100 * $countInDic / $countAll;
#       print STDERR "In dictionary as is: $countInDic ($percent%)\n";
#       my $percent = 100 * $CountInDicLC / $countAll;
#       print STDERR "In LC dictionary as is: $CountInDicLC ($percent%)\n";
#       my $percent = 100 * $countCleanInDic / $countAll;
#       print STDERR "In dictionary cleaned: $countCleanInDic ($percent%)\n";
#       my $percent = 100 * $countCleanLCInDicLC / $countAll;
#       print STDERR "In LC dictionary cleaned: $countCleanLCInDicLC ($percent%)\n";
#       my $percent = 100 * $tooShort / $countAll;
#       print STDERR "Not in dictionary but too short to fix (<3): $tooShort ($percent%)\n";
#       my $percent = 100 * $notInDict / $countAll;
#       print STDERR "Not in dictionary: $notInDict ($percent%)\n\n";
#
#   }

    # now work through the words that are not in the dictionary

    # first check for inline/across-line merging
    foreach my $count (@toCorrect) {
        my $before = $count - 1;
        my $after  = $count + 1;

        # left merging check
        unless ( ( $firstWord{$count} )
            || ( $subs_type{$count} == 2 )
            || ( $wordsClean{$count} =~ m/\A\d+/ )
            || ( $wordsClean{$before} =~ m/\d+\Z/ ) )
        {
            check_left( $before, $count, $after );
        }

        # right merging check
        unless ( ( $lastWord{$count} )
            || ( $subs_type{$count} == 1 )
            || ( $wordsClean{$count} =~ m/\d+\Z/ )
            || ( $wordsClean{$after} =~ m/\A\d+/ ) )
        {
            check_right( $before, $count, $after );
        }
        if (   ( $lastWord{$count} )
            && ( !$subs_type{$count} )
            && $words{$count} =~ m/\-\Z/ )
        {
            merge_right( $before, $count, $after );
        }
        if (   ( $firstWord{$count} )
            && ( !$subs_type{$count} )
            && $words{$before} =~ m/\-\Z/
            && ( !$acrosslineMerge{$before} ) )
        {
            merge_left( $before, $count, $after );
        }
    }

# Note: inline merging should also work with correction proposals. Maybe move down in the chain?
# Note: There are quite a few hyphenated words across lines that have not been detected as one word - check!

    my $theT1 = time;

# get anagram-hash list of correction proposals for each word that requires fixing
# problem: The list generates proposals that differ in one character â€“ or possibly by two characters, because a bi-gram
# is substituted for a single character. This does not therefore catch words where two different characters were mis-OCR-ed.
    foreach my $count (@toCorrect) {

        # don't do anything if it is mergable or part of a hyphenated line-break
        # or if its an abbreviation or number
        next
          if ( ( $inlineMerge{$count} )
            || ( $inlineMergeLC{$count} )
            || ( $acrosslineMerge{$count} )
            || ( $acrosslineMergeLC{$count} )
            || ( $subs_type{$count} )
            || $status{$count} == 6 );
        my $focused_word = lc( $wordsClean{$count} );
        my $length       = length($focused_word);

# disregards short words because they produce more than 1000 correction proposals - can we optimise this?
        next if ( $length < 4 );
        my $ld_limit = 3;
        my $focused_char_hash;
        my (%sameHashValues);
        warn $focused_word;
        unless ( $anagramProposals{$focused_word} ) {

            #warn $focused_word;
            my $focused_word_hash = anagram_hash($focused_word);

            for ( my $l = -1 ; $l < $length ; $l++ ) {
                for (
                    my $l2 = 1 ;
                    ( $l2 < 3 ) && ( $l + $l2 <= $length ) ;
                    $l2++
                  )
                {
                    if ( $l < 0 ) {
                        $focused_char_hash = 0;
                        $l2                = $length;
                    }
                    else {
                        $focused_char_hash =
                          anagram_hash( substr( $focused_word, $l, $l + $l2 ) );
                    }

                    foreach my $simulated_char_hash ( keys %confusion_list ) {
                        my $simulated_word_hash =
                          $focused_word_hash + $simulated_char_hash -
                          $focused_char_hash;

                        # this speeds up things by about 25%
                        next unless $anagramHashBasic{$simulated_char_hash};
                        foreach my $theWord (
                            keys %{ $anagramHashDict{$simulated_word_hash} } )
                        {
                            $sameHashValues{$theWord}++;
                        }
                    }
                }

            }

            #warn "before: " . time;
            foreach my $similarWord ( keys %sameHashValues ) {
                my $distance = get_distance( $focused_word, $similarWord );
                if ( $distance <= $ld_limit ) {
                    $anagramProposals{$focused_word}{$similarWord} = $distance;
                }
            }

            #$anagramProposals{$focused_word} = 1;
            if ( $focused_word eq "reputabje" ) {
                print STDERR
"word: $focused_word - focused_word_hash: $focused_word_hash:\n";
                foreach my $bla ( keys %{ $anagramProposals{$focused_word} } ) {
                    print STDERR
                      "$bla: $anagramProposals{$focused_word}{$bla}\n";
                }
            }
        }
        else {

            #warn "#### saved time $focused_word";
        }
    }
    my $theT2   = time;
    my $theDiff = $theT2 - $theT1;
    print STDERR "time taken: $theDiff\n";

}

my $time2      = time;
my $difference = $time2 - $time1;
my $average    = $difference / $countFiles;

print STDERR
"Number of files processed: $countFiles\nNumber of words: $countAll\nTime taken: $difference\nAverage time per file: $average\n"
  if ($debug);

sub loadDictionaries {
    my $myPath           = shift;
    my $myDictionaryName = shift;

    my $openPath = dictionaryPath($myDictionaryName);
    open( H, "<:utf8", "$openPath" ) || die "can't open file $openPath";
    while (<H>) {
        my $line = $_;
        chomp($line);
        ( my $freqWord, my $theFreq ) = split( /\t/, $line );
        my $freqWordLC = lc($freqWord);
        $freqDict{$freqWord}     += $theFreq;
        $freqDictLC{$freqWordLC} += $theFreq;
    }
    close(H);
}

sub loadAnagramHash {
    my $myPath           = shift;
    my $myDictionaryName = shift;

    my $openPath = dictionaryPath($myDictionaryName);
    open( H, "<:utf8", "$openPath" ) || die "can't open file $openPath";
    while (<H>) {
        my $line = $_;
        chomp($line);
        ( my $hash, my $freqWord, my $theFreq ) = split( /\t/, $line );
        $anagramHashDict{$hash}{$freqWord} = $theFreq;
        $anagramHashBasic{$hash}++;
    }
    close(H);
}

sub loadConfusionAlphabet {
    my $myPath           = shift;
    my $myDictionaryName = shift;

    my $openPath = dictionaryPath($myDictionaryName);
    open( H, "<:utf8", "$openPath" ) || die "can't open file $openPath";
    while (<H>) {
        my $line = $_;
        chomp($line);
        $confusion_list{$line} = 1;
    }
    close(H);
}

sub check_left {
    my ( $before, $count, $after ) = @_;
    my $mergedWord = $wordsClean{$before} . $wordsClean{$count};
    if ( $freqDict{$mergedWord} ) {
        $inlineMerge{$count} = "L";
    }
    elsif ( $freqDictLC{ lc($mergedWord) } ) {
        $inlineMergeLC{$count} = "L";
    }
}

sub check_right {
    my ( $before, $count, $after ) = @_;

    # ignore if word-merging involves digits at word boundaries
    my $mergedWord = $wordsClean{$count} . $wordsClean{$after};

    if ( $freqDict{$mergedWord} ) {
        if ( $inlineMerge{$count} eq "L" ) {
            $inlineMerge{$count} = "B";
        }
        else {
            $inlineMerge{$count} = "R";
        }
    }
    elsif ( $freqDictLC{ lc($mergedWord) } ) {
        if ( $inlineMergeLC{$count} eq "L" ) {
            $inlineMergeLC{$count} = "B";
        }
        else {
            $inlineMergeLC{$count} = "R";
        }
    }
}

sub merge_left {
    my ( $before, $count, $after ) = @_;
    my $mergedWord = $wordsClean{$before} . $wordsClean{$count};
    if ( $freqDict{$mergedWord} ) {
        $acrosslineMerge{$count}  = "L";
        $acrosslineMerge{$before} = "R";
        warn "merged hyphenated word: $wordsClean{$before} $wordsClean{$count}";
    }
    elsif ( $freqDictLC{ lc($mergedWord) } ) {
        $acrosslineMergeLC{$count}  = "L";
        $acrosslineMergeLC{$before} = "R";
        warn "merged hyphenated word: $wordsClean{$before} $wordsClean{$count}";
    }
}

sub merge_right {
    my ( $before, $count, $after ) = @_;
    my $mergedWord = $wordsClean{$count} . $wordsClean{$after};
    if ( $freqDict{$mergedWord} ) {
        $acrosslineMerge{$count} = "R";
        $acrosslineMerge{$after} = "L";
        warn "merged hyphenated word: $wordsClean{$count} $wordsClean{$after}";
    }
    elsif ( $freqDictLC{ lc($mergedWord) } ) {
        $acrosslineMergeLC{$count} = "R";
        $acrosslineMergeLC{$after} = "L";
        warn "merged hyphenated word: $wordsClean{$count} $wordsClean{$after}";
    }
}