Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/perl
- use strict;
- use File::Find::Rule;
- use Time::HiRes qw ( time alarm sleep );
- use utf8;
- use Text::Unidecode;
- use Data::Dumper;
- use List::Util qw /min/;
- use OcrConfig;
- #use Devel::Size qw(size total_size);
- #use Devel::DumpSizes qw/dump_sizes/;
- my $time1 = time;
- my $countFiles;
- my $countAll;
- my ( $countInDic, $CountInDicLC, $countCleanInDic, $countCleanLCInDicLC,
- $notInDict, $tooShort );
- my @files;
- my %text;
- my $item;
- my %anagramProposals;
- my $debug = 1;
- warn "Begin time: " . time if ($debug);
- # sets line-break character to nonsense -> sucks in whole text.
- #$/ = "\x04";
- my $year = $ARGV[1];
- #my $pruneDictFreq = $ARGV[2];
- # DA: moved hashes up to global level => accessibility and mutability in subs
- # May require resetting hashes in outmost foreach loop
- my (
- %IDs, %words, %wordsClean,
- %subs_type, %firstWord, %lastWord,
- %delimiter, %WC, %CC,
- %HPOS, %VPOS, %WIDTH,
- %HEIGHT, %dirtLeft, %dirtRight,
- %lowFreq, %lowFreqLC, @toCorrect,
- %inlineMerge, %inlineMergeLC, %acrosslineMerge,
- %acrosslineMergeLC, %possessive
- );
- # load dictionarie(s) to memory
- # SH: this loads the relevant files into global hashes - probably not good practice?
- my ( %freqDict, %freqDictLC, %anagramHashDict, %anagramHashBasic,
- %confusion_list );
- loadDictionaries( dictionaryPath( dictionary() ) );
- loadAnagramHash( dictionaryPath( anagramHashMap() ) );
- loadConfusionAlphabet( dictionaryPath( confusionAlphabet() ) );
- warn "Dictionaries loaded " . time if ($debug);
- @files = File::Find::Rule->file->in( $ARGV[0] );
- foreach my $path (@files) {
- next unless ( $path =~ m/\/([^\/]+\d+)\.xml$/ );
- my $filename = $1;
- next unless ( $path =~ /straitstimes_19510102_0002.xml/ );
- print STDERR "$path\n";
- # sets input record separator character to undef -> enable "slurp" mode
- local $/;
- open( H, "<:utf8", "$path" ) || die "can't open file $path";
- my $file = <H>;
- close(H);
- $countFiles++;
- $file =~ tr/(\n|\r)/ /;
- my $countWords = 0;
- my ( $firstWord, $delimiter, $WC, $CC );
- while ( $file =~ m/<TextBlock([^>]+)>(.*?)<\/TextBlock>/g ) {
- my $textBlockAtt = $1;
- my $TextBlock = $2;
- while ( $TextBlock =~ m/<TextLine([^>]+)>(.*?)<\/TextLine>/g ) {
- my $textLineAtt = $1;
- my $TextLine = $2;
- my $beginTextLine = 1;
- while ( $TextLine =~
- m/<String ID="([^"]+)" HPOS="([^"]+)" VPOS="([^"]+)" WIDTH="([^"]+)" HEIGHT="([^"]+)" (.*?)\/>/g
- )
- {
- $countWords++;
- $countAll++;
- $IDs{$countWords} = $1;
- $HPOS{$countWords} = $2;
- $VPOS{$countWords} = $3;
- $WIDTH{$countWords} = $4;
- $HEIGHT{$countWords} = $5;
- my $rest = $6;
- # This parse does not implement splitting on split_string, so the following would not be split:
- # <String ID="P2_ST00064" HPOS="1371" VPOS="235" WIDTH="142" HEIGHT="18" CONTENT="January,l9sl." WC="0.95" CC="1101002002101"/>
- # can we add this and still keep track of things?
- if ($beginTextLine) {
- $firstWord{$countWords} = 1;
- $beginTextLine = 0;
- $lastWord{ $countWords - 1 } = 1
- unless ( $countWords == 1 );
- }
- else {
- $firstWord{$countWords} = 0;
- }
- if ( $rest =~ m/SUBS_TYPE="HypPart1"/ ) {
- $subs_type{$countWords} = 1;
- }
- elsif ( $rest =~ m/SUBS_TYPE="HypPart2"/ ) {
- $subs_type{$countWords} = 2;
- }
- else {
- $subs_type{$countWords} = 0;
- }
- my $item;
- if ( $rest =~ m/CONTENT="([^"]+)"/ ) {
- $item = $1;
- $delimiter{$countWords} = "double";
- }
- elsif ( $rest =~ m/CONTENT='([^']+)'/ ) {
- $item = $1;
- $delimiter{$countWords} = "single";
- }
- else {
- warn
- "Can't work out content attribute for $TextLine in $path"
- if ($debug);
- }
- if ( $rest =~ m/WC="([^"]+)" CC="([^"]+)"/ ) {
- $WC{$countWords} = $1;
- $CC{$countWords} = $2;
- }
- else {
- warn
- "Can't work out WC and CC attributes for $TextLine in $path"
- if ($debug);
- }
- $words{$countWords} = $item;
- if ( $item =~ s/\A(\P{Word}+)// ) {
- $dirtLeft{$countWords} = $1;
- }
- else {
- $dirtLeft{$countWords} = "";
- }
- if ( $item =~ s/(\P{Word}+)\Z// ) {
- $dirtRight{$countWords} = $1;
- }
- else {
- $dirtRight{$countWords} = "";
- }
- $possessive{$countWords} = 1 if ( $item =~ m/\'s\Z/ );
- $wordsClean{$countWords} = $item;
- }
- }
- }
- # Now we have the relevant data in the hashes...
- # First check status depending on whether the word is in the dictionary
- my (%status);
- for ( my $count = 1 ; $count <= $countWords ; $count++ ) {
- # check first for manual correction rules
- # not yet implemented
- # Status:
- # 0 = in dictionary as is, no need to correct
- # 1 = in dictionary, but only lowercase; no need to correct, but check case
- # 2 = in dictionary as is, if dirt removed; deal with dirt (?)
- # 3 = in dictionary as lowercase, if dirt removed; no need to correct, but check case, and deal with dirt
- # 4 = manual correction rule
- # 5 = not in dictionary, but too short
- # 6 = not in dictionary, but number or possible abbreviation
- # 7 = needs to be corrected
- if ( $freqDict{ $words{$count} } ) {
- $status{$count} = 0;
- $countInDic++;
- $lowFreq{$count} = 1
- if ( $freqDict{ $words{$count} } < freqThreshold() );
- }
- elsif ( $freqDictLC{ lc( $words{$count} ) } ) {
- $status{$count} = 1;
- $CountInDicLC++;
- $lowFreqLC{$count} = 1
- if ( $freqDictLC{ lc( $words{$count} ) } < freqThreshold() );
- }
- elsif ( $freqDict{ $wordsClean{$count} } ) {
- $status{$count} = 2;
- $countCleanInDic++;
- $lowFreq{$count} = 1
- if ( $freqDict{ $wordsClean{$count} } < freqThreshold() );
- }
- elsif ( $freqDictLC{ lc( $wordsClean{$count} ) } ) {
- $status{$count} = 3;
- $countCleanLCInDicLC++;
- $lowFreqLC{$count} = 1
- if ( $freqDictLC{ lc( $wordsClean{$count} ) } < freqThreshold() );
- }
- elsif ( length( $wordsClean{$count} ) < 3 ) {
- $status{$count} = 5;
- $tooShort++;
- }
- elsif (( $wordsClean{$count} =~ m/\A[\d\.\-\/\,]+\Z/ )
- || ( $wordsClean{$count} =~ m/\A[a-z](\.[a-z])+\Z/i ) )
- {
- $status{$count} = 6;
- }
- else {
- $notInDict++;
- $status{$count} = 7;
- push @toCorrect, $count;
- }
- }
- # if ($debug) {
- # my $percent = 100 * $countInDic / $countAll;
- # print STDERR "In dictionary as is: $countInDic ($percent%)\n";
- # my $percent = 100 * $CountInDicLC / $countAll;
- # print STDERR "In LC dictionary as is: $CountInDicLC ($percent%)\n";
- # my $percent = 100 * $countCleanInDic / $countAll;
- # print STDERR "In dictionary cleaned: $countCleanInDic ($percent%)\n";
- # my $percent = 100 * $countCleanLCInDicLC / $countAll;
- # print STDERR "In LC dictionary cleaned: $countCleanLCInDicLC ($percent%)\n";
- # my $percent = 100 * $tooShort / $countAll;
- # print STDERR "Not in dictionary but too short to fix (<3): $tooShort ($percent%)\n";
- # my $percent = 100 * $notInDict / $countAll;
- # print STDERR "Not in dictionary: $notInDict ($percent%)\n\n";
- #
- # }
- # now work through the words that are not in the dictionary
- # first check for inline/across-line merging
- foreach my $count (@toCorrect) {
- my $before = $count - 1;
- my $after = $count + 1;
- # left merging check
- unless ( ( $firstWord{$count} )
- || ( $subs_type{$count} == 2 )
- || ( $wordsClean{$count} =~ m/\A\d+/ )
- || ( $wordsClean{$before} =~ m/\d+\Z/ ) )
- {
- check_left( $before, $count, $after );
- }
- # right merging check
- unless ( ( $lastWord{$count} )
- || ( $subs_type{$count} == 1 )
- || ( $wordsClean{$count} =~ m/\d+\Z/ )
- || ( $wordsClean{$after} =~ m/\A\d+/ ) )
- {
- check_right( $before, $count, $after );
- }
- if ( ( $lastWord{$count} )
- && ( !$subs_type{$count} )
- && $words{$count} =~ m/\-\Z/ )
- {
- merge_right( $before, $count, $after );
- }
- if ( ( $firstWord{$count} )
- && ( !$subs_type{$count} )
- && $words{$before} =~ m/\-\Z/
- && ( !$acrosslineMerge{$before} ) )
- {
- merge_left( $before, $count, $after );
- }
- }
- # Note: inline merging should also work with correction proposals. Maybe move down in the chain?
- # Note: There are quite a few hyphenated words across lines that have not been detected as one word - check!
- my $theT1 = time;
- # get anagram-hash list of correction proposals for each word that requires fixing
- # problem: The list generates proposals that differ in one character – or possibly by two characters, because a bi-gram
- # is substituted for a single character. This does not therefore catch words where two different characters were mis-OCR-ed.
- foreach my $count (@toCorrect) {
- # don't do anything if it is mergable or part of a hyphenated line-break
- # or if its an abbreviation or number
- next
- if ( ( $inlineMerge{$count} )
- || ( $inlineMergeLC{$count} )
- || ( $acrosslineMerge{$count} )
- || ( $acrosslineMergeLC{$count} )
- || ( $subs_type{$count} )
- || $status{$count} == 6 );
- my $focused_word = lc( $wordsClean{$count} );
- my $length = length($focused_word);
- # disregards short words because they produce more than 1000 correction proposals - can we optimise this?
- next if ( $length < 4 );
- my $ld_limit = 3;
- my $focused_char_hash;
- my (%sameHashValues);
- warn $focused_word;
- unless ( $anagramProposals{$focused_word} ) {
- #warn $focused_word;
- my $focused_word_hash = anagram_hash($focused_word);
- for ( my $l = -1 ; $l < $length ; $l++ ) {
- for (
- my $l2 = 1 ;
- ( $l2 < 3 ) && ( $l + $l2 <= $length ) ;
- $l2++
- )
- {
- if ( $l < 0 ) {
- $focused_char_hash = 0;
- $l2 = $length;
- }
- else {
- $focused_char_hash =
- anagram_hash( substr( $focused_word, $l, $l + $l2 ) );
- }
- foreach my $simulated_char_hash ( keys %confusion_list ) {
- my $simulated_word_hash =
- $focused_word_hash + $simulated_char_hash -
- $focused_char_hash;
- # this speeds up things by about 25%
- next unless $anagramHashBasic{$simulated_char_hash};
- foreach my $theWord (
- keys %{ $anagramHashDict{$simulated_word_hash} } )
- {
- $sameHashValues{$theWord}++;
- }
- }
- }
- }
- #warn "before: " . time;
- foreach my $similarWord ( keys %sameHashValues ) {
- my $distance = get_distance( $focused_word, $similarWord );
- if ( $distance <= $ld_limit ) {
- $anagramProposals{$focused_word}{$similarWord} = $distance;
- }
- }
- #$anagramProposals{$focused_word} = 1;
- if ( $focused_word eq "reputabje" ) {
- print STDERR
- "word: $focused_word - focused_word_hash: $focused_word_hash:\n";
- foreach my $bla ( keys %{ $anagramProposals{$focused_word} } ) {
- print STDERR
- "$bla: $anagramProposals{$focused_word}{$bla}\n";
- }
- }
- }
- else {
- #warn "#### saved time $focused_word";
- }
- }
- my $theT2 = time;
- my $theDiff = $theT2 - $theT1;
- print STDERR "time taken: $theDiff\n";
- }
- my $time2 = time;
- my $difference = $time2 - $time1;
- my $average = $difference / $countFiles;
- print STDERR
- "Number of files processed: $countFiles\nNumber of words: $countAll\nTime taken: $difference\nAverage time per file: $average\n"
- if ($debug);
- sub loadDictionaries {
- my $myPath = shift;
- my $myDictionaryName = shift;
- my $openPath = dictionaryPath($myDictionaryName);
- open( H, "<:utf8", "$openPath" ) || die "can't open file $openPath";
- while (<H>) {
- my $line = $_;
- chomp($line);
- ( my $freqWord, my $theFreq ) = split( /\t/, $line );
- my $freqWordLC = lc($freqWord);
- $freqDict{$freqWord} += $theFreq;
- $freqDictLC{$freqWordLC} += $theFreq;
- }
- close(H);
- }
- sub loadAnagramHash {
- my $myPath = shift;
- my $myDictionaryName = shift;
- my $openPath = dictionaryPath($myDictionaryName);
- open( H, "<:utf8", "$openPath" ) || die "can't open file $openPath";
- while (<H>) {
- my $line = $_;
- chomp($line);
- ( my $hash, my $freqWord, my $theFreq ) = split( /\t/, $line );
- $anagramHashDict{$hash}{$freqWord} = $theFreq;
- $anagramHashBasic{$hash}++;
- }
- close(H);
- }
- sub loadConfusionAlphabet {
- my $myPath = shift;
- my $myDictionaryName = shift;
- my $openPath = dictionaryPath($myDictionaryName);
- open( H, "<:utf8", "$openPath" ) || die "can't open file $openPath";
- while (<H>) {
- my $line = $_;
- chomp($line);
- $confusion_list{$line} = 1;
- }
- close(H);
- }
- sub check_left {
- my ( $before, $count, $after ) = @_;
- my $mergedWord = $wordsClean{$before} . $wordsClean{$count};
- if ( $freqDict{$mergedWord} ) {
- $inlineMerge{$count} = "L";
- }
- elsif ( $freqDictLC{ lc($mergedWord) } ) {
- $inlineMergeLC{$count} = "L";
- }
- }
- sub check_right {
- my ( $before, $count, $after ) = @_;
- # ignore if word-merging involves digits at word boundaries
- my $mergedWord = $wordsClean{$count} . $wordsClean{$after};
- if ( $freqDict{$mergedWord} ) {
- if ( $inlineMerge{$count} eq "L" ) {
- $inlineMerge{$count} = "B";
- }
- else {
- $inlineMerge{$count} = "R";
- }
- }
- elsif ( $freqDictLC{ lc($mergedWord) } ) {
- if ( $inlineMergeLC{$count} eq "L" ) {
- $inlineMergeLC{$count} = "B";
- }
- else {
- $inlineMergeLC{$count} = "R";
- }
- }
- }
- sub merge_left {
- my ( $before, $count, $after ) = @_;
- my $mergedWord = $wordsClean{$before} . $wordsClean{$count};
- if ( $freqDict{$mergedWord} ) {
- $acrosslineMerge{$count} = "L";
- $acrosslineMerge{$before} = "R";
- warn "merged hyphenated word: $wordsClean{$before} $wordsClean{$count}";
- }
- elsif ( $freqDictLC{ lc($mergedWord) } ) {
- $acrosslineMergeLC{$count} = "L";
- $acrosslineMergeLC{$before} = "R";
- warn "merged hyphenated word: $wordsClean{$before} $wordsClean{$count}";
- }
- }
- sub merge_right {
- my ( $before, $count, $after ) = @_;
- my $mergedWord = $wordsClean{$count} . $wordsClean{$after};
- if ( $freqDict{$mergedWord} ) {
- $acrosslineMerge{$count} = "R";
- $acrosslineMerge{$after} = "L";
- warn "merged hyphenated word: $wordsClean{$count} $wordsClean{$after}";
- }
- elsif ( $freqDictLC{ lc($mergedWord) } ) {
- $acrosslineMergeLC{$count} = "R";
- $acrosslineMergeLC{$after} = "L";
- warn "merged hyphenated word: $wordsClean{$count} $wordsClean{$after}";
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement