Advertisement
Guest User

Untitled

a guest
Mar 8th, 2014
80
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Perl 14.24 KB | None | 0 0
  1. #!/usr/bin/perl
  2. use strict;
  3. use File::Find::Rule;
  4. use Time::HiRes qw ( time alarm sleep );
  5. use utf8;
  6. use Text::Unidecode;
  7. use Data::Dumper;
  8. use List::Util qw /min/;
  9. use OcrConfig;
  10.  
  11. #use Devel::Size qw(size total_size);
  12. #use Devel::DumpSizes qw/dump_sizes/;
  13.  
  14. my $time1 = time;
  15. my $countFiles;
  16. my $countAll;
  17.  
  18. my ( $countInDic, $CountInDicLC, $countCleanInDic, $countCleanLCInDicLC,
  19.     $notInDict, $tooShort );
  20.  
  21. my @files;
  22. my %text;
  23. my $item;
  24. my %anagramProposals;
  25.  
  26. my $debug = 1;
  27. warn "Begin time: " . time if ($debug);
  28.  
  29. # sets line-break character to nonsense -> sucks in whole text.
  30. #$/ = "\x04";
  31.  
  32. my $year = $ARGV[1];
  33.  
  34. #my $pruneDictFreq = $ARGV[2];
  35.  
  36. # DA: moved hashes up to global level => accessibility and mutability in subs
  37. # May require resetting hashes in outmost foreach loop
  38. my (
  39.     %IDs,               %words,         %wordsClean,
  40.     %subs_type,         %firstWord,     %lastWord,
  41.     %delimiter,         %WC,            %CC,
  42.     %HPOS,              %VPOS,          %WIDTH,
  43.     %HEIGHT,            %dirtLeft,      %dirtRight,
  44.     %lowFreq,           %lowFreqLC,     @toCorrect,
  45.     %inlineMerge,       %inlineMergeLC, %acrosslineMerge,
  46.     %acrosslineMergeLC, %possessive
  47. );
  48.  
  49. # load dictionarie(s) to memory
  50. # SH: this loads the relevant files into global hashes - probably not good practice?
  51. my ( %freqDict, %freqDictLC, %anagramHashDict, %anagramHashBasic,
  52.     %confusion_list );
  53. loadDictionaries( dictionaryPath( dictionary() ) );
  54. loadAnagramHash( dictionaryPath( anagramHashMap() ) );
  55. loadConfusionAlphabet( dictionaryPath( confusionAlphabet() ) );
  56.  
  57. warn "Dictionaries loaded " . time if ($debug);
  58.  
  59. @files = File::Find::Rule->file->in( $ARGV[0] );
  60.  
  61. foreach my $path (@files) {
  62.     next unless ( $path =~ m/\/([^\/]+\d+)\.xml$/ );
  63.     my $filename = $1;
  64.  
  65.     next unless ( $path =~ /straitstimes_19510102_0002.xml/ );
  66.  
  67.     print STDERR "$path\n";
  68.  
  69.     # sets input record separator character to undef -> enable "slurp" mode
  70.     local $/;
  71.  
  72.     open( H, "<:utf8", "$path" ) || die "can't open file $path";
  73.     my $file = <H>;
  74.     close(H);
  75.  
  76.     $countFiles++;
  77.  
  78.     $file =~ tr/(\n|\r)/ /;
  79.  
  80.     my $countWords = 0;
  81.  
  82.     my ( $firstWord, $delimiter, $WC, $CC );
  83.  
  84.     while ( $file =~ m/<TextBlock([^>]+)>(.*?)<\/TextBlock>/g ) {
  85.         my $textBlockAtt = $1;
  86.         my $TextBlock    = $2;
  87.  
  88.         while ( $TextBlock =~ m/<TextLine([^>]+)>(.*?)<\/TextLine>/g ) {
  89.             my $textLineAtt   = $1;
  90.             my $TextLine      = $2;
  91.             my $beginTextLine = 1;
  92.             while ( $TextLine =~
  93. m/<String ID="([^"]+)" HPOS="([^"]+)" VPOS="([^"]+)" WIDTH="([^"]+)" HEIGHT="([^"]+)" (.*?)\/>/g
  94.               )
  95.             {
  96.                 $countWords++;
  97.                 $countAll++;
  98.                 $IDs{$countWords}    = $1;
  99.                 $HPOS{$countWords}   = $2;
  100.                 $VPOS{$countWords}   = $3;
  101.                 $WIDTH{$countWords}  = $4;
  102.                 $HEIGHT{$countWords} = $5;
  103.                 my $rest = $6;
  104.  
  105. # This parse does not implement splitting on split_string, so the following would not be split:
  106. # <String ID="P2_ST00064" HPOS="1371" VPOS="235" WIDTH="142" HEIGHT="18" CONTENT="January,l9sl." WC="0.95" CC="1101002002101"/>
  107. # can we add this and still keep track of things?
  108.  
  109.                 if ($beginTextLine) {
  110.                     $firstWord{$countWords}      = 1;
  111.                     $beginTextLine               = 0;
  112.                     $lastWord{ $countWords - 1 } = 1
  113.                       unless ( $countWords == 1 );
  114.                 }
  115.                 else {
  116.                     $firstWord{$countWords} = 0;
  117.                 }
  118.  
  119.                 if ( $rest =~ m/SUBS_TYPE="HypPart1"/ ) {
  120.                     $subs_type{$countWords} = 1;
  121.                 }
  122.                 elsif ( $rest =~ m/SUBS_TYPE="HypPart2"/ ) {
  123.                     $subs_type{$countWords} = 2;
  124.                 }
  125.                 else {
  126.                     $subs_type{$countWords} = 0;
  127.                 }
  128.                 my $item;
  129.                 if ( $rest =~ m/CONTENT="([^"]+)"/ ) {
  130.                     $item = $1;
  131.                     $delimiter{$countWords} = "double";
  132.                 }
  133.                 elsif ( $rest =~ m/CONTENT='([^']+)'/ ) {
  134.                     $item = $1;
  135.                     $delimiter{$countWords} = "single";
  136.                 }
  137.                 else {
  138.                     warn
  139.                       "Can't work out content attribute for $TextLine in $path"
  140.                       if ($debug);
  141.                 }
  142.  
  143.                 if ( $rest =~ m/WC="([^"]+)" CC="([^"]+)"/ ) {
  144.                     $WC{$countWords} = $1;
  145.                     $CC{$countWords} = $2;
  146.                 }
  147.                 else {
  148.                     warn
  149. "Can't work out WC and CC attributes for $TextLine in $path"
  150.                       if ($debug);
  151.                 }
  152.                 $words{$countWords} = $item;
  153.  
  154.                 if ( $item =~ s/\A(\P{Word}+)// ) {
  155.                     $dirtLeft{$countWords} = $1;
  156.                 }
  157.                 else {
  158.                     $dirtLeft{$countWords} = "";
  159.                 }
  160.                 if ( $item =~ s/(\P{Word}+)\Z// ) {
  161.                     $dirtRight{$countWords} = $1;
  162.                 }
  163.                 else {
  164.                     $dirtRight{$countWords} = "";
  165.                 }
  166.  
  167.                 $possessive{$countWords} = 1 if ( $item =~ m/\'s\Z/ );
  168.  
  169.                 $wordsClean{$countWords} = $item;
  170.  
  171.             }
  172.         }
  173.     }
  174.  
  175.     # Now we have the relevant data in the hashes...
  176.     # First check status depending on whether the word is in the dictionary
  177.  
  178.     my (%status);
  179.     for ( my $count = 1 ; $count <= $countWords ; $count++ ) {
  180.  
  181.         # check first for manual correction rules
  182.         # not yet implemented
  183.  
  184. # Status:
  185. # 0 = in dictionary as is, no need to correct
  186. # 1 = in dictionary, but only lowercase; no need to correct, but check case
  187. # 2 = in dictionary as is, if dirt removed; deal with dirt (?)
  188. # 3 = in dictionary as lowercase, if dirt removed; no need to correct, but check case, and deal with dirt
  189. # 4 = manual correction rule
  190. # 5 = not in dictionary, but too short
  191. # 6 = not in dictionary, but number or possible abbreviation
  192. # 7 = needs to be corrected
  193.  
  194.         if ( $freqDict{ $words{$count} } ) {
  195.             $status{$count} = 0;
  196.             $countInDic++;
  197.             $lowFreq{$count} = 1
  198.               if ( $freqDict{ $words{$count} } < freqThreshold() );
  199.         }
  200.         elsif ( $freqDictLC{ lc( $words{$count} ) } ) {
  201.             $status{$count} = 1;
  202.             $CountInDicLC++;
  203.             $lowFreqLC{$count} = 1
  204.               if ( $freqDictLC{ lc( $words{$count} ) } < freqThreshold() );
  205.         }
  206.         elsif ( $freqDict{ $wordsClean{$count} } ) {
  207.             $status{$count} = 2;
  208.             $countCleanInDic++;
  209.             $lowFreq{$count} = 1
  210.               if ( $freqDict{ $wordsClean{$count} } < freqThreshold() );
  211.         }
  212.         elsif ( $freqDictLC{ lc( $wordsClean{$count} ) } ) {
  213.             $status{$count} = 3;
  214.             $countCleanLCInDicLC++;
  215.             $lowFreqLC{$count} = 1
  216.               if ( $freqDictLC{ lc( $wordsClean{$count} ) } < freqThreshold() );
  217.         }
  218.         elsif ( length( $wordsClean{$count} ) < 3 ) {
  219.             $status{$count} = 5;
  220.             $tooShort++;
  221.         }
  222.         elsif (( $wordsClean{$count} =~ m/\A[\d\.\-\/\,]+\Z/ )
  223.             || ( $wordsClean{$count} =~ m/\A[a-z](\.[a-z])+\Z/i ) )
  224.         {
  225.             $status{$count} = 6;
  226.         }
  227.         else {
  228.             $notInDict++;
  229.             $status{$count} = 7;
  230.             push @toCorrect, $count;
  231.         }
  232.  
  233.     }
  234.  
  235. #   if ($debug) {
  236. #       my $percent = 100 * $countInDic / $countAll;
  237. #       print STDERR "In dictionary as is: $countInDic ($percent%)\n";
  238. #       my $percent = 100 * $CountInDicLC / $countAll;
  239. #       print STDERR "In LC dictionary as is: $CountInDicLC ($percent%)\n";
  240. #       my $percent = 100 * $countCleanInDic / $countAll;
  241. #       print STDERR "In dictionary cleaned: $countCleanInDic ($percent%)\n";
  242. #       my $percent = 100 * $countCleanLCInDicLC / $countAll;
  243. #       print STDERR "In LC dictionary cleaned: $countCleanLCInDicLC ($percent%)\n";
  244. #       my $percent = 100 * $tooShort / $countAll;
  245. #       print STDERR "Not in dictionary but too short to fix (<3): $tooShort ($percent%)\n";
  246. #       my $percent = 100 * $notInDict / $countAll;
  247. #       print STDERR "Not in dictionary: $notInDict ($percent%)\n\n";
  248. #
  249. #   }
  250.  
  251.     # now work through the words that are not in the dictionary
  252.  
  253.     # first check for inline/across-line merging
  254.     foreach my $count (@toCorrect) {
  255.         my $before = $count - 1;
  256.         my $after  = $count + 1;
  257.  
  258.         # left merging check
  259.         unless ( ( $firstWord{$count} )
  260.             || ( $subs_type{$count} == 2 )
  261.             || ( $wordsClean{$count} =~ m/\A\d+/ )
  262.             || ( $wordsClean{$before} =~ m/\d+\Z/ ) )
  263.         {
  264.             check_left( $before, $count, $after );
  265.         }
  266.  
  267.         # right merging check
  268.         unless ( ( $lastWord{$count} )
  269.             || ( $subs_type{$count} == 1 )
  270.             || ( $wordsClean{$count} =~ m/\d+\Z/ )
  271.             || ( $wordsClean{$after} =~ m/\A\d+/ ) )
  272.         {
  273.             check_right( $before, $count, $after );
  274.         }
  275.         if (   ( $lastWord{$count} )
  276.             && ( !$subs_type{$count} )
  277.             && $words{$count} =~ m/\-\Z/ )
  278.         {
  279.             merge_right( $before, $count, $after );
  280.         }
  281.         if (   ( $firstWord{$count} )
  282.             && ( !$subs_type{$count} )
  283.             && $words{$before} =~ m/\-\Z/
  284.             && ( !$acrosslineMerge{$before} ) )
  285.         {
  286.             merge_left( $before, $count, $after );
  287.         }
  288.     }
  289.  
  290. # Note: inline merging should also work with correction proposals. Maybe move down in the chain?
  291. # Note: There are quite a few hyphenated words across lines that have not been detected as one word - check!
  292.  
  293.     my $theT1 = time;
  294.  
  295. # get anagram-hash list of correction proposals for each word that requires fixing
  296. # problem: The list generates proposals that differ in one character – or possibly by two characters, because a bi-gram
  297. # is substituted for a single character. This does not therefore catch words where two different characters were mis-OCR-ed.
  298.     foreach my $count (@toCorrect) {
  299.  
  300.         # don't do anything if it is mergable or part of a hyphenated line-break
  301.         # or if its an abbreviation or number
  302.         next
  303.           if ( ( $inlineMerge{$count} )
  304.             || ( $inlineMergeLC{$count} )
  305.             || ( $acrosslineMerge{$count} )
  306.             || ( $acrosslineMergeLC{$count} )
  307.             || ( $subs_type{$count} )
  308.             || $status{$count} == 6 );
  309.         my $focused_word = lc( $wordsClean{$count} );
  310.         my $length       = length($focused_word);
  311.  
  312. # disregards short words because they produce more than 1000 correction proposals - can we optimise this?
  313.         next if ( $length < 4 );
  314.         my $ld_limit = 3;
  315.         my $focused_char_hash;
  316.         my (%sameHashValues);
  317.         warn $focused_word;
  318.         unless ( $anagramProposals{$focused_word} ) {
  319.  
  320.             #warn $focused_word;
  321.             my $focused_word_hash = anagram_hash($focused_word);
  322.  
  323.             for ( my $l = -1 ; $l < $length ; $l++ ) {
  324.                 for (
  325.                     my $l2 = 1 ;
  326.                     ( $l2 < 3 ) && ( $l + $l2 <= $length ) ;
  327.                     $l2++
  328.                   )
  329.                 {
  330.                     if ( $l < 0 ) {
  331.                         $focused_char_hash = 0;
  332.                         $l2                = $length;
  333.                     }
  334.                     else {
  335.                         $focused_char_hash =
  336.                           anagram_hash( substr( $focused_word, $l, $l + $l2 ) );
  337.                     }
  338.  
  339.                     foreach my $simulated_char_hash ( keys %confusion_list ) {
  340.                         my $simulated_word_hash =
  341.                           $focused_word_hash + $simulated_char_hash -
  342.                           $focused_char_hash;
  343.  
  344.                         # this speeds up things by about 25%
  345.                         next unless $anagramHashBasic{$simulated_char_hash};
  346.                         foreach my $theWord (
  347.                             keys %{ $anagramHashDict{$simulated_word_hash} } )
  348.                         {
  349.                             $sameHashValues{$theWord}++;
  350.                         }
  351.                     }
  352.                 }
  353.  
  354.             }
  355.  
  356.             #warn "before: " . time;
  357.             foreach my $similarWord ( keys %sameHashValues ) {
  358.                 my $distance = get_distance( $focused_word, $similarWord );
  359.                 if ( $distance <= $ld_limit ) {
  360.                     $anagramProposals{$focused_word}{$similarWord} = $distance;
  361.                 }
  362.             }
  363.  
  364.             #$anagramProposals{$focused_word} = 1;
  365.             if ( $focused_word eq "reputabje" ) {
  366.                 print STDERR
  367. "word: $focused_word - focused_word_hash: $focused_word_hash:\n";
  368.                 foreach my $bla ( keys %{ $anagramProposals{$focused_word} } ) {
  369.                     print STDERR
  370.                       "$bla: $anagramProposals{$focused_word}{$bla}\n";
  371.                 }
  372.             }
  373.         }
  374.         else {
  375.  
  376.             #warn "#### saved time $focused_word";
  377.         }
  378.     }
  379.     my $theT2   = time;
  380.     my $theDiff = $theT2 - $theT1;
  381.     print STDERR "time taken: $theDiff\n";
  382.  
  383. }
  384.  
  385. my $time2      = time;
  386. my $difference = $time2 - $time1;
  387. my $average    = $difference / $countFiles;
  388.  
  389. print STDERR
  390. "Number of files processed: $countFiles\nNumber of words: $countAll\nTime taken: $difference\nAverage time per file: $average\n"
  391.   if ($debug);
  392.  
  393. sub loadDictionaries {
  394.     my $myPath           = shift;
  395.     my $myDictionaryName = shift;
  396.  
  397.     my $openPath = dictionaryPath($myDictionaryName);
  398.     open( H, "<:utf8", "$openPath" ) || die "can't open file $openPath";
  399.     while (<H>) {
  400.         my $line = $_;
  401.         chomp($line);
  402.         ( my $freqWord, my $theFreq ) = split( /\t/, $line );
  403.         my $freqWordLC = lc($freqWord);
  404.         $freqDict{$freqWord}     += $theFreq;
  405.         $freqDictLC{$freqWordLC} += $theFreq;
  406.     }
  407.     close(H);
  408. }
  409.  
  410. sub loadAnagramHash {
  411.     my $myPath           = shift;
  412.     my $myDictionaryName = shift;
  413.  
  414.     my $openPath = dictionaryPath($myDictionaryName);
  415.     open( H, "<:utf8", "$openPath" ) || die "can't open file $openPath";
  416.     while (<H>) {
  417.         my $line = $_;
  418.         chomp($line);
  419.         ( my $hash, my $freqWord, my $theFreq ) = split( /\t/, $line );
  420.         $anagramHashDict{$hash}{$freqWord} = $theFreq;
  421.         $anagramHashBasic{$hash}++;
  422.     }
  423.     close(H);
  424. }
  425.  
  426. sub loadConfusionAlphabet {
  427.     my $myPath           = shift;
  428.     my $myDictionaryName = shift;
  429.  
  430.     my $openPath = dictionaryPath($myDictionaryName);
  431.     open( H, "<:utf8", "$openPath" ) || die "can't open file $openPath";
  432.     while (<H>) {
  433.         my $line = $_;
  434.         chomp($line);
  435.         $confusion_list{$line} = 1;
  436.     }
  437.     close(H);
  438. }
  439.  
  440. sub check_left {
  441.     my ( $before, $count, $after ) = @_;
  442.     my $mergedWord = $wordsClean{$before} . $wordsClean{$count};
  443.     if ( $freqDict{$mergedWord} ) {
  444.         $inlineMerge{$count} = "L";
  445.     }
  446.     elsif ( $freqDictLC{ lc($mergedWord) } ) {
  447.         $inlineMergeLC{$count} = "L";
  448.     }
  449. }
  450.  
  451. sub check_right {
  452.     my ( $before, $count, $after ) = @_;
  453.  
  454.     # ignore if word-merging involves digits at word boundaries
  455.     my $mergedWord = $wordsClean{$count} . $wordsClean{$after};
  456.  
  457.     if ( $freqDict{$mergedWord} ) {
  458.         if ( $inlineMerge{$count} eq "L" ) {
  459.             $inlineMerge{$count} = "B";
  460.         }
  461.         else {
  462.             $inlineMerge{$count} = "R";
  463.         }
  464.     }
  465.     elsif ( $freqDictLC{ lc($mergedWord) } ) {
  466.         if ( $inlineMergeLC{$count} eq "L" ) {
  467.             $inlineMergeLC{$count} = "B";
  468.         }
  469.         else {
  470.             $inlineMergeLC{$count} = "R";
  471.         }
  472.     }
  473. }
  474.  
  475. sub merge_left {
  476.     my ( $before, $count, $after ) = @_;
  477.     my $mergedWord = $wordsClean{$before} . $wordsClean{$count};
  478.     if ( $freqDict{$mergedWord} ) {
  479.         $acrosslineMerge{$count}  = "L";
  480.         $acrosslineMerge{$before} = "R";
  481.         warn "merged hyphenated word: $wordsClean{$before} $wordsClean{$count}";
  482.     }
  483.     elsif ( $freqDictLC{ lc($mergedWord) } ) {
  484.         $acrosslineMergeLC{$count}  = "L";
  485.         $acrosslineMergeLC{$before} = "R";
  486.         warn "merged hyphenated word: $wordsClean{$before} $wordsClean{$count}";
  487.     }
  488. }
  489.  
  490. sub merge_right {
  491.     my ( $before, $count, $after ) = @_;
  492.     my $mergedWord = $wordsClean{$count} . $wordsClean{$after};
  493.     if ( $freqDict{$mergedWord} ) {
  494.         $acrosslineMerge{$count} = "R";
  495.         $acrosslineMerge{$after} = "L";
  496.         warn "merged hyphenated word: $wordsClean{$count} $wordsClean{$after}";
  497.     }
  498.     elsif ( $freqDictLC{ lc($mergedWord) } ) {
  499.         $acrosslineMergeLC{$count} = "R";
  500.         $acrosslineMergeLC{$after} = "L";
  501.         warn "merged hyphenated word: $wordsClean{$count} $wordsClean{$after}";
  502.     }
  503. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement