SHARE
TWEET

Untitled

a guest Dec 9th, 2018 62 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. # Skip XML tags.
  2. if re.search(r"(<S[^>]*>)", token):
  3.      continue
  4. # Skip if sentence start symbols.
  5. elif token in self.DELAYED_SENT_START:
  6.     continue
  7.    
  8. # Resets the `is_first_word` after seeing sent end symbols.
  9. if not is_first_word and token in self.SENT_END:
  10.     is_first_word = True
  11.     continue
  12.  
  13. # Skips words with nothing to case.
  14. if not re.search(r"[{}]".format(ll_lu_lt), token):
  15.     is_first_word = False
  16.     continue
  17.    
  18. current_word_weight = 0
  19. if not is_first_word:
  20.     current_word_weight = 1
  21. elif possibly_use_first_token:
  22.     # Gated special handling of first word of sentence.
  23.     # Check if first characer of token is lowercase.
  24.     if token[0].is_lower():
  25.         current_word_weight = 1
  26.     elif i == 1:
  27.         current_word_weight = 0.1
  28.  
  29. if current_word_weight > 0:
  30.     casing[token.lower()][token] += current_word_weight
  31.  
  32. is_first_word = False
  33.    
  34. import re
  35.  
  36. from collections import defaultdict, Counter
  37. from six import text_type
  38.  
  39. from sacremoses.corpus import Perluniprops
  40. from sacremoses.corpus import NonbreakingPrefixes
  41.  
  42. perluniprops = Perluniprops()
  43.  
  44.  
  45. class MosesTruecaser(object):
  46.     """
  47.     This is a Python port of the Moses Truecaser from
  48.     https://github.com/moses-smt/mosesdecoder/blob/master/scripts/recaser/train-truecaser.perl
  49.     https://github.com/moses-smt/mosesdecoder/blob/master/scripts/recaser/truecase.perl
  50.     """
  51.     # Perl Unicode Properties character sets.
  52.     Lowercase_Letter = text_type(''.join(perluniprops.chars('Lowercase_Letter')))
  53.     Uppercase_Letter = text_type(''.join(perluniprops.chars('Uppercase_Letter')))
  54.     Titlecase_Letter = text_type(''.join(perluniprops.chars('Uppercase_Letter')))
  55.  
  56.     def __init__(self):
  57.         # Initialize the object.
  58.         super(MosesTruecaser, self).__init__()
  59.         # Initialize the language specific nonbreaking prefixes.
  60.         self.SKIP_LETTERS_REGEX = r"[{}{}{}]".format(Lowercase_Letter,
  61.                                     Uppercase_Letter, Titlecase_Letter)
  62.  
  63.         self.SENT_END = [".", ":", "?", "!"]
  64.         self.DELAYED_SENT_START = ["(", "[", """, "'", "&apos;", """, "&#91;", "&#93;"]
  65.  
  66.     def train(self, filename, possibly_use_first_token=False):
  67.         casing = defaultdict(Counter)
  68.         with open(filename) as fin:
  69.             for line in fin:
  70.                 # Keep track of first words in the sentence(s) of the line.
  71.                 is_first_word = True
  72.                 for i, token in enumerate(line.split()):
  73.                     # Skip XML tags.
  74.                     if re.search(r"(<S[^>]*>)", token):
  75.                         continue
  76.                     # Skip if sentence start symbols.
  77.                     elif token in self.DELAYED_SENT_START:
  78.                         continue
  79.  
  80.                     # Resets the `is_first_word` after seeing sent end symbols.
  81.                     if not is_first_word and token in self.SENT_END:
  82.                         is_first_word = True
  83.                         continue
  84.  
  85.                     # Skips words with nothing to case.
  86.                     if not re.search(r"[{}]".format(ll_lu_lt), token):
  87.                         is_first_word = False
  88.                         continue
  89.  
  90.                     current_word_weight = 0
  91.                     if not is_first_word:
  92.                         current_word_weight = 1
  93.                     elif possibly_use_first_token:
  94.                         # Gated special handling of first word of sentence.
  95.                         # Check if first characer of token is lowercase.
  96.                         if token[0].is_lower():
  97.                             current_word_weight = 1
  98.                         elif i == 1:
  99.                             current_word_weight = 0.1
  100.  
  101.                     if current_word_weight > 0:
  102.                         casing[token.lower()][token] += current_word_weight
  103.  
  104.                     is_first_word = False
  105.         return casing
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top