Advertisement
Guest User

Untitled

a guest
Dec 9th, 2018
88
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.78 KB | None | 0 0
  1. # Skip XML tags.
  2. if re.search(r"(<S[^>]*>)", token):
  3. continue
  4. # Skip if sentence start symbols.
  5. elif token in self.DELAYED_SENT_START:
  6. continue
  7.  
  8. # Resets the `is_first_word` after seeing sent end symbols.
  9. if not is_first_word and token in self.SENT_END:
  10. is_first_word = True
  11. continue
  12.  
  13. # Skips words with nothing to case.
  14. if not re.search(r"[{}]".format(ll_lu_lt), token):
  15. is_first_word = False
  16. continue
  17.  
  18. current_word_weight = 0
  19. if not is_first_word:
  20. current_word_weight = 1
  21. elif possibly_use_first_token:
  22. # Gated special handling of first word of sentence.
  23. # Check if first characer of token is lowercase.
  24. if token[0].is_lower():
  25. current_word_weight = 1
  26. elif i == 1:
  27. current_word_weight = 0.1
  28.  
  29. if current_word_weight > 0:
  30. casing[token.lower()][token] += current_word_weight
  31.  
  32. is_first_word = False
  33.  
  34. import re
  35.  
  36. from collections import defaultdict, Counter
  37. from six import text_type
  38.  
  39. from sacremoses.corpus import Perluniprops
  40. from sacremoses.corpus import NonbreakingPrefixes
  41.  
  42. perluniprops = Perluniprops()
  43.  
  44.  
  45. class MosesTruecaser(object):
  46. """
  47. This is a Python port of the Moses Truecaser from
  48. https://github.com/moses-smt/mosesdecoder/blob/master/scripts/recaser/train-truecaser.perl
  49. https://github.com/moses-smt/mosesdecoder/blob/master/scripts/recaser/truecase.perl
  50. """
  51. # Perl Unicode Properties character sets.
  52. Lowercase_Letter = text_type(''.join(perluniprops.chars('Lowercase_Letter')))
  53. Uppercase_Letter = text_type(''.join(perluniprops.chars('Uppercase_Letter')))
  54. Titlecase_Letter = text_type(''.join(perluniprops.chars('Uppercase_Letter')))
  55.  
  56. def __init__(self):
  57. # Initialize the object.
  58. super(MosesTruecaser, self).__init__()
  59. # Initialize the language specific nonbreaking prefixes.
  60. self.SKIP_LETTERS_REGEX = r"[{}{}{}]".format(Lowercase_Letter,
  61. Uppercase_Letter, Titlecase_Letter)
  62.  
  63. self.SENT_END = [".", ":", "?", "!"]
  64. self.DELAYED_SENT_START = ["(", "[", """, "'", "&apos;", """, "&#91;", "&#93;"]
  65.  
  66. def train(self, filename, possibly_use_first_token=False):
  67. casing = defaultdict(Counter)
  68. with open(filename) as fin:
  69. for line in fin:
  70. # Keep track of first words in the sentence(s) of the line.
  71. is_first_word = True
  72. for i, token in enumerate(line.split()):
  73. # Skip XML tags.
  74. if re.search(r"(<S[^>]*>)", token):
  75. continue
  76. # Skip if sentence start symbols.
  77. elif token in self.DELAYED_SENT_START:
  78. continue
  79.  
  80. # Resets the `is_first_word` after seeing sent end symbols.
  81. if not is_first_word and token in self.SENT_END:
  82. is_first_word = True
  83. continue
  84.  
  85. # Skips words with nothing to case.
  86. if not re.search(r"[{}]".format(ll_lu_lt), token):
  87. is_first_word = False
  88. continue
  89.  
  90. current_word_weight = 0
  91. if not is_first_word:
  92. current_word_weight = 1
  93. elif possibly_use_first_token:
  94. # Gated special handling of first word of sentence.
  95. # Check if first characer of token is lowercase.
  96. if token[0].is_lower():
  97. current_word_weight = 1
  98. elif i == 1:
  99. current_word_weight = 0.1
  100.  
  101. if current_word_weight > 0:
  102. casing[token.lower()][token] += current_word_weight
  103.  
  104. is_first_word = False
  105. return casing
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement