Advertisement
Guest User

Untitled

a guest
May 25th, 2016
74
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 6.25 KB | None | 0 0
  1. from collections import Counter
  2. import re
  3. import glob
  4.  
  5. class ConllEntry:
  6. def __init__(self, id, form, pos, cpos, parent_id=None, relation=None):
  7. self.id = id
  8. self.form = form
  9. self.norm = normalize(form)
  10. self.cpos = cpos.upper()
  11. self.pos = pos.upper()
  12. self.parent_id = parent_id
  13. self.relation = relation
  14. # Added so that write_conll accepts these.
  15. self.pred_parent_id = parent_id
  16. self.pred_relation = relation
  17.  
  18. class ParseForest:
  19. def __init__(self, sentence):
  20. self.roots = list(sentence)
  21.  
  22. for root in self.roots:
  23. root.children = []
  24. root.scores = None
  25. root.parent = None
  26. root.pred_parent_id = 0 # None
  27. root.pred_relation = 'rroot' # None
  28. root.vecs = None
  29. root.lstms = None
  30.  
  31. def __len__(self):
  32. return len(self.roots)
  33.  
  34.  
  35. def Attach(self, parent_index, child_index):
  36. parent = self.roots[parent_index]
  37. child = self.roots[child_index]
  38.  
  39. child.pred_parent_id = parent.id
  40. del self.roots[child_index]
  41.  
  42.  
  43. def isProj(sentence):
  44. forest = ParseForest(sentence)
  45. unassigned = {entry.id: sum([1 for pentry in sentence if pentry.parent_id == entry.id]) for entry in sentence}
  46.  
  47. for _ in xrange(len(sentence)):
  48. for i in xrange(len(forest.roots) - 1):
  49. if forest.roots[i].parent_id == forest.roots[i+1].id and unassigned[forest.roots[i].id] == 0:
  50. unassigned[forest.roots[i+1].id]-=1
  51. forest.Attach(i+1, i)
  52. break
  53. if forest.roots[i+1].parent_id == forest.roots[i].id and unassigned[forest.roots[i+1].id] == 0:
  54. unassigned[forest.roots[i].id]-=1
  55. forest.Attach(i, i+1)
  56. break
  57.  
  58. return len(forest.roots) == 1
  59.  
  60. def vocab(conll_path):
  61. wordsCount = Counter()
  62. posCount = Counter()
  63. relCount = Counter()
  64.  
  65. with open(conll_path, 'r') as conllFP:
  66. for sentence in read_conll(conllFP, True):
  67. wordsCount.update([node.norm for node in sentence])
  68. posCount.update([node.pos for node in sentence])
  69. relCount.update([node.relation for node in sentence])
  70.  
  71. return (wordsCount, {w: i for i, w in enumerate(wordsCount.keys())}, posCount.keys(), relCount.keys())
  72.  
  73. def read_conll(fh, proj):
  74. root = ConllEntry(0, '*root*', 'ROOT-POS', 'ROOT-CPOS', 0, 'rroot')
  75. tokens = [root]
  76. for line in fh:
  77. tok = line.strip().split()
  78. if not tok:
  79. if len(tokens)>1:
  80. if not proj or isProj(tokens):
  81. yield tokens
  82. else:
  83. print 'Non-projective sentence dropped'
  84. tokens = [root]
  85. id = 0
  86. else:
  87. tokens.append(ConllEntry(int(tok[0]), tok[1], tok[3], tok[4], int(tok[6]) if tok[6] != '_' else -1, tok[7]))
  88. if len(tokens) > 1:
  89. yield tokens
  90.  
  91.  
  92. def write_conll(fn, conll_gen):
  93. with open(fn, 'w') as fh:
  94. for sentence in conll_gen:
  95. for entry in sentence[1:]:
  96. fh.write('\t'.join([str(entry.id), entry.form, '_', entry.pos, entry.cpos, '_', str(entry.pred_parent_id), entry.pred_relation, '_', '_']))
  97. fh.write('\n')
  98. fh.write('\n')
  99.  
  100. # ---------------------------
  101. # ADDED
  102.  
  103. # CONLL-U format:
  104.  
  105. # 0 ID: Word index, integer starting at 1 for each new sentence; may be a range for tokens with multiple words.
  106. # 1 FORM: Word form or punctuation symbol.
  107. # 2 LEMMA: Lemma or stem of word form.
  108. # 3 UPOSTAG: Universal part-of-speech tag drawn from our revised version of the Google universal POS tags.
  109. # 4 XPOSTAG: Language-specific part-of-speech tag; underscore if not available.
  110. # 5 FEATS: List of morphological features from the universal feature inventory or from a defined language-specific extension; underscore if not available.
  111. # 6 HEAD: Head of the current token, which is either a value of ID or zero (0).
  112. # 7 DEPREL: Universal Stanford dependency relation to the HEAD (root iff HEAD = 0) or a defined language-specific subtype of one.
  113. # 8 DEPS: List of secondary dependencies (head-deprel pairs).
  114. # 9 MISC: Any other annotation.
  115.  
  116. def read_conll_u(filename):
  117. with open(filename) as f:
  118. root = ConllEntry(0, '*root*', 'ROOT-POS', 'ROOT-CPOS', 0, 'rroot')
  119. tokens = [root]
  120. for line in f:
  121. if line.startswith('#'):
  122. # This is just metadata.
  123. continue
  124. elif line == '\n':
  125. yield tokens
  126. tokens = [root]
  127. else:
  128. tok = line.strip().split()
  129. entry = ConllEntry(id = int(tok[0]),
  130. form = tok[1],
  131. pos = tok[3],
  132. cpos = '_',
  133. parent_id = int(tok[6]) if tok[6] != '_' else -1,
  134. relation = tok[7])
  135. tokens.append(entry)
  136.  
  137. def convert_conllu_to_conll(filename):
  138. if filename.endswith('conllu'):
  139. write_conll(fn = filename[:-1],
  140. conll_gen = read_conll_u(filename))
  141. else:
  142. raise ValueErrorError('This does not seem to be a .conllu file!')
  143.  
  144. def convert_all(dirname):
  145. files = glob.glob(dirname + '/*.conllu')
  146. print "Converting %d files" % len(files)
  147. for filename in files:
  148. convert_conllu_to_conll(filename)
  149.  
  150. # ----------------------------------------
  151.  
  152. numberRegex = re.compile("[0-9]+|[0-9]+\\.[0-9]+|[0-9]+[0-9,]+");
  153. def normalize(word):
  154. return 'NUM' if numberRegex.match(word) else word.lower()
  155.  
  156. cposTable = {"PRP$": "PRON", "VBG": "VERB", "VBD": "VERB", "VBN": "VERB", ",": ".", "''": ".", "VBP": "VERB", "WDT": "DET", "JJ": "ADJ", "WP": "PRON", "VBZ": "VERB",
  157. "DT": "DET", "#": ".", "RP": "PRT", "$": ".", "NN": "NOUN", ")": ".", "(": ".", "FW": "X", "POS": "PRT", ".": ".", "TO": "PRT", "PRP": "PRON", "RB": "ADV",
  158. ":": ".", "NNS": "NOUN", "NNP": "NOUN", "``": ".", "WRB": "ADV", "CC": "CONJ", "LS": "X", "PDT": "DET", "RBS": "ADV", "RBR": "ADV", "CD": "NUM", "EX": "DET",
  159. "IN": "ADP", "WP$": "PRON", "MD": "VERB", "NNPS": "NOUN", "JJS": "ADJ", "JJR": "ADJ", "SYM": "X", "VB": "VERB", "UH": "X", "ROOT-POS": "ROOT-CPOS",
  160. "-LRB-": ".", "-RRB-": "."}
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement