Advertisement
Guest User

Untitled

a guest
Jun 17th, 2019
94
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 6.99 KB | None | 0 0
  1. #!/usr/bin/python3
  2. import nltk
  3. from nltk.wsd import lesk
  4. from nltk.corpus import wordnet as wn
  5. import wikipedia
  6. from nltk.parse import CoreNLPParser
  7. import itertools
  8. from collections import OrderedDict
  9. import os
  10.  
  11. def wikilinker(query, wordlist):
  12. try:
  13. return wikipedia.page(query).url
  14. except wikipedia.exceptions.DisambiguationError as e:
  15. ll = []
  16. sumlist = []
  17. for option in e.options:
  18. dis = 1/nltk.edit_distance(query, option)
  19. ll.append(dis)
  20. try:
  21. a = nltk.word_tokenize(wikipedia.page(option).summary)
  22. match = set(a).intersection(wordlist)
  23. sumlist.append(len(match))
  24. except:
  25.  
  26. sumlist.append(0)
  27. multilist = []
  28. for (ws,match) in zip(ll, sumlist):
  29. multilist.append(ws*match)
  30. maxval = multilist.index(max(multilist))
  31. return wikipedia.page(e.options[maxval]).url
  32. except wikipedia.exceptions.WikipediaException as r:
  33. pass
  34.  
  35. def sfNERTagger(rawText, POSFile):
  36. '''(sf = stanford) get the raw text from a file and convert that to a list with tuples of each word with a StanFord annotated NER-tag'''
  37. parser = CoreNLPParser(url='http://localhost:9000', tagtype='ner')
  38. tupleList = list(parser.tag(rawText.split()))
  39. #convert list of tuple to list of lists, so we can change tags we dont need
  40. NERList = [list(tuple) for tuple in tupleList]
  41.  
  42. #change tags we dont need
  43. for item in NERList:
  44. if item[1] == 'COUNTRY': item[1] = 'COU'
  45. elif item[1] == 'PERSON': item[1] = 'PER'
  46. elif item[1] == 'CITY': item[1] = 'CIT'
  47. elif item[1] == 'ORGANIZATION': item[1] = 'ORG'
  48. else: item[1] = ''
  49. #add the pos tag as a third item to every list within the NERList
  50. with open(POSFile) as f:
  51. POSLines = f.readlines()
  52.  
  53. #remove the '\n'
  54. for i in POSLines:
  55. i = i.strip('\n')
  56. #add postag to every appriopriate item
  57. for lineNumber,item in enumerate(NERList):
  58. item.append(POSLines[lineNumber].split()[4])
  59.  
  60. return NERList
  61.  
  62. def ownTagger (NERList):
  63. #take nerlist, when i[1] =='': see if this tagger can tag it, other wise leave empty. output ex: [[word, tag][word, tag]]
  64.  
  65.  
  66. synlist = []
  67. ll = []
  68. onesynlist = []
  69. simlist = []
  70. current_chunk = []
  71.  
  72. #make list out of NERList that only contains the words that are not tagged by NERTagger
  73.  
  74. t = []
  75.  
  76. for p in NERList:
  77. if p[1] == '' and p[2].startswith('N'):
  78. onedeflist = []
  79. deflist = []
  80. klist = []
  81. for syns in wn.synsets(p[0]):
  82. deflist.append(syns.definition())
  83. onedeflist.append(syns)
  84. klist.append(syns)
  85. if not deflist:
  86. ll.append([])
  87. elif len(deflist) == 1:
  88. onesynlist.append(onedeflist[0])
  89. ll.append(klist)
  90. else:
  91. synlist.append(deflist)
  92. ll.append(klist)
  93.  
  94. newlist = []
  95. for item in ll:
  96. itemb = []
  97. for i in item:
  98. synb = []
  99. for a in onesynlist:
  100. ps = i.path_similarity(a)
  101. synb.append(ps)
  102. itemb.append(synb)
  103. newlist.append(itemb)
  104.  
  105.  
  106. newlist = [[[0 if x is None else x for x in i] for i in item] for item in newlist]
  107. newlist = [[sum(i) for i in item] for item in newlist]
  108.  
  109. maxlist = []
  110. newsynlist = []
  111.  
  112. for item in newlist:
  113. if not item:
  114. maxlist.append([])
  115. else:
  116. maxlist.append(item.index(max(item)))
  117.  
  118.  
  119. for x,value in zip(ll, maxlist):
  120. if value == []:
  121. newsynlist.append([])
  122. else:
  123. a = x[value].definition()
  124. newsynlist.append(a)
  125. synlist = newsynlist
  126.  
  127. print(synlist)
  128.  
  129. #lists of all words that could appear in the definitions of the unigrams and bigrams
  130. city = [' city ', ' village ', ' town ', 'capital']
  131. country = [' nation ', ' republic ', ' monarchy ', ' province ', ' island ' , ' archipelago ']
  132. sport = [' sport ', 'combat', ' game ']
  133. natural_places = [' desert ', ' volcano ', ' sea ', ' ocean ', ' lake ', ' river ', ' jungle ', ' waterfall ', ' glacier ', ' mountain ', ' forest ' , ' crater ', ' cave ', ' canyon ', ' fjord ', ' park ', ' bay ', ' valley ', ' cliff ', ' reef ']
  134. entertainment = [' book ', 'magazine', 'film', 'movie', 'song', 'journal', 'newspaper']
  135. animal = ['mammal', 'bird', 'fish', 'amphibian', 'reptil', 'crustacean', 'insect', 'carnivore', 'herbivore', 'species', 'breed', 'cattle', 'quadruped', 'pachyderm', 'feline', 'ungulate']
  136. person = ['born']
  137. organization = ['organization']
  138.  
  139.  
  140.  
  141. #if one of the words appears in the definition of the uni- or bigram, append a tuple to a list with the word and the NER tag
  142. for deflist in synlist:
  143. if any(x in deflist for x in city):
  144. NERList[synlist.index(deflist)][1] = 'CIT'
  145. elif any(x in deflist for x in country):
  146. NERList[synlist.index(deflist)][1] = 'COU'
  147. elif any(x in deflist for x in sport):
  148. NERList[synlist.index(deflist)][1] = 'SPO'
  149. elif any(x in deflist for x in natural_places):
  150. NERList[synlist.index(deflist)][1] = 'NAT'
  151. elif any(x in deflist for x in entertainment):
  152. NERList[synlist.index(deflist)][1] = 'ENT'
  153. elif any(x in deflist for x in animal):
  154. NERList[synlist.index(deflist)][1] = 'ANI'
  155.  
  156. return NERList
  157.  
  158. def sfNERWriter(POSFile, NERList, rawlist):
  159. '''Takes output of sfNERTagger() -->NERList, iters over the POSFile, if NERList[index][1] is meaningful: add the appropriate tag. create ENTFile and write every line'''
  160. with open(POSFile, "r") as f1:
  161. POSLines = f1.readlines()
  162. with open(str(POSFile + ".test"),"w") as f2:
  163. for lineNumber, line in enumerate(POSLines):
  164. line = line.strip('\n')
  165. if line.split()[3] == NERList[lineNumber][0]:
  166. f2.write(str(line + " " + NERList[lineNumber][1] + '\n'))
  167. else:
  168. f2.write("error")
  169. if len(line.split()) > 5:
  170. f2.write(wikilinker(NERList[lineNumber][0], rawlist))
  171. # with open("posfile", "r") as posfile:
  172. #lines = posfile.readlines()
  173. #with open("posfile", "w") as posfile:
  174. #for line in lines:
  175. #sources.write(blablabla)
  176.  
  177.  
  178. def getContinuousChunks(NERList):
  179. '''takes NERList and lists together words that need chunking'''
  180. continuous_chunk = []
  181. current_chunk = []
  182.  
  183. for token, tag, postag in NERList:
  184. if tag != "":
  185. current_chunk.append((token))
  186. else:
  187. if current_chunk: # if the current chunk is not empty
  188. continuous_chunk.append(current_chunk)
  189. current_chunk = []
  190. # Flush the final current_chunk into the continuous_chunk, if any.
  191. if current_chunk:
  192. continuous_chunk.append(current_chunk)
  193. return continuous_chunk
  194.  
  195.  
  196.  
  197. def main():
  198. POSFile = 'data/p51/d0069/en.tok.off.pos'
  199.  
  200. with open('data/p51/d0069/en.raw') as f1:
  201. rawText = f1.read()
  202. rawlist = rawText.split()
  203. NERList = sfNERTagger(rawText, POSFile)
  204. # x output example: [('out', 'O'),('two','Date")] etc
  205.  
  206. #with open('data/p51/d0060/en.tok.off.pos') as f2:
  207. #for lineNumber, line in enumerate(f2):
  208. #if line.split()[3] == x[lineNumber][0]:
  209. #print('True', NERRaw[lineNumber], f2.name + ".ent")
  210.  
  211.  
  212. #print(NERRaw)
  213.  
  214. #x = getContinuousChunks(NERList)
  215. y = ownTagger(NERList)
  216. sfNERWriter(POSFile, NERList, rawlist)
  217.  
  218. if __name__ == "__main__":
  219. main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement