Advertisement
Guest User

Untitled

a guest
Jan 21st, 2020
80
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 8.60 KB | None | 0 0
  1. from SPARQLWrapper import SPARQLWrapper, JSON
  2. from pynif import NIFCollection
  3. from itertools import tee, islice, chain
  4. import rdflib
  5. import re
  6. import nltk
  7. import numpy as np
  8. from nltk.corpus import stopwords
  9. nltk.download('stopwords')
  10.  
  11.  
  12. class WordAndClass():
  13. "Stores words and theirs classes"
  14. def __init__(self, word, wordsClass, whereFound):
  15. self.word = word
  16. self.wordsClass = wordsClass
  17. self.whereFound = whereFound
  18.  
  19. def printText(self):
  20. print("Word: " + self.word + " matches to " + self.wordsClass + " class")
  21. print("From Text: " + self.whereFound)
  22.  
  23.  
  24. def openAndPurifyFile(file):
  25. with open(file) as openfile:
  26. for line in openfile:
  27. if "isString" in line:
  28. head, sep, tail = line.partition('@')
  29. searchedText = head.split()
  30. searchedText.pop(0)
  31. finalText = ' '.join(searchedText)
  32. finalText = re.sub('[^A-Za-z0-9]+', ' ', finalText)
  33. return finalText
  34.  
  35. def removeStopwords(tokens):
  36. stop_words = set(stopwords.words('english'))
  37. return list(filter(lambda x: x not in stop_words, tokens))
  38.  
  39. def tokenize(sentence):
  40. wordsList=[]
  41. for word in sentence.split():
  42. cleanString = re.sub('\W+','', word)
  43. wordsList.append(cleanString)
  44. return wordsList
  45.  
  46. def capitalizeList(wordsList):
  47. newList = []
  48. for word in wordsList:
  49. newList.append(word.title())
  50. return newList
  51.  
  52. def previous_and_next(some_iterable):
  53. prevs, items, nexts = tee(some_iterable, 3)
  54. prevs = chain([None], prevs)
  55. nexts = chain(islice(nexts, 1, None), [None])
  56. return zip(prevs, items, nexts)
  57.  
  58. def getSentencesFromWords(tokens):
  59. serie = ''
  60. series = []
  61. counter = 0
  62. for previous, token, nxt in previous_and_next(tokens):
  63. if token[0].isupper():
  64. if serie == '':
  65. serie = serie + token
  66. series += [serie]
  67. else:
  68. serie = serie + '_' + token
  69. counter = counter + 1
  70. if counter == 2:
  71. series += [serie]
  72. if (nxt[0].isupper() and (serie.count('_')) == 1):
  73. serie = serie + '_' + nxt
  74. series += [serie]
  75. counter = 0
  76. serie = ''
  77. else:
  78. if serie != '':
  79. series += [serie]
  80. series += [token]
  81. serie = ''
  82. counter = 0
  83.  
  84. series = list(set(series))
  85. return series
  86.  
  87. def sendDBPediaQuery(wordsList, wordClasses):
  88. output=[]
  89. sparql = SPARQLWrapper("http://dbpedia.org/sparql")
  90. for word in wordsList:
  91. sparql.setQuery("""
  92. SELECT ?label
  93. WHERE { <http://dbpedia.org/resource/"""+word+"""> rdf:type ?label }
  94. """)
  95. sparql.setReturnFormat(JSON)
  96. results = sparql.query().convert()
  97.  
  98. for result in results["results"]["bindings"]:
  99. # print(word + ' ' + result['label']['value'])
  100. if 'http://dbpedia.org/ontology/' in result['label']['value']:
  101. # print('Word: ' + word + ', class: ' + result['label']['value'])
  102. finalText = re.sub('http://dbpedia.org/ontology/', '', result['label']['value'])
  103. if isWordInSearchedClasses(finalText, wordClasses):
  104. print('Word: ' + word + ', class: ' + result['label']['value'])
  105. output.append(word)
  106. else:
  107. foundWordClass = isOntologyOfSubclass(word)
  108. if foundWordClass in wordClasses:
  109. print('Word: ' + word + ', class: ' + 'http://dbpedia.org/ontology/' + foundWordClass)
  110. output.append(word)
  111. return output
  112.  
  113. def isWordInSearchedClasses(word, wordClasses):
  114. if word in wordClasses:
  115. return True
  116. return False
  117.  
  118. def findIndexesOfFoundWordInOriginalText(word, file):
  119. with open(file) as openfile:
  120. for line in openfile:
  121. if "isString" in line:
  122. head, sep, tail = line.partition('@')
  123. searchedText = head.split()
  124. searchedText.pop(0)
  125. finalText = ' '.join(searchedText)
  126. finalText = re.sub('["]', '', finalText)
  127. formatted = getSentencesFromWords(finalText.split())
  128.  
  129. final = ''
  130. for formattedWord in formatted:
  131. final += formattedWord
  132. final += ' '
  133. startIndex = final.find(word)
  134. start, end = startIndex, startIndex + len(word)
  135. textLen = len(final) - 1
  136. return start, end, textLen
  137.  
  138. def fromStringToOutputFile(outputString, start, end, textLen):
  139. f = open("output","w+")
  140. f_input = open("input","r")
  141. contents = f_input.read()
  142. f.write(contents + '\n')
  143. f.write('<http://example.com/example-task1#char=' + str(start) + ',' + str(end) + '>\n')
  144. f.write(' a nif:RFC5147String , nif:String ;\n')
  145. f.write(' nif:anchorOf "Florence May Harding"@en ;\n')
  146. f.write(' nif:beginIndex "' + str(start) + '"^^xsd:nonNegativeInteger ;\n')
  147. f.write(' nif:endIndex "' + str(end) + '"^^xsd:nonNegativeInteger ;\n')
  148. f.write(' nif:referenceContext <http://example.com/example-task1#char=0,' + str(textLen) + '> ;\n')
  149. f.write(' itsrdf:taIdentRef dbpedia:' + outputString + ' .\n')
  150.  
  151. f.close()
  152. f_input.close()
  153.  
  154.  
  155. def isOntologyOfSubclass(word):
  156. wordClasses = ['Person', 'Place', 'Organisation']
  157. finalText = "Initial Text"
  158. while finalText != None:
  159. sparql = SPARQLWrapper("http://dbpedia.org/sparql")
  160. sparql.setQuery("""
  161. SELECT ?value
  162. WHERE { <http://dbpedia.org/ontology/"""+word+"""> rdfs:subClassOf ?value }
  163. """)
  164. sparql.setReturnFormat(JSON)
  165. results = sparql.query().convert()
  166. values = results['results']['bindings']
  167. if values:
  168. if 'http://dbpedia.org/ontology/' in str(values):
  169. uniqueList = unique(values)
  170. for x in uniqueList:
  171. singleOntology = x['value']['value']
  172. finalText = re.sub('http://dbpedia.org/ontology/', '', singleOntology)
  173. # print(finalText)
  174. word = finalText
  175. if word in wordClasses:
  176. return word
  177. break
  178. else:
  179. finalText = None
  180. else:
  181. finalText = None
  182. return 'False'
  183.  
  184.  
  185. def unique(list1):
  186. # intilize a null list
  187. unique_list = []
  188. for x in list1:
  189. # check if exists in unique_list or not
  190. if x not in unique_list:
  191. unique_list.append(x)
  192. return unique_list
  193.  
  194.  
  195. def fromStringToInputFile(inputText):
  196. f = open("input","w+")
  197. f.write('@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .\n')
  198. f.write('@prefix nif: <http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#> .\n')
  199. f.write('@prefix dbpedia: <http://dbpedia.org/resource/> .\n')
  200. f.write('@prefix itsrdf: <http://www.w3.org/2005/11/its/rdf#> .\n')
  201. f.write('<http://example.com/example-task1#char=0,' + len(inputText) + '>\n')
  202. f.write(' a nif:RFC5147String , nif:String , nif:Context ;\n')
  203. f.write(' nif:beginIndex "0"^^xsd:nonNegativeInteger ;\n')
  204. f.write(' nif:endIndex "' + len(inputText) + '"^^xsd:nonNegativeInteger ;\n')
  205. f.write(' nif:isString "' + inputText + '"@en .\n')
  206. f.close()
  207.  
  208. if __name__ == "__main__":
  209. wordClasses = ['Person', 'Place', 'Organisation']
  210.  
  211. # Open and purify file
  212. openedFile = openAndPurifyFile('./input')
  213. # Delete stopwords
  214. purifiedString = tokenize(openedFile)
  215. # Remove stopwords
  216. purifiedString = removeStopwords(purifiedString)
  217. # Get sentences from string that matches to each other
  218. purifiedString = getSentencesFromWords(purifiedString)
  219. # Make all words/sentences start with uppercase
  220. wordsList = capitalizeList(purifiedString)
  221. # Send DBPedia Queries
  222. output = sendDBPediaQuery(wordsList, wordClasses)
  223. for word in output:
  224. start, end, textLen = findIndexesOfFoundWordInOriginalText(word, './input')
  225. print("Word "+ word + " position [" + str(start) + ', ' + str(end) + ']')
  226. fromStringToOutputFile(word, start, end, textLen)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement