Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from SPARQLWrapper import SPARQLWrapper, JSON
- from pynif import NIFCollection
- from itertools import tee, islice, chain
- import rdflib
- import re
- import nltk
- import numpy as np
- from nltk.corpus import stopwords
- nltk.download('stopwords')
- class WordAndClass():
- "Stores words and theirs classes"
- def __init__(self, word, wordsClass, whereFound):
- self.word = word
- self.wordsClass = wordsClass
- self.whereFound = whereFound
- def printText(self):
- print("Word: " + self.word + " matches to " + self.wordsClass + " class")
- print("From Text: " + self.whereFound)
- def openAndPurifyFile(file):
- with open(file) as openfile:
- for line in openfile:
- if "isString" in line:
- head, sep, tail = line.partition('@')
- searchedText = head.split()
- searchedText.pop(0)
- finalText = ' '.join(searchedText)
- finalText = re.sub('[^A-Za-z0-9]+', ' ', finalText)
- return finalText
- def removeStopwords(tokens):
- stop_words = set(stopwords.words('english'))
- return list(filter(lambda x: x not in stop_words, tokens))
- def tokenize(sentence):
- wordsList=[]
- for word in sentence.split():
- cleanString = re.sub('\W+','', word)
- wordsList.append(cleanString)
- return wordsList
- def capitalizeList(wordsList):
- newList = []
- for word in wordsList:
- newList.append(word.title())
- return newList
- def previous_and_next(some_iterable):
- prevs, items, nexts = tee(some_iterable, 3)
- prevs = chain([None], prevs)
- nexts = chain(islice(nexts, 1, None), [None])
- return zip(prevs, items, nexts)
- def getSentencesFromWords(tokens):
- serie = ''
- series = []
- counter = 0
- for previous, token, nxt in previous_and_next(tokens):
- if token[0].isupper():
- if serie == '':
- serie = serie + token
- series += [serie]
- else:
- serie = serie + '_' + token
- counter = counter + 1
- if counter == 2:
- series += [serie]
- if (nxt[0].isupper() and (serie.count('_')) == 1):
- serie = serie + '_' + nxt
- series += [serie]
- counter = 0
- serie = ''
- else:
- if serie != '':
- series += [serie]
- series += [token]
- serie = ''
- counter = 0
- series = list(set(series))
- return series
- def sendDBPediaQuery(wordsList, wordClasses):
- output=[]
- sparql = SPARQLWrapper("http://dbpedia.org/sparql")
- for word in wordsList:
- sparql.setQuery("""
- SELECT ?label
- WHERE { <http://dbpedia.org/resource/"""+word+"""> rdf:type ?label }
- """)
- sparql.setReturnFormat(JSON)
- results = sparql.query().convert()
- for result in results["results"]["bindings"]:
- # print(word + ' ' + result['label']['value'])
- if 'http://dbpedia.org/ontology/' in result['label']['value']:
- # print('Word: ' + word + ', class: ' + result['label']['value'])
- finalText = re.sub('http://dbpedia.org/ontology/', '', result['label']['value'])
- if isWordInSearchedClasses(finalText, wordClasses):
- print('Word: ' + word + ', class: ' + result['label']['value'])
- output.append(word)
- else:
- foundWordClass = isOntologyOfSubclass(word)
- if foundWordClass in wordClasses:
- print('Word: ' + word + ', class: ' + 'http://dbpedia.org/ontology/' + foundWordClass)
- output.append(word)
- return output
- def isWordInSearchedClasses(word, wordClasses):
- if word in wordClasses:
- return True
- return False
- def findIndexesOfFoundWordInOriginalText(word, file):
- with open(file) as openfile:
- for line in openfile:
- if "isString" in line:
- head, sep, tail = line.partition('@')
- searchedText = head.split()
- searchedText.pop(0)
- finalText = ' '.join(searchedText)
- finalText = re.sub('["]', '', finalText)
- formatted = getSentencesFromWords(finalText.split())
- final = ''
- for formattedWord in formatted:
- final += formattedWord
- final += ' '
- startIndex = final.find(word)
- start, end = startIndex, startIndex + len(word)
- textLen = len(final) - 1
- return start, end, textLen
- def fromStringToOutputFile(outputString, start, end, textLen):
- f = open("output","w+")
- f_input = open("input","r")
- contents = f_input.read()
- f.write(contents + '\n')
- f.write('<http://example.com/example-task1#char=' + str(start) + ',' + str(end) + '>\n')
- f.write(' a nif:RFC5147String , nif:String ;\n')
- f.write(' nif:anchorOf "Florence May Harding"@en ;\n')
- f.write(' nif:beginIndex "' + str(start) + '"^^xsd:nonNegativeInteger ;\n')
- f.write(' nif:endIndex "' + str(end) + '"^^xsd:nonNegativeInteger ;\n')
- f.write(' nif:referenceContext <http://example.com/example-task1#char=0,' + str(textLen) + '> ;\n')
- f.write(' itsrdf:taIdentRef dbpedia:' + outputString + ' .\n')
- f.close()
- f_input.close()
- def isOntologyOfSubclass(word):
- wordClasses = ['Person', 'Place', 'Organisation']
- finalText = "Initial Text"
- while finalText != None:
- sparql = SPARQLWrapper("http://dbpedia.org/sparql")
- sparql.setQuery("""
- SELECT ?value
- WHERE { <http://dbpedia.org/ontology/"""+word+"""> rdfs:subClassOf ?value }
- """)
- sparql.setReturnFormat(JSON)
- results = sparql.query().convert()
- values = results['results']['bindings']
- if values:
- if 'http://dbpedia.org/ontology/' in str(values):
- uniqueList = unique(values)
- for x in uniqueList:
- singleOntology = x['value']['value']
- finalText = re.sub('http://dbpedia.org/ontology/', '', singleOntology)
- # print(finalText)
- word = finalText
- if word in wordClasses:
- return word
- break
- else:
- finalText = None
- else:
- finalText = None
- return 'False'
- def unique(list1):
- # intilize a null list
- unique_list = []
- for x in list1:
- # check if exists in unique_list or not
- if x not in unique_list:
- unique_list.append(x)
- return unique_list
- def fromStringToInputFile(inputText):
- f = open("input","w+")
- f.write('@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .\n')
- f.write('@prefix nif: <http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#> .\n')
- f.write('@prefix dbpedia: <http://dbpedia.org/resource/> .\n')
- f.write('@prefix itsrdf: <http://www.w3.org/2005/11/its/rdf#> .\n')
- f.write('<http://example.com/example-task1#char=0,' + len(inputText) + '>\n')
- f.write(' a nif:RFC5147String , nif:String , nif:Context ;\n')
- f.write(' nif:beginIndex "0"^^xsd:nonNegativeInteger ;\n')
- f.write(' nif:endIndex "' + len(inputText) + '"^^xsd:nonNegativeInteger ;\n')
- f.write(' nif:isString "' + inputText + '"@en .\n')
- f.close()
- if __name__ == "__main__":
- wordClasses = ['Person', 'Place', 'Organisation']
- # Open and purify file
- openedFile = openAndPurifyFile('./input')
- # Delete stopwords
- purifiedString = tokenize(openedFile)
- # Remove stopwords
- purifiedString = removeStopwords(purifiedString)
- # Get sentences from string that matches to each other
- purifiedString = getSentencesFromWords(purifiedString)
- # Make all words/sentences start with uppercase
- wordsList = capitalizeList(purifiedString)
- # Send DBPedia Queries
- output = sendDBPediaQuery(wordsList, wordClasses)
- for word in output:
- start, end, textLen = findIndexesOfFoundWordInOriginalText(word, './input')
- print("Word "+ word + " position [" + str(start) + ', ' + str(end) + ']')
- fromStringToOutputFile(word, start, end, textLen)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement