Untitled

#!/usr/bin/python3
import nltk
from nltk.wsd import lesk
from nltk.corpus import wordnet as wn
import wikipedia
from nltk.parse import CoreNLPParser
import itertools
from collections import OrderedDict
import os

def wikilinker(query, wordlist):
    try:
        return wikipedia.page(query).url
    except wikipedia.exceptions.DisambiguationError as e:
        ll = []
        sumlist = []
        for option in e.options:
            dis = 1/nltk.edit_distance(query, option)
            ll.append(dis)
            try:
                a = nltk.word_tokenize(wikipedia.page(option).summary)
                match = set(a).intersection(wordlist)
                sumlist.append(len(match))
            except:

                sumlist.append(0)
        multilist = []
        for (ws,match) in zip(ll, sumlist):
            multilist.append(ws*match)
        maxval = multilist.index(max(multilist))
        return wikipedia.page(e.options[maxval]).url
    except wikipedia.exceptions.WikipediaException as r:
        pass

def sfNERTagger(rawText, POSFile):
	'''(sf = stanford) get the raw text from a file and convert that to a list with tuples of each word with a StanFord annotated NER-tag'''
	parser = CoreNLPParser(url='http://localhost:9000', tagtype='ner')
	tupleList = list(parser.tag(rawText.split()))
	#convert list of tuple to list of lists, so we can change tags we dont need
	NERList = [list(tuple) for tuple in tupleList]

	#change tags we dont need
	for item in NERList:
		if item[1] == 'COUNTRY': item[1] = 'COU'
		elif item[1] == 'PERSON': item[1] = 'PER'
		elif item[1] == 'CITY': item[1] = 'CIT'
		elif item[1] == 'ORGANIZATION': item[1] = 'ORG'
		else: item[1] = ''
	#add the pos tag as a third item to every list within the NERList
	with open(POSFile) as f:
		POSLines = f.readlines()

	#remove the '\n'
	for i in POSLines:
		i = i.strip('\n')
	#add postag to every appriopriate item
	for lineNumber,item in enumerate(NERList):
		item.append(POSLines[lineNumber].split()[4])

	return NERList

def ownTagger (NERList):
#take nerlist, when i[1] =='': see if this tagger can tag it, other wise leave empty. output ex: [[word, tag][word, tag]]


	synlist = []
	ll = []
	onesynlist = []
	simlist = []
	current_chunk = []

	#make list out of NERList that only contains the words that are not tagged by NERTagger

	t = []

	for p in NERList:
		if p[1] == '' and p[2].startswith('N'):
			onedeflist = []
			deflist = []
			klist = []
			for syns in wn.synsets(p[0]):
				deflist.append(syns.definition())
				onedeflist.append(syns)
				klist.append(syns)
			if not deflist:
				ll.append([])
			elif len(deflist) == 1:
				onesynlist.append(onedeflist[0])
				ll.append(klist)
			else:
				synlist.append(deflist)
				ll.append(klist)

			newlist = []
			for item in ll:
				itemb = []
				for i in item:
					synb = []
					for a in onesynlist:
						ps = i.path_similarity(a)
						synb.append(ps)
					itemb.append(synb)
				newlist.append(itemb)


	newlist = [[[0 if x is None else x for x in i] for i in item] for item in newlist]
	newlist = [[sum(i) for i in item] for item in newlist]

	maxlist = []
	newsynlist = []

	for item in newlist:
		if not item:
			maxlist.append([])
		else:
			maxlist.append(item.index(max(item)))


	for x,value in zip(ll, maxlist):
		if value == []:
			newsynlist.append([])
		else:
			a = x[value].definition()
			newsynlist.append(a)
	synlist = newsynlist

	print(synlist)

	#lists of all words that could appear in the definitions of the unigrams and bigrams
	city = [' city ', ' village ', ' town ', 'capital']
	country = [' nation ', ' republic ', ' monarchy ', ' province ', ' island ' , ' archipelago ']
	sport = [' sport ', 'combat', ' game ']
	natural_places = [' desert ', ' volcano ', ' sea ', ' ocean ',	' lake ', ' river ', ' jungle ', ' waterfall ', ' glacier ', ' mountain ', ' forest ' , ' crater ', ' cave ', ' canyon ', ' fjord ', ' park ', ' bay ', ' valley ', ' cliff ', ' reef ']
	entertainment = [' book ', 'magazine', 'film', 'movie', 'song', 'journal', 'newspaper']
	animal = ['mammal', 'bird', 'fish', 'amphibian', 'reptil', 'crustacean', 'insect', 'carnivore', 'herbivore', 'species', 'breed', 'cattle', 'quadruped', 'pachyderm', 'feline', 'ungulate']
	person = ['born']
	organization = ['organization']


	#if one of the words appears in the definition of the uni- or bigram, append a tuple to a list with the word and the NER tag
	for deflist in synlist:
		if any(x in deflist for x in city):
			NERList[synlist.index(deflist)][1] = 'CIT'
		elif any(x in deflist for x in country):
			NERList[synlist.index(deflist)][1] = 'COU'
		elif any(x in deflist for x in sport):
			NERList[synlist.index(deflist)][1] = 'SPO'
		elif any(x in deflist for x in natural_places):
			NERList[synlist.index(deflist)][1] = 'NAT'
		elif any(x in deflist for x in entertainment):
			NERList[synlist.index(deflist)][1] = 'ENT'
		elif any(x in deflist for x in animal):
			NERList[synlist.index(deflist)][1] = 'ANI'

	return NERList

def sfNERWriter(POSFile, NERList, rawlist):
	'''Takes output of sfNERTagger() -->NERList, iters over the POSFile, if NERList[index][1] is meaningful: add the appropriate tag. create ENTFile and write every line'''
	with open(POSFile, "r") as f1:
		POSLines = f1.readlines()
	with open(str(POSFile + ".test"),"w") as f2:
		for lineNumber, line in enumerate(POSLines):
			line  = line.strip('\n')
			if line.split()[3] == NERList[lineNumber][0]:
				f2.write(str(line + " " + NERList[lineNumber][1] + '\n'))
			else:
				f2.write("error")
			if len(line.split()) > 5:
				f2.write(wikilinker(NERList[lineNumber][0], rawlist))
	# with open("posfile", "r") as posfile:
	#lines = posfile.readlines()
#with open("posfile", "w") as posfile:
	#for line in lines:
		#sources.write(blablabla)


def getContinuousChunks(NERList):
	'''takes NERList and lists together words that need chunking'''
	continuous_chunk = []
	current_chunk = []

	for token, tag, postag in NERList:
		if tag != "":
			current_chunk.append((token))
		else:
			if current_chunk: # if the current chunk is not empty
				continuous_chunk.append(current_chunk)
				current_chunk = []
	# Flush the final current_chunk into the continuous_chunk, if any.
	if current_chunk:
		continuous_chunk.append(current_chunk)
	return continuous_chunk


def main():
	POSFile = 'data/p51/d0069/en.tok.off.pos'

	with open('data/p51/d0069/en.raw') as f1:
		rawText = f1.read()
		rawlist = rawText.split()
		NERList = sfNERTagger(rawText, POSFile)
		# x output example: [('out', 'O'),('two','Date")] etc

	#with open('data/p51/d0060/en.tok.off.pos') as f2:
		#for lineNumber, line in enumerate(f2):
			#if line.split()[3] == x[lineNumber][0]:
				#print('True', NERRaw[lineNumber], f2.name + ".ent")


	#print(NERRaw)

	#x = getContinuousChunks(NERList)
	y = ownTagger(NERList)
	sfNERWriter(POSFile, NERList, rawlist)

if __name__ == "__main__":
	main()