Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from IPython.core.display import display, HTML
- display(HTML("<style>.container { width:100% !important; }</style>"))
- import glob
- import numpy as np
- import math
- import re
- from nltk.corpus import stopwords
- from nltk.tokenize import word_tokenize,sent_tokenize
- import sys
- #nltk.download('stopwords')
- import os
- stopwords.words('english')
- def remove_string_special_chars(s):
- """
- This function removes any special chars from a string
- :parameter
- s(str) : single input string
- :return:
- stripped(str) : a string with special chars removed
- """
- stripped = re.sub('[^\w\s]','',s)
- stripped = re.sub('_','',stripped)
- stripped = re.sub('\s+',' ',stripped)
- stripped = stripped.strip()
- return stripped
- def create_freq_dic(corpus):
- """
- This function creates a frequency dictionary for each word in each document
- """
- i = 0
- freqDic_list = []
- #for each_doc in corpus:
- i+=1
- freq_dic = {}
- docid = corpus['docid']
- textContent = corpus['text']
- words = word_tokenize(textContent)
- for word in words :
- word = word.lower()
- if word in freq_dic:
- freq_dic[word] += 1
- else:
- freq_dic[word] = 1
- temp = {'docid': docid , 'freqdict': freq_dic}
- freqDic_list.append(temp)
- #print("Created frequency dictionary")
- return freqDic_list
- def computeTF(freqDict_list):
- """
- tf = ( frequency of the term in the doc / total number of terms in the doc )
- """
- TF_scores = []
- #for tempDict in freqDict_list:
- id = freqDict_list['docid']
- freq = freqDict_list['freqdict']
- nwords = sum(freq.values())
- for k in freq:
- tf = freq[k]/nwords
- temp = {'docid' : id, 'term' : k, 'raw' : freq[k] , 'tf' : tf }
- TF_scores.append(temp)
- return TF_scores
- def loadTF(corpora):
- TF_scores = []
- inputFile = "/home/joao/workspace/SAC/resources/"+corpora+"/mention.tf"
- fin = open(inputFile, 'r', 1, encoding='utf-8')
- for line in fin:
- docid, term, raw, tf = line.split('\t')
- temp = {'docid' : docid, 'term' : term, 'raw' : raw , 'tf' : tf }
- TF_scores.append(temp)
- fin.close()
- return TF_scores
- def computeIDF(freqDic_list,freqDic_listCOPY, numdocs):
- """
- idf = ln (total number of docs / number of docs with term in it )
- """
- IDF_scores = []
- counter = 0
- #for dict in freqDic_list:
- docid = freqDic_list['docid']
- #term = dict['term']
- counter += 1
- for k in freqDic_list['freqdict'].keys():
- df = sum([k in tempDict['freqdict'] for tempDict in freqDic_listCOPY])
- temp = {'docid' : docid, 'df' : df, 'idf': math.log(numdocs/df),'term' : k}
- IDF_scores.append(temp)
- return IDF_scores
- def loadIDF(corpora):
- IDF_scores = []
- inputFile = "/home/joao/workspace/SAC/resources/"+corpora+"/mention.idf"
- fin = open(inputFile, 'w', 1, encoding='utf-8')
- for line in fin:
- docid, term, df, idf = line.split('\t')
- temp = {'docid' : docid, 'df' : df, 'idf': idf,'term' : term}
- IDF_scores.append(temp)
- fin.close()
- return IDF_scores
- def computeTFIDF(TF_scores, IDF_scores):
- TFIDF_scores = []
- for j in IDF_scores:
- #print(j)
- for i in TF_scores:
- #print(i)
- if j['term'] == i['term'] and j['docid'] == i['docid']:
- tfXidf = j['idf']*i['tf']
- temp = {'docid' : j['docid'], 'tfXidf' : tfXidf, 'term' : i['term']}
- TFIDF_scores.append(temp)
- #print("Created term frequency x inverse document frequency")
- return TFIDF_scores
- def count_words(sentence):
- """
- This function returns the total number of words in the input
- :param sent:
- :return:
- """
- count = 0
- words = word_tokenize(sentence)
- for word in words :
- count+=1
- return count
- def read_mentions_set(easy,medium,hard):
- mentionsSet = set()
- # here I am only reading the mentions from the file labeled as easy
- feasy= open(easy, 'r', 1, encoding='utf-8')
- for i in feasy:
- doc, mention, offset, el1, el2, el3 = i.split('\t')
- mention = mention.lower()
- mentionsSet.add(mention)
- feasy.close()
- print("read easy mentions")
- # here I am only reading the mentions from the file labeled as medium
- fmedium = open(medium, 'r', 1, encoding='utf-8')
- for i in fmedium:
- doc, mention, offset, el1, el2, el3 = i.split('\t')
- mention = mention.lower()
- mentionsSet.add(mention)
- fmedium.close()
- print("read medium mentions")
- # here I am only reading the mentions from the file labeled as hard
- fhard = open(hard, 'r', 1, encoding='utf-8')
- for i in fhard:
- doc, mention, offset, el1, el2, el3 = i.split('\t')
- mention = mention.lower()
- mentionsSet.add(mention)
- fhard.close()
- print("read hard mentions")
- return mentionsSet
- def remove_punctuation(data):
- symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
- for i in symbols:
- data = np.char.replace(data, i, ' ')
- return data
- def remove_stop_words(data):
- en_stops = set(stopwords.words('english'))
- new_text = ""
- sentence = data.split()
- #print(sentence)
- for word in sentence:
- if word not in en_stops:
- new_text = new_text + " " + word
- return new_text
- def remove_single_chars(words):
- new_text = ""
- for w in words:
- if len(w) > 1:
- new_text = new_text + " " + w
- return new_text
- corpora = "iitb"
- ### Reading the mentions set
- easy = "/home/joao/workspace/SAC/resources/"+corpora+"/"+corpora+".easy.out"
- hard = "/home/joao/workspace/SAC/resources/"+corpora+"/"+corpora+".hard.out"
- medium = "/home/joao/workspace/SAC/resources/"+corpora+"/"+corpora+".medium.out"
- mentionsSet = read_mentions_set(easy,medium,hard)
- ### Putting docs content into a dictionary
- corpus = []
- #
- #mypath = "/home/joao/datasets/NYT/nyt_corpus/TEXT_FILES/"
- mypath = "/home/joao/datasets/"+corpora+"/TEXT_FILES/"
- txtfiles = []
- for file in glob.glob(mypath + "*.txt"):
- txtfiles.append(file)
- ndocs = len(txtfiles)
- for file in txtfiles:
- fopen = open(file, 'r', 1, encoding='utf-8')
- text = fopen.read()
- text = text.lower()
- text = remove_string_special_chars(text)
- text = remove_stop_words(text)
- #docid = file.split('/')[-1]
- docid=os.path.basename(file)
- docid = docid.split('.txt')[0]
- corpus.append({'docid': docid, 'text' : text})
- fopen.close()
- print("num files :" + str(len(txtfiles)))
- # Parallelizing using Pool.map()
- import multiprocessing as mp
- #Calculating Frequency Dictionary
- pool = mp.Pool(mp.cpu_count())
- freq_dic = pool.map(create_freq_dic, corpus)
- pool.close()
- FreqDic = []
- for elem in range(ndocs):
- FreqDic.append(freq_dic[elem][0])
- #print(FreqDic[0])
- FreqDicCOPY = FreqDic
- #############################################
- ### Calculating Term Frequency
- pool = mp.Pool(mp.cpu_count())
- TF = pool.map(computeTF,FreqDic)
- pool.close()
- TF_SCORE = []
- for elem in range(ndocs):
- for i in TF[elem]:
- TF_SCORE.append(i)
- print("Created term frequency")
- #print(TF_SCORE)
- #############################################
- #print(type(FreqDicCOPY))
- ### Calculating Inverse Document Frequency
- pool = mp.Pool(mp.cpu_count())
- IDF = [pool.apply(computeIDF,args = (row,FreqDicCOPY, ndocs)) for row in FreqDic]
- pool.close()
- #############################################
- IDF_SCORE = []
- for elem in range(ndocs):
- for i in IDF[elem]:
- IDF_SCORE.append(i)
- print("Created inverse document frequency")
- #print(IDF_SCORE[0])
- #print(IDF_SCORE[1])
- #print(IDF_SCORE[2])
- #IDFS = []
- #print(IDF[0])
- pool = mp.Pool(mp.cpu_count())
- TFxIDF = [pool.apply(computeTFIDF,args = (TF_SCORE,row)) for row in IDF]
- pool.close()
- print("Created tf x idf")
- #print(TFxIDF[0][0])#tfXidf = computeTFIDF(TF_SCORE, IDF_SCORE)
- TFxIDF_SCORES = []
- for elem in range(ndocs):
- for i in TFxIDF[elem]:
- TFxIDF_SCORES.append(i)
- #print("num elems dic "+str(len(tfXidf)))
- for x in TFxIDF_SCORES:
- token = x['term']
- #if token in mentionsSet:
- # print(token)
- #for y in IDF_SCORE:
- # print(y)
- #for z in TF_SCORE:
- #if x['docid'] == y['docid'] and x['docid'] == z['docid'] and x['term'] == y['term'] and x['term'] == z['term']:
- # #fout.write(z['docid'] + "\t" + z['term'] + "\t" + str(z['raw'])+ "\t" + str(z['tf']) + "\t" + str(y['df']) + "\t" + str(y['idf']) + "\t" + str(x['tfXidf'])+ "\n" )
- # print(['docid'] + "\t" + z['term'] + "\t" + str(z['raw'])+ "\t" + str(z['tf']) + "\t" + str(y['df']) + "\t" + str(y['idf']) + "\t" + str(x['tfXidf'])+ "\n" )
- ### Calculating TFxIDF
- #pool = mp.Pool(mp.cpu_count())
- #TFIDF = [pool.apply(computeIDF,args = (row,FreqDicCOPY, ndocs)) for row in FreqDic]
- #pool.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement