Untitled

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

import glob
import numpy as np
import math
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize,sent_tokenize
import sys
#nltk.download('stopwords')
import os
stopwords.words('english')


def remove_string_special_chars(s):
    """
    This function removes any special chars from a string
    :parameter
        s(str) : single input string
    :return:
        stripped(str) : a string with special chars removed
    """
    stripped = re.sub('[^\w\s]','',s)
    stripped = re.sub('_','',stripped)
    stripped = re.sub('\s+',' ',stripped)
    stripped = stripped.strip()
    return stripped


def create_freq_dic(corpus):
    """
    This function creates a frequency dictionary for each word in each document
    """
    i = 0
    freqDic_list = []
    #for each_doc in corpus:
    i+=1
    freq_dic = {}
    docid = corpus['docid']
    textContent = corpus['text']
    words = word_tokenize(textContent)
    for word in words :
        word = word.lower()
        if word in freq_dic:
            freq_dic[word] += 1
        else:
            freq_dic[word] = 1
        temp = {'docid': docid , 'freqdict': freq_dic}
    freqDic_list.append(temp)
    #print("Created frequency dictionary")
    return freqDic_list


def computeTF(freqDict_list):
    """
    tf = ( frequency of the term in the doc / total number of terms in the doc )

    """

    TF_scores = []
    #for tempDict in freqDict_list:
    id = freqDict_list['docid']
    freq = freqDict_list['freqdict']
    nwords = sum(freq.values())
    for k in freq:
        tf = freq[k]/nwords
        temp = {'docid' : id, 'term' : k, 'raw' : freq[k] , 'tf' : tf  }
        TF_scores.append(temp)
    return TF_scores


def loadTF(corpora):
    TF_scores = []
    inputFile = "/home/joao/workspace/SAC/resources/"+corpora+"/mention.tf"
    fin =  open(inputFile, 'r', 1, encoding='utf-8')
    for line in fin:
          docid, term, raw, tf = line.split('\t')
          temp = {'docid' : docid, 'term' : term, 'raw' : raw , 'tf' : tf  }
          TF_scores.append(temp)
    fin.close()
    return TF_scores


def computeIDF(freqDic_list,freqDic_listCOPY, numdocs):
    """
    idf = ln (total number of docs / number of docs with term in it )
    """
    IDF_scores = []
    counter = 0
    #for dict in freqDic_list:
    docid = freqDic_list['docid']
    #term  = dict['term']
    counter += 1
    for k in freqDic_list['freqdict'].keys():
        df = sum([k in tempDict['freqdict'] for tempDict in freqDic_listCOPY])
        temp = {'docid' : docid, 'df' : df, 'idf': math.log(numdocs/df),'term' : k}
        IDF_scores.append(temp)
    return IDF_scores


def loadIDF(corpora):
    IDF_scores = []
    inputFile = "/home/joao/workspace/SAC/resources/"+corpora+"/mention.idf"
    fin =  open(inputFile, 'w', 1, encoding='utf-8')
    for line in fin:
        docid, term, df, idf  = line.split('\t')
        temp = {'docid' : docid, 'df' : df, 'idf': idf,'term' : term}
        IDF_scores.append(temp)
    fin.close()
    return IDF_scores


def computeTFIDF(TF_scores, IDF_scores):
    TFIDF_scores = []
    for j in IDF_scores:
        #print(j)
        for i in TF_scores:
            #print(i)
            if j['term'] == i['term'] and j['docid'] == i['docid']:
                tfXidf = j['idf']*i['tf']
                temp = {'docid' : j['docid'], 'tfXidf' : tfXidf, 'term' : i['term']}
        TFIDF_scores.append(temp)
    #print("Created term frequency x inverse document frequency")
    return TFIDF_scores


def count_words(sentence):
    """
    This function returns the total number of words in the input

    :param sent:
    :return:
    """
    count = 0
    words = word_tokenize(sentence)
    for word in words :
        count+=1
    return count


def read_mentions_set(easy,medium,hard):
    mentionsSet = set()
    # here I am only reading the mentions from the file labeled as easy
    feasy= open(easy, 'r', 1, encoding='utf-8')
    for i in feasy:
        doc, mention, offset, el1, el2, el3  = i.split('\t')
        mention = mention.lower()
        mentionsSet.add(mention)
    feasy.close()
    print("read easy mentions")

    # here I am only reading the mentions from the file labeled as medium
    fmedium = open(medium, 'r', 1, encoding='utf-8')
    for i in fmedium:
        doc, mention, offset, el1, el2, el3  = i.split('\t')
        mention = mention.lower()
        mentionsSet.add(mention)
    fmedium.close()
    print("read medium mentions")

    # here I am only reading the mentions from the file labeled as hard
    fhard = open(hard, 'r', 1, encoding='utf-8')
    for i in fhard:
        doc, mention, offset, el1, el2, el3  = i.split('\t')
        mention = mention.lower()
        mentionsSet.add(mention)
    fhard.close()
    print("read hard mentions")
    return mentionsSet

def remove_punctuation(data):
    symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
    for i in symbols:
        data = np.char.replace(data, i, ' ')
    return data

def remove_stop_words(data):
    en_stops = set(stopwords.words('english'))
    new_text = ""
    sentence = data.split()
    #print(sentence)
    for word in sentence:
        if word not in en_stops:
            new_text = new_text + " " + word
    return new_text

def remove_single_chars(words):
    new_text = ""
    for w in words:

        if len(w) > 1:
            new_text = new_text + " " + w
    return new_text


corpora = "iitb"

### Reading the mentions set
easy = "/home/joao/workspace/SAC/resources/"+corpora+"/"+corpora+".easy.out"
hard  = "/home/joao/workspace/SAC/resources/"+corpora+"/"+corpora+".hard.out"
medium = "/home/joao/workspace/SAC/resources/"+corpora+"/"+corpora+".medium.out"

mentionsSet = read_mentions_set(easy,medium,hard)


### Putting docs content into a dictionary
corpus = []
#
#mypath = "/home/joao/datasets/NYT/nyt_corpus/TEXT_FILES/"
mypath = "/home/joao/datasets/"+corpora+"/TEXT_FILES/"


txtfiles = []
for file in glob.glob(mypath + "*.txt"):
    txtfiles.append(file)

ndocs = len(txtfiles)

for file in txtfiles:
    fopen = open(file, 'r', 1, encoding='utf-8')
    text = fopen.read()
    text  = text.lower()
    text = remove_string_special_chars(text)
    text = remove_stop_words(text)

        #docid = file.split('/')[-1]
    docid=os.path.basename(file)
    docid = docid.split('.txt')[0]
    corpus.append({'docid': docid, 'text' : text})
    fopen.close()

print("num files :" + str(len(txtfiles)))


# Parallelizing using Pool.map()
import multiprocessing as mp

#Calculating Frequency Dictionary
pool = mp.Pool(mp.cpu_count())
freq_dic = pool.map(create_freq_dic, corpus)
pool.close()

FreqDic = []
for elem in range(ndocs):
    FreqDic.append(freq_dic[elem][0])
#print(FreqDic[0])
FreqDicCOPY = FreqDic
#############################################


### Calculating Term Frequency
pool = mp.Pool(mp.cpu_count())
TF = pool.map(computeTF,FreqDic)
pool.close()
TF_SCORE = []
for elem in range(ndocs):
    for i in TF[elem]:
        TF_SCORE.append(i)
print("Created term frequency")

#print(TF_SCORE)
#############################################

#print(type(FreqDicCOPY))

### Calculating Inverse Document Frequency
pool = mp.Pool(mp.cpu_count())
IDF = [pool.apply(computeIDF,args = (row,FreqDicCOPY, ndocs)) for row in FreqDic]
pool.close()
#############################################
IDF_SCORE = []
for elem in range(ndocs):
      for i in IDF[elem]:
            IDF_SCORE.append(i)
print("Created inverse document frequency")

#print(IDF_SCORE[0])
#print(IDF_SCORE[1])
#print(IDF_SCORE[2])
#IDFS = []
#print(IDF[0])

pool = mp.Pool(mp.cpu_count())
TFxIDF = [pool.apply(computeTFIDF,args = (TF_SCORE,row)) for row in IDF]
pool.close()
print("Created tf x idf")

#print(TFxIDF[0][0])#tfXidf =  computeTFIDF(TF_SCORE, IDF_SCORE)


TFxIDF_SCORES = []
for elem in range(ndocs):
    for i in TFxIDF[elem]:
        TFxIDF_SCORES.append(i)

#print("num elems dic "+str(len(tfXidf)))
for x in TFxIDF_SCORES:
    token  = x['term']
    #if token in mentionsSet:
#        print(token)
        #for y in IDF_SCORE:
#            print(y)
            #for z in TF_SCORE:
                #if x['docid'] == y['docid'] and x['docid'] == z['docid'] and x['term'] == y['term'] and x['term'] == z['term']:
    #                #fout.write(z['docid'] + "\t" + z['term'] + "\t" + str(z['raw'])+ "\t" + str(z['tf'])  + "\t" + str(y['df']) + "\t" + str(y['idf']) + "\t" + str(x['tfXidf'])+ "\n" )
            #        print(['docid'] + "\t" + z['term'] + "\t" + str(z['raw'])+ "\t" + str(z['tf'])  + "\t" + str(y['df']) + "\t" + str(y['idf']) + "\t" + str(x['tfXidf'])+ "\n" )


### Calculating TFxIDF
#pool = mp.Pool(mp.cpu_count())
#TFIDF = [pool.apply(computeIDF,args = (row,FreqDicCOPY, ndocs)) for row in FreqDic]
#pool.close()