Advertisement
Guest User

Untitled

a guest
Jun 24th, 2019
207
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 9.03 KB | None | 0 0
  1. from IPython.core.display import display, HTML
  2. display(HTML("<style>.container { width:100% !important; }</style>"))
  3.  
  4. import glob
  5. import numpy as np
  6. import math
  7. import re
  8. from nltk.corpus import stopwords
  9. from nltk.tokenize import word_tokenize,sent_tokenize
  10. import sys
  11. #nltk.download('stopwords')
  12. import os
  13. stopwords.words('english')
  14.  
  15.  
  16. def remove_string_special_chars(s):
  17. """
  18. This function removes any special chars from a string
  19. :parameter
  20. s(str) : single input string
  21. :return:
  22. stripped(str) : a string with special chars removed
  23. """
  24. stripped = re.sub('[^\w\s]','',s)
  25. stripped = re.sub('_','',stripped)
  26. stripped = re.sub('\s+',' ',stripped)
  27. stripped = stripped.strip()
  28. return stripped
  29.  
  30.  
  31.  
  32. def create_freq_dic(corpus):
  33. """
  34. This function creates a frequency dictionary for each word in each document
  35. """
  36. i = 0
  37. freqDic_list = []
  38. #for each_doc in corpus:
  39. i+=1
  40. freq_dic = {}
  41. docid = corpus['docid']
  42. textContent = corpus['text']
  43. words = word_tokenize(textContent)
  44. for word in words :
  45. word = word.lower()
  46. if word in freq_dic:
  47. freq_dic[word] += 1
  48. else:
  49. freq_dic[word] = 1
  50. temp = {'docid': docid , 'freqdict': freq_dic}
  51. freqDic_list.append(temp)
  52. #print("Created frequency dictionary")
  53. return freqDic_list
  54.  
  55.  
  56. def computeTF(freqDict_list):
  57. """
  58. tf = ( frequency of the term in the doc / total number of terms in the doc )
  59.  
  60. """
  61.  
  62. TF_scores = []
  63. #for tempDict in freqDict_list:
  64. id = freqDict_list['docid']
  65. freq = freqDict_list['freqdict']
  66. nwords = sum(freq.values())
  67. for k in freq:
  68. tf = freq[k]/nwords
  69. temp = {'docid' : id, 'term' : k, 'raw' : freq[k] , 'tf' : tf }
  70. TF_scores.append(temp)
  71. return TF_scores
  72.  
  73.  
  74. def loadTF(corpora):
  75. TF_scores = []
  76. inputFile = "/home/joao/workspace/SAC/resources/"+corpora+"/mention.tf"
  77. fin = open(inputFile, 'r', 1, encoding='utf-8')
  78. for line in fin:
  79. docid, term, raw, tf = line.split('\t')
  80. temp = {'docid' : docid, 'term' : term, 'raw' : raw , 'tf' : tf }
  81. TF_scores.append(temp)
  82. fin.close()
  83. return TF_scores
  84.  
  85.  
  86. def computeIDF(freqDic_list,freqDic_listCOPY, numdocs):
  87. """
  88. idf = ln (total number of docs / number of docs with term in it )
  89. """
  90. IDF_scores = []
  91. counter = 0
  92. #for dict in freqDic_list:
  93. docid = freqDic_list['docid']
  94. #term = dict['term']
  95. counter += 1
  96. for k in freqDic_list['freqdict'].keys():
  97. df = sum([k in tempDict['freqdict'] for tempDict in freqDic_listCOPY])
  98. temp = {'docid' : docid, 'df' : df, 'idf': math.log(numdocs/df),'term' : k}
  99. IDF_scores.append(temp)
  100. return IDF_scores
  101.  
  102.  
  103. def loadIDF(corpora):
  104. IDF_scores = []
  105. inputFile = "/home/joao/workspace/SAC/resources/"+corpora+"/mention.idf"
  106. fin = open(inputFile, 'w', 1, encoding='utf-8')
  107. for line in fin:
  108. docid, term, df, idf = line.split('\t')
  109. temp = {'docid' : docid, 'df' : df, 'idf': idf,'term' : term}
  110. IDF_scores.append(temp)
  111. fin.close()
  112. return IDF_scores
  113.  
  114.  
  115.  
  116.  
  117. def computeTFIDF(TF_scores, IDF_scores):
  118. TFIDF_scores = []
  119. for j in IDF_scores:
  120. #print(j)
  121. for i in TF_scores:
  122. #print(i)
  123. if j['term'] == i['term'] and j['docid'] == i['docid']:
  124. tfXidf = j['idf']*i['tf']
  125. temp = {'docid' : j['docid'], 'tfXidf' : tfXidf, 'term' : i['term']}
  126. TFIDF_scores.append(temp)
  127. #print("Created term frequency x inverse document frequency")
  128. return TFIDF_scores
  129.  
  130.  
  131.  
  132.  
  133. def count_words(sentence):
  134. """
  135. This function returns the total number of words in the input
  136.  
  137. :param sent:
  138. :return:
  139. """
  140. count = 0
  141. words = word_tokenize(sentence)
  142. for word in words :
  143. count+=1
  144. return count
  145.  
  146.  
  147.  
  148. def read_mentions_set(easy,medium,hard):
  149. mentionsSet = set()
  150. # here I am only reading the mentions from the file labeled as easy
  151. feasy= open(easy, 'r', 1, encoding='utf-8')
  152. for i in feasy:
  153. doc, mention, offset, el1, el2, el3 = i.split('\t')
  154. mention = mention.lower()
  155. mentionsSet.add(mention)
  156. feasy.close()
  157. print("read easy mentions")
  158.  
  159. # here I am only reading the mentions from the file labeled as medium
  160. fmedium = open(medium, 'r', 1, encoding='utf-8')
  161. for i in fmedium:
  162. doc, mention, offset, el1, el2, el3 = i.split('\t')
  163. mention = mention.lower()
  164. mentionsSet.add(mention)
  165. fmedium.close()
  166. print("read medium mentions")
  167.  
  168. # here I am only reading the mentions from the file labeled as hard
  169. fhard = open(hard, 'r', 1, encoding='utf-8')
  170. for i in fhard:
  171. doc, mention, offset, el1, el2, el3 = i.split('\t')
  172. mention = mention.lower()
  173. mentionsSet.add(mention)
  174. fhard.close()
  175. print("read hard mentions")
  176. return mentionsSet
  177.  
  178. def remove_punctuation(data):
  179. symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
  180. for i in symbols:
  181. data = np.char.replace(data, i, ' ')
  182. return data
  183.  
  184. def remove_stop_words(data):
  185. en_stops = set(stopwords.words('english'))
  186. new_text = ""
  187. sentence = data.split()
  188. #print(sentence)
  189. for word in sentence:
  190. if word not in en_stops:
  191. new_text = new_text + " " + word
  192. return new_text
  193.  
  194. def remove_single_chars(words):
  195. new_text = ""
  196. for w in words:
  197.  
  198. if len(w) > 1:
  199. new_text = new_text + " " + w
  200. return new_text
  201.  
  202.  
  203.  
  204.  
  205. corpora = "iitb"
  206.  
  207. ### Reading the mentions set
  208. easy = "/home/joao/workspace/SAC/resources/"+corpora+"/"+corpora+".easy.out"
  209. hard = "/home/joao/workspace/SAC/resources/"+corpora+"/"+corpora+".hard.out"
  210. medium = "/home/joao/workspace/SAC/resources/"+corpora+"/"+corpora+".medium.out"
  211.  
  212. mentionsSet = read_mentions_set(easy,medium,hard)
  213.  
  214.  
  215. ### Putting docs content into a dictionary
  216. corpus = []
  217. #
  218. #mypath = "/home/joao/datasets/NYT/nyt_corpus/TEXT_FILES/"
  219. mypath = "/home/joao/datasets/"+corpora+"/TEXT_FILES/"
  220.  
  221.  
  222. txtfiles = []
  223. for file in glob.glob(mypath + "*.txt"):
  224. txtfiles.append(file)
  225.  
  226. ndocs = len(txtfiles)
  227.  
  228. for file in txtfiles:
  229. fopen = open(file, 'r', 1, encoding='utf-8')
  230. text = fopen.read()
  231. text = text.lower()
  232. text = remove_string_special_chars(text)
  233. text = remove_stop_words(text)
  234.  
  235. #docid = file.split('/')[-1]
  236. docid=os.path.basename(file)
  237. docid = docid.split('.txt')[0]
  238. corpus.append({'docid': docid, 'text' : text})
  239. fopen.close()
  240.  
  241. print("num files :" + str(len(txtfiles)))
  242.  
  243.  
  244. # Parallelizing using Pool.map()
  245. import multiprocessing as mp
  246.  
  247. #Calculating Frequency Dictionary
  248. pool = mp.Pool(mp.cpu_count())
  249. freq_dic = pool.map(create_freq_dic, corpus)
  250. pool.close()
  251.  
  252. FreqDic = []
  253. for elem in range(ndocs):
  254. FreqDic.append(freq_dic[elem][0])
  255. #print(FreqDic[0])
  256. FreqDicCOPY = FreqDic
  257. #############################################
  258.  
  259.  
  260. ### Calculating Term Frequency
  261. pool = mp.Pool(mp.cpu_count())
  262. TF = pool.map(computeTF,FreqDic)
  263. pool.close()
  264. TF_SCORE = []
  265. for elem in range(ndocs):
  266. for i in TF[elem]:
  267. TF_SCORE.append(i)
  268. print("Created term frequency")
  269.  
  270. #print(TF_SCORE)
  271. #############################################
  272.  
  273. #print(type(FreqDicCOPY))
  274.  
  275. ### Calculating Inverse Document Frequency
  276. pool = mp.Pool(mp.cpu_count())
  277. IDF = [pool.apply(computeIDF,args = (row,FreqDicCOPY, ndocs)) for row in FreqDic]
  278. pool.close()
  279. #############################################
  280. IDF_SCORE = []
  281. for elem in range(ndocs):
  282. for i in IDF[elem]:
  283. IDF_SCORE.append(i)
  284. print("Created inverse document frequency")
  285.  
  286. #print(IDF_SCORE[0])
  287. #print(IDF_SCORE[1])
  288. #print(IDF_SCORE[2])
  289. #IDFS = []
  290. #print(IDF[0])
  291.  
  292. pool = mp.Pool(mp.cpu_count())
  293. TFxIDF = [pool.apply(computeTFIDF,args = (TF_SCORE,row)) for row in IDF]
  294. pool.close()
  295. print("Created tf x idf")
  296.  
  297. #print(TFxIDF[0][0])#tfXidf = computeTFIDF(TF_SCORE, IDF_SCORE)
  298.  
  299.  
  300. TFxIDF_SCORES = []
  301. for elem in range(ndocs):
  302. for i in TFxIDF[elem]:
  303. TFxIDF_SCORES.append(i)
  304.  
  305. #print("num elems dic "+str(len(tfXidf)))
  306. for x in TFxIDF_SCORES:
  307. token = x['term']
  308. #if token in mentionsSet:
  309. # print(token)
  310. #for y in IDF_SCORE:
  311. # print(y)
  312. #for z in TF_SCORE:
  313. #if x['docid'] == y['docid'] and x['docid'] == z['docid'] and x['term'] == y['term'] and x['term'] == z['term']:
  314. # #fout.write(z['docid'] + "\t" + z['term'] + "\t" + str(z['raw'])+ "\t" + str(z['tf']) + "\t" + str(y['df']) + "\t" + str(y['idf']) + "\t" + str(x['tfXidf'])+ "\n" )
  315. # print(['docid'] + "\t" + z['term'] + "\t" + str(z['raw'])+ "\t" + str(z['tf']) + "\t" + str(y['df']) + "\t" + str(y['idf']) + "\t" + str(x['tfXidf'])+ "\n" )
  316.  
  317.  
  318. ### Calculating TFxIDF
  319. #pool = mp.Pool(mp.cpu_count())
  320. #TFIDF = [pool.apply(computeIDF,args = (row,FreqDicCOPY, ndocs)) for row in FreqDic]
  321. #pool.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement