Advertisement
Guest User

Untitled

a guest
Dec 11th, 2018
59
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 7.42 KB | None | 0 0
  1. import sys
  2. import re
  3. import json
  4. import numpy as np
  5. import math
  6.  
  7. # global declarations for doclist, postings, vocabulary
  8. docids = []
  9. postings = {}
  10. vocab = []
  11. docLengths= []
  12.  
  13. def main():
  14. global docids
  15. global postings
  16. global vocab
  17. global docLengths
  18. read_index_files()
  19. print('read index files')
  20. if len(sys.argv) < 2:
  21. print ('usage: ./retriever.py term [term ...]')
  22. sys.exit(1)
  23.  
  24.  
  25.  
  26.  
  27.  
  28.  
  29.  
  30. #gets query from file
  31.  
  32. queries = readFile(sys.argv[1])
  33.  
  34. #Initializes the retrieve vector function which links to the rest of functions
  35. queryScore = retrieve_vector(queries)[0]
  36.  
  37.  
  38.  
  39.  
  40.  
  41. def read_index_files():
  42. ## reads existing data from index files: docids, vocab, postings
  43. # uses JSON to preserve list/dictionary data structures
  44. # declare refs to global variables
  45. global docids
  46. global postings
  47. global vocab
  48. global docLengths
  49. # open the files
  50. in_d = open('docids.txt', 'r')
  51. in_v = open('vocab.txt', 'r')
  52. in_p = open('postings.txt', 'r')
  53. in_dl= open('doclengths.txt', 'r')
  54. print('opened text files')
  55.  
  56. # load the data
  57. docids = json.load(in_d)
  58. vocab = json.load(in_v)
  59. postings = json.load(in_p)
  60. docLengths = json.load(in_dl)
  61. # close the files
  62. in_d.close()
  63. in_v.close()
  64. in_p.close()
  65. in_dl.close()
  66.  
  67.  
  68. return
  69.  
  70.  
  71. #def retrieve_bool(query_terms):
  72. ## a function to perform Boolean retrieval
  73. # assumes the postings lists are lists, not dicts
  74.  
  75. # declare refs to global variables
  76. #global docids
  77. #global postings
  78. #global vocab
  79. #global doclength
  80.  
  81. #answer = []
  82. ###############################
  83. #### your code starts here ####
  84.  
  85. """
  86. pseudocode for lab3 boolean retrieval if postings is a dictionary
  87. for each term, get its termid from vocab
  88. print message if term not in vocab
  89.  
  90. initialise:
  91. pop the first term's posting onto the answer
  92.  
  93. for each termid
  94. get its posting list
  95. copy the answer to the second list
  96. clear answer
  97. for each docid in posting list
  98. if docid in second list
  99. append docid to answer
  100.  
  101. return answer
  102. """
  103.  
  104.  
  105. #query_termids = []
  106. #operators = []
  107. #for term in query_terms:
  108. #if term in ('AND', 'OR', 'NOT'):
  109. #operators.append(term)
  110. #continue
  111. #if term in vocab:
  112. #query_termids.append(vocab.index(term)) # will change if vocab is dict
  113. #else:
  114. #print (term, 'is not in vocab')
  115.  
  116. #answer = postings.get(str(query_termids.pop(0))) # make the initial answer the postings list for the first term
  117.  
  118. # find shortest list of the current answer and the postings list for the next term
  119. #for termid in query_termids:
  120. #if len(postings.get(str(termid))) > len(answer):
  121. #list1 = answer
  122. #list2 = postings.get(str(termid))
  123. #else:
  124. #list2 = answer
  125. #list1 = postings.get(str(termid))
  126. #answer = []
  127. #operator = operators.pop()
  128. # step through the list
  129. #for post in list1:
  130. #if operator == 'AND':
  131. #if post in list2:
  132. #answer.append(post)
  133. #if operator == 'NOT':
  134. #if post not in list2:
  135. #answer.append(post)
  136. #if operator == 'OR':
  137. #answer = list(set(list1 + list2))
  138.  
  139.  
  140. #### your code ends here ####
  141. ###############################
  142.  
  143. #return answer
  144.  
  145.  
  146. #def merge(list1, list2):
  147. # check that list1 is the shorter
  148. #if len(list1) > len(list2):
  149. # answer = list1
  150. # list1 = list2
  151. # list2 = answer
  152.  
  153. #answer = []
  154. #p2 = -1
  155. #for p1 in list1:
  156. #print ('merge 1: ', p1, p2)
  157. #while int(p2) < int(p1):
  158. #print ('merge 2: ', p1, p2)
  159. #try:
  160. #p2 = list2.pop(0)
  161. #except IndexError:
  162. #return answer
  163. #if p2 == p1:
  164. #print ('merge answer: ', p1, p2)
  165. #answer.append(p1)
  166. #print ('merge: returning ', answer)
  167. #return answer
  168.  
  169.  
  170. def write_topTen(writtenLine, storei):
  171.  
  172. with open('results.csv','a') as file:
  173. file.write(str(storei) + ': ' + writtenLine)
  174. file.write('\n')
  175. return
  176.  
  177.  
  178. def readFile(fileName):
  179. queryArray=[]
  180. queryFile = open(fileName, 'r')
  181. for line in queryFile:
  182. queryArray.append(line.split())
  183. queryFile.close()
  184. return queryArray
  185.  
  186.  
  187.  
  188. def normaliseTerms(query):
  189. normaliseArr = []
  190.  
  191. for term in query:
  192.  
  193. term = term.lower() #method object.lower makes all terms lower case
  194.  
  195. # removes all characters that aren't words(punctuation)
  196. term = re.sub(r'[^\w\s]', '', term)
  197.  
  198. #appends the term to the string of normalised terms
  199. if term:
  200. normaliseArr.append(re.sub(r'[^\w\s]', '', term))
  201.  
  202.  
  203. return normaliseArr
  204.  
  205.  
  206.  
  207.  
  208. def retrieve_vector(queries):
  209. ## a function to perform vector model retrieval with tf*idf weighting
  210. #
  211. #imports global variable
  212. global docids # list of doc names - the index is the docid (i.e. 0-4)
  213. scoreArr = []
  214. #new array to store scores
  215.  
  216.  
  217.  
  218. #Goes through each line in the query text file
  219. for query in queries:
  220.  
  221.  
  222. #intialises the normalisation of terms
  223. query=normaliseTerms(query)
  224.  
  225. #initialising new function to get cosine score
  226. scoreOrder = getCosineScore(query)
  227.  
  228.  
  229. #Makes a top 10 search results
  230. topTen=[]
  231. #starts the process by making i = 0 this is the set position before ordering starts
  232. i=0
  233.  
  234. #reverses the order so that bigger values come first in position rather than last
  235. cosineScoreArray = sorted(scoreOrder, reverse=True)
  236.  
  237.  
  238. #returns this to the user
  239.  
  240.  
  241. while(i<10):
  242. #initialises score order function
  243. #np tracks movement in position after score is calculated
  244. scoreOrder = np.array(scoreOrder)
  245.  
  246.  
  247. #gets index values but not frequency
  248. indexVals = np.where(scoreOrder == cosineScoreArray[i])[0]
  249.  
  250.  
  251.  
  252. #Adds top 3 to a list and prints this list
  253. for indexVals in indexVals:
  254. if i<10:
  255. topTen.append(indexVals)
  256. writtenLine=str(docids[indexVals])
  257. storei= i+1
  258.  
  259.  
  260.  
  261. #prints out order number and docid for specific index value
  262. print(i+1, ":", docids[indexVals])
  263.  
  264. i +=1
  265. write_topTen(writtenLine, storei)
  266.  
  267.  
  268.  
  269.  
  270.  
  271.  
  272. scoreArr.append(topTen)
  273.  
  274.  
  275.  
  276.  
  277.  
  278.  
  279.  
  280. #### your code ends here ####
  281. return scoreArr
  282.  
  283. def getCosineScore(query):
  284. global docLengths
  285. global postings
  286. global vocab
  287.  
  288.  
  289. #Makes an array which combines the doclengths and sets them to 0
  290. cosineScoreArray = [0] * len(docLengths)
  291.  
  292.  
  293. for term in query:
  294.  
  295. if term in vocab:
  296.  
  297. #adds vocab indexes of terms into an array
  298.  
  299. postingsList = postings[str(vocab.index(term))]
  300.  
  301. #initializes
  302. idfValue = getIDF(postingsList)
  303.  
  304. #this was added because if 'the' appeared in every document for example
  305. #it would have an idf value of 0 therefore it is rendered pointless to
  306. #the search
  307. if idfValue != 0:
  308. for posting in postingsList:
  309. #initializes term frequency function and gets posting input
  310. tfValue = getTF(posting)
  311. #calculates cosine score for each
  312. cosineScoreArray[posting[0]] += tfValue*idfValue
  313.  
  314.  
  315. return cosineScoreArray
  316.  
  317. def getIDF(postingsList):
  318. global postings
  319. global docLengths
  320. totalNumDocs = len(docLengths)
  321. #this gets how frequent the term is
  322. termOccurence = len(postingsList)
  323. #the log takes into account that some terms are more frequent than others
  324. #so evens out the effect of this
  325. idf=math.log(totalNumDocs/termOccurence)
  326. return idf
  327.  
  328. def getTF (posting):
  329. global docLengths
  330.  
  331. termOccurence = posting[1]
  332.  
  333. totalTerms = docLengths[posting[0]]
  334. return termOccurence/totalTerms
  335.  
  336. # Standard boilerplate to call the main() function
  337. if __name__ == '__main__':
  338. main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement