Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import sys
- import re
- import json
- import numpy as np
- import math
- # global declarations for doclist, postings, vocabulary
- docids = []
- postings = {}
- vocab = []
- docLengths= []
- def main():
- global docids
- global postings
- global vocab
- global docLengths
- read_index_files()
- print('read index files')
- if len(sys.argv) < 2:
- print ('usage: ./retriever.py term [term ...]')
- sys.exit(1)
- #gets query from file
- queries = readFile(sys.argv[1])
- #Initializes the retrieve vector function which links to the rest of functions
- queryScore = retrieve_vector(queries)[0]
- def read_index_files():
- ## reads existing data from index files: docids, vocab, postings
- # uses JSON to preserve list/dictionary data structures
- # declare refs to global variables
- global docids
- global postings
- global vocab
- global docLengths
- # open the files
- in_d = open('docids.txt', 'r')
- in_v = open('vocab.txt', 'r')
- in_p = open('postings.txt', 'r')
- in_dl= open('doclengths.txt', 'r')
- print('opened text files')
- # load the data
- docids = json.load(in_d)
- vocab = json.load(in_v)
- postings = json.load(in_p)
- docLengths = json.load(in_dl)
- # close the files
- in_d.close()
- in_v.close()
- in_p.close()
- in_dl.close()
- return
- #def retrieve_bool(query_terms):
- ## a function to perform Boolean retrieval
- # assumes the postings lists are lists, not dicts
- # declare refs to global variables
- #global docids
- #global postings
- #global vocab
- #global doclength
- #answer = []
- ###############################
- #### your code starts here ####
- """
- pseudocode for lab3 boolean retrieval if postings is a dictionary
- for each term, get its termid from vocab
- print message if term not in vocab
- initialise:
- pop the first term's posting onto the answer
- for each termid
- get its posting list
- copy the answer to the second list
- clear answer
- for each docid in posting list
- if docid in second list
- append docid to answer
- return answer
- """
- #query_termids = []
- #operators = []
- #for term in query_terms:
- #if term in ('AND', 'OR', 'NOT'):
- #operators.append(term)
- #continue
- #if term in vocab:
- #query_termids.append(vocab.index(term)) # will change if vocab is dict
- #else:
- #print (term, 'is not in vocab')
- #answer = postings.get(str(query_termids.pop(0))) # make the initial answer the postings list for the first term
- # find shortest list of the current answer and the postings list for the next term
- #for termid in query_termids:
- #if len(postings.get(str(termid))) > len(answer):
- #list1 = answer
- #list2 = postings.get(str(termid))
- #else:
- #list2 = answer
- #list1 = postings.get(str(termid))
- #answer = []
- #operator = operators.pop()
- # step through the list
- #for post in list1:
- #if operator == 'AND':
- #if post in list2:
- #answer.append(post)
- #if operator == 'NOT':
- #if post not in list2:
- #answer.append(post)
- #if operator == 'OR':
- #answer = list(set(list1 + list2))
- #### your code ends here ####
- ###############################
- #return answer
- #def merge(list1, list2):
- # check that list1 is the shorter
- #if len(list1) > len(list2):
- # answer = list1
- # list1 = list2
- # list2 = answer
- #answer = []
- #p2 = -1
- #for p1 in list1:
- #print ('merge 1: ', p1, p2)
- #while int(p2) < int(p1):
- #print ('merge 2: ', p1, p2)
- #try:
- #p2 = list2.pop(0)
- #except IndexError:
- #return answer
- #if p2 == p1:
- #print ('merge answer: ', p1, p2)
- #answer.append(p1)
- #print ('merge: returning ', answer)
- #return answer
- def write_topTen(writtenLine, storei):
- with open('results.csv','a') as file:
- file.write(str(storei) + ': ' + writtenLine)
- file.write('\n')
- return
- def readFile(fileName):
- queryArray=[]
- queryFile = open(fileName, 'r')
- for line in queryFile:
- queryArray.append(line.split())
- queryFile.close()
- return queryArray
- def normaliseTerms(query):
- normaliseArr = []
- for term in query:
- term = term.lower() #method object.lower makes all terms lower case
- # removes all characters that aren't words(punctuation)
- term = re.sub(r'[^\w\s]', '', term)
- #appends the term to the string of normalised terms
- if term:
- normaliseArr.append(re.sub(r'[^\w\s]', '', term))
- return normaliseArr
- def retrieve_vector(queries):
- ## a function to perform vector model retrieval with tf*idf weighting
- #
- #imports global variable
- global docids # list of doc names - the index is the docid (i.e. 0-4)
- scoreArr = []
- #new array to store scores
- #Goes through each line in the query text file
- for query in queries:
- #intialises the normalisation of terms
- query=normaliseTerms(query)
- #initialising new function to get cosine score
- scoreOrder = getCosineScore(query)
- #Makes a top 10 search results
- topTen=[]
- #starts the process by making i = 0 this is the set position before ordering starts
- i=0
- #reverses the order so that bigger values come first in position rather than last
- cosineScoreArray = sorted(scoreOrder, reverse=True)
- #returns this to the user
- while(i<10):
- #initialises score order function
- #np tracks movement in position after score is calculated
- scoreOrder = np.array(scoreOrder)
- #gets index values but not frequency
- indexVals = np.where(scoreOrder == cosineScoreArray[i])[0]
- #Adds top 3 to a list and prints this list
- for indexVals in indexVals:
- if i<10:
- topTen.append(indexVals)
- writtenLine=str(docids[indexVals])
- storei= i+1
- #prints out order number and docid for specific index value
- print(i+1, ":", docids[indexVals])
- i +=1
- write_topTen(writtenLine, storei)
- scoreArr.append(topTen)
- #### your code ends here ####
- return scoreArr
- def getCosineScore(query):
- global docLengths
- global postings
- global vocab
- #Makes an array which combines the doclengths and sets them to 0
- cosineScoreArray = [0] * len(docLengths)
- for term in query:
- if term in vocab:
- #adds vocab indexes of terms into an array
- postingsList = postings[str(vocab.index(term))]
- #initializes
- idfValue = getIDF(postingsList)
- #this was added because if 'the' appeared in every document for example
- #it would have an idf value of 0 therefore it is rendered pointless to
- #the search
- if idfValue != 0:
- for posting in postingsList:
- #initializes term frequency function and gets posting input
- tfValue = getTF(posting)
- #calculates cosine score for each
- cosineScoreArray[posting[0]] += tfValue*idfValue
- return cosineScoreArray
- def getIDF(postingsList):
- global postings
- global docLengths
- totalNumDocs = len(docLengths)
- #this gets how frequent the term is
- termOccurence = len(postingsList)
- #the log takes into account that some terms are more frequent than others
- #so evens out the effect of this
- idf=math.log(totalNumDocs/termOccurence)
- return idf
- def getTF (posting):
- global docLengths
- termOccurence = posting[1]
- totalTerms = docLengths[posting[0]]
- return termOccurence/totalTerms
- # Standard boilerplate to call the main() function
- if __name__ == '__main__':
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement