Untitled

import sys
import re
import json
import numpy as np
import math

# global declarations for doclist, postings, vocabulary
docids = []
postings = {}
vocab = []
docLengths= []

def main():
	global docids
	global postings
	global vocab
	global docLengths
	read_index_files()
	print('read index files')
	if len(sys.argv) < 2:
		print ('usage: ./retriever.py term [term ...]')
		sys.exit(1)


	#gets query from file

	queries = readFile(sys.argv[1])

	#Initializes the retrieve vector function which links to the rest of functions
	queryScore = retrieve_vector(queries)[0]


def read_index_files():
	## reads existing data from index files: docids, vocab, postings
	# uses JSON to preserve list/dictionary data structures
	# declare refs to global variables
	global docids
	global postings
	global vocab
	global docLengths
	# open the files
	in_d = open('docids.txt', 'r')
	in_v = open('vocab.txt', 'r')
	in_p = open('postings.txt', 'r')
	in_dl= open('doclengths.txt', 'r')
	print('opened text files')

	# load the data
	docids = json.load(in_d)
	vocab = json.load(in_v)
	postings = json.load(in_p)
	docLengths = json.load(in_dl)
	# close the files
	in_d.close()
	in_v.close()
	in_p.close()
	in_dl.close()


	return


#def retrieve_bool(query_terms):
	## a function to perform Boolean retrieval
	# assumes the postings lists are lists, not dicts

	# declare refs to global variables
	#global docids
	#global postings
	#global vocab
	#global doclength

	#answer = []
	###############################
	#### your code starts here ####

	"""
	pseudocode for lab3 boolean retrieval if postings is a dictionary
		for each term, get its termid from vocab
			print message if term not in vocab

		initialise:
			pop the first term's posting onto the answer

		for each termid
			get its posting list
			copy the answer to the second list
			clear answer
			for each docid in posting list
				if docid in second list
					append docid to answer

		return answer
	"""


	#query_termids = []
	#operators = []
	#for term in query_terms:
		#if term in ('AND', 'OR', 'NOT'):
			#operators.append(term)
			#continue
		#if term in vocab:
			#query_termids.append(vocab.index(term)) # will change if vocab is dict
		#else:
			#print (term, 'is not in vocab')

	#answer = postings.get(str(query_termids.pop(0))) # make the initial answer the postings list for the first term

	# find shortest list of the current answer and the postings list for the next term
	#for termid in query_termids:
		#if len(postings.get(str(termid))) > len(answer):
			#list1 = answer
			#list2 = postings.get(str(termid))
		#else:
			#list2 = answer
			#list1 = postings.get(str(termid))
		#answer = []
		#operator = operators.pop()
		# step through the list
		#for post in list1:
			#if operator == 'AND':
				#if post in list2:
					#answer.append(post)
			#if operator == 'NOT':
				#if post not in list2:
					#answer.append(post)
			#if operator == 'OR':
				#answer = list(set(list1 + list2))


	####  your code ends here  ####
	###############################

	#return answer


#def merge(list1, list2):
	# check that list1 is the shorter
	#if len(list1) > len(list2):
	#	answer = list1
	#	list1 = list2
	#	list2 = answer

	#answer = []
	#p2 = -1
	#for p1 in list1:
		#print ('merge 1: ', p1, p2)
		#while int(p2) < int(p1):
			#print ('merge 2: ', p1, p2)
			#try:
				#p2 = list2.pop(0)
			#except IndexError:
				#return answer
		#if p2 == p1:
			#print ('merge answer: ', p1, p2)
			#answer.append(p1)
	#print ('merge: returning ', answer)
	#return answer


def write_topTen(writtenLine, storei):

	with open('results.csv','a') as file:
			file.write(str(storei) + ': ' + writtenLine)
			file.write('\n')
	return


def readFile(fileName):
	queryArray=[]
	queryFile = open(fileName, 'r')
	for line in queryFile:
		queryArray.append(line.split())
	queryFile.close()
	return queryArray


def normaliseTerms(query):
	normaliseArr = []

	for term in query:

		term = term.lower() #method object.lower makes all terms lower case

		# removes all characters that aren't words(punctuation)
		term = re.sub(r'[^\w\s]', '', term)

	#appends the term to the string of normalised terms
		if term:
			normaliseArr.append(re.sub(r'[^\w\s]', '', term))


	return normaliseArr


def retrieve_vector(queries):
	## a function to perform vector model retrieval with tf*idf weighting
	#
	#imports global variable
	global docids		# list of doc names - the index is the docid (i.e. 0-4)
	scoreArr = []
	#new array to store scores


	#Goes through each line in the query text file
	for query in queries:


		#intialises the normalisation of terms
		query=normaliseTerms(query)

		#initialising new function to get cosine score
		scoreOrder = getCosineScore(query)


		#Makes a top 10 search results
		topTen=[]
		#starts the process by making i = 0 this is the set position before ordering starts
		i=0

		#reverses the order so that bigger values come first in position rather than last
		cosineScoreArray = sorted(scoreOrder, reverse=True)


		#returns this to the user


		while(i<10):
			#initialises score order function
			#np tracks movement in position after score is calculated
			scoreOrder = np.array(scoreOrder)


			#gets index values but not frequency
			indexVals = np.where(scoreOrder == cosineScoreArray[i])[0]


			#Adds top 3 to a list and prints this list
			for indexVals in indexVals:
				if i<10:
					topTen.append(indexVals)
					writtenLine=str(docids[indexVals])
					storei= i+1


					#prints out order number and docid for specific index value
					print(i+1, ":", docids[indexVals])

					i +=1
					write_topTen(writtenLine, storei)


			scoreArr.append(topTen)


	#### your code ends here ####
	return scoreArr

def getCosineScore(query):
	global docLengths
	global postings
	global vocab


	#Makes an array which combines the doclengths and sets them to 0
	cosineScoreArray = [0] * len(docLengths)


	for term in query:

		if term in vocab:

			#adds vocab indexes of terms into an array

			postingsList = postings[str(vocab.index(term))]

			#initializes
			idfValue = getIDF(postingsList)

			#this was added because if 'the' appeared in every document for example
			#it would have an idf value of 0 therefore it is rendered pointless to
			#the search
			if idfValue != 0:
				for posting in postingsList:
					#initializes term frequency function and gets posting input
					tfValue = getTF(posting)
					#calculates cosine score for each
					cosineScoreArray[posting[0]] += tfValue*idfValue


	return cosineScoreArray

def getIDF(postingsList):
	global postings
	global docLengths
	totalNumDocs = len(docLengths)
	#this gets how frequent the term is
	termOccurence = len(postingsList)
	#the log takes into account that some terms are more frequent than others
	#so evens out the effect of this
	idf=math.log(totalNumDocs/termOccurence)
	return idf

def getTF (posting):
	global docLengths

	termOccurence = posting[1]

	totalTerms = docLengths[posting[0]]
	return termOccurence/totalTerms

	# Standard boilerplate to call the main() function
if __name__ == '__main__':
	main()