SHARE
TWEET

s

a guest Dec 6th, 2019 113 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #!/usr/bin/python3
  2.  
  3. import os
  4. import sys
  5. import math
  6.  
  7. docids = ["./LookingGlass/lg001.html/", "./LookingGlass/lg002.html/", "./LookingGlass/lg003.html/",
  8.           "./LookingGlass/lg004.html/", "./LookingGlass/lg005.html/"]
  9. doclength = {0: 66, 1: 80, 2: 101, 3: 152, 4: 122}
  10. vocab = ["1", "looking", "glass", "house", "one", "thing", "was", "certain", "that", "the", "white", "kitten", "had",
  11.          "nothing", "to", "do", "with", "it", "black", "fault", "entirely", "for", "been", "having", "its", "face",
  12.          "washed", "by", "old", "cat", "last", "quarter", "of", "an", "hour", "and", "bearing", "pretty", "well",
  13.          "considering", "so", "you", "see", "couldn't", "have", "any", "hand", "in", "mischief", "2", "way", "dinah",
  14.          "her", "children", "faces", "this", "first", "she", "held", "poor", "down", "ear", "paw", "then", "other",
  15.          "rubbed", "all", "over", "wrong", "beginning", "at", "nose", "just", "now", "as", "i", "said", "hard", "work",
  16.          "on", "which", "lying", "quite", "still", "trying", "purrno", "doubt", "feeling", "meant", "good", "3", "but",
  17.          "finished", "earlier", "afternoon", "while", "alice", "sitting", "curled", "up", "a", "corner", "great", "arm",
  18.          "chair", "half", "talking", "herself", "asleep", "grand", "game", "romps", "ball", "worsted", "wind",
  19.          "rolling", "till", "come", "undone", "again", "there", "spread", "hearth", "rug", "knots", "tangles",
  20.          "running", "after", "own", "tail", "middle", "4", "oh", "wicked", "little", "thing'", "cried", "catching",
  21.          "giving", "kiss", "make", "understand", "disgrace", "really", "ought", "taught", "better", "manners", "know",
  22.          "ought'", "added", "reproachfully", "speaking", "cross", "voice", "could", "manage", "scrambled", "back",
  23.          "into", "taking", "began", "winding", "\u0001", "not", "get", "very", "fast", "time", "sometimes", "kitty",
  24.          "sat", "demurely", "knee", "pretending", "watch", "progress", "putting", "out", "gently", "touching", "if",
  25.          "would", "be", "glad", "help", "might", "5", "do", "what", "morrow", "is", "kitty?'", "you'd", "guessed",
  26.          "you'd", "window", "meonly", "making", "tidy", "watching", "boys", "getting", "sticks", "bonfireand", "wants",
  27.          "plenty", "only", "got", "cold", "snowed", "they", "leave", "off", "never", "mind", "we'll", "go", "bonfire",
  28.          "morrow'", "here", "wound", "two", "or", "three", "turns", "round", "neck", "how", "look", "led", "scramble",
  29.          "rolled", "upon", "floor", "yards", "unwound"]
  30. postings = {0: [[0, 2], [1, 1], [2, 1], [3, 1], [4, 1]], 1: [[0, 1], [1, 1], [2, 1], [3, 2], [4, 1]],
  31.             2: [[0, 1], [1, 1], [2, 1], [3, 1], [4, 1]], 3: [[0, 1], [1, 1], [2, 1], [3, 1], [4, 1]],
  32.             4: [[0, 1], [1, 1], [3, 1]], 5: [[0, 1], [1, 1]], 6: [[0, 2], [1, 4], [2, 2], [3, 2], [4, 2]], 7: [[0, 1]],
  33.             8: [[0, 2], [1, 1], [3, 1]], 9: [[0, 6], [1, 6], [2, 8], [3, 11], [4, 8]], 10: [[0, 2], [1, 1]],
  34.             11: [[0, 3], [1, 1], [2, 3], [3, 3], [4, 1]], 12: [[0, 4], [2, 5], [4, 1]], 13: [[0, 1]],
  35.             14: [[0, 1], [1, 1], [2, 2], [3, 6], [4, 5]], 15: [[0, 1]], 16: [[0, 1], [1, 2], [2, 3], [3, 1], [4, 1]],
  36.             17: [[0, 3], [1, 1], [2, 3], [3, 5], [4, 5]], 18: [[0, 1], [2, 1]], 19: [[0, 1]], 20: [[0, 1]],
  37.             21: [[0, 2], [1, 1], [4, 1]], 22: [[0, 1], [2, 4], [4, 1]], 23: [[0, 1], [2, 1]],
  38.             24: [[0, 1], [1, 3], [2, 1]], 25: [[0, 1], [1, 1]], 26: [[0, 1], [1, 1]], 27: [[0, 1], [1, 1]],
  39.             28: [[0, 1], [3, 1]], 29: [[0, 1], [3, 1]], 30: [[0, 1]], 31: [[0, 1]],
  40.             32: [[0, 1], [2, 3], [3, 1], [4, 3]], 33: [[0, 1]], 34: [[0, 1]],
  41.             35: [[0, 1], [1, 3], [2, 6], [3, 9], [4, 4]], 36: [[0, 1]], 37: [[0, 1]], 38: [[0, 1]], 39: [[0, 1]],
  42.             40: [[0, 1], [2, 1], [4, 3]], 41: [[0, 1], [3, 5], [4, 3]], 42: [[0, 1], [4, 2]], 43: [[0, 1]],
  43.             44: [[0, 1], [3, 1], [4, 1]], 45: [[0, 1]], 46: [[0, 1]], 47: [[0, 1], [2, 3], [3, 2], [4, 3]],
  44.             48: [[0, 1]], 49: [[1, 1]], 50: [[1, 2]], 51: [[1, 1], [3, 2], [4, 1]], 52: [[1, 1], [3, 2]], 53: [[1, 1]],
  45.             54: [[1, 1]], 55: [[1, 1], [4, 1]], 56: [[1, 1]], 57: [[1, 3], [3, 5]], 58: [[1, 1]], 59: [[1, 1]],
  46.             60: [[1, 1], [2, 1], [4, 1]], 61: [[1, 1]], 62: [[1, 2], [3, 1]], 63: [[1, 1], [3, 2]], 64: [[1, 1]],
  47.             65: [[1, 1]], 66: [[1, 2], [2, 2], [3, 1]], 67: [[1, 1], [2, 1]], 68: [[1, 1]], 69: [[1, 1]],
  48.             70: [[1, 2], [3, 1]], 71: [[1, 1]], 72: [[1, 1], [4, 1]], 73: [[1, 1], [3, 1]], 74: [[1, 1], [3, 4]],
  49.             75: [[1, 1], [4, 1]], 76: [[1, 1]], 77: [[1, 1]], 78: [[1, 1]], 79: [[1, 1], [3, 2]], 80: [[1, 1], [4, 1]],
  50.             81: [[1, 1]], 82: [[1, 1]], 83: [[1, 1]], 84: [[1, 1], [2, 1]], 85: [[1, 1]], 86: [[1, 1]], 87: [[1, 1]],
  51.             88: [[1, 1]], 89: [[1, 1]], 90: [[2, 1]], 91: [[2, 1], [3, 1]], 92: [[2, 1]], 93: [[2, 1]], 94: [[2, 1]],
  52.             95: [[2, 1]], 96: [[2, 2], [3, 1], [4, 2]], 97: [[2, 1]], 98: [[2, 1]], 99: [[2, 3], [3, 2], [4, 1]],
  53.             100: [[2, 2], [3, 2], [4, 1]], 101: [[2, 1]], 102: [[2, 1]], 103: [[2, 1], [3, 1]], 104: [[2, 1], [3, 1]],
  54.             105: [[2, 2]], 106: [[2, 1], [3, 1]], 107: [[2, 1], [3, 1]], 108: [[2, 1]], 109: [[2, 1]], 110: [[2, 1]],
  55.             111: [[2, 1]], 112: [[2, 1], [3, 2], [4, 1]], 113: [[2, 1], [3, 1], [4, 1]], 114: [[2, 1]], 115: [[2, 1]],
  56.             116: [[2, 1]], 117: [[2, 1]], 118: [[2, 1]], 119: [[2, 1], [3, 1], [4, 1]], 120: [[2, 1]], 121: [[2, 1]],
  57.             122: [[2, 1]], 123: [[2, 1]], 124: [[2, 1]], 125: [[2, 1]], 126: [[2, 1]], 127: [[2, 1]], 128: [[2, 1]],
  58.             129: [[2, 1]], 130: [[2, 1]], 131: [[3, 1]], 132: [[3, 1]], 133: [[3, 1]], 134: [[3, 2]], 135: [[3, 1]],
  59.             136: [[3, 1]], 137: [[3, 1]], 138: [[3, 1]], 139: [[3, 1]], 140: [[3, 1]], 141: [[3, 1]], 142: [[3, 1]],
  60.             143: [[3, 1]], 144: [[3, 2]], 145: [[3, 1]], 146: [[3, 1]], 147: [[3, 1]], 148: [[3, 1], [4, 1]],
  61.             149: [[3, 1]], 150: [[3, 1]], 151: [[3, 1]], 152: [[3, 1]], 153: [[3, 1]], 154: [[3, 1]], 155: [[3, 1]],
  62.             156: [[3, 1]], 157: [[3, 1]], 158: [[3, 1]], 159: [[3, 1]], 160: [[3, 1]], 161: [[3, 1], [4, 1]],
  63.             162: [[3, 2]], 163: [[3, 1], [4, 1]], 164: [[3, 1], [4, 1]], 165: [[3, 1]], 166: [[3, 2]], 167: [[3, 1]],
  64.             168: [[3, 1]], 169: [[3, 2]], 170: [[3, 1], [4, 2]], 171: [[3, 1]], 172: [[3, 1]], 173: [[3, 1]],
  65.             174: [[3, 1]], 175: [[3, 1]], 176: [[3, 1]], 177: [[3, 1]], 178: [[3, 1]], 179: [[3, 1]], 180: [[3, 1]],
  66.             181: [[3, 2], [4, 1]], 182: [[3, 1], [4, 1]], 183: [[3, 1]], 184: [[3, 1]], 185: [[3, 1]], 186: [[3, 1]],
  67.             187: [[4, 1]], 188: [[4, 1]], 189: [[4, 1]], 190: [[4, 1]], 191: [[4, 1]], 192: [[4, 1]], 193: [[4, 1]],
  68.             194: [[4, 1]], 195: [[4, 1]], 196: [[4, 1]], 197: [[4, 1]], 198: [[4, 1]], 199: [[4, 1]], 200: [[4, 1]],
  69.             201: [[4, 1]], 202: [[4, 1]], 203: [[4, 2]], 204: [[4, 1]], 205: [[4, 1]], 206: [[4, 1]], 207: [[4, 1]],
  70.             208: [[4, 2]], 209: [[4, 1]], 210: [[4, 1]], 211: [[4, 1]], 212: [[4, 1]], 213: [[4, 1]], 214: [[4, 1]],
  71.             215: [[4, 1]], 216: [[4, 1]], 217: [[4, 1]], 218: [[4, 1]], 219: [[4, 1]], 220: [[4, 1]], 221: [[4, 1]],
  72.             222: [[4, 1]], 223: [[4, 1]], 224: [[4, 1]], 225: [[4, 1]], 226: [[4, 1]], 227: [[4, 1]], 228: [[4, 1]],
  73.             229: [[4, 1]], 230: [[4, 1]], 231: [[4, 1]], 232: [[4, 1]], 233: [[4, 1]], 234: [[4, 1]], 235: [[4, 2]],
  74.             236: [[4, 1]]}
  75.  
  76.  
  77. def main():
  78.     # code for testing offline
  79.     if len(sys.argv) < 2:
  80.         print('usage: ./retriever.py term [term ...]')
  81.         sys.exit(1)
  82.     query_terms = sys.argv[1:]
  83.  
  84.     answer = []
  85.     answer = retrieve_vector(query_terms)
  86.  
  87.     # print ('Query: ', query_terms)
  88.     # i = 0
  89.     # for docid in answer:
  90.     #    i += 1
  91.     #    print (i, docids[docid[0]])
  92.  
  93.     print(answer)
  94.  
  95.  
  96. def retrieve_vector(query_terms):
  97.     # a function to perform vector model retrieval with tf*idf weighting
  98.  
  99.     global docids  # list of doc names - the index is the docid (i.e. 0-4)
  100.     global doclength  # number of terms in each document
  101.     global vocab  # list of terms found (237) - the index is the termid
  102.     global postings  # postings dictionary; the key is a termid
  103.     # the value is a list of postings entries,
  104.     # each of which is a list containing a docid and frequency
  105.     answer = []
  106.  
  107.     merge = []
  108.     idf = {}
  109.     scores = {}
  110.     query_vector = []
  111.  
  112.     query_set = set(query_terms)
  113.  
  114.     for term in query_set:
  115.         try:
  116.             termid = vocab.index(term.lower())
  117.         except: # the term is not in the vocab
  118.             print("not found: ", term, "is not in the vocabulary")
  119.             continue
  120.         idf[termid] = (1 + math.log(len(postings.get(termid))))/(len(doclength))
  121.  
  122.         # print ("retrieve_vector: term = ", term, "termid = ", termid, "idf = ", idf[termid])
  123.  
  124.         # create merge_list in descending idf order will need cut-off
  125.         # heuristic to avoid including low-value terms in calculation
  126.  
  127.         i = -1
  128.         # now calculate tf*idf and score for each doc and the query
  129.         for termid in sorted(idf, key=idf.get, reverse=True):
  130.             i += 1
  131.             query_vector.append(idf[termid]/len(query_set))
  132.  
  133.             for post in postings.get(termid):
  134.                 print("post[0] = ", post[0],
  135.                       "post[1] = ", post[1],
  136.                       "idf = ", idf.get(termid),
  137.                       "doclength = ", doclength.get(post[0])
  138.                       )
  139.                 if post[0] in scores:
  140.                     scores[post[0]] += (idf.get(termid) * post[1]) / doclength.get(post[0]) * query_vector[i]
  141.                 else:
  142.                     scores[post[0]] = (idf.get(termid) * post[1]) / doclength.get(post[0]) * query_vector[i]
  143.                 # print((idf.get(termid)* post[1])/doclength.get(post[0]))
  144.  
  145.         for docid in sorted(scores, key=scores.get, reverse=True):
  146.             print("retrieve_vector: docid = ", docid, "score = ", scores.get(docid))
  147.             #answer.append([docid, scores.get(docid)]) # testing only
  148.             answer.append(docid)
  149.  
  150.     return answer
  151.  
  152. # Standard boilerplate to call the main() function
  153. if __name__ == '__main__':
  154.     main()
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top