Advertisement
Guest User

s

a guest
Dec 6th, 2019
173
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 10.17 KB | None | 0 0
  1. #!/usr/bin/python3
  2.  
  3. import os
  4. import sys
  5. import math
  6.  
  7. docids = ["./LookingGlass/lg001.html/", "./LookingGlass/lg002.html/", "./LookingGlass/lg003.html/",
  8. "./LookingGlass/lg004.html/", "./LookingGlass/lg005.html/"]
  9. doclength = {0: 66, 1: 80, 2: 101, 3: 152, 4: 122}
  10. vocab = ["1", "looking", "glass", "house", "one", "thing", "was", "certain", "that", "the", "white", "kitten", "had",
  11. "nothing", "to", "do", "with", "it", "black", "fault", "entirely", "for", "been", "having", "its", "face",
  12. "washed", "by", "old", "cat", "last", "quarter", "of", "an", "hour", "and", "bearing", "pretty", "well",
  13. "considering", "so", "you", "see", "couldn't", "have", "any", "hand", "in", "mischief", "2", "way", "dinah",
  14. "her", "children", "faces", "this", "first", "she", "held", "poor", "down", "ear", "paw", "then", "other",
  15. "rubbed", "all", "over", "wrong", "beginning", "at", "nose", "just", "now", "as", "i", "said", "hard", "work",
  16. "on", "which", "lying", "quite", "still", "trying", "purrno", "doubt", "feeling", "meant", "good", "3", "but",
  17. "finished", "earlier", "afternoon", "while", "alice", "sitting", "curled", "up", "a", "corner", "great", "arm",
  18. "chair", "half", "talking", "herself", "asleep", "grand", "game", "romps", "ball", "worsted", "wind",
  19. "rolling", "till", "come", "undone", "again", "there", "spread", "hearth", "rug", "knots", "tangles",
  20. "running", "after", "own", "tail", "middle", "4", "oh", "wicked", "little", "thing'", "cried", "catching",
  21. "giving", "kiss", "make", "understand", "disgrace", "really", "ought", "taught", "better", "manners", "know",
  22. "ought'", "added", "reproachfully", "speaking", "cross", "voice", "could", "manage", "scrambled", "back",
  23. "into", "taking", "began", "winding", "\u0001", "not", "get", "very", "fast", "time", "sometimes", "kitty",
  24. "sat", "demurely", "knee", "pretending", "watch", "progress", "putting", "out", "gently", "touching", "if",
  25. "would", "be", "glad", "help", "might", "5", "do", "what", "morrow", "is", "kitty?'", "you'd", "guessed",
  26. "you'd", "window", "meonly", "making", "tidy", "watching", "boys", "getting", "sticks", "bonfireand", "wants",
  27. "plenty", "only", "got", "cold", "snowed", "they", "leave", "off", "never", "mind", "we'll", "go", "bonfire",
  28. "morrow'", "here", "wound", "two", "or", "three", "turns", "round", "neck", "how", "look", "led", "scramble",
  29. "rolled", "upon", "floor", "yards", "unwound"]
  30. postings = {0: [[0, 2], [1, 1], [2, 1], [3, 1], [4, 1]], 1: [[0, 1], [1, 1], [2, 1], [3, 2], [4, 1]],
  31. 2: [[0, 1], [1, 1], [2, 1], [3, 1], [4, 1]], 3: [[0, 1], [1, 1], [2, 1], [3, 1], [4, 1]],
  32. 4: [[0, 1], [1, 1], [3, 1]], 5: [[0, 1], [1, 1]], 6: [[0, 2], [1, 4], [2, 2], [3, 2], [4, 2]], 7: [[0, 1]],
  33. 8: [[0, 2], [1, 1], [3, 1]], 9: [[0, 6], [1, 6], [2, 8], [3, 11], [4, 8]], 10: [[0, 2], [1, 1]],
  34. 11: [[0, 3], [1, 1], [2, 3], [3, 3], [4, 1]], 12: [[0, 4], [2, 5], [4, 1]], 13: [[0, 1]],
  35. 14: [[0, 1], [1, 1], [2, 2], [3, 6], [4, 5]], 15: [[0, 1]], 16: [[0, 1], [1, 2], [2, 3], [3, 1], [4, 1]],
  36. 17: [[0, 3], [1, 1], [2, 3], [3, 5], [4, 5]], 18: [[0, 1], [2, 1]], 19: [[0, 1]], 20: [[0, 1]],
  37. 21: [[0, 2], [1, 1], [4, 1]], 22: [[0, 1], [2, 4], [4, 1]], 23: [[0, 1], [2, 1]],
  38. 24: [[0, 1], [1, 3], [2, 1]], 25: [[0, 1], [1, 1]], 26: [[0, 1], [1, 1]], 27: [[0, 1], [1, 1]],
  39. 28: [[0, 1], [3, 1]], 29: [[0, 1], [3, 1]], 30: [[0, 1]], 31: [[0, 1]],
  40. 32: [[0, 1], [2, 3], [3, 1], [4, 3]], 33: [[0, 1]], 34: [[0, 1]],
  41. 35: [[0, 1], [1, 3], [2, 6], [3, 9], [4, 4]], 36: [[0, 1]], 37: [[0, 1]], 38: [[0, 1]], 39: [[0, 1]],
  42. 40: [[0, 1], [2, 1], [4, 3]], 41: [[0, 1], [3, 5], [4, 3]], 42: [[0, 1], [4, 2]], 43: [[0, 1]],
  43. 44: [[0, 1], [3, 1], [4, 1]], 45: [[0, 1]], 46: [[0, 1]], 47: [[0, 1], [2, 3], [3, 2], [4, 3]],
  44. 48: [[0, 1]], 49: [[1, 1]], 50: [[1, 2]], 51: [[1, 1], [3, 2], [4, 1]], 52: [[1, 1], [3, 2]], 53: [[1, 1]],
  45. 54: [[1, 1]], 55: [[1, 1], [4, 1]], 56: [[1, 1]], 57: [[1, 3], [3, 5]], 58: [[1, 1]], 59: [[1, 1]],
  46. 60: [[1, 1], [2, 1], [4, 1]], 61: [[1, 1]], 62: [[1, 2], [3, 1]], 63: [[1, 1], [3, 2]], 64: [[1, 1]],
  47. 65: [[1, 1]], 66: [[1, 2], [2, 2], [3, 1]], 67: [[1, 1], [2, 1]], 68: [[1, 1]], 69: [[1, 1]],
  48. 70: [[1, 2], [3, 1]], 71: [[1, 1]], 72: [[1, 1], [4, 1]], 73: [[1, 1], [3, 1]], 74: [[1, 1], [3, 4]],
  49. 75: [[1, 1], [4, 1]], 76: [[1, 1]], 77: [[1, 1]], 78: [[1, 1]], 79: [[1, 1], [3, 2]], 80: [[1, 1], [4, 1]],
  50. 81: [[1, 1]], 82: [[1, 1]], 83: [[1, 1]], 84: [[1, 1], [2, 1]], 85: [[1, 1]], 86: [[1, 1]], 87: [[1, 1]],
  51. 88: [[1, 1]], 89: [[1, 1]], 90: [[2, 1]], 91: [[2, 1], [3, 1]], 92: [[2, 1]], 93: [[2, 1]], 94: [[2, 1]],
  52. 95: [[2, 1]], 96: [[2, 2], [3, 1], [4, 2]], 97: [[2, 1]], 98: [[2, 1]], 99: [[2, 3], [3, 2], [4, 1]],
  53. 100: [[2, 2], [3, 2], [4, 1]], 101: [[2, 1]], 102: [[2, 1]], 103: [[2, 1], [3, 1]], 104: [[2, 1], [3, 1]],
  54. 105: [[2, 2]], 106: [[2, 1], [3, 1]], 107: [[2, 1], [3, 1]], 108: [[2, 1]], 109: [[2, 1]], 110: [[2, 1]],
  55. 111: [[2, 1]], 112: [[2, 1], [3, 2], [4, 1]], 113: [[2, 1], [3, 1], [4, 1]], 114: [[2, 1]], 115: [[2, 1]],
  56. 116: [[2, 1]], 117: [[2, 1]], 118: [[2, 1]], 119: [[2, 1], [3, 1], [4, 1]], 120: [[2, 1]], 121: [[2, 1]],
  57. 122: [[2, 1]], 123: [[2, 1]], 124: [[2, 1]], 125: [[2, 1]], 126: [[2, 1]], 127: [[2, 1]], 128: [[2, 1]],
  58. 129: [[2, 1]], 130: [[2, 1]], 131: [[3, 1]], 132: [[3, 1]], 133: [[3, 1]], 134: [[3, 2]], 135: [[3, 1]],
  59. 136: [[3, 1]], 137: [[3, 1]], 138: [[3, 1]], 139: [[3, 1]], 140: [[3, 1]], 141: [[3, 1]], 142: [[3, 1]],
  60. 143: [[3, 1]], 144: [[3, 2]], 145: [[3, 1]], 146: [[3, 1]], 147: [[3, 1]], 148: [[3, 1], [4, 1]],
  61. 149: [[3, 1]], 150: [[3, 1]], 151: [[3, 1]], 152: [[3, 1]], 153: [[3, 1]], 154: [[3, 1]], 155: [[3, 1]],
  62. 156: [[3, 1]], 157: [[3, 1]], 158: [[3, 1]], 159: [[3, 1]], 160: [[3, 1]], 161: [[3, 1], [4, 1]],
  63. 162: [[3, 2]], 163: [[3, 1], [4, 1]], 164: [[3, 1], [4, 1]], 165: [[3, 1]], 166: [[3, 2]], 167: [[3, 1]],
  64. 168: [[3, 1]], 169: [[3, 2]], 170: [[3, 1], [4, 2]], 171: [[3, 1]], 172: [[3, 1]], 173: [[3, 1]],
  65. 174: [[3, 1]], 175: [[3, 1]], 176: [[3, 1]], 177: [[3, 1]], 178: [[3, 1]], 179: [[3, 1]], 180: [[3, 1]],
  66. 181: [[3, 2], [4, 1]], 182: [[3, 1], [4, 1]], 183: [[3, 1]], 184: [[3, 1]], 185: [[3, 1]], 186: [[3, 1]],
  67. 187: [[4, 1]], 188: [[4, 1]], 189: [[4, 1]], 190: [[4, 1]], 191: [[4, 1]], 192: [[4, 1]], 193: [[4, 1]],
  68. 194: [[4, 1]], 195: [[4, 1]], 196: [[4, 1]], 197: [[4, 1]], 198: [[4, 1]], 199: [[4, 1]], 200: [[4, 1]],
  69. 201: [[4, 1]], 202: [[4, 1]], 203: [[4, 2]], 204: [[4, 1]], 205: [[4, 1]], 206: [[4, 1]], 207: [[4, 1]],
  70. 208: [[4, 2]], 209: [[4, 1]], 210: [[4, 1]], 211: [[4, 1]], 212: [[4, 1]], 213: [[4, 1]], 214: [[4, 1]],
  71. 215: [[4, 1]], 216: [[4, 1]], 217: [[4, 1]], 218: [[4, 1]], 219: [[4, 1]], 220: [[4, 1]], 221: [[4, 1]],
  72. 222: [[4, 1]], 223: [[4, 1]], 224: [[4, 1]], 225: [[4, 1]], 226: [[4, 1]], 227: [[4, 1]], 228: [[4, 1]],
  73. 229: [[4, 1]], 230: [[4, 1]], 231: [[4, 1]], 232: [[4, 1]], 233: [[4, 1]], 234: [[4, 1]], 235: [[4, 2]],
  74. 236: [[4, 1]]}
  75.  
  76.  
  77. def main():
  78. # code for testing offline
  79. if len(sys.argv) < 2:
  80. print('usage: ./retriever.py term [term ...]')
  81. sys.exit(1)
  82. query_terms = sys.argv[1:]
  83.  
  84. answer = []
  85. answer = retrieve_vector(query_terms)
  86.  
  87. # print ('Query: ', query_terms)
  88. # i = 0
  89. # for docid in answer:
  90. # i += 1
  91. # print (i, docids[docid[0]])
  92.  
  93. print(answer)
  94.  
  95.  
  96. def retrieve_vector(query_terms):
  97. # a function to perform vector model retrieval with tf*idf weighting
  98.  
  99. global docids # list of doc names - the index is the docid (i.e. 0-4)
  100. global doclength # number of terms in each document
  101. global vocab # list of terms found (237) - the index is the termid
  102. global postings # postings dictionary; the key is a termid
  103. # the value is a list of postings entries,
  104. # each of which is a list containing a docid and frequency
  105. answer = []
  106.  
  107. merge = []
  108. idf = {}
  109. scores = {}
  110. query_vector = []
  111.  
  112. query_set = set(query_terms)
  113.  
  114. for term in query_set:
  115. try:
  116. termid = vocab.index(term.lower())
  117. except: # the term is not in the vocab
  118. print("not found: ", term, "is not in the vocabulary")
  119. continue
  120. idf[termid] = (1 + math.log(len(postings.get(termid))))/(len(doclength))
  121.  
  122. # print ("retrieve_vector: term = ", term, "termid = ", termid, "idf = ", idf[termid])
  123.  
  124. # create merge_list in descending idf order will need cut-off
  125. # heuristic to avoid including low-value terms in calculation
  126.  
  127. i = -1
  128. # now calculate tf*idf and score for each doc and the query
  129. for termid in sorted(idf, key=idf.get, reverse=True):
  130. i += 1
  131. query_vector.append(idf[termid]/len(query_set))
  132.  
  133. for post in postings.get(termid):
  134. print("post[0] = ", post[0],
  135. "post[1] = ", post[1],
  136. "idf = ", idf.get(termid),
  137. "doclength = ", doclength.get(post[0])
  138. )
  139. if post[0] in scores:
  140. scores[post[0]] += (idf.get(termid) * post[1]) / doclength.get(post[0]) * query_vector[i]
  141. else:
  142. scores[post[0]] = (idf.get(termid) * post[1]) / doclength.get(post[0]) * query_vector[i]
  143. # print((idf.get(termid)* post[1])/doclength.get(post[0]))
  144.  
  145. for docid in sorted(scores, key=scores.get, reverse=True):
  146. print("retrieve_vector: docid = ", docid, "score = ", scores.get(docid))
  147. #answer.append([docid, scores.get(docid)]) # testing only
  148. answer.append(docid)
  149.  
  150. return answer
  151.  
  152. # Standard boilerplate to call the main() function
  153. if __name__ == '__main__':
  154. main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement