Guest User

Untitled

a guest
Feb 18th, 2019
83
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.90 KB | None | 0 0
  1. import nltk
  2. from nltk.corpus import stopwords
  3. from nltk import word_tokenize
  4.  
  5. nltk.download('stopwords')
  6.  
  7.  
  8. class InvertedIndex:
  9. def __init__(self):
  10. self.inv_index = {}
  11.  
  12. # Magic method to get value
  13. def __getitem__(self, term):
  14. normalized_term = self.normalize(term) # Normalize term
  15. if normalized_term in self.inv_index:
  16. return self.inv_index[normalized_term] # Look in Inverted index dictionary
  17. else:
  18. return "Term is not indexed. Please try different term."
  19.  
  20. def normalize(self, term):
  21. stemmer = nltk.stem.PorterStemmer()
  22. normalized_term = stemmer.stem(term)
  23. return normalized_term.lower()
  24.  
  25. def add_term(self, term, docID):
  26. normalized_term = self.normalize(term)
  27. if normalized_term in self.inv_index:
  28. # If that term already exist in our inverted index data structure
  29. self.inv_index[normalized_term].append(docID)
  30. else:
  31. self.inv_index[normalized_term] = [docID]
  32.  
  33. def add_document(self, document, id):
  34. stop_words = set(stopwords.words('english'))
  35.  
  36. word_tokens = word_tokenize(document) # Divide documents into words
  37. for w in word_tokens:
  38. if not w in stop_words: # Check if word is stop word or not . If not than append in filtered sentence list
  39. self.add_term(self.normalize(w), id + 1) # Add normalized term to inverted index data structure
  40.  
  41. def build_index(self, corpus):
  42. for i, document in enumerate(corpus):
  43. self.add_document(document, i)
  44.  
  45.  
  46. if __name__ == '__main__':
  47. a = InvertedIndex()
  48. # You can add your own document
  49. a.build_index(["Hello My Name is XYZ", "Who are you XYZ", "Oh I like your name XYZ",
  50. "Who is the person behind you XYZ"])
  51. term = input("Please input string you want to find documents --- ")
  52. print("Document ID for ", term, " --- ", a[term])
Add Comment
Please, Sign In to add comment