Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import nltk
- from nltk.corpus import stopwords
- from nltk import word_tokenize
- nltk.download('stopwords')
- class InvertedIndex:
- def __init__(self):
- self.inv_index = {}
- # Magic method to get value
- def __getitem__(self, term):
- normalized_term = self.normalize(term) # Normalize term
- if normalized_term in self.inv_index:
- return self.inv_index[normalized_term] # Look in Inverted index dictionary
- else:
- return "Term is not indexed. Please try different term."
- def normalize(self, term):
- stemmer = nltk.stem.PorterStemmer()
- normalized_term = stemmer.stem(term)
- return normalized_term.lower()
- def add_term(self, term, docID):
- normalized_term = self.normalize(term)
- if normalized_term in self.inv_index:
- # If that term already exist in our inverted index data structure
- self.inv_index[normalized_term].append(docID)
- else:
- self.inv_index[normalized_term] = [docID]
- def add_document(self, document, id):
- stop_words = set(stopwords.words('english'))
- word_tokens = word_tokenize(document) # Divide documents into words
- for w in word_tokens:
- if not w in stop_words: # Check if word is stop word or not . If not than append in filtered sentence list
- self.add_term(self.normalize(w), id + 1) # Add normalized term to inverted index data structure
- def build_index(self, corpus):
- for i, document in enumerate(corpus):
- self.add_document(document, i)
- if __name__ == '__main__':
- a = InvertedIndex()
- # You can add your own document
- a.build_index(["Hello My Name is XYZ", "Who are you XYZ", "Oh I like your name XYZ",
- "Who is the person behind you XYZ"])
- term = input("Please input string you want to find documents --- ")
- print("Document ID for ", term, " --- ", a[term])
Add Comment
Please, Sign In to add comment