Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import codecs
- import time
- from sklearn.feature_extraction.text import TfidfVectorizer
- class NipTester:
- def load_reference_nips(self):
- self.nips = []
- for line in codecs.open("nips.txt", "r", "utf8"):
- line = line.strip()
- if len(line) > 0:
- for n in range(0,7):
- self.nips.append(line)
- def init(self):
- self.load_reference_nips()
- self.vect = TfidfVectorizer(analyzer='char', ngram_range=(1, 3), min_df=0.0, max_df=1.0)
- self.tfidf = self.vect.fit_transform(self.nips)
- def find_similar(self, nip):
- nip_tfidf = self.vect.transform([nip])
- mat = (self.tfidf * nip_tfidf.T).A
- similar = []
- for i in range(0, len(self.nips)):
- sim = float(mat[i][0])
- if sim > 0.6:
- similar.append((self.nips[i], sim))
- return similar
- test_nips = []
- test_nips.append("986o239432")
- test_nips.append("986o239.32")
- nipTester = NipTester()
- start = time.time()
- nipTester.init()
- print("Loading time: %4.2f s" % (time.time() - start))
- print("Size of the reference NIP database: %d" % len(nipTester.nips))
- loops = 5
- searches = 0
- start = time.time()
- for n in range(0, loops):
- for nip in test_nips:
- matches = nipTester.find_similar(nip)
- searches += 1
- for match in matches:
- print("'%s' is similar to '%s' with score of %8.5f" % (nip, match[0], match[1]))
- print("Searching time: %4.2f s" % (time.time() - start))
- print("Number of searches: %d" % searches)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement