Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- tfidf = TfidfVectorizer().fit_transform(documents)
- min_percent = int(min_percent) / 100.0
- row_length = len(documents)
- result = {}
- for i in range(row_length):
- row = linear_kernel(tfidf[i:i+1], tfidf).flatten()
- for n in range(row_length):
- if i != n and row[n] >= min_percent:
- if not result.get(i):
- result[i] = []
- result[i].append((n, row[n]))
- # Postprocessing
- clean_result = deepcopy(result)
- for key, values in result.iteritems():
- if key in clean_result:
- for value in values:
- if value in clean_result[key] and value[0] in clean_result:
- for value2 in clean_result[value[0]]:
- if value2[0] == key:
- clean_result[value[0]].remove(value2)
- if not clean_result[value[0]]:
- clean_result.pop(value[0])
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement