Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import math
- from collections import Counter, defaultdict
- import string
- def preprocess_text(text):
- string.punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
- text.lower()
- for i in string.punctuation:
- if i in text:
- text = text.replace(i, '')
- return text.split()
- def calc_tf(term, text):
- tokens = preprocess_text(text)
- frequency = Counter(tokens)
- most_common_term, most_common_frequency = frequency.most_common(1)[0]
- term_frequency = frequency.get(term)
- if term_frequency:
- return 0.5 + 0.5* term_frequency / most_common_frequency
- else:
- return 0
- def calc_idfs(corpus):
- corpus1 = []
- for i in corpus:
- corpus1.append(set(preprocess_text(i)))
- idfs = defaultdict(float)
- for i in corpus1:
- for j in i:
- idfs[j] += 1
- for token, count in idfs.items():
- idfs[token]= math.log(len(corpus) / (1 + count))
- return idfs
- def calc_tfidf(term, text, idfs):
- return calc_tf(term, text) * idfs[term]
Add Comment
Please, Sign In to add comment