Advertisement
gubichas

Посчитать TF-IDF

Nov 2nd, 2022 (edited)
62
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.04 KB | None | 0 0
  1. import math
  2.  
  3. from collections import Counter, defaultdict
  4.  
  5. import string
  6.  
  7.  
  8. def preprocess_text(text):
  9.     string.punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
  10.     text.lower()
  11.     for i in string.punctuation:
  12.         if i in text:
  13.             text = text.replace(i, '')
  14.     return text.split()
  15.  
  16.  
  17. def calc_tf(term, text):
  18.     tokens = preprocess_text(text)
  19.     frequency = Counter(tokens)
  20.     most_common_term, most_common_frequency = frequency.most_common(1)[0]
  21.     term_frequency = frequency.get(term)
  22.     if term_frequency:
  23.         return 0.5 + 0.5* term_frequency / most_common_frequency
  24.     else:
  25.         return 0
  26.  
  27.  
  28. def calc_idfs(corpus):
  29.     corpus1 = []
  30.     for i in corpus:
  31.         corpus1.append(set(preprocess_text(i)))
  32.  
  33.     idfs = defaultdict(float)
  34.  
  35.     for i in corpus1:
  36.         for j in i:
  37.             idfs[j] += 1
  38.  
  39.     for token, count in idfs.items():
  40.         idfs[token]= math.log(len(corpus) / (1 + count))
  41.  
  42.     return idfs
  43.  
  44. def calc_tfidf(term, text, idfs):
  45.     return calc_tf(term, text) * idfs[term]
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement