Guest User

Untitled

a guest
Apr 20th, 2018
73
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.03 KB | None | 0 0
  1. #Teks harus sudah melalui proses word tokenizing terlebih dahulu.
  2.  
  3. def tf(sudahDiTokenize): #Term Frequency
  4. wordlist = sudahDiTokenize
  5. #flat_list = [item for sublist in wordlist for item in sublist] #bila memakai tf normalized
  6. #jumkata = len(flat_list) # bila memakai tf normalized
  7. wordfreq = {}
  8. for w in wordlist:
  9. for o in w:
  10. wordfreq[o] = wordfreq.get(o,0) + 1
  11. #wordfreq.update((x, y/jumkata) for x, y in wordfreq.items()) #Gunakan ini apabila memakai TF Normalized
  12. print(wordfreq)
  13. return wordfreq
  14.  
  15. def idf(sudahDiTokenize): #Inverse Term Frequency
  16. idf_values = {}
  17. jumdok = len(sudahDiTokenize)
  18. all_tokens_set = set([item for sublist in sudahDiTokenize for item in sublist])
  19. print(all_tokens_set)
  20. for tkn in all_tokens_set:
  21. contains_token = map(lambda doc: tkn in doc, sudahDiTokenize)
  22. idf_values[tkn] = math.log10(jumdok/(sum(contains_token)))
  23. print(idf_values)
  24. return idf_values
  25.  
  26. def tfxidf(tf,idf):
  27. hasil = {k: tf[k]*idf[k] for k in tf}
  28. print(hasil)
Add Comment
Please, Sign In to add comment