Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #Teks harus sudah melalui proses word tokenizing terlebih dahulu.
- def tf(sudahDiTokenize): #Term Frequency
- wordlist = sudahDiTokenize
- #flat_list = [item for sublist in wordlist for item in sublist] #bila memakai tf normalized
- #jumkata = len(flat_list) # bila memakai tf normalized
- wordfreq = {}
- for w in wordlist:
- for o in w:
- wordfreq[o] = wordfreq.get(o,0) + 1
- #wordfreq.update((x, y/jumkata) for x, y in wordfreq.items()) #Gunakan ini apabila memakai TF Normalized
- print(wordfreq)
- return wordfreq
- def idf(sudahDiTokenize): #Inverse Term Frequency
- idf_values = {}
- jumdok = len(sudahDiTokenize)
- all_tokens_set = set([item for sublist in sudahDiTokenize for item in sublist])
- print(all_tokens_set)
- for tkn in all_tokens_set:
- contains_token = map(lambda doc: tkn in doc, sudahDiTokenize)
- idf_values[tkn] = math.log10(jumdok/(sum(contains_token)))
- print(idf_values)
- return idf_values
- def tfxidf(tf,idf):
- hasil = {k: tf[k]*idf[k] for k in tf}
- print(hasil)
Add Comment
Please, Sign In to add comment