Advertisement
DragonNecromancer

Untitled

Feb 17th, 2019
97
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.79 KB | None | 0 0
  1. # manual
  2. import csv
  3. import nltk
  4. import re
  5. import pandas as pd
  6. import string
  7. from collections import Counter
  8. from nltk.corpus import stopwords
  9. from nltk.tokenize import word_tokenize
  10. from nltk.lm import counter
  11. from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
  12. from pandas import ExcelWriter
  13. from pandas import ExcelFile
  14.  
  15.  
  16. files = csv.reader(open('2017 ganjil TI.csv', 'rt'), delimiter = ',')
  17. all_documents = []
  18. hasil = []
  19. for i in files :
  20.     # case folding: mengubah menjadi huruf kecil
  21.     all_documents = i[5].lower()
  22.     # cleansing: menghilangkan tanda baca
  23.     result = re.sub('[^A-Za-z]+', ' ', all_documents)
  24.     # melakukan stemming: merubah menjadi kata dasar
  25.     factory = StemmerFactory()
  26.     stemmer = factory.create_stemmer()
  27.     katadasar = stemmer.stem(result)
  28.     # tokenisasi: memisahkan setiap kata pada kalimat
  29.     word_list = set(stopwords.words('indonesian'))
  30.     word_list2 = [w.strip() for w in word_list if w.strip() not in nltk.corpus.stopwords.words('indonesian')]
  31.     # custom kata untuk dilakukan stop word
  32.     CustomListofWordstoExclude = ['pak', 'bapak', 'terimakasih', 'terima', 'kasih', 'hehehe', 'hehe', 'haha', 'hahaha', 'hihihi', 'hihi','bu']
  33.     # menambahkan list kata ke dalam file stopword
  34.     word_list2.extend(CustomListofWordstoExclude)
  35.     # tokenisasi: memisahkan setiap kata pada sebuah kalimat
  36.     tokenize = katadasar.split(" ")
  37.     z = tokenize
  38.     # print(z)
  39.     akhir = [i for i in z if not i in word_list2 and i != '']
  40.     if akhir != []:
  41.         test = ' '.join(akhir)
  42.         hasil.append(test)
  43.  
  44.         # print(akhir)
  45. # Cek hasil yang terakhir
  46. print(hasil)
  47. df = pd.DataFrame(hasil, columns=['test%'])
  48. writer = pd.ExcelWriter('hasilpreprop.xlsx', engine='xlsxwriter')
  49. df.to_excel(writer, sheet_name='Sheet1')
  50. writer.save()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement