Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # manual
- import csv
- import nltk
- import re
- import pandas as pd
- import string
- from collections import Counter
- from nltk.corpus import stopwords
- from nltk.tokenize import word_tokenize
- from nltk.lm import counter
- from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
- from pandas import ExcelWriter
- from pandas import ExcelFile
- files = csv.reader(open('2017 ganjil TI.csv', 'rt'), delimiter = ',')
- all_documents = []
- hasil = []
- for i in files :
- # case folding: mengubah menjadi huruf kecil
- all_documents = i[5].lower()
- # cleansing: menghilangkan tanda baca
- result = re.sub('[^A-Za-z]+', ' ', all_documents)
- # melakukan stemming: merubah menjadi kata dasar
- factory = StemmerFactory()
- stemmer = factory.create_stemmer()
- katadasar = stemmer.stem(result)
- # tokenisasi: memisahkan setiap kata pada kalimat
- word_list = set(stopwords.words('indonesian'))
- word_list2 = [w.strip() for w in word_list if w.strip() not in nltk.corpus.stopwords.words('indonesian')]
- # custom kata untuk dilakukan stop word
- CustomListofWordstoExclude = ['pak', 'bapak', 'terimakasih', 'terima', 'kasih', 'hehehe', 'hehe', 'haha', 'hahaha', 'hihihi', 'hihi','bu']
- # menambahkan list kata ke dalam file stopword
- word_list2.extend(CustomListofWordstoExclude)
- # tokenisasi: memisahkan setiap kata pada sebuah kalimat
- tokenize = katadasar.split(" ")
- z = tokenize
- # print(z)
- akhir = [i for i in z if not i in word_list2 and i != '']
- if akhir != []:
- test = ' '.join(akhir)
- hasil.append(test)
- # print(akhir)
- # Cek hasil yang terakhir
- print(hasil)
- df = pd.DataFrame(hasil, columns=['test%'])
- writer = pd.ExcelWriter('hasilpreprop.xlsx', engine='xlsxwriter')
- df.to_excel(writer, sheet_name='Sheet1')
- writer.save()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement