SHARE
TWEET

Untitled

DragonNecromancer Feb 17th, 2019 61 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. # manual
  2. import csv
  3. import nltk
  4. import re
  5. import pandas as pd
  6. import string
  7. from collections import Counter
  8. from nltk.corpus import stopwords
  9. from nltk.tokenize import word_tokenize
  10. from nltk.lm import counter
  11. from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
  12. from pandas import ExcelWriter
  13. from pandas import ExcelFile
  14.  
  15.  
  16. files = csv.reader(open('2017 ganjil TI.csv', 'rt'), delimiter = ',')
  17. all_documents = []
  18. hasil = []
  19. for i in files :
  20.     # case folding: mengubah menjadi huruf kecil
  21.     all_documents = i[5].lower()
  22.     # cleansing: menghilangkan tanda baca
  23.     result = re.sub('[^A-Za-z]+', ' ', all_documents)
  24.     # melakukan stemming: merubah menjadi kata dasar
  25.     factory = StemmerFactory()
  26.     stemmer = factory.create_stemmer()
  27.     katadasar = stemmer.stem(result)
  28.     # tokenisasi: memisahkan setiap kata pada kalimat
  29.     word_list = set(stopwords.words('indonesian'))
  30.     word_list2 = [w.strip() for w in word_list if w.strip() not in nltk.corpus.stopwords.words('indonesian')]
  31.     # custom kata untuk dilakukan stop word
  32.     CustomListofWordstoExclude = ['pak', 'bapak', 'terimakasih', 'terima', 'kasih', 'hehehe', 'hehe', 'haha', 'hahaha', 'hihihi', 'hihi','bu']
  33.     # menambahkan list kata ke dalam file stopword
  34.     word_list2.extend(CustomListofWordstoExclude)
  35.     # tokenisasi: memisahkan setiap kata pada sebuah kalimat
  36.     tokenize = katadasar.split(" ")
  37.     z = tokenize
  38.     # print(z)
  39.     akhir = [i for i in z if not i in word_list2 and i != '']
  40.     if akhir != []:
  41.         test = ' '.join(akhir)
  42.         hasil.append(test)
  43.  
  44.         # print(akhir)
  45. # Cek hasil yang terakhir
  46. print(hasil)
  47. df = pd.DataFrame(hasil, columns=['test%'])
  48. writer = pd.ExcelWriter('hasilpreprop.xlsx', engine='xlsxwriter')
  49. df.to_excel(writer, sheet_name='Sheet1')
  50. writer.save()
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top