alfi_gusman

tensorflow.python.framework.errors_impl.UnimplementedError: Cast string to float is not supported on

Oct 17th, 2021
1,300
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. from flask import Flask, render_template, request, url_for
  2. from flask_bootstrap import Bootstrap
  3.  
  4. from textblob import TextBlob, Word
  5. import random
  6. import time
  7. import pandas as pd
  8. import numpy as np
  9.  
  10. # NLTK
  11. import nltk
  12. nltk.download('punkt')
  13. nltk.download('stopwords')
  14. from nltk.tokenize import word_tokenize
  15. import re
  16. import string
  17. from nltk.probability import FreqDist
  18. from nltk.corpus import stopwords
  19. from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
  20. import swifter
  21.  
  22. # Keras // ikutin kya yg di bawah ini di jupyternya klo pengen running
  23. import tensorflow as tf
  24. from tensorflow.keras import layers
  25. from tensorflow.keras import models
  26. from tensorflow.keras.utils import to_categorical
  27. from tensorflow.keras.preprocessing.text import Tokenizer
  28. from tensorflow.keras.preprocessing import sequence
  29. from tensorflow.keras.preprocessing.sequence import pad_sequences
  30. from tensorflow.keras.layers import Embedding
  31. from tensorflow.keras.models import load_model  
  32.  
  33. from sklearn.model_selection import train_test_split
  34. from sklearn.metrics import classification_report
  35. from sklearn.metrics import confusion_matrix
  36.  
  37.  
  38. app = Flask(__name__)
  39. Bootstrap(app)
  40.  
  41. @app.route('/')
  42. def index():
  43.     return render_template('index.html')
  44.  
  45. @app.route('/analyse', methods=['POST'])
  46. def analyse():
  47.     # Membaca file
  48.     df = pd.read_csv('hadist2.csv')
  49.     df.columns=['id', 'terjemah_hadist','no_hadist','sumber','kitab','kelas','label']
  50.     df = df.drop(['id','no_hadist','sumber','kitab','kelas'], axis=1)
  51.     text = df['terjemah_hadist']
  52.     print(text)
  53.  
  54.     df['terjemah_hadist'] = df.iloc[:, 0]
  55.     df['label'] = df.iloc[:, 1]
  56.  
  57.     label = to_categorical(df['label'])
  58.     print(label.shape)
  59.     print(label)
  60.  
  61.     # text preprocessing
  62.     # Convert Lower Text (Lower Case)
  63.     df['terjemah_hadist'] = df['terjemah_hadist'].str.lower()
  64.     print('Hasil Case Folding : \n')
  65.     print(df['terjemah_hadist'])
  66.     print('\n\n')
  67.  
  68.     #Tokenization
  69.     # Clean the text
  70.     #remove number
  71.     def remove_number(text):
  72.         return  re.sub(r"\d+", "", text)
  73.     df['terjemah_hadist'] = df['terjemah_hadist'].apply(remove_number)
  74.  
  75.     #remove punctuation
  76.     def remove_punctuation(text):
  77.         return text.translate(str.maketrans("","",string.punctuation))
  78.     df['terjemah_hadist'] = df['terjemah_hadist'].apply(remove_punctuation)
  79.  
  80.     #remove whitespace leading & trailing
  81.     def remove_whitespace_LT(text):
  82.         return text.strip()
  83.     df['terjemah_hadist'] = df['terjemah_hadist'].apply(remove_whitespace_LT)
  84.  
  85.     #remove multiple whitespace into single whitespace
  86.     def remove_whitespace_multiple(text):
  87.         return re.sub('\s+',' ',text)
  88.     df['terjemah_hadist'] = df['terjemah_hadist'].apply(remove_whitespace_multiple)
  89.  
  90.     # remove single char
  91.     def remove_singl_char(text):
  92.         return re.sub(r"\b[a-zA-Z]\b", "", text)
  93.     df['terjemah_hadist'] = df['terjemah_hadist'].apply(remove_singl_char)
  94.  
  95.     # NLTK word rokenize
  96.     def word_tokenize_wrapper(text):
  97.         return word_tokenize(text)
  98.     df['tokenized_text'] = df['terjemah_hadist'].apply(word_tokenize_wrapper)
  99.     print('Hasil Tokenizing : \n')
  100.     print(df['tokenized_text'].head())
  101.     print('\n\n')
  102.  
  103.     # NLTK calc frequency distribution
  104.     def freqDist_wrapper(text):
  105.         return FreqDist(text)
  106.     df['tokenized_text_fdist'] = df['tokenized_text'].apply(freqDist_wrapper)
  107.     print('Frequency Tokens : \n')
  108.     print(df['tokenized_text_fdist'].head().apply(lambda x : x.most_common()))
  109.     print('\n\n')
  110.  
  111.     #Filtering (Stopword Removal)
  112.     # ----- get stopword from NLTK stopword -----
  113.     list_stopwords = stopwords.words('indonesian')
  114.  
  115.     # ------ manualy add stopword  ------
  116.     # append additional stopword
  117.     list_stopwords.extend(["yg", "dg", "rt", "dgn", "ny", "d", 'klo',
  118.                        'kalo', 'amp', 'biar', 'bikin', 'bilang',
  119.                        'gak', 'ga', 'krn', 'nya', 'nih', 'sih',
  120.                        'si', 'tau', 'tdk', 'tuh', 'utk', 'ya',
  121.                        'jd', 'jgn', 'sdh', 'aja', 'n', 't',
  122.                        'nyg', 'hehe', 'pen', 'u', 'nan', 'loh', 'rt',
  123.                        '&amp', 'yah'])
  124.  
  125.     # ------ add stopword from txt file ------
  126.     # read txt stopword using pandas
  127.     txt_stopword = pd.read_csv("stopwords.txt", names= ["stopwords"], header = None)
  128.  
  129.     # convert stopword string to list & append additional stopword
  130.     list_stopwords.extend(txt_stopword["stopwords"][0].split(' '))
  131.  
  132.     # ------------------------------------------
  133.     # convert list to dictionary
  134.     list_stopwords = set(list_stopwords)
  135.  
  136.     #remove stopword pada list token
  137.     def stopwords_removal(words):
  138.         return [word for word in words if word not in list_stopwords]
  139.     df['tokenized_text_WSW'] = df['tokenized_text'].apply(stopwords_removal)
  140.     print('Hasil Stopword : \n')
  141.     print(df['tokenized_text_WSW'].head())
  142.     print('\n\n')
  143.  
  144.     #Normalization
  145.     normalizad_word = pd.read_excel('normalisasi.xlsx', engine='openpyxl')
  146.  
  147.     normalizad_word_dict = {}
  148.  
  149.     for index, row in normalizad_word.iterrows():
  150.         if row[0] not in normalizad_word_dict:
  151.             normalizad_word_dict[row[0]] = row[1]
  152.  
  153.     def normalized_term(document):
  154.         return [normalizad_word_dict[term] if term in normalizad_word_dict else term for term in document]
  155.     df['text_normalized'] = df['tokenized_text_WSW'].apply(normalized_term)
  156.     print('Hasil Normalisasi : \n')
  157.     print (df['text_normalized'].head(10))
  158.     print('\n\n')
  159.  
  160.     #Stemmer
  161.     # create stemmer
  162.     factory = StemmerFactory()
  163.     stemmer = factory.create_stemmer()
  164.  
  165.     # stemmed
  166.     def stemmed_wrapper(term):
  167.         return stemmer.stem(term)
  168.  
  169.     term_dict = {}
  170.  
  171.     for document in df['text_normalized']:
  172.         for term in document:
  173.             if term not in term_dict:
  174.                 term_dict[term] = ' '
  175.    
  176.     print(len(term_dict))
  177.     print("------------------------")
  178.  
  179.     for term in term_dict:
  180.         term_dict[term] = stemmed_wrapper(term)
  181.         print(term,":" ,term_dict[term])
  182.    
  183.     print(term_dict)
  184.     print("------------------------")
  185.  
  186.     # apply stemmed term to dataframe
  187.     def get_stemmed_term(document):
  188.         return [term_dict[term] for term in document]
  189.     df['text_tokens_stemmed'] = df['text_normalized'].swifter.apply(get_stemmed_term)
  190.     print('Hasil Stemming : \n')
  191.     print(df['text_tokens_stemmed'])
  192.     print('\n\n')
  193.  
  194.  
  195.     # Pembagian data
  196.     data_train, data_testing, Y_train, Y_test, = train_test_split(df['text_tokens_stemmed'], label, test_size = 0.2, random_state = 42)
  197.     print ('Data latih')
  198.     print (data_train.shape, Y_train.shape)
  199.     print ('=====================')
  200.     print ('Data uji')
  201.     print (data_testing.shape, Y_test.shape)
  202.  
  203.     # Melakukan padding
  204.     tokenizer = Tokenizer()
  205.     tokenizer.fit_on_texts(data_train)
  206.     jum_vocab = len(tokenizer.word_index)
  207.     padding_x_train = tokenizer.texts_to_sequences(data_train)
  208.     padding_x_testing = tokenizer.texts_to_sequences(data_testing)
  209.  
  210.     maxlen = 600
  211.     X_train = sequence.pad_sequences(padding_x_train, maxlen=maxlen)
  212.     X_testing = sequence.pad_sequences(padding_x_testing, maxlen=maxlen)  
  213.  
  214.     # RNN
  215.     jum_embedding = 20
  216.     model = models.Sequential()
  217.     model.add(Embedding(jum_vocab+1,jum_embedding,input_length=maxlen)) #input layer
  218.     model.add(layers.LSTM(32)) #layerRNN
  219.     model.add(layers.Dense(2, activation='sigmoid')) #output layer
  220.     model.compile(optimizer="adam", loss='binary_crossentropy', metrics=['accuracy'])
  221.  
  222.     # Melatih data
  223.     #model.fit(X_train, Y_train, epochs=10, batch_size=1, validation_split=0.1)
  224.     model.fit(X_train, Y_train, epochs=1, batch_size=1, validation_split=0.1, verbose=1)
  225.     model.summary()
  226.  
  227.     if request.method == 'POST':
  228.         modelrnn = model
  229.         rawtext = request.form['rawtext']
  230.         vara = np.asarray(rawtext)
  231.         data = [vara]
  232.         #y = model.predict(np.asarray(data)) #ada masalah disini datanya kya hrs di convert
  233.         #print(y)
  234.  
  235.  
  236.     return render_template('index.html', hasil = data) 
  237.  
  238.  
  239.  
  240.  
  241. if __name__ == '__main__':
  242.     app.run(debug=True)
RAW Paste Data