Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: utf-8 -*-
- import pandas as pd
- import nltk
- import numpy as np
- import string
- from nltk import pos_tag
- from nltk.corpus import stopwords
- from nltk.tokenize import WhitespaceTokenizer
- from nltk.stem import WordNetLemmatizer
- from nltk.corpus import wordnet
- # nltk.download('stopwords')
- # nltk.download('averaged_perceptron_tagger')
- # nltk.download('wordnet')
- # nltk.download('vader_lexicon')
- colnames=['Review', 'Rating']
- reviews = pd.read_csv("j:/Desktop/PNJReviews/EXCELSIORGRANDHOTEL1.csv", names=colnames, header=None, sep='"', delimiter=",", engine='python', encoding = "utf-8")
- print(reviews.head(10))
- def get_wordnet_pos(pos_tag): #Rozbija na części mowy
- if pos_tag.startswith('J'):
- return wordnet.ADJ #ADJECTIVE - przymiotnik
- elif pos_tag.startswith('V'):
- return wordnet.VERB #VERB - czasownik
- elif pos_tag.startswith('N'):
- return wordnet.NOUN #NOIN - rzeczownik
- elif pos_tag.startswith('R'):
- return wordnet.ADV #ADVERB - przysłówek
- else:
- return wordnet.NOUN
- def clean_text(text):
- text = text.lower() # wszystko małymi literami
- text = [word.strip(string.punctuation) for word in text.split(" ")] # tokenizacja
- text = [word for word in text if not any(c.isdigit() for c in word)] # usuwa słowa zawierające liczby
- stop = stopwords.words('english') #
- text = [x for x in text if x not in stop] # usuwa niepotrzebne słowa ("a", "the")
- text = [t for t in text if len(t) > 0] # usuwa puste
- pos_tags = pos_tag(text) # przypisuje część mowy
- text = [WordNetLemmatizer().lemmatize(t[0], get_wordnet_pos(t[1])) for t in pos_tags] # lematyzuje tekst (formy podstawowe)
- text = [t for t in text if len(t) > 1]
- text = " ".join(text)
- return(text)
- reviews["Review_clean"] = reviews["Review"].apply(lambda x: clean_text(x)) # "czyści" recenzje
- print(reviews[["Review","Review_clean"]].head(10))
- from sklearn.feature_extraction.text import TfidfVectorizer
- tfidf = TfidfVectorizer(min_df = 10) # dla słów występujących minimum 10 razy
- tfidf_result = tfidf.fit_transform(reviews["Review_clean"]).toarray() # tworzy wektory dla słów (iloraz frekwencji wyrazu i odwrotnej frekwencji dokumentu)
- tfidf_df = pd.DataFrame(tfidf_result, columns = tfidf.get_feature_names()) # mapuje liczby na nazwy
- tfidf_df.columns = [x for x in tfidf_df.columns] # nazywa kolumny słowami
- tfidf_df.index = reviews.index # indeksowanie
- reviews = pd.concat([reviews, tfidf_df], axis=1) # konkatenacja
- print(reviews.head())
- label = "Rating" # zmienna objaśniana
- ignore_cols = [label, "Review", "Review_clean"]
- features = [c for c in reviews.columns if c not in ignore_cols] # zmienne objaśniające
- from sklearn.model_selection import train_test_split # podział na zbiór uczący i testowy
- X_train, X_test, y_train, y_test = train_test_split(reviews[features], reviews[label], test_size = 0.2, random_state=42)
- from sklearn.linear_model import LinearRegression
- lr = LinearRegression()
- lr.fit(X_train, y_train)
- y_test_predict = lr.predict(X_test)
- # y_test_predict = np.round(y_test_predict, -1) # zaokrąglenie
- y_test_predict = np.where(y_test_predict > 50, 50, y_test_predict) # limit górny
- y_test_predict = np.where(y_test_predict < 10, 10, y_test_predict) # limit dolny
- pd.DataFrame(y_test_predict).to_csv('j:/Desktop/PNJReviews/PredictedOutput.csv', index=None, header=None, sep=';')
- pd.DataFrame(y_test).to_csv('j:/Desktop/PNJReviews/TestOutput.csv', index=None, header=None, sep=';')
- from sklearn.metrics import mean_squared_error
- from math import sqrt
- rms = sqrt(mean_squared_error(y_test, y_test_predict)) #średnia kwadratowa
- print(rms)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement