Untitled

# -*- coding: utf-8 -*-

import pandas as pd
import nltk
import numpy as np
import string
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
# nltk.download('stopwords')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('wordnet')
# nltk.download('vader_lexicon')


colnames=['Review', 'Rating']
reviews = pd.read_csv("j:/Desktop/PNJReviews/EXCELSIORGRANDHOTEL1.csv", names=colnames, header=None, sep='"', delimiter=",", engine='python', encoding = "utf-8")
print(reviews.head(10))

def get_wordnet_pos(pos_tag): #Rozbija na części mowy
    if pos_tag.startswith('J'):
        return wordnet.ADJ          #ADJECTIVE - przymiotnik
    elif pos_tag.startswith('V'):
        return wordnet.VERB         #VERB - czasownik
    elif pos_tag.startswith('N'):
        return wordnet.NOUN         #NOIN - rzeczownik
    elif pos_tag.startswith('R'):
        return wordnet.ADV          #ADVERB - przysłówek
    else:
        return wordnet.NOUN

def clean_text(text):
    text = text.lower()                                                                     # wszystko małymi literami
    text = [word.strip(string.punctuation) for word in text.split(" ")]                     # tokenizacja
    text = [word for word in text if not any(c.isdigit() for c in word)]                    # usuwa słowa zawierające liczby
    stop = stopwords.words('english')                                                       #
    text = [x for x in text if x not in stop]                                               # usuwa niepotrzebne słowa ("a", "the")
    text = [t for t in text if len(t) > 0]                                                  # usuwa puste
    pos_tags = pos_tag(text)                                                                # przypisuje część mowy
    text = [WordNetLemmatizer().lemmatize(t[0], get_wordnet_pos(t[1])) for t in pos_tags]   # lematyzuje tekst (formy podstawowe)
    text = [t for t in text if len(t) > 1]
    text = " ".join(text)
    return(text)

reviews["Review_clean"] = reviews["Review"].apply(lambda x: clean_text(x))                  # "czyści" recenzje
print(reviews[["Review","Review_clean"]].head(10))

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(min_df = 10)                                                        # dla słów występujących minimum 10 razy
tfidf_result = tfidf.fit_transform(reviews["Review_clean"]).toarray()                       # tworzy wektory dla słów (iloraz frekwencji wyrazu i odwrotnej frekwencji dokumentu)
tfidf_df = pd.DataFrame(tfidf_result, columns = tfidf.get_feature_names())                  # mapuje liczby na nazwy
tfidf_df.columns = [x for x in tfidf_df.columns]                                            # nazywa kolumny słowami
tfidf_df.index = reviews.index                                                              # indeksowanie
reviews = pd.concat([reviews, tfidf_df], axis=1)                                            # konkatenacja

print(reviews.head())

label = "Rating"                                                                            # zmienna objaśniana
ignore_cols = [label, "Review", "Review_clean"]
features = [c for c in reviews.columns if c not in ignore_cols]                             # zmienne objaśniające

from sklearn.model_selection import train_test_split                                        # podział na zbiór uczący i testowy
X_train, X_test, y_train, y_test = train_test_split(reviews[features], reviews[label], test_size = 0.2, random_state=42)

from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)
y_test_predict = lr.predict(X_test)
# y_test_predict = np.round(y_test_predict, -1)                                               # zaokrąglenie
y_test_predict = np.where(y_test_predict > 50, 50, y_test_predict)                          # limit górny
y_test_predict = np.where(y_test_predict < 10, 10, y_test_predict)                          # limit dolny

pd.DataFrame(y_test_predict).to_csv('j:/Desktop/PNJReviews/PredictedOutput.csv', index=None, header=None, sep=';')
pd.DataFrame(y_test).to_csv('j:/Desktop/PNJReviews/TestOutput.csv', index=None, header=None, sep=';')

from sklearn.metrics import mean_squared_error
from math import sqrt

rms = sqrt(mean_squared_error(y_test, y_test_predict)) #średnia kwadratowa
print(rms)