Advertisement
Guest User

Untitled

a guest
Dec 11th, 2019
146
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.48 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2.  
  3. import pandas as pd
  4. import nltk
  5. import numpy as np
  6. import string
  7. from nltk import pos_tag
  8. from nltk.corpus import stopwords
  9. from nltk.tokenize import WhitespaceTokenizer
  10. from nltk.stem import WordNetLemmatizer
  11. from nltk.corpus import wordnet
  12. # nltk.download('stopwords')
  13. # nltk.download('averaged_perceptron_tagger')
  14. # nltk.download('wordnet')
  15. # nltk.download('vader_lexicon')
  16.  
  17.  
  18. colnames=['Review', 'Rating']
  19. reviews = pd.read_csv("j:/Desktop/PNJReviews/EXCELSIORGRANDHOTEL1.csv", names=colnames, header=None, sep='"', delimiter=",", engine='python', encoding = "utf-8")
  20. print(reviews.head(10))
  21.  
  22. def get_wordnet_pos(pos_tag): #Rozbija na części mowy
  23. if pos_tag.startswith('J'):
  24. return wordnet.ADJ #ADJECTIVE - przymiotnik
  25. elif pos_tag.startswith('V'):
  26. return wordnet.VERB #VERB - czasownik
  27. elif pos_tag.startswith('N'):
  28. return wordnet.NOUN #NOIN - rzeczownik
  29. elif pos_tag.startswith('R'):
  30. return wordnet.ADV #ADVERB - przysłówek
  31. else:
  32. return wordnet.NOUN
  33.  
  34. def clean_text(text):
  35. text = text.lower() # wszystko małymi literami
  36. text = [word.strip(string.punctuation) for word in text.split(" ")] # tokenizacja
  37. text = [word for word in text if not any(c.isdigit() for c in word)] # usuwa słowa zawierające liczby
  38. stop = stopwords.words('english') #
  39. text = [x for x in text if x not in stop] # usuwa niepotrzebne słowa ("a", "the")
  40. text = [t for t in text if len(t) > 0] # usuwa puste
  41. pos_tags = pos_tag(text) # przypisuje część mowy
  42. text = [WordNetLemmatizer().lemmatize(t[0], get_wordnet_pos(t[1])) for t in pos_tags] # lematyzuje tekst (formy podstawowe)
  43. text = [t for t in text if len(t) > 1]
  44. text = " ".join(text)
  45. return(text)
  46.  
  47. reviews["Review_clean"] = reviews["Review"].apply(lambda x: clean_text(x)) # "czyści" recenzje
  48. print(reviews[["Review","Review_clean"]].head(10))
  49.  
  50. from sklearn.feature_extraction.text import TfidfVectorizer
  51. tfidf = TfidfVectorizer(min_df = 10) # dla słów występujących minimum 10 razy
  52. tfidf_result = tfidf.fit_transform(reviews["Review_clean"]).toarray() # tworzy wektory dla słów (iloraz frekwencji wyrazu i odwrotnej frekwencji dokumentu)
  53. tfidf_df = pd.DataFrame(tfidf_result, columns = tfidf.get_feature_names()) # mapuje liczby na nazwy
  54. tfidf_df.columns = [x for x in tfidf_df.columns] # nazywa kolumny słowami
  55. tfidf_df.index = reviews.index # indeksowanie
  56. reviews = pd.concat([reviews, tfidf_df], axis=1) # konkatenacja
  57.  
  58. print(reviews.head())
  59.  
  60. label = "Rating" # zmienna objaśniana
  61. ignore_cols = [label, "Review", "Review_clean"]
  62. features = [c for c in reviews.columns if c not in ignore_cols] # zmienne objaśniające
  63.  
  64. from sklearn.model_selection import train_test_split # podział na zbiór uczący i testowy
  65. X_train, X_test, y_train, y_test = train_test_split(reviews[features], reviews[label], test_size = 0.2, random_state=42)
  66.  
  67. from sklearn.linear_model import LinearRegression
  68. lr = LinearRegression()
  69. lr.fit(X_train, y_train)
  70. y_test_predict = lr.predict(X_test)
  71. # y_test_predict = np.round(y_test_predict, -1) # zaokrąglenie
  72. y_test_predict = np.where(y_test_predict > 50, 50, y_test_predict) # limit górny
  73. y_test_predict = np.where(y_test_predict < 10, 10, y_test_predict) # limit dolny
  74.  
  75. pd.DataFrame(y_test_predict).to_csv('j:/Desktop/PNJReviews/PredictedOutput.csv', index=None, header=None, sep=';')
  76. pd.DataFrame(y_test).to_csv('j:/Desktop/PNJReviews/TestOutput.csv', index=None, header=None, sep=';')
  77.  
  78. from sklearn.metrics import mean_squared_error
  79. from math import sqrt
  80.  
  81. rms = sqrt(mean_squared_error(y_test, y_test_predict)) #średnia kwadratowa
  82. print(rms)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement