Advertisement
Guest User

Untitled

a guest
Feb 20th, 2019
90
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.77 KB | None | 0 0
  1. {
  2. "cells": [
  3. {
  4. "cell_type": "markdown",
  5. "metadata": {},
  6. "source": [
  7. "# Restaurant Reviews(Natural Language Processing):\n",
  8. "In this we have a dataset that contains 1000 reviews of a restaurant given by the customers.And for each review 0 and 1 is assigned for negative and positive reviews respectively.We have to predict for a new review whether it is positive or negative."
  9. ]
  10. },
  11. {
  12. "cell_type": "code",
  13. "execution_count": 4,
  14. "metadata": {},
  15. "outputs": [
  16. {
  17. "data": {
  18. "text/plain": [
  19. "'\\nCreated on Mon Feb 11 07:14:09 2018\\n\\n@author: Pankaj Singh\\n'"
  20. ]
  21. },
  22. "execution_count": 4,
  23. "metadata": {},
  24. "output_type": "execute_result"
  25. }
  26. ],
  27. "source": [
  28. "# -*- coding: utf-8 -*-\n",
  29. "\"\"\"\n",
  30. "Created on Mon Feb 11 07:14:09 2018\n",
  31. "\n",
  32. "@author: Pankaj Singh\n",
  33. "\"\"\""
  34. ]
  35. },
  36. {
  37. "cell_type": "code",
  38. "execution_count": 5,
  39. "metadata": {},
  40. "outputs": [],
  41. "source": [
  42. "#Natural Language Processing\n",
  43. "\n",
  44. "#Importing the libraries\n",
  45. "import numpy as np\n",
  46. "import matplotlib.pyplot as plt\n",
  47. "import pandas as pd"
  48. ]
  49. },
  50. {
  51. "cell_type": "code",
  52. "execution_count": 6,
  53. "metadata": {},
  54. "outputs": [],
  55. "source": [
  56. "#Importing the dataset\n",
  57. "dataset = pd.read_csv(\"Restaurant_Reviews.tsv\", delimiter='\\t', quoting= 3)"
  58. ]
  59. },
  60. {
  61. "cell_type": "code",
  62. "execution_count": 7,
  63. "metadata": {},
  64. "outputs": [
  65. {
  66. "name": "stdout",
  67. "output_type": "stream",
  68. "text": [
  69. "[nltk_data] Downloading package stopwords to C:\\Users\\Pankaj\n",
  70. "[nltk_data] Singh\\AppData\\Roaming\\nltk_data...\n",
  71. "[nltk_data] Package stopwords is already up-to-date!\n"
  72. ]
  73. }
  74. ],
  75. "source": [
  76. "#cleaning the texts\n",
  77. "import re\n",
  78. "import nltk\n",
  79. "nltk.download('stopwords')\n",
  80. "from nltk.corpus import stopwords\n",
  81. "from nltk.stem.porter import PorterStemmer\n",
  82. "corpus = []\n",
  83. "for i in range(0,1000):\n",
  84. " review = re.sub('[^a-zA-z]',' ',dataset['Review'][i])\n",
  85. " review = review.lower()\n",
  86. " review = review.split()\n",
  87. " ps = PorterStemmer()\n",
  88. " review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]\n",
  89. " review = ' '.join(review)\n",
  90. " corpus.append(review)"
  91. ]
  92. },
  93. {
  94. "cell_type": "code",
  95. "execution_count": 8,
  96. "metadata": {},
  97. "outputs": [],
  98. "source": [
  99. " \n",
  100. " #create baga of words model \n",
  101. "from sklearn.feature_extraction.text import CountVectorizer\n",
  102. "cv = CountVectorizer()\n",
  103. "X = cv.fit_transform(corpus).toarray()\n",
  104. "y = dataset.iloc[:,1].values"
  105. ]
  106. },
  107. {
  108. "cell_type": "code",
  109. "execution_count": 9,
  110. "metadata": {},
  111. "outputs": [
  112. {
  113. "name": "stderr",
  114. "output_type": "stream",
  115. "text": [
  116. "C:\\Users\\Pankaj Singh\\Anaconda3\\lib\\site-packages\\sklearn\\cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
  117. " \"This module will be removed in 0.20.\", DeprecationWarning)\n"
  118. ]
  119. }
  120. ],
  121. "source": [
  122. "#solitting the dataset into training and test set\n",
  123. "from sklearn.cross_validation import train_test_split\n",
  124. "X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25)"
  125. ]
  126. },
  127. {
  128. "cell_type": "code",
  129. "execution_count": 10,
  130. "metadata": {},
  131. "outputs": [
  132. {
  133. "data": {
  134. "text/plain": [
  135. "GaussianNB(priors=None)"
  136. ]
  137. },
  138. "execution_count": 10,
  139. "metadata": {},
  140. "output_type": "execute_result"
  141. }
  142. ],
  143. "source": [
  144. "#fitting classfier to the training set\n",
  145. "from sklearn.naive_bayes import GaussianNB\n",
  146. "classifier = GaussianNB()\n",
  147. "classifier.fit(X_train,y_train)"
  148. ]
  149. },
  150. {
  151. "cell_type": "code",
  152. "execution_count": 11,
  153. "metadata": {},
  154. "outputs": [],
  155. "source": [
  156. "# Predicting the test set results\n",
  157. "y_pred = classifier.predict(X_test)\n",
  158. "\n",
  159. "from sklearn.metrics import confusion_matrix\n",
  160. "cm = confusion_matrix(y_test,y_pred)"
  161. ]
  162. }
  163. ],
  164. "metadata": {
  165. "kernelspec": {
  166. "display_name": "Python 3",
  167. "language": "python",
  168. "name": "python3"
  169. },
  170. "language_info": {
  171. "codemirror_mode": {
  172. "name": "ipython",
  173. "version": 3
  174. },
  175. "file_extension": ".py",
  176. "mimetype": "text/x-python",
  177. "name": "python",
  178. "nbconvert_exporter": "python",
  179. "pygments_lexer": "ipython3",
  180. "version": "3.7.0"
  181. }
  182. },
  183. "nbformat": 4,
  184. "nbformat_minor": 2
  185. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement