Advertisement
Guest User

Untitled

a guest
Aug 17th, 2019
224
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 26.78 KB | None | 0 0
  1. {
  2. "cells": [
  3. {
  4. "cell_type": "code",
  5. "execution_count": 52,
  6. "metadata": {},
  7. "outputs": [],
  8. "source": [
  9. "import nltk\n",
  10. "import pandas as pd\n",
  11. "import numpy as np"
  12. ]
  13. },
  14. {
  15. "cell_type": "code",
  16. "execution_count": 53,
  17. "metadata": {},
  18. "outputs": [
  19. {
  20. "data": {
  21. "text/html": [
  22. "<div>\n",
  23. "<style scoped>\n",
  24. " .dataframe tbody tr th:only-of-type {\n",
  25. " vertical-align: middle;\n",
  26. " }\n",
  27. "\n",
  28. " .dataframe tbody tr th {\n",
  29. " vertical-align: top;\n",
  30. " }\n",
  31. "\n",
  32. " .dataframe thead th {\n",
  33. " text-align: right;\n",
  34. " }\n",
  35. "</style>\n",
  36. "<table border=\"1\" class=\"dataframe\">\n",
  37. " <thead>\n",
  38. " <tr style=\"text-align: right;\">\n",
  39. " <th></th>\n",
  40. " <th>review</th>\n",
  41. " <th>sentiment</th>\n",
  42. " </tr>\n",
  43. " </thead>\n",
  44. " <tbody>\n",
  45. " <tr>\n",
  46. " <th>0</th>\n",
  47. " <td>One of the other reviewers has mentioned that ...</td>\n",
  48. " <td>positive</td>\n",
  49. " </tr>\n",
  50. " <tr>\n",
  51. " <th>1</th>\n",
  52. " <td>A wonderful little production. <br /><br />The...</td>\n",
  53. " <td>positive</td>\n",
  54. " </tr>\n",
  55. " <tr>\n",
  56. " <th>2</th>\n",
  57. " <td>I thought this was a wonderful way to spend ti...</td>\n",
  58. " <td>positive</td>\n",
  59. " </tr>\n",
  60. " <tr>\n",
  61. " <th>3</th>\n",
  62. " <td>Basically there's a family where a little boy ...</td>\n",
  63. " <td>negative</td>\n",
  64. " </tr>\n",
  65. " <tr>\n",
  66. " <th>4</th>\n",
  67. " <td>Petter Mattei's \"Love in the Time of Money\" is...</td>\n",
  68. " <td>positive</td>\n",
  69. " </tr>\n",
  70. " </tbody>\n",
  71. "</table>\n",
  72. "</div>"
  73. ],
  74. "text/plain": [
  75. " review sentiment\n",
  76. "0 One of the other reviewers has mentioned that ... positive\n",
  77. "1 A wonderful little production. <br /><br />The... positive\n",
  78. "2 I thought this was a wonderful way to spend ti... positive\n",
  79. "3 Basically there's a family where a little boy ... negative\n",
  80. "4 Petter Mattei's \"Love in the Time of Money\" is... positive"
  81. ]
  82. },
  83. "execution_count": 53,
  84. "metadata": {},
  85. "output_type": "execute_result"
  86. }
  87. ],
  88. "source": [
  89. "df=pd.read_csv('../IMDB_Dataset.csv')\n",
  90. "df.head()"
  91. ]
  92. },
  93. {
  94. "cell_type": "code",
  95. "execution_count": 54,
  96. "metadata": {},
  97. "outputs": [
  98. {
  99. "data": {
  100. "text/html": [
  101. "<div>\n",
  102. "<style scoped>\n",
  103. " .dataframe tbody tr th:only-of-type {\n",
  104. " vertical-align: middle;\n",
  105. " }\n",
  106. "\n",
  107. " .dataframe tbody tr th {\n",
  108. " vertical-align: top;\n",
  109. " }\n",
  110. "\n",
  111. " .dataframe thead th {\n",
  112. " text-align: right;\n",
  113. " }\n",
  114. "</style>\n",
  115. "<table border=\"1\" class=\"dataframe\">\n",
  116. " <thead>\n",
  117. " <tr style=\"text-align: right;\">\n",
  118. " <th></th>\n",
  119. " <th>review</th>\n",
  120. " <th>sentiment</th>\n",
  121. " <th>Positively Rated</th>\n",
  122. " </tr>\n",
  123. " </thead>\n",
  124. " <tbody>\n",
  125. " <tr>\n",
  126. " <th>0</th>\n",
  127. " <td>One of the other reviewers has mentioned that ...</td>\n",
  128. " <td>positive</td>\n",
  129. " <td>1</td>\n",
  130. " </tr>\n",
  131. " <tr>\n",
  132. " <th>1</th>\n",
  133. " <td>A wonderful little production. <br /><br />The...</td>\n",
  134. " <td>positive</td>\n",
  135. " <td>1</td>\n",
  136. " </tr>\n",
  137. " <tr>\n",
  138. " <th>2</th>\n",
  139. " <td>I thought this was a wonderful way to spend ti...</td>\n",
  140. " <td>positive</td>\n",
  141. " <td>1</td>\n",
  142. " </tr>\n",
  143. " <tr>\n",
  144. " <th>3</th>\n",
  145. " <td>Basically there's a family where a little boy ...</td>\n",
  146. " <td>negative</td>\n",
  147. " <td>0</td>\n",
  148. " </tr>\n",
  149. " <tr>\n",
  150. " <th>4</th>\n",
  151. " <td>Petter Mattei's \"Love in the Time of Money\" is...</td>\n",
  152. " <td>positive</td>\n",
  153. " <td>1</td>\n",
  154. " </tr>\n",
  155. " <tr>\n",
  156. " <th>5</th>\n",
  157. " <td>Probably my all-time favorite movie, a story o...</td>\n",
  158. " <td>positive</td>\n",
  159. " <td>1</td>\n",
  160. " </tr>\n",
  161. " <tr>\n",
  162. " <th>6</th>\n",
  163. " <td>I sure would like to see a resurrection of a u...</td>\n",
  164. " <td>positive</td>\n",
  165. " <td>1</td>\n",
  166. " </tr>\n",
  167. " <tr>\n",
  168. " <th>7</th>\n",
  169. " <td>This show was an amazing, fresh & innovative i...</td>\n",
  170. " <td>negative</td>\n",
  171. " <td>0</td>\n",
  172. " </tr>\n",
  173. " <tr>\n",
  174. " <th>8</th>\n",
  175. " <td>Encouraged by the positive comments about this...</td>\n",
  176. " <td>negative</td>\n",
  177. " <td>0</td>\n",
  178. " </tr>\n",
  179. " <tr>\n",
  180. " <th>9</th>\n",
  181. " <td>If you like original gut wrenching laughter yo...</td>\n",
  182. " <td>positive</td>\n",
  183. " <td>1</td>\n",
  184. " </tr>\n",
  185. " </tbody>\n",
  186. "</table>\n",
  187. "</div>"
  188. ],
  189. "text/plain": [
  190. " review sentiment \\\n",
  191. "0 One of the other reviewers has mentioned that ... positive \n",
  192. "1 A wonderful little production. <br /><br />The... positive \n",
  193. "2 I thought this was a wonderful way to spend ti... positive \n",
  194. "3 Basically there's a family where a little boy ... negative \n",
  195. "4 Petter Mattei's \"Love in the Time of Money\" is... positive \n",
  196. "5 Probably my all-time favorite movie, a story o... positive \n",
  197. "6 I sure would like to see a resurrection of a u... positive \n",
  198. "7 This show was an amazing, fresh & innovative i... negative \n",
  199. "8 Encouraged by the positive comments about this... negative \n",
  200. "9 If you like original gut wrenching laughter yo... positive \n",
  201. "\n",
  202. " Positively Rated \n",
  203. "0 1 \n",
  204. "1 1 \n",
  205. "2 1 \n",
  206. "3 0 \n",
  207. "4 1 \n",
  208. "5 1 \n",
  209. "6 1 \n",
  210. "7 0 \n",
  211. "8 0 \n",
  212. "9 1 "
  213. ]
  214. },
  215. "execution_count": 54,
  216. "metadata": {},
  217. "output_type": "execute_result"
  218. }
  219. ],
  220. "source": [
  221. "# Drop missing values\n",
  222. "df.dropna(inplace=True)\n",
  223. "\n",
  224. "# Encode 4s and 5s as 1 (rated positively)\n",
  225. "# Encode 1s and 2s as 0 (rated poorly)\n",
  226. "df['Positively Rated'] = np.where(df['sentiment'] == 'positive', 1, 0)\n",
  227. "df.head(10)"
  228. ]
  229. },
  230. {
  231. "cell_type": "code",
  232. "execution_count": 55,
  233. "metadata": {},
  234. "outputs": [
  235. {
  236. "data": {
  237. "text/plain": [
  238. "0.5"
  239. ]
  240. },
  241. "execution_count": 55,
  242. "metadata": {},
  243. "output_type": "execute_result"
  244. }
  245. ],
  246. "source": [
  247. "df['Positively Rated'].mean()\n"
  248. ]
  249. },
  250. {
  251. "cell_type": "code",
  252. "execution_count": 56,
  253. "metadata": {},
  254. "outputs": [],
  255. "source": [
  256. "from sklearn.model_selection import train_test_split\n",
  257. "\n",
  258. "# Split data into training and test sets\n",
  259. "X_train, X_test, y_train, y_test = train_test_split(df['review'], \n",
  260. " df['Positively Rated'], \n",
  261. " random_state=0)"
  262. ]
  263. },
  264. {
  265. "cell_type": "code",
  266. "execution_count": 57,
  267. "metadata": {},
  268. "outputs": [
  269. {
  270. "data": {
  271. "text/plain": [
  272. "39758 This is a story of two dogs and a cat looking ...\n",
  273. "18457 A paranoid scientist creates a wolfman by tran...\n",
  274. "33239 Class Reunion is a very underated comedy gem. ...\n",
  275. "35006 This film IS brilliant...... without a doubt. ...\n",
  276. "30982 La Chute de la Maison Usher, or The Fall of th...\n",
  277. "23507 Doctor Feinstone is a dentist.He has a beautif...\n",
  278. "39796 Please, help the economy - spend your money el...\n",
  279. "6586 Dahl seems to have been under the influence of...\n",
  280. "22051 What an excellent movie, made even more so by ...\n",
  281. "32938 This movie is about a young girl who goes to l...\n",
  282. "32875 Wow, was this version of THE RACKETEER tough t...\n",
  283. "11942 I come from Bangladesh, and here, C.C.Costigan...\n",
  284. "25154 Updated from a previous comment. The great and...\n",
  285. "41573 A slick production which holds the interest fr...\n",
  286. "49277 Beautiful attracts excellent idea, but ruined ...\n",
  287. "44277 I hired the DVD yesterday and first of all it ...\n",
  288. "1127 Let's start from this point: This is not a mov...\n",
  289. "25515 At first glance, it would seem natural to comp...\n",
  290. "48553 The Tooth Fairy is about the ghost of an old d...\n",
  291. "18901 Apart from the DA (James Eckhouse), and a brie...\n",
  292. "19015 \"Everything is Illuminated\" is like viewing a ...\n",
  293. "39309 The Comebacks is a spoof on inspirational spor...\n",
  294. "7444 Don't get fooled with all the big names like B...\n",
  295. "4220 \"Happy Days\" was produced and broadcast from t...\n",
  296. "32462 I can honestly say I never expected this movie...\n",
  297. "35374 I suppose that in 1997 Hollywood wasn't quite ...\n",
  298. "48646 Let me just start out by saying that Tourist T...\n",
  299. "18176 The scripting of the subtle comedy is unmatche...\n",
  300. "48348 Yes, my summary just about tells it all.<br />...\n",
  301. "7286 Everything this film tried to do is done bette...\n",
  302. " ... \n",
  303. "7877 There are other movies about boarding schools ...\n",
  304. "37619 If this is all the Watchowski's have to offer ...\n",
  305. "5072 This movie is based on a Stephen King novel in...\n",
  306. "2163 I'm from Belgium and therefore my English writ...\n",
  307. "38804 And so it started with \"Shreik\" a send up of h...\n",
  308. "6921 Micro-phonies is a classic Stooge short. The g...\n",
  309. "38984 Holy cow, what a piece of sh*t this movie is. ...\n",
  310. "27469 First of all, let me say this film isn't for e...\n",
  311. "16921 This movie contains personalities that so deli...\n",
  312. "35665 one may ask why? the characters snarl, yell, a...\n",
  313. "24152 This movie is about human relationships. Charm...\n",
  314. "43095 I was a huge fan of the original Robocop.<br /...\n",
  315. "18983 \"GEORGE LOPEZ,\" in my opinion, is an absolute ...\n",
  316. "32230 See Dick work.<br /><br />See Jane work.<br />...\n",
  317. "17089 I went in not knowing anything about this movi...\n",
  318. "14650 I loved this movie and will watch it again. Or...\n",
  319. "39512 One of the best war films I have ever seen, if...\n",
  320. "48600 Well, you'd better if you plan on sitting thro...\n",
  321. "15430 One would think that a film based on the life ...\n",
  322. "14935 This is another of Hollywood's anti-communist ...\n",
  323. "46884 Brothers with psychokinetic powers (yes, reall...\n",
  324. "20757 Susie Q is a great romantic prom Movie. Amy Jo...\n",
  325. "41993 Yeah, it is. In fact, it's somewhere in my top...\n",
  326. "32103 This isn't a dreadful film, merely insipid. Th...\n",
  327. "30403 I also saw this upon its release in '56, and h...\n",
  328. "21243 I did not set very high expectations for this ...\n",
  329. "45891 THE BLOB is a great horror movie, not merely b...\n",
  330. "42613 After too many years of waiting, Anne Rivers S...\n",
  331. "43567 I am a massive fan of the LoG. I thought the f...\n",
  332. "2732 AG was an excellent presentation of drama, sus...\n",
  333. "Name: review, Length: 37500, dtype: object"
  334. ]
  335. },
  336. "execution_count": 57,
  337. "metadata": {},
  338. "output_type": "execute_result"
  339. }
  340. ],
  341. "source": [
  342. "X_train\n"
  343. ]
  344. },
  345. {
  346. "cell_type": "code",
  347. "execution_count": 58,
  348. "metadata": {},
  349. "outputs": [
  350. {
  351. "name": "stdout",
  352. "output_type": "stream",
  353. "text": [
  354. "X_train first entry:\n",
  355. "\n",
  356. " This is a story of two dogs and a cat looking for their way back home.Old and wise Golden Retriever Shadow, young American Bulldog Chance and Himalayan cat Sassy flee from the ranch and go into the wilderness to be reunited with their family.Homeward Bound: The Incredible Journey (1993) is a family adventure directed by Duwayne Dunham.It's a remake of a 1963 film.This movie got a sequel three years later.Michael J. Fox is the perfect man to do the voice-over for Chance.Fox has some youthful energy he brings to the role.Sally Field does great voice work as Sassy.Don Ameche is fantastic as Shadow.This was this veteran actor's second last movie.Also the visible actors are great.Kim Greist plays Laura Burnford-Seaver.Robert Hays is Bob Seaver.Benji Thall plays Peter Burnford.Veronica Lauren is Hope Burnford.Kevin Chevalia is Jamie Seaver.Jean Smart portrays Kate.It's quite amazing to watch these pets trying to survive in the wilderness.We see Sassy taken by the river and she seems like a goner.The bear scene is exiting and funny.Chance has no chance with that big, hungry bear.And his meeting with the porcupine looks painful.This is some great fun for the whole family.\n",
  357. "\n",
  358. "\n",
  359. "X_train shape: (37500,)\n"
  360. ]
  361. }
  362. ],
  363. "source": [
  364. "print('X_train first entry:\\n\\n', X_train.iloc[0])\n",
  365. "print('\\n\\nX_train shape: ', X_train.shape)"
  366. ]
  367. },
  368. {
  369. "cell_type": "code",
  370. "execution_count": 59,
  371. "metadata": {},
  372. "outputs": [],
  373. "source": [
  374. "from sklearn.feature_extraction.text import CountVectorizer\n",
  375. "\n",
  376. "# Fit the CountVectorizer to the training data\n",
  377. "vect = CountVectorizer().fit(X_train)"
  378. ]
  379. },
  380. {
  381. "cell_type": "code",
  382. "execution_count": 60,
  383. "metadata": {},
  384. "outputs": [
  385. {
  386. "data": {
  387. "text/plain": [
  388. "['00',\n",
  389. " 'actionless',\n",
  390. " 'andlaurel',\n",
  391. " 'audiobooks',\n",
  392. " 'befits',\n",
  393. " 'bolo',\n",
  394. " 'bushwhackers',\n",
  395. " 'chalie',\n",
  396. " 'cochlear',\n",
  397. " 'cornered',\n",
  398. " 'danube',\n",
  399. " 'diabolik',\n",
  400. " 'dozing',\n",
  401. " 'emile',\n",
  402. " 'expressively',\n",
  403. " 'flashiness',\n",
  404. " 'gake',\n",
  405. " 'goths',\n",
  406. " 'hark',\n",
  407. " 'hoodwinks',\n",
  408. " 'indecent',\n",
  409. " 'janaya',\n",
  410. " 'kidnappers',\n",
  411. " 'leaches',\n",
  412. " 'luján',\n",
  413. " 'mathematically',\n",
  414. " 'mirages',\n",
  415. " 'myabe',\n",
  416. " 'nunn',\n",
  417. " 'oxbow',\n",
  418. " 'petulant',\n",
  419. " 'powerhouses',\n",
  420. " 'quartmaster',\n",
  421. " 'rejenacyn',\n",
  422. " 'romania',\n",
  423. " 'schfrin',\n",
  424. " 'shin',\n",
  425. " 'snippers',\n",
  426. " 'static',\n",
  427. " 'surmising',\n",
  428. " 'teuton',\n",
  429. " 'transvestive',\n",
  430. " 'unended',\n",
  431. " 'via',\n",
  432. " 'whisk',\n",
  433. " 'zakk']"
  434. ]
  435. },
  436. "execution_count": 60,
  437. "metadata": {},
  438. "output_type": "execute_result"
  439. }
  440. ],
  441. "source": [
  442. "vect.get_feature_names()[::2000]\n"
  443. ]
  444. },
  445. {
  446. "cell_type": "code",
  447. "execution_count": 61,
  448. "metadata": {},
  449. "outputs": [
  450. {
  451. "data": {
  452. "text/plain": [
  453. "90506"
  454. ]
  455. },
  456. "execution_count": 61,
  457. "metadata": {},
  458. "output_type": "execute_result"
  459. }
  460. ],
  461. "source": [
  462. "len(vect.get_feature_names())\n"
  463. ]
  464. },
  465. {
  466. "cell_type": "code",
  467. "execution_count": 62,
  468. "metadata": {},
  469. "outputs": [
  470. {
  471. "data": {
  472. "text/plain": [
  473. "<37500x90506 sparse matrix of type '<class 'numpy.int64'>'\n",
  474. "\twith 5111856 stored elements in Compressed Sparse Row format>"
  475. ]
  476. },
  477. "execution_count": 62,
  478. "metadata": {},
  479. "output_type": "execute_result"
  480. }
  481. ],
  482. "source": [
  483. "# transform the documents in the training data to a document-term matrix\n",
  484. "X_train_vectorized = vect.transform(X_train)\n",
  485. "\n",
  486. "X_train_vectorized"
  487. ]
  488. },
  489. {
  490. "cell_type": "code",
  491. "execution_count": 63,
  492. "metadata": {},
  493. "outputs": [
  494. {
  495. "name": "stderr",
  496. "output_type": "stream",
  497. "text": [
  498. "/usr/local/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
  499. " FutureWarning)\n"
  500. ]
  501. },
  502. {
  503. "data": {
  504. "text/plain": [
  505. "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n",
  506. " intercept_scaling=1, l1_ratio=None, max_iter=100,\n",
  507. " multi_class='warn', n_jobs=None, penalty='l2',\n",
  508. " random_state=None, solver='warn', tol=0.0001, verbose=0,\n",
  509. " warm_start=False)"
  510. ]
  511. },
  512. "execution_count": 63,
  513. "metadata": {},
  514. "output_type": "execute_result"
  515. }
  516. ],
  517. "source": [
  518. "from sklearn.linear_model import LogisticRegression\n",
  519. "\n",
  520. "# Train the model\n",
  521. "model = LogisticRegression()\n",
  522. "model.fit(X_train_vectorized, y_train)"
  523. ]
  524. },
  525. {
  526. "cell_type": "code",
  527. "execution_count": 64,
  528. "metadata": {},
  529. "outputs": [
  530. {
  531. "name": "stdout",
  532. "output_type": "stream",
  533. "text": [
  534. "AUC: 0.8841970005800444\n"
  535. ]
  536. }
  537. ],
  538. "source": [
  539. "from sklearn.metrics import roc_auc_score\n",
  540. "\n",
  541. "# Predict the transformed test documents\n",
  542. "predictions = model.predict(vect.transform(X_test))\n",
  543. "\n",
  544. "print('AUC: ', roc_auc_score(y_test, predictions))"
  545. ]
  546. },
  547. {
  548. "cell_type": "code",
  549. "execution_count": 65,
  550. "metadata": {},
  551. "outputs": [
  552. {
  553. "name": "stdout",
  554. "output_type": "stream",
  555. "text": [
  556. "Smallest Coefs:\n",
  557. "['worst' 'waste' 'forgettable' 'awful' 'disappointing' 'disappointment'\n",
  558. " 'stinker' 'poorly' 'fails' 'uninteresting']\n",
  559. "\n",
  560. "Largest Coefs: \n",
  561. "['refreshing' 'hooked' 'wonderfully' 'raunchy' 'adr' 'superb' 'perfect'\n",
  562. " 'delightful' 'squirrel' 'funniest']\n"
  563. ]
  564. }
  565. ],
  566. "source": [
  567. "# get the feature names as numpy array\n",
  568. "feature_names = np.array(vect.get_feature_names())\n",
  569. "\n",
  570. "# Sort the coefficients from the model\n",
  571. "sorted_coef_index = model.coef_[0].argsort()\n",
  572. "\n",
  573. "# Find the 10 smallest and 10 largest coefficients\n",
  574. "# The 10 largest coefficients are being indexed using [:-11:-1] \n",
  575. "# so the list returned is in order of largest to smallest\n",
  576. "print('Smallest Coefs:\\n{}\\n'.format(feature_names[sorted_coef_index[:10]]))\n",
  577. "print('Largest Coefs: \\n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))"
  578. ]
  579. },
  580. {
  581. "cell_type": "code",
  582. "execution_count": 66,
  583. "metadata": {},
  584. "outputs": [],
  585. "source": [
  586. "# Tfidf\n"
  587. ]
  588. },
  589. {
  590. "cell_type": "code",
  591. "execution_count": 67,
  592. "metadata": {},
  593. "outputs": [
  594. {
  595. "data": {
  596. "text/plain": [
  597. "32673"
  598. ]
  599. },
  600. "execution_count": 67,
  601. "metadata": {},
  602. "output_type": "execute_result"
  603. }
  604. ],
  605. "source": [
  606. "from sklearn.feature_extraction.text import TfidfVectorizer\n",
  607. "\n",
  608. "# Fit the TfidfVectorizer to the training data specifiying a minimum document frequency of 5\n",
  609. "vect = TfidfVectorizer(min_df=5).fit(X_train)\n",
  610. "len(vect.get_feature_names())"
  611. ]
  612. },
  613. {
  614. "cell_type": "code",
  615. "execution_count": 68,
  616. "metadata": {},
  617. "outputs": [
  618. {
  619. "name": "stderr",
  620. "output_type": "stream",
  621. "text": [
  622. "/usr/local/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
  623. " FutureWarning)\n"
  624. ]
  625. },
  626. {
  627. "name": "stdout",
  628. "output_type": "stream",
  629. "text": [
  630. "AUC: 0.8933488824184665\n"
  631. ]
  632. }
  633. ],
  634. "source": [
  635. "X_train_vectorized = vect.transform(X_train)\n",
  636. "\n",
  637. "model = LogisticRegression()\n",
  638. "model.fit(X_train_vectorized, y_train)\n",
  639. "\n",
  640. "predictions = model.predict(vect.transform(X_test))\n",
  641. "\n",
  642. "print('AUC: ', roc_auc_score(y_test, predictions))"
  643. ]
  644. },
  645. {
  646. "cell_type": "code",
  647. "execution_count": 69,
  648. "metadata": {},
  649. "outputs": [
  650. {
  651. "name": "stdout",
  652. "output_type": "stream",
  653. "text": [
  654. "Smallest tfidf:\n",
  655. "['cavalryman' 'horace' 'ershadi' 'ebrahimi' 'homayoun' 'mahmoodzada'\n",
  656. " 'rueful' 'musclebound' 'décor' 'bails']\n",
  657. "\n",
  658. "Largest tfidf: \n",
  659. "['pokemon' 'ghoulies' 'robot' 'ernest' 'cycle' 'lupin' 'rodrigues'\n",
  660. " 'gamera' 'wei' 'steve']\n"
  661. ]
  662. }
  663. ],
  664. "source": [
  665. "feature_names = np.array(vect.get_feature_names())\n",
  666. "\n",
  667. "sorted_tfidf_index = X_train_vectorized.max(0).toarray()[0].argsort()\n",
  668. "\n",
  669. "print('Smallest tfidf:\\n{}\\n'.format(feature_names[sorted_tfidf_index[:10]]))\n",
  670. "print('Largest tfidf: \\n{}'.format(feature_names[sorted_tfidf_index[:-11:-1]]))"
  671. ]
  672. },
  673. {
  674. "cell_type": "code",
  675. "execution_count": 70,
  676. "metadata": {},
  677. "outputs": [
  678. {
  679. "name": "stdout",
  680. "output_type": "stream",
  681. "text": [
  682. "Smallest Coefs:\n",
  683. "['worst' 'bad' 'waste' 'awful' 'boring' 'terrible' 'poor' 'nothing' 'dull'\n",
  684. " 'worse']\n",
  685. "\n",
  686. "Largest Coefs: \n",
  687. "['great' 'excellent' 'best' 'perfect' 'wonderful' 'amazing' 'today'\n",
  688. " 'loved' 'fun' 'favorite']\n"
  689. ]
  690. }
  691. ],
  692. "source": [
  693. "sorted_coef_index = model.coef_[0].argsort()\n",
  694. "\n",
  695. "print('Smallest Coefs:\\n{}\\n'.format(feature_names[sorted_coef_index[:10]]))\n",
  696. "print('Largest Coefs: \\n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))"
  697. ]
  698. },
  699. {
  700. "cell_type": "code",
  701. "execution_count": 71,
  702. "metadata": {},
  703. "outputs": [
  704. {
  705. "name": "stdout",
  706. "output_type": "stream",
  707. "text": [
  708. "[1 0]\n"
  709. ]
  710. }
  711. ],
  712. "source": [
  713. "print(model.predict(vect.transform(['People with bias are reviewing the movie. Watch it, it is mind blowing. Great acting, awesome music scores and really awesome direction. Cheers!',\n",
  714. " 'Here the hero is glorified while he slaps his girlfriend, treats her like a courtesan n the girlfriend is shown as a docile animal. The acting by Shahid is getting monotonous. He acted the same like he did in Uddta Punjab. This film is nothing but a z grade version of Devdas n Dev D.'])))"
  715. ]
  716. },
  717. {
  718. "cell_type": "code",
  719. "execution_count": 72,
  720. "metadata": {},
  721. "outputs": [],
  722. "source": [
  723. "##n-grams¶\n"
  724. ]
  725. },
  726. {
  727. "cell_type": "code",
  728. "execution_count": 73,
  729. "metadata": {},
  730. "outputs": [
  731. {
  732. "data": {
  733. "text/plain": [
  734. "213759"
  735. ]
  736. },
  737. "execution_count": 73,
  738. "metadata": {},
  739. "output_type": "execute_result"
  740. }
  741. ],
  742. "source": [
  743. "# Fit the CountVectorizer to the training data specifiying a minimum \n",
  744. "# document frequency of 5 and extracting 1-grams and 2-grams\n",
  745. "vect = CountVectorizer(min_df=5, ngram_range=(1,2)).fit(X_train)\n",
  746. "\n",
  747. "X_train_vectorized = vect.transform(X_train)\n",
  748. "\n",
  749. "len(vect.get_feature_names())"
  750. ]
  751. },
  752. {
  753. "cell_type": "code",
  754. "execution_count": 74,
  755. "metadata": {},
  756. "outputs": [
  757. {
  758. "name": "stderr",
  759. "output_type": "stream",
  760. "text": [
  761. "/usr/local/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
  762. " FutureWarning)\n"
  763. ]
  764. },
  765. {
  766. "name": "stdout",
  767. "output_type": "stream",
  768. "text": [
  769. "AUC: 0.9041957619987435\n"
  770. ]
  771. }
  772. ],
  773. "source": [
  774. "model = LogisticRegression()\n",
  775. "model.fit(X_train_vectorized, y_train)\n",
  776. "\n",
  777. "predictions = model.predict(vect.transform(X_test))\n",
  778. "\n",
  779. "print('AUC: ', roc_auc_score(y_test, predictions))"
  780. ]
  781. },
  782. {
  783. "cell_type": "code",
  784. "execution_count": 75,
  785. "metadata": {},
  786. "outputs": [
  787. {
  788. "name": "stdout",
  789. "output_type": "stream",
  790. "text": [
  791. "Smallest Coefs:\n",
  792. "['worst' 'awful' 'boring' 'waste' 'disappointment' 'terrible'\n",
  793. " 'disappointing' 'horrible' 'not worth' 'poorly']\n",
  794. "\n",
  795. "Largest Coefs: \n",
  796. "['perfect' 'excellent' 'hilarious' 'gem' 'amazing' 'superb' 'loved this'\n",
  797. " 'today' 'incredible' 'well worth']\n"
  798. ]
  799. }
  800. ],
  801. "source": [
  802. "feature_names = np.array(vect.get_feature_names())\n",
  803. "\n",
  804. "sorted_coef_index = model.coef_[0].argsort()\n",
  805. "\n",
  806. "print('Smallest Coefs:\\n{}\\n'.format(feature_names[sorted_coef_index[:10]]))\n",
  807. "print('Largest Coefs: \\n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))"
  808. ]
  809. },
  810. {
  811. "cell_type": "code",
  812. "execution_count": 76,
  813. "metadata": {},
  814. "outputs": [
  815. {
  816. "name": "stdout",
  817. "output_type": "stream",
  818. "text": [
  819. "[1 0]\n"
  820. ]
  821. }
  822. ],
  823. "source": [
  824. "# These reviews are now correctly identified\n",
  825. "print(model.predict(vect.transform(['People with bias are reviewing the movie. Watch it, it is mind blowing. Great acting, awesome music scores and really awesome direction. Cheers!',\n",
  826. " 'Here the hero is glorified while he slaps his girlfriend, treats her like a courtesan n the girlfriend is shown as a docile animal. The acting by Shahid is getting monotonous. He acted the same like he did in Uddta Punjab. This film is nothing but a z grade version of Devdas n Dev D.'])))"
  827. ]
  828. }
  829. ],
  830. "metadata": {
  831. "kernelspec": {
  832. "display_name": "Python 3",
  833. "language": "python",
  834. "name": "python3"
  835. },
  836. "language_info": {
  837. "codemirror_mode": {
  838. "name": "ipython",
  839. "version": 3
  840. },
  841. "file_extension": ".py",
  842. "mimetype": "text/x-python",
  843. "name": "python",
  844. "nbconvert_exporter": "python",
  845. "pygments_lexer": "ipython3",
  846. "version": "3.7.3"
  847. }
  848. },
  849. "nbformat": 4,
  850. "nbformat_minor": 2
  851. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement