Untitled

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [],
   "source": [
    "import nltk\n",
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>review</th>\n",
       "      <th>sentiment</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>One of the other reviewers has mentioned that ...</td>\n",
       "      <td>positive</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>A wonderful little production. <br /><br />The...</td>\n",
       "      <td>positive</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>I thought this was a wonderful way to spend ti...</td>\n",
       "      <td>positive</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Basically there's a family where a little boy ...</td>\n",
       "      <td>negative</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Petter Mattei's \"Love in the Time of Money\" is...</td>\n",
       "      <td>positive</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                              review sentiment\n",
       "0  One of the other reviewers has mentioned that ...  positive\n",
       "1  A wonderful little production. <br /><br />The...  positive\n",
       "2  I thought this was a wonderful way to spend ti...  positive\n",
       "3  Basically there's a family where a little boy ...  negative\n",
       "4  Petter Mattei's \"Love in the Time of Money\" is...  positive"
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df=pd.read_csv('../IMDB_Dataset.csv')\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>review</th>\n",
       "      <th>sentiment</th>\n",
       "      <th>Positively Rated</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>One of the other reviewers has mentioned that ...</td>\n",
       "      <td>positive</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>A wonderful little production. <br /><br />The...</td>\n",
       "      <td>positive</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>I thought this was a wonderful way to spend ti...</td>\n",
       "      <td>positive</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Basically there's a family where a little boy ...</td>\n",
       "      <td>negative</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Petter Mattei's \"Love in the Time of Money\" is...</td>\n",
       "      <td>positive</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>Probably my all-time favorite movie, a story o...</td>\n",
       "      <td>positive</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>I sure would like to see a resurrection of a u...</td>\n",
       "      <td>positive</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>This show was an amazing, fresh & innovative i...</td>\n",
       "      <td>negative</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>Encouraged by the positive comments about this...</td>\n",
       "      <td>negative</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>If you like original gut wrenching laughter yo...</td>\n",
       "      <td>positive</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                              review sentiment  \\\n",
       "0  One of the other reviewers has mentioned that ...  positive   \n",
       "1  A wonderful little production. <br /><br />The...  positive   \n",
       "2  I thought this was a wonderful way to spend ti...  positive   \n",
       "3  Basically there's a family where a little boy ...  negative   \n",
       "4  Petter Mattei's \"Love in the Time of Money\" is...  positive   \n",
       "5  Probably my all-time favorite movie, a story o...  positive   \n",
       "6  I sure would like to see a resurrection of a u...  positive   \n",
       "7  This show was an amazing, fresh & innovative i...  negative   \n",
       "8  Encouraged by the positive comments about this...  negative   \n",
       "9  If you like original gut wrenching laughter yo...  positive   \n",
       "\n",
       "   Positively Rated  \n",
       "0                 1  \n",
       "1                 1  \n",
       "2                 1  \n",
       "3                 0  \n",
       "4                 1  \n",
       "5                 1  \n",
       "6                 1  \n",
       "7                 0  \n",
       "8                 0  \n",
       "9                 1  "
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Drop missing values\n",
    "df.dropna(inplace=True)\n",
    "\n",
    "# Encode 4s and 5s as 1 (rated positively)\n",
    "# Encode 1s and 2s as 0 (rated poorly)\n",
    "df['Positively Rated'] = np.where(df['sentiment'] == 'positive', 1, 0)\n",
    "df.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.5"
      ]
     },
     "execution_count": 55,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['Positively Rated'].mean()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "# Split data into training and test sets\n",
    "X_train, X_test, y_train, y_test = train_test_split(df['review'], \n",
    "                                                    df['Positively Rated'], \n",
    "                                                    random_state=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "39758    This is a story of two dogs and a cat looking ...\n",
       "18457    A paranoid scientist creates a wolfman by tran...\n",
       "33239    Class Reunion is a very underated comedy gem. ...\n",
       "35006    This film IS brilliant...... without a doubt. ...\n",
       "30982    La Chute de la Maison Usher, or The Fall of th...\n",
       "23507    Doctor Feinstone is a dentist.He has a beautif...\n",
       "39796    Please, help the economy - spend your money el...\n",
       "6586     Dahl seems to have been under the influence of...\n",
       "22051    What an excellent movie, made even more so by ...\n",
       "32938    This movie is about a young girl who goes to l...\n",
       "32875    Wow, was this version of THE RACKETEER tough t...\n",
       "11942    I come from Bangladesh, and here, C.C.Costigan...\n",
       "25154    Updated from a previous comment. The great and...\n",
       "41573    A slick production which holds the interest fr...\n",
       "49277    Beautiful attracts excellent idea, but ruined ...\n",
       "44277    I hired the DVD yesterday and first of all it ...\n",
       "1127     Let's start from this point: This is not a mov...\n",
       "25515    At first glance, it would seem natural to comp...\n",
       "48553    The Tooth Fairy is about the ghost of an old d...\n",
       "18901    Apart from the DA (James Eckhouse), and a brie...\n",
       "19015    \"Everything is Illuminated\" is like viewing a ...\n",
       "39309    The Comebacks is a spoof on inspirational spor...\n",
       "7444     Don't get fooled with all the big names like B...\n",
       "4220     \"Happy Days\" was produced and broadcast from t...\n",
       "32462    I can honestly say I never expected this movie...\n",
       "35374    I suppose that in 1997 Hollywood wasn't quite ...\n",
       "48646    Let me just start out by saying that Tourist T...\n",
       "18176    The scripting of the subtle comedy is unmatche...\n",
       "48348    Yes, my summary just about tells it all.<br />...\n",
       "7286     Everything this film tried to do is done bette...\n",
       "                               ...                        \n",
       "7877     There are other movies about boarding schools ...\n",
       "37619    If this is all the Watchowski's have to offer ...\n",
       "5072     This movie is based on a Stephen King novel in...\n",
       "2163     I'm from Belgium and therefore my English writ...\n",
       "38804    And so it started with \"Shreik\" a send up of h...\n",
       "6921     Micro-phonies is a classic Stooge short. The g...\n",
       "38984    Holy cow, what a piece of sh*t this movie is. ...\n",
       "27469    First of all, let me say this film isn't for e...\n",
       "16921    This movie contains personalities that so deli...\n",
       "35665    one may ask why? the characters snarl, yell, a...\n",
       "24152    This movie is about human relationships. Charm...\n",
       "43095    I was a huge fan of the original Robocop.<br /...\n",
       "18983    \"GEORGE LOPEZ,\" in my opinion, is an absolute ...\n",
       "32230    See Dick work.<br /><br />See Jane work.<br />...\n",
       "17089    I went in not knowing anything about this movi...\n",
       "14650    I loved this movie and will watch it again. Or...\n",
       "39512    One of the best war films I have ever seen, if...\n",
       "48600    Well, you'd better if you plan on sitting thro...\n",
       "15430    One would think that a film based on the life ...\n",
       "14935    This is another of Hollywood's anti-communist ...\n",
       "46884    Brothers with psychokinetic powers (yes, reall...\n",
       "20757    Susie Q is a great romantic prom Movie. Amy Jo...\n",
       "41993    Yeah, it is. In fact, it's somewhere in my top...\n",
       "32103    This isn't a dreadful film, merely insipid. Th...\n",
       "30403    I also saw this upon its release in '56, and h...\n",
       "21243    I did not set very high expectations for this ...\n",
       "45891    THE BLOB is a great horror movie, not merely b...\n",
       "42613    After too many years of waiting, Anne Rivers S...\n",
       "43567    I am a massive fan of the LoG. I thought the f...\n",
       "2732     AG was an excellent presentation of drama, sus...\n",
       "Name: review, Length: 37500, dtype: object"
      ]
     },
     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_train\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "X_train first entry:\n",
      "\n",
      " This is a story of two dogs and a cat looking for their way back home.Old and wise Golden Retriever Shadow, young American Bulldog Chance and Himalayan cat Sassy flee from the ranch and go into the wilderness to be reunited with their family.Homeward Bound: The Incredible Journey (1993) is a family adventure directed by Duwayne Dunham.It's a remake of a 1963 film.This movie got a sequel three years later.Michael J. Fox is the perfect man to do the voice-over for Chance.Fox has some youthful energy he brings to the role.Sally Field does great voice work as Sassy.Don Ameche is fantastic as Shadow.This was this veteran actor's second last movie.Also the visible actors are great.Kim Greist plays Laura Burnford-Seaver.Robert Hays is Bob Seaver.Benji Thall plays Peter Burnford.Veronica Lauren is Hope Burnford.Kevin Chevalia is Jamie Seaver.Jean Smart portrays Kate.It's quite amazing to watch these pets trying to survive in the wilderness.We see Sassy taken by the river and she seems like a goner.The bear scene is exiting and funny.Chance has no chance with that big, hungry bear.And his meeting with the porcupine looks painful.This is some great fun for the whole family.\n",
      "\n",
      "\n",
      "X_train shape:  (37500,)\n"
     ]
    }
   ],
   "source": [
    "print('X_train first entry:\\n\\n', X_train.iloc[0])\n",
    "print('\\n\\nX_train shape: ', X_train.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "\n",
    "# Fit the CountVectorizer to the training data\n",
    "vect = CountVectorizer().fit(X_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['00',\n",
       " 'actionless',\n",
       " 'andlaurel',\n",
       " 'audiobooks',\n",
       " 'befits',\n",
       " 'bolo',\n",
       " 'bushwhackers',\n",
       " 'chalie',\n",
       " 'cochlear',\n",
       " 'cornered',\n",
       " 'danube',\n",
       " 'diabolik',\n",
       " 'dozing',\n",
       " 'emile',\n",
       " 'expressively',\n",
       " 'flashiness',\n",
       " 'gake',\n",
       " 'goths',\n",
       " 'hark',\n",
       " 'hoodwinks',\n",
       " 'indecent',\n",
       " 'janaya',\n",
       " 'kidnappers',\n",
       " 'leaches',\n",
       " 'luján',\n",
       " 'mathematically',\n",
       " 'mirages',\n",
       " 'myabe',\n",
       " 'nunn',\n",
       " 'oxbow',\n",
       " 'petulant',\n",
       " 'powerhouses',\n",
       " 'quartmaster',\n",
       " 'rejenacyn',\n",
       " 'romania',\n",
       " 'schfrin',\n",
       " 'shin',\n",
       " 'snippers',\n",
       " 'static',\n",
       " 'surmising',\n",
       " 'teuton',\n",
       " 'transvestive',\n",
       " 'unended',\n",
       " 'via',\n",
       " 'whisk',\n",
       " 'zakk']"
      ]
     },
     "execution_count": 60,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vect.get_feature_names()[::2000]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "90506"
      ]
     },
     "execution_count": 61,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(vect.get_feature_names())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<37500x90506 sparse matrix of type '<class 'numpy.int64'>'\n",
       "\twith 5111856 stored elements in Compressed Sparse Row format>"
      ]
     },
     "execution_count": 62,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# transform the documents in the training data to a document-term matrix\n",
    "X_train_vectorized = vect.transform(X_train)\n",
    "\n",
    "X_train_vectorized"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
      "  FutureWarning)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n",
       "                   intercept_scaling=1, l1_ratio=None, max_iter=100,\n",
       "                   multi_class='warn', n_jobs=None, penalty='l2',\n",
       "                   random_state=None, solver='warn', tol=0.0001, verbose=0,\n",
       "                   warm_start=False)"
      ]
     },
     "execution_count": 63,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.linear_model import LogisticRegression\n",
    "\n",
    "# Train the model\n",
    "model = LogisticRegression()\n",
    "model.fit(X_train_vectorized, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "AUC:  0.8841970005800444\n"
     ]
    }
   ],
   "source": [
    "from sklearn.metrics import roc_auc_score\n",
    "\n",
    "# Predict the transformed test documents\n",
    "predictions = model.predict(vect.transform(X_test))\n",
    "\n",
    "print('AUC: ', roc_auc_score(y_test, predictions))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Smallest Coefs:\n",
      "['worst' 'waste' 'forgettable' 'awful' 'disappointing' 'disappointment'\n",
      " 'stinker' 'poorly' 'fails' 'uninteresting']\n",
      "\n",
      "Largest Coefs: \n",
      "['refreshing' 'hooked' 'wonderfully' 'raunchy' 'adr' 'superb' 'perfect'\n",
      " 'delightful' 'squirrel' 'funniest']\n"
     ]
    }
   ],
   "source": [
    "# get the feature names as numpy array\n",
    "feature_names = np.array(vect.get_feature_names())\n",
    "\n",
    "# Sort the coefficients from the model\n",
    "sorted_coef_index = model.coef_[0].argsort()\n",
    "\n",
    "# Find the 10 smallest and 10 largest coefficients\n",
    "# The 10 largest coefficients are being indexed using [:-11:-1] \n",
    "# so the list returned is in order of largest to smallest\n",
    "print('Smallest Coefs:\\n{}\\n'.format(feature_names[sorted_coef_index[:10]]))\n",
    "print('Largest Coefs: \\n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Tfidf\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "32673"
      ]
     },
     "execution_count": 67,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "\n",
    "# Fit the TfidfVectorizer to the training data specifiying a minimum document frequency of 5\n",
    "vect = TfidfVectorizer(min_df=5).fit(X_train)\n",
    "len(vect.get_feature_names())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
      "  FutureWarning)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "AUC:  0.8933488824184665\n"
     ]
    }
   ],
   "source": [
    "X_train_vectorized = vect.transform(X_train)\n",
    "\n",
    "model = LogisticRegression()\n",
    "model.fit(X_train_vectorized, y_train)\n",
    "\n",
    "predictions = model.predict(vect.transform(X_test))\n",
    "\n",
    "print('AUC: ', roc_auc_score(y_test, predictions))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Smallest tfidf:\n",
      "['cavalryman' 'horace' 'ershadi' 'ebrahimi' 'homayoun' 'mahmoodzada'\n",
      " 'rueful' 'musclebound' 'décor' 'bails']\n",
      "\n",
      "Largest tfidf: \n",
      "['pokemon' 'ghoulies' 'robot' 'ernest' 'cycle' 'lupin' 'rodrigues'\n",
      " 'gamera' 'wei' 'steve']\n"
     ]
    }
   ],
   "source": [
    "feature_names = np.array(vect.get_feature_names())\n",
    "\n",
    "sorted_tfidf_index = X_train_vectorized.max(0).toarray()[0].argsort()\n",
    "\n",
    "print('Smallest tfidf:\\n{}\\n'.format(feature_names[sorted_tfidf_index[:10]]))\n",
    "print('Largest tfidf: \\n{}'.format(feature_names[sorted_tfidf_index[:-11:-1]]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Smallest Coefs:\n",
      "['worst' 'bad' 'waste' 'awful' 'boring' 'terrible' 'poor' 'nothing' 'dull'\n",
      " 'worse']\n",
      "\n",
      "Largest Coefs: \n",
      "['great' 'excellent' 'best' 'perfect' 'wonderful' 'amazing' 'today'\n",
      " 'loved' 'fun' 'favorite']\n"
     ]
    }
   ],
   "source": [
    "sorted_coef_index = model.coef_[0].argsort()\n",
    "\n",
    "print('Smallest Coefs:\\n{}\\n'.format(feature_names[sorted_coef_index[:10]]))\n",
    "print('Largest Coefs: \\n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1 0]\n"
     ]
    }
   ],
   "source": [
    "print(model.predict(vect.transform(['People with bias are reviewing the movie. Watch it, it is mind blowing. Great acting, awesome music scores and really awesome direction. Cheers!',\n",
    "                                    'Here the hero is glorified while he slaps his girlfriend, treats her like a courtesan n the girlfriend is shown as a docile animal. The acting by Shahid is getting monotonous. He acted the same like he did in Uddta Punjab. This film is nothing but a z grade version of Devdas n Dev D.'])))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [],
   "source": [
    "##n-grams¶\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "213759"
      ]
     },
     "execution_count": 73,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Fit the CountVectorizer to the training data specifiying a minimum \n",
    "# document frequency of 5 and extracting 1-grams and 2-grams\n",
    "vect = CountVectorizer(min_df=5, ngram_range=(1,2)).fit(X_train)\n",
    "\n",
    "X_train_vectorized = vect.transform(X_train)\n",
    "\n",
    "len(vect.get_feature_names())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
      "  FutureWarning)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "AUC:  0.9041957619987435\n"
     ]
    }
   ],
   "source": [
    "model = LogisticRegression()\n",
    "model.fit(X_train_vectorized, y_train)\n",
    "\n",
    "predictions = model.predict(vect.transform(X_test))\n",
    "\n",
    "print('AUC: ', roc_auc_score(y_test, predictions))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Smallest Coefs:\n",
      "['worst' 'awful' 'boring' 'waste' 'disappointment' 'terrible'\n",
      " 'disappointing' 'horrible' 'not worth' 'poorly']\n",
      "\n",
      "Largest Coefs: \n",
      "['perfect' 'excellent' 'hilarious' 'gem' 'amazing' 'superb' 'loved this'\n",
      " 'today' 'incredible' 'well worth']\n"
     ]
    }
   ],
   "source": [
    "feature_names = np.array(vect.get_feature_names())\n",
    "\n",
    "sorted_coef_index = model.coef_[0].argsort()\n",
    "\n",
    "print('Smallest Coefs:\\n{}\\n'.format(feature_names[sorted_coef_index[:10]]))\n",
    "print('Largest Coefs: \\n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1 0]\n"
     ]
    }
   ],
   "source": [
    "# These reviews are now correctly identified\n",
    "print(model.predict(vect.transform(['People with bias are reviewing the movie. Watch it, it is mind blowing. Great acting, awesome music scores and really awesome direction. Cheers!',\n",
    "                                    'Here the hero is glorified while he slaps his girlfriend, treats her like a courtesan n the girlfriend is shown as a docile animal. The acting by Shahid is getting monotonous. He acted the same like he did in Uddta Punjab. This film is nothing but a z grade version of Devdas n Dev D.'])))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}