dl_imdb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from keras.datasets import imdb\n",
    "from keras import models\n",
    "from keras import layers\n",
    "from keras import optimizers\n",
    "from keras import losses\n",
    "from keras import metrics\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load the data, keeping only 10,000 of the most frequently occuring words\n",
    "(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words = 10000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Check the first label\n",
    "train_labels[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Here is a list of maximum indexes in every review --- we search the maximum index in this\n",
    "print(type([max(sequence) for sequence in train_data]))\n",
    "# Find the maximum of all max indexes\n",
    "max([max(sequence) for sequence in train_data])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Let's quickly decode a review\n",
    "# step 1: load the dictionary mappings from word to integer index\n",
    "word_index = imdb.get_word_index()\n",
    "# step 2: reverse word index to map integer indexes to their respective words\n",
    "reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])\n",
    "# Step 3: decode the review, mapping integer indices to words\n",
    "#\n",
    "# indices are off by 3 because 0, 1, and 2 are reserverd indices for \"padding\", \"Start of se\n",
    "decoded_review = ' '.join([reverse_word_index.get(i-3, '?') for i in train_data[0]])\n",
    "decoded_review"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(reverse_word_index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def vectorize_sequences(sequences, dimension=10000):\n",
    "    results = np.zeros((len(sequences), dimension)) # Creates an all zero matrix of shape\n",
    "    for i,sequence in enumerate(sequences):\n",
    "        results[i,sequence] = 1\n",
    "    return results\n",
    "\n",
    "# Vectorize training Data\n",
    "X_train = vectorize_sequences(train_data)\n",
    "# Vectorize testing Data\n",
    "X_test = vectorize_sequences(test_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#vectorize labels\n",
    "y_train = np.asarray(train_labels).astype('float32')\n",
    "y_test = np.asarray(test_labels).astype('float32')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#model definition\n",
    "model = models.Sequential()\n",
    "model.add(layers.Dense(16, activation='relu', input_shape=(10000,)))\n",
    "model.add(layers.Dense(16, activation='relu'))\n",
    "model.add(layers.Dense(1, activation='sigmoid'))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#compiling model\n",
    "model.compile(\n",
    "optimizer=optimizers.RMSprop(learning_rate=0.001),\n",
    "loss = losses.binary_crossentropy,\n",
    "metrics = [metrics.binary_accuracy]\n",
    ")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Input for Validation\n",
    "X_val = X_train[:10000]\n",
    "partial_X_train = X_train[10000:]\n",
    "# Labels for validation\n",
    "y_val = y_train[:10000]\n",
    "partial_y_train = y_train[10000:]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "history = model.fit(\n",
    "partial_X_train,\n",
    "partial_y_train,\n",
    "epochs=20,\n",
    "batch_size=512,\n",
    "validation_data=(X_val, y_val)\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "history_dict = history.history\n",
    "history_dict.keys()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Plotting losses\n",
    "loss_values = history_dict['loss']\n",
    "val_loss_values = history_dict['val_loss']\n",
    "epochs = range(1, len(loss_values) + 1)\n",
    "plt.plot(epochs, loss_values, 'g', label=\"Training Loss\")\n",
    "plt.plot(epochs, val_loss_values, 'b', label=\"Validation Loss\")\n",
    "plt.title('Training and Validation Loss')\n",
    "plt.xlabel('Epochs')\n",
    "plt.ylabel('Loss Value')\n",
    "plt.legend()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "# Training and Validation Accuracy\n",
    "acc_values = history_dict['binary_accuracy']\n",
    "val_acc_values = history_dict['val_binary_accuracy']\n",
    "epochs = range(1, len(loss_values) + 1)\n",
    "plt.plot(epochs, acc_values, 'g', label=\"Training Accuracy\")\n",
    "plt.plot(epochs, val_acc_values, 'b', label=\"Validation Accuracy\")\n",
    "plt.title('Training and Validation Accuraccy')\n",
    "plt.xlabel('Epochs')\n",
    "plt.ylabel('Accuracy')\n",
    "plt.legend()\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Making Predictions for testing data\n",
    "np.set_printoptions(suppress=True)\n",
    "result = model.predict(X_test)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "result"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}