Untitled

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Helper functions:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def ohe(X,idx2word):\n",
    "    ncol = len(idx2word.keys())\n",
    "    nrow = len(X)\n",
    "    OHE_X = np.zeros((nrow,ncol))\n",
    "    for r in range(len(X)):\n",
    "        if not isinstance(X[r],list):\n",
    "            OHE_X[r,X[r]] = 1\n",
    "        else:\n",
    "            row_val = X[r]\n",
    "            for c in row_val:\n",
    "                OHE_X[r,c] = 1\n",
    "                \n",
    "    return OHE_X\n",
    "        \n",
    "\n",
    "    \n",
    "def tokenize(x_list):\n",
    "    #unique tokens:\n",
    "    unique_x = list(set([j for i in data for j in i]))\n",
    "    idx2word = dict(enumerate(unique_x))\n",
    "    word2idx = {i[1]:i[0] for i in idx2word.items()}\n",
    "    # Encode:\n",
    "    tokened_x_list = []\n",
    "    for sentence in x_list:\n",
    "        temp_sent = []\n",
    "        for word in sentence:\n",
    "            token = word2idx.get(word,-1)\n",
    "            temp_sent.append(token)\n",
    "        \n",
    "        tokened_x_list.append(temp_sent)\n",
    "    return tokened_x_list,idx2word,word2idx\n",
    "            \n",
    "\n",
    "    \n",
    "def skipgram_prep(x_list,context_window=2):\n",
    "    \"\"\"\n",
    "    Use Skipgram method to prepare the data.\n",
    "    \n",
    "    Arguments:\n",
    "        x_list(list): tokenized training data\n",
    "        \n",
    "        context_window: the context window on each side. \n",
    "        For example, if context_window=2, we will be looking at 2 tokens on the left and \n",
    "        2 tokens on the right\n",
    "    \n",
    "    Returns:\n",
    "        processd_data(list):  a list of tuples represents the processed data. Each pair of tuple is a (x,y) pair\n",
    "        \n",
    "    \"\"\"\n",
    "    processed_data = []\n",
    "    \n",
    "    for row in x_list:\n",
    "        row_len = len(row)\n",
    "        for i in range(row_len):\n",
    "            x = row[i]\n",
    "            start_idx = max(i-context_window,0)\n",
    "            end_idx = min(row_len,i+context_window+1)\n",
    "            y = row[start_idx:i] + row[i+1:end_idx] # skip the self\n",
    "            \n",
    "            \n",
    "            temp_xy_pair = zip([x]*len(y),y)\n",
    "            processed_data.extend(temp_xy_pair)\n",
    "    \n",
    "    return processed_data\n",
    "            \n",
    "            \n",
    "\n",
    "        \n",
    "def softmax(x):\n",
    "    e_x = np.exp(x)\n",
    "    return np.divide(e_x,e_x.sum(axis=1).reshape(-1,1))\n",
    "            \n",
    "            \n",
    "    \n",
    "    \n",
    "            \n",
    "    \n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "data = [\n",
    "    'apple banana are delicious food',\n",
    "    'video game go play in game studio',\n",
    "    'lunch food is fruit apple banana icecream',\n",
    "    'warcraft or starcraft or overwatch best game',\n",
    "    'chocolate or banana or icecream the most delicious food',\n",
    "    'banana apple smoothie is the best for lunch or dinner',\n",
    "    'video game is good for geeks',\n",
    "    'what to eat for dinner banana or chocolate',\n",
    "    'which game company is better ubisoft or blizzard',\n",
    "    'play game on ps4 or xbox',\n",
    "    'banana is less sweet icecream is more sweet',\n",
    "    'chocolate icecream taste more delicious than banana'\n",
    "    \n",
    "]\n",
    "\n",
    "data = [i.split(\" \") for i in data]\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Tokenization:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenized_data_list,idx2word,word2idx = tokenize(data)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Use Skipgram to Prepare the Training Data:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "prep_data = skipgram_prep(tokenized_data_list,context_window=2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = [i[0] for i in prep_data]\n",
    "Y = [i[1] for i in prep_data]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### OHE:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "ohe_X = ohe(X,idx2word)\n",
    "ohe_Y = ohe(Y,idx2word)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Naive Word2vec Model:\n",
    "\n",
    "First, let's build a naive Word2vec model, means we're gonna use softmax across all vocabularies."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Hyper Parameters:\n",
    "N_NEGATIVE = 3\n",
    "LEARNING_RATE = 0.01\n",
    "N_VOCAB = len(idx2word)\n",
    "N_DIM = 16\n",
    "BATCH_SIZE = len(ohe_X)\n",
    "\n",
    "# Weights Initialization:\n",
    "embedding_mat = np.random.normal(size=(N_VOCAB,N_DIM)) \n",
    "dense_w = np.random.normal(size=(N_DIM,N_VOCAB))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loss: 9.378688108625873\n",
      "Loss: 2.005298530982752\n",
      "Loss: 1.9282251934894516\n",
      "Loss: 1.915202004986637\n",
      "Loss: 1.9103938902718987\n",
      "Loss: 1.9079612745844892\n",
      "Loss: 1.9067267756355368\n",
      "Loss: 1.9116364898998748\n",
      "Loss: 1.9093111376997192\n",
      "Loss: 1.9079385961499742\n",
      "Loss: 1.9070468916300318\n",
      "Loss: 1.9064129969572752\n",
      "Loss: 1.9059291852573528\n",
      "Loss: 1.9055409406255481\n",
      "Loss: 1.9052187142658792\n"
     ]
    }
   ],
   "source": [
    "all_loss = []\n",
    "for i in range(1500):\n",
    "    \n",
    "\n",
    "    # forward pass:\n",
    "    input_x = ohe_X\n",
    "    input_y = ohe_Y\n",
    "    x_embedding_layer = input_x.dot(embedding_mat)# query word embedding X\n",
    "#     print(x_embedding_layer.shape)\n",
    "    dense_layer = x_embedding_layer.dot(dense_w)\n",
    "#     print(dense_layer.shape)\n",
    "    output_layer = softmax(dense_layer)\n",
    "\n",
    "    # cross entropy loss:\n",
    "    loss = -np.sum(input_y*np.log(output_layer+1e-9))/BATCH_SIZE # adding smooth term\n",
    "    if i%100==0:\n",
    "        print(f\"Loss: {loss}\")\n",
    "    all_loss.append(loss)\n",
    "#     print('---')\n",
    "    \n",
    "\n",
    "\n",
    "    # Backward Pass\n",
    "\n",
    "    # d_loss/d_dense_layer = d_loss/d_op_layer * d_op_layer/d_dense_layer\n",
    "    d_dense = output_layer - input_y\n",
    "#     print(d_dense.shape)\n",
    "\n",
    "    # d_loss/d_dense_w = d_loss/d_dense_layer * d_dense_layer/d_dense_w\n",
    "    d_dense_w =  d_dense.T.dot(x_embedding_layer).T\n",
    "#     print(d_dense_w.shape)\n",
    "\n",
    "    # d_loss/x_embedding_layer = d_loss/d_dense_layer * d_dense_layer/x_embedding_layer\n",
    "    d_emb_layer =  d_dense.dot(dense_w.T)\n",
    "#     print(d_emb_layer.shape)\n",
    "    # d_loss/d_embedding_mat = d_loss/x_embedding_layer * x_embedding_layer/d_embedding_mat\n",
    "    d_embedding_mat = d_emb_layer.T.dot(input_x)\n",
    "#     print(d_embedding_mat.shape)\n",
    "#     print('~')\n",
    "\n",
    "\n",
    "    \n",
    "    embedding_mat -= LEARNING_RATE*d_embedding_mat.T\n",
    "    dense_w -= LEARNING_RATE*d_dense_w\n",
    "    "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now the mini word2vec model is ready, let build the query function to check:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_word_vector(word ,embedding = embedding_mat,word2idx=word2idx,vector_dim=N_DIM):\n",
    "    \n",
    "    query_id = word2idx.get(word,-1)\n",
    "    if query_id>=0:\n",
    "        return embedding_mat[query_id,:]\n",
    "    else:\n",
    "        return np.zeros((N_DIM,))-999.\n",
    "        \n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([-1.10479476,  0.69152043,  0.0222014 ,  0.47398416,  0.85253254,\n",
       "        1.29816081,  0.46473506, -0.17165976,  0.02458933, -0.58116457,\n",
       "       -0.40560783,  2.78396632, -0.96417779,  2.04935327,  0.82896536,\n",
       "       -0.92053599])"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Try with in vocab word:\n",
    "query_word = 'xbox'\n",
    "get_word_vector(query_word)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([-999., -999., -999., -999., -999., -999., -999., -999., -999.,\n",
       "       -999., -999., -999., -999., -999., -999., -999.])"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Try with in Out-of_vocabulary word:\n",
    "query_word = 'lol'\n",
    "get_word_vector(query_word)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Now find the most similar word to our query word:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "from numpy import dot\n",
    "from numpy.linalg import norm\n",
    "\n",
    "\n",
    "def cosine_sim(vx,vy):\n",
    "    return dot(vx, vy)/(norm(vx)*norm(vy))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "def find_similar(query_word,word2idx=word2idx):\n",
    "    query_vector = get_word_vector(query_word)\n",
    "    \n",
    "    result = {}\n",
    "    for word in word2idx:\n",
    "        temp_vector = get_word_vector(word)\n",
    "#         print(word)\n",
    "#         print(temp_vector)\n",
    "        sim = cosine_sim(query_vector,temp_vector)\n",
    "        result[word] = sim\n",
    "    \n",
    "    return result\n",
    "        \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('food', 1.0), ('apple', 0.5053182732410727), ('less', 0.34573572640324446)]"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "result = find_similar('food')\n",
    "\n",
    "\n",
    "sorted(list(result.items()),key=lambda x: x[1],reverse=True)[:3]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('warcraft', 0.9999999999999999),\n",
       " ('ubisoft', 0.4935534265673933),\n",
       " ('overwatch', 0.467640114458129)]"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "result = find_similar('warcraft')\n",
    "\n",
    "\n",
    "sorted(list(result.items()),key=lambda x: x[1],reverse=True)[:3]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### So what's the problem here -- the above algorithm will never work for real world problem!!!\n",
    "\n",
    "We only have 44 vocabularies in this vanilla example. What if we have millions of vocabs? The softmax operation becomes very expensive. To tackle this  issue, several algorithms are proposed, in order to do the approximation of softmax, such as: Hiearchical Softmax, Negative Sampling or NCE.\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    " Coming Soon...."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Negative Sampling:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "# def sample_negative(xy_pairs,n_negative,idx2word):\n",
    "\n",
    "#     pos_context = {}\n",
    "#     grand_negative_samples = []\n",
    "#     for x,y in xy_pairs:\n",
    "\n",
    "#         if x not in pos_context:\n",
    "#             good_pair = [i[1] for i in xy_pairs if i[0]==x]\n",
    "#             pos_context[x] = good_pair\n",
    "\n",
    "#         ## Sample:\n",
    "#         temp_neg_samples = []\n",
    "#         while len(temp_neg_samples)< n_negative:\n",
    "#             temp_idx = np.random.choice(list(idx2word.keys()))\n",
    "#             if temp_idx!=x and temp_idx not in pos_context[x]:\n",
    "#                 temp_neg_samples.append(temp_idx)\n",
    "\n",
    "#         grand_negative_samples.append(temp_neg_samples)\n",
    "#     return np.array(grand_negative_samples)\n",
    "    "
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}