Untitled

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "import time\n",
    "import json\n",
    "\n",
    "import numpy as np\n",
    "import faiss\n",
    "\n",
    "import sys\n",
    "\n",
    "def ivecs_read(fname):\n",
    "    a = np.fromfile(fname, dtype='int32')\n",
    "    d = a[0]\n",
    "    return a.reshape(-1, d + 1)[:, 1:].copy()\n",
    "\n",
    "def fvecs_read(fname):\n",
    "    return ivecs_read(fname).view('float32')\n",
    "\n",
    "simdir = '/mnt/vol/gfsai-east/ai-group/datasets/simsearch/'\n",
    "\n",
    "def load_sift1M(root_dir=simdir):\n",
    "    print(\"Loading sift1M...\")\n",
    "    xt = fvecs_read(\"%s/sift1M/sift_learn.fvecs\" %(root_dir))\n",
    "    xb = fvecs_read(\"%s/sift1M/sift_base.fvecs\" % (root_dir))\n",
    "    xq = fvecs_read(\"%s/sift1M/sift_query.fvecs\" % (root_dir))\n",
    "    gt = ivecs_read(\"%s/sift1M/sift_groundtruth.ivecs\" %(root_dir))\n",
    "\n",
    "    return xb, xq, xt, gt\n",
    "\n",
    "\n",
    "def load_random():\n",
    "    print(\"Loading random...\")\n",
    "    np.random.seed(1234)             # make reproducible\n",
    "    xb = np.random.random((1000 * 1000, 128)).astype('float32')\n",
    "    xb[:, 0] += np.arange(1000 * 1000) / 1000.\n",
    "    xq = xb[:100]\n",
    "    xt = xb\n",
    "    gt = xb\n",
    "    return xb, xq, xt, gt\n",
    "\n",
    "def test_with(index, xb, xq):\n",
    "    index.train(xb)\n",
    "\n",
    "    index.add(xb)\n",
    "    \n",
    "    stats = faiss.cvar.indexIVF_stats\n",
    "    stats.reset()    \n",
    "    total_times = []\n",
    "    for j in range(100):\n",
    "        t0 = time.time()\n",
    "        D, I = index.search(xq[:10], 10)\n",
    "        t1 = time.time()\n",
    "        total_times.append((t1 - t0) * 1000.0)\n",
    "    print(np.median(total_times))\n",
    "\n",
    "def test_with1(index, xb, xq):\n",
    "\n",
    "    stats = faiss.cvar.indexIVF_stats\n",
    "    stats.reset()    \n",
    "    total_times = []\n",
    "    for j in range(100):\n",
    "        t0 = time.time()\n",
    "        D, I = index.search(xq[:10], 10)\n",
    "        t1 = time.time()\n",
    "        total_times.append((t1 - t0) * 1000.0)\n",
    "    print(np.median(total_times), stats.ndis)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading sift1M...\n",
      "(10000, 512) (1000000, 512)\n",
      "47.14250564575195 137396900\n"
     ]
    }
   ],
   "source": [
    "\n",
    "d = 512\n",
    "\n",
    "#sift1M\n",
    "xb, xq, xt, gt = load_sift1M()\n",
    "xb = np.hstack((xb, np.zeros((xb.shape[0], d - xb.shape[1]), dtype = xb.dtype)))\n",
    "xq = np.hstack((xq, np.zeros((xq.shape[0], d - xq.shape[1]), dtype = xq.dtype)))\n",
    "\n",
    "print(xq.shape, xb.shape)\n",
    "\n",
    "faiss.omp_set_num_threads(16)\n",
    "index = faiss.IndexIVFFlat(faiss.IndexFlatL2(d), d, 128)\n",
    "index.nprobe = 16\n",
    "test_with(index, xb, xq)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading random...\n",
      "(100, 512) (1000000, 512)\n",
      "29.09719944000244 88811000\n"
     ]
    }
   ],
   "source": [
    "#random\n",
    "xb, xq, xt, gt = load_random()\n",
    "xb = np.hstack((xb, np.zeros((xb.shape[0], d - xb.shape[1]), dtype = xb.dtype)))\n",
    "xq = np.hstack((xq, np.zeros((xq.shape[0], d - xq.shape[1]), dtype = xq.dtype)))\n",
    "\n",
    "print(xq.shape, xb.shape)\n",
    "\n",
    "faiss.omp_set_num_threads(16)\n",
    "index2 = faiss.IndexIVFFlat(faiss.IndexFlatL2(d), d, 128)\n",
    "index2.nprobe = 16\n",
    "test_with(index2, xb, xq)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "D, I = index2.quantizer.search(xq[:10], 16)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "57.5098991394043 151247400\n"
     ]
    }
   ],
   "source": [
    "test_with1(index2, xb, xb[np.random.choice(1000000, size=100)])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([5658, 5176, 4474, 4336, 4191, 4410, 4866, 5219, 5097, 5018, 5732,\n",
       "       6245, 6441, 6860, 7450, 7638])"
      ]
     },
     "execution_count": 30,
     "metadata": {
      "bento_obj_id": "139687776509584"
     },
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# get the list for the clusters \n",
    "list_sizes = np.array([index2.invlists.list_size(i) for i in range(128)])\n",
    "list_sizes[I[0]] "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "7812.5"
      ]
     },
     "execution_count": 29,
     "metadata": {
      "bento_obj_id": "139693108448184"
     },
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list_sizes.mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "bento_stylesheets": {
   "bento/extensions/flow/main.css": true,
   "bento/extensions/kernel_selector/main.css": true,
   "bento/extensions/kernel_ui/main.css": true,
   "bento/extensions/new_kernel/main.css": true,
   "bento/extensions/system_usage/main.css": true,
   "bento/extensions/theme/main.css": true
  },
  "kernelspec": {
   "display_name": "faiss",
   "language": "python",
   "name": "bento_kernel_faiss"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3rc1+"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}