Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- {
- "cells": [
- {
- "cell_type": "code",
- "execution_count": 22,
- "metadata": {},
- "outputs": [],
- "source": [
- "import time\n",
- "import json\n",
- "\n",
- "import numpy as np\n",
- "import faiss\n",
- "\n",
- "import sys\n",
- "\n",
- "def ivecs_read(fname):\n",
- " a = np.fromfile(fname, dtype='int32')\n",
- " d = a[0]\n",
- " return a.reshape(-1, d + 1)[:, 1:].copy()\n",
- "\n",
- "def fvecs_read(fname):\n",
- " return ivecs_read(fname).view('float32')\n",
- "\n",
- "simdir = '/mnt/vol/gfsai-east/ai-group/datasets/simsearch/'\n",
- "\n",
- "def load_sift1M(root_dir=simdir):\n",
- " print(\"Loading sift1M...\")\n",
- " xt = fvecs_read(\"%s/sift1M/sift_learn.fvecs\" %(root_dir))\n",
- " xb = fvecs_read(\"%s/sift1M/sift_base.fvecs\" % (root_dir))\n",
- " xq = fvecs_read(\"%s/sift1M/sift_query.fvecs\" % (root_dir))\n",
- " gt = ivecs_read(\"%s/sift1M/sift_groundtruth.ivecs\" %(root_dir))\n",
- "\n",
- " return xb, xq, xt, gt\n",
- "\n",
- "\n",
- "def load_random():\n",
- " print(\"Loading random...\")\n",
- " np.random.seed(1234) # make reproducible\n",
- " xb = np.random.random((1000 * 1000, 128)).astype('float32')\n",
- " xb[:, 0] += np.arange(1000 * 1000) / 1000.\n",
- " xq = xb[:100]\n",
- " xt = xb\n",
- " gt = xb\n",
- " return xb, xq, xt, gt\n",
- "\n",
- "def test_with(index, xb, xq):\n",
- " index.train(xb)\n",
- "\n",
- " index.add(xb)\n",
- " \n",
- " stats = faiss.cvar.indexIVF_stats\n",
- " stats.reset() \n",
- " total_times = []\n",
- " for j in range(100):\n",
- " t0 = time.time()\n",
- " D, I = index.search(xq[:10], 10)\n",
- " t1 = time.time()\n",
- " total_times.append((t1 - t0) * 1000.0)\n",
- " print(np.median(total_times))\n",
- "\n",
- "def test_with1(index, xb, xq):\n",
- "\n",
- " stats = faiss.cvar.indexIVF_stats\n",
- " stats.reset() \n",
- " total_times = []\n",
- " for j in range(100):\n",
- " t0 = time.time()\n",
- " D, I = index.search(xq[:10], 10)\n",
- " t1 = time.time()\n",
- " total_times.append((t1 - t0) * 1000.0)\n",
- " print(np.median(total_times), stats.ndis)\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Loading sift1M...\n",
- "(10000, 512) (1000000, 512)\n",
- "47.14250564575195 137396900\n"
- ]
- }
- ],
- "source": [
- "\n",
- "d = 512\n",
- "\n",
- "#sift1M\n",
- "xb, xq, xt, gt = load_sift1M()\n",
- "xb = np.hstack((xb, np.zeros((xb.shape[0], d - xb.shape[1]), dtype = xb.dtype)))\n",
- "xq = np.hstack((xq, np.zeros((xq.shape[0], d - xq.shape[1]), dtype = xq.dtype)))\n",
- "\n",
- "print(xq.shape, xb.shape)\n",
- "\n",
- "faiss.omp_set_num_threads(16)\n",
- "index = faiss.IndexIVFFlat(faiss.IndexFlatL2(d), d, 128)\n",
- "index.nprobe = 16\n",
- "test_with(index, xb, xq)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Loading random...\n",
- "(100, 512) (1000000, 512)\n",
- "29.09719944000244 88811000\n"
- ]
- }
- ],
- "source": [
- "#random\n",
- "xb, xq, xt, gt = load_random()\n",
- "xb = np.hstack((xb, np.zeros((xb.shape[0], d - xb.shape[1]), dtype = xb.dtype)))\n",
- "xq = np.hstack((xq, np.zeros((xq.shape[0], d - xq.shape[1]), dtype = xq.dtype)))\n",
- "\n",
- "print(xq.shape, xb.shape)\n",
- "\n",
- "faiss.omp_set_num_threads(16)\n",
- "index2 = faiss.IndexIVFFlat(faiss.IndexFlatL2(d), d, 128)\n",
- "index2.nprobe = 16\n",
- "test_with(index2, xb, xq)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 18,
- "metadata": {},
- "outputs": [],
- "source": [
- "D, I = index2.quantizer.search(xq[:10], 16)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 25,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "57.5098991394043 151247400\n"
- ]
- }
- ],
- "source": [
- "test_with1(index2, xb, xb[np.random.choice(1000000, size=100)])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 30,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([5658, 5176, 4474, 4336, 4191, 4410, 4866, 5219, 5097, 5018, 5732,\n",
- " 6245, 6441, 6860, 7450, 7638])"
- ]
- },
- "execution_count": 30,
- "metadata": {
- "bento_obj_id": "139687776509584"
- },
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# get the list for the clusters \n",
- "list_sizes = np.array([index2.invlists.list_size(i) for i in range(128)])\n",
- "list_sizes[I[0]] "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 29,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "7812.5"
- ]
- },
- "execution_count": 29,
- "metadata": {
- "bento_obj_id": "139693108448184"
- },
- "output_type": "execute_result"
- }
- ],
- "source": [
- "list_sizes.mean()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "bento_stylesheets": {
- "bento/extensions/flow/main.css": true,
- "bento/extensions/kernel_selector/main.css": true,
- "bento/extensions/kernel_ui/main.css": true,
- "bento/extensions/new_kernel/main.css": true,
- "bento/extensions/system_usage/main.css": true,
- "bento/extensions/theme/main.css": true
- },
- "kernelspec": {
- "display_name": "faiss",
- "language": "python",
- "name": "bento_kernel_faiss"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.6.3rc1+"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement