Advertisement
Guest User

Untitled

a guest
Feb 6th, 2016
67
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.04 KB | None | 0 0
  1.  
  2. {
  3. "cells": [
  4. {
  5. "cell_type": "markdown",
  6. "metadata": {},
  7. "source": [
  8. "## Experiment in reverse video search\n",
  9. "\n",
  10. "Below I attempt to use minhash and opencv to build a usable minhash for finding duplicate or similar videos. Our test tries modifying a video in several ways and testing how well it matches in addition to testing how well it matches against 6 differnet videos."
  11. ]
  12. },
  13. {
  14. "cell_type": "code",
  15. "execution_count": 112,
  16. "metadata": {
  17. "collapsed": false
  18. },
  19. "outputs": [],
  20. "source": [
  21. "import numpy as np\n",
  22. "import cv2\n",
  23. "import sys\n",
  24. "import time\n",
  25. "from datasketch import MinHash\n",
  26. "from hashlib import sha1\n",
  27. "\n",
  28. "\n",
  29. "def minhash_of(filename):\n",
  30. " \"\"\" Compute the minhash of a video file. This function computes a color\n",
  31. " histogram for each frame of a video, then performs a simple bucket\n",
  32. " operations (making a slightly fuzzy frame hash). This value is fed into\n",
  33. " minhash and then we compute the jaccard similarity of the videos. \"\"\"\n",
  34. " cap = cv2.VideoCapture(filename)\n",
  35. " m1 = MinHash()\n",
  36. " total = int(cap.get(cv2.cv.CV_CAP_PROP_FRAME_COUNT)) # Total frame count\n",
  37. " \n",
  38. " while 1:\n",
  39. " ret, frame = cap.read()\n",
  40. " if not ret:\n",
  41. " break\n",
  42. " hist = cv2.calcHist([frame],[0],None,[512],[0,512])\n",
  43. " # Normalize. This is because the histogram's scale depends on the resolution of the image.\n",
  44. " hist = np.true_divide(hist, hist.max())\n",
  45. " # Bucket the number slightly to account for some variance.\n",
  46. " # A larger divisor here will increase how many matches are made.\n",
  47. " hist = np.sum(hist)\n",
  48. " # add our frame to the minhash\n",
  49. " m1.digest(sha1(str(round(hist))))\n",
  50. " m1.digest(sha1(str(round(hist * 3))))\n",
  51. " current = cap.get(cv2.cv.CV_CAP_PROP_POS_FRAMES)\n",
  52. " sys.stdout.write(\"{} / {}\\r\".format(current, total)) # Print a progress bar\n",
  53. "\n",
  54. " cap.release()\n",
  55. " cv2.destroyAllWindows()\n",
  56. " \n",
  57. " return m1, total\n",
  58. "\n",
  59. "def compare(filename, mh, desc):\n",
  60. " start = time.time()\n",
  61. " other, total = minhash_of(filename)\n",
  62. " print(\"{:.2f}\\t{}\\t({:.2f} fps)\".format(mh.jaccard(other), desc, total / (time.time() - start)))"
  63. ]
  64. },
  65. {
  66. "cell_type": "code",
  67. "execution_count": 113,
  68. "metadata": {
  69. "collapsed": false
  70. },
  71. "outputs": [
  72. {
  73. "name": "stdout",
  74. "output_type": "stream",
  75. "text": [
  76. "0.89\tcompressed 144p\t(4776.13 fps)\n",
  77. "0.85\tvisible gamma change\t(993.65 fps)\n",
  78. "0.89\twatermark added\t(1001.44 fps)\n",
  79. "0.85\tsubclip 15% length\t(990.81 fps)\n",
  80. "0.27\tsubclip 0.5% length\t(698.35 fps)\n",
  81. "0.90\tfps 29 vs 23\t(994.44 fps)\n",
  82. "0.91\tfps 29 vs 10\t(906.13 fps)\n"
  83. ]
  84. }
  85. ],
  86. "source": [
  87. "vid1, _ = minhash_of('vid1_full.mp4') # 480p 29.97fps\n",
  88. "compare('vid1_compressed.mp4', vid1, \"compressed 144p\")\n",
  89. "compare('vid1_gamma_neg20.mp4', vid1, \"visible gamma change\")\n",
  90. "compare('vid1_watermark.mp4', vid1, \"watermark added\")\n",
  91. "compare('vid1_clip_30.mp4', vid1, \"subclip 15% length\")\n",
  92. "compare('vid1_clip_2.mp4', vid1, \"subclip 0.5% length\")\n",
  93. "compare('vid1_change_fps.mp4', vid1, \"fps 29 vs 23\") # from 29 to 23\n",
  94. "compare('vid1_change_fps_low.mp4', vid1, \"fps 29 vs 10\")"
  95. ]
  96. },
  97. {
  98. "cell_type": "code",
  99. "execution_count": 114,
  100. "metadata": {
  101. "collapsed": false
  102. },
  103. "outputs": [
  104. {
  105. "name": "stdout",
  106. "output_type": "stream",
  107. "text": [
  108. "0.66\tdifferent video 2 compressed 144p\t(2611.61 fps)\n",
  109. "0.70\tdifferent video 2\t(1484.69 fps)\n",
  110. "0.05\tdifferent video 3\t(1464.13 fps)\n",
  111. "0.91\tdifferent video 4\t(1314.44 fps)\n",
  112. "0.89\tdifferent video 5\t(1602.97 fps)\n",
  113. "0.80\tdifferent video 6\t(1203.70 fps)\n"
  114. ]
  115. }
  116. ],
  117. "source": [
  118. "compare('vid2_compressed.mp4', vid1, \"different video 2 compressed 144p\")\n",
  119. "for i in range(2, 7):\n",
  120. " compare('vid{}.mp4'.format(i), vid1, \"different video {}\".format(i))"
  121. ]
  122. },
  123. {
  124. "cell_type": "markdown",
  125. "metadata": {},
  126. "source": [
  127. "As can be seen above, this simple technique did a reasonable job of finding duplicates after a range of changes. It handled subclips poorly, and had one-two false positives. By isolating more features and tuning the bucketing technique better results could be obtained."
  128. ]
  129. }
  130. ],
  131. "metadata": {
  132. "kernelspec": {
  133. "display_name": "Python 2",
  134. "language": "python",
  135. "name": "python2"
  136. },
  137. "language_info": {
  138. "codemirror_mode": {
  139. "name": "ipython",
  140. "version": 2
  141. },
  142. "file_extension": ".py",
  143. "mimetype": "text/x-python",
  144. "name": "python",
  145. "nbconvert_exporter": "python",
  146. "pygments_lexer": "ipython2",
  147. "version": "2.7.10"
  148. }
  149. },
  150. "nbformat": 4,
  151. "nbformat_minor": 0
  152. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement