Guest User

Untitled

a guest
Feb 25th, 2018
82
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.52 KB | None | 0 0
  1. {
  2. "cells": [
  3. {
  4. "cell_type": "code",
  5. "execution_count": 1,
  6. "metadata": {},
  7. "outputs": [],
  8. "source": [
  9. "from sklearn.feature_extraction.text import TfidfVectorizer\n",
  10. "import numpy as np\n",
  11. "np.set_printoptions(precision=2)\n",
  12. "tfidf = TfidfVectorizer(token_pattern=u'(?u)\\\\b\\\\w+\\\\b')"
  13. ]
  14. },
  15. {
  16. "cell_type": "code",
  17. "execution_count": 2,
  18. "metadata": {},
  19. "outputs": [],
  20. "source": [
  21. "result = tfidf.fit_transform([\n",
  22. " '亀 亀 世界',\n",
  23. " 'うさぎ 小山 亀',\n",
  24. " '桃太郎 桃太郎 腰 きびだんご']) # うさぎとかめの歌と桃太郎の歌(名刺のみ)を学習"
  25. ]
  26. },
  27. {
  28. "cell_type": "code",
  29. "execution_count": 3,
  30. "metadata": {},
  31. "outputs": [
  32. {
  33. "data": {
  34. "text/plain": [
  35. "{'うさぎ': 0, 'きびだんご': 1, '世界': 2, '亀': 3, '小山': 4, '桃太郎': 5, '腰': 6}"
  36. ]
  37. },
  38. "execution_count": 3,
  39. "metadata": {},
  40. "output_type": "execute_result"
  41. }
  42. ],
  43. "source": [
  44. "tfidf.vocabulary_ # この学習によって取り込んだ語彙"
  45. ]
  46. },
  47. {
  48. "cell_type": "code",
  49. "execution_count": 4,
  50. "metadata": {},
  51. "outputs": [
  52. {
  53. "data": {
  54. "text/plain": [
  55. "array([[0. , 0. , 0.55, 0.84, 0. , 0. , 0. ],\n",
  56. " [0.62, 0. , 0. , 0.47, 0.62, 0. , 0. ],\n",
  57. " [0. , 0.41, 0. , 0. , 0. , 0.82, 0.41]])"
  58. ]
  59. },
  60. "execution_count": 4,
  61. "metadata": {},
  62. "output_type": "execute_result"
  63. }
  64. ],
  65. "source": [
  66. "result.toarray() # TFIDF値"
  67. ]
  68. },
  69. {
  70. "cell_type": "code",
  71. "execution_count": 5,
  72. "metadata": {},
  73. "outputs": [
  74. {
  75. "data": {
  76. "text/plain": [
  77. "array([[0. , 0. , 0.55, 0.84, 0. , 0. , 0. ],\n",
  78. " [0.62, 0. , 0. , 0.47, 0.62, 0. , 0. ]])"
  79. ]
  80. },
  81. "execution_count": 5,
  82. "metadata": {},
  83. "output_type": "execute_result"
  84. }
  85. ],
  86. "source": [
  87. "usagi_arrays = result.toarray()[[True, True, False]] # うさぎとかめの歌だけ抽出\n",
  88. "usagi_arrays"
  89. ]
  90. },
  91. {
  92. "cell_type": "code",
  93. "execution_count": 6,
  94. "metadata": {},
  95. "outputs": [
  96. {
  97. "data": {
  98. "text/plain": [
  99. "array([0.31, 0. , 0.27, 0.65, 0.31, 0. , 0. ])"
  100. ]
  101. },
  102. "execution_count": 6,
  103. "metadata": {},
  104. "output_type": "execute_result"
  105. }
  106. ],
  107. "source": [
  108. "usagi_weight = np.mean(usagi_arrays, axis=0) # 重みを計算\n",
  109. "usagi_weight"
  110. ]
  111. },
  112. {
  113. "cell_type": "code",
  114. "execution_count": 7,
  115. "metadata": {},
  116. "outputs": [
  117. {
  118. "data": {
  119. "text/plain": [
  120. "array([0. , 0.41, 0. , 0. , 0. , 0.82, 0.41])"
  121. ]
  122. },
  123. "execution_count": 7,
  124. "metadata": {},
  125. "output_type": "execute_result"
  126. }
  127. ],
  128. "source": [
  129. "momotaro_arrays = result.toarray()[[False, False, True]] # ももたろうの歌だけ抽出\n",
  130. "momotaro_arrays\n",
  131. "momotaro_weight = np.mean(momotaro_arrays, axis=0) # 重みを計算\n",
  132. "momotaro_weight"
  133. ]
  134. },
  135. {
  136. "cell_type": "code",
  137. "execution_count": 8,
  138. "metadata": {},
  139. "outputs": [
  140. {
  141. "data": {
  142. "text/plain": [
  143. "array([[0., 0., 0., 1., 0., 0., 0.]])"
  144. ]
  145. },
  146. "execution_count": 8,
  147. "metadata": {},
  148. "output_type": "execute_result"
  149. }
  150. ],
  151. "source": [
  152. "new_song = tfidf.transform(['浦島 亀 竜宮城 絵']).toarray() # 新たな歌が与えられた\n",
  153. "new_song"
  154. ]
  155. },
  156. {
  157. "cell_type": "code",
  158. "execution_count": 10,
  159. "metadata": {},
  160. "outputs": [
  161. {
  162. "data": {
  163. "text/plain": [
  164. "array([0.65])"
  165. ]
  166. },
  167. "execution_count": 10,
  168. "metadata": {},
  169. "output_type": "execute_result"
  170. }
  171. ],
  172. "source": [
  173. "score = new_song.dot(usagi_weight)\n",
  174. "score"
  175. ]
  176. },
  177. {
  178. "cell_type": "code",
  179. "execution_count": null,
  180. "metadata": {},
  181. "outputs": [],
  182. "source": [
  183. "score = new_song.dot(momotaro_weight)\n",
  184. "score"
  185. ]
  186. },
  187. {
  188. "cell_type": "code",
  189. "execution_count": null,
  190. "metadata": {},
  191. "outputs": [],
  192. "source": []
  193. }
  194. ],
  195. "metadata": {
  196. "kernelspec": {
  197. "display_name": "Python 3",
  198. "language": "python",
  199. "name": "python3"
  200. },
  201. "language_info": {
  202. "codemirror_mode": {
  203. "name": "ipython",
  204. "version": 3
  205. },
  206. "file_extension": ".py",
  207. "mimetype": "text/x-python",
  208. "name": "python",
  209. "nbconvert_exporter": "python",
  210. "pygments_lexer": "ipython3",
  211. "version": "3.6.0"
  212. }
  213. },
  214. "nbformat": 4,
  215. "nbformat_minor": 2
  216. }
Add Comment
Please, Sign In to add comment