Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- {
- "cells": [
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [],
- "source": [
- "from sklearn.feature_extraction.text import TfidfVectorizer\n",
- "import numpy as np\n",
- "np.set_printoptions(precision=2)\n",
- "tfidf = TfidfVectorizer(token_pattern=u'(?u)\\\\b\\\\w+\\\\b')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {},
- "outputs": [],
- "source": [
- "result = tfidf.fit_transform([\n",
- " '亀 亀 世界',\n",
- " 'うさぎ 小山 亀',\n",
- " '桃太郎 桃太郎 腰 きびだんご']) # うさぎとかめの歌と桃太郎の歌(名刺のみ)を学習"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "{'うさぎ': 0, 'きびだんご': 1, '世界': 2, '亀': 3, '小山': 4, '桃太郎': 5, '腰': 6}"
- ]
- },
- "execution_count": 3,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "tfidf.vocabulary_ # この学習によって取り込んだ語彙"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([[0. , 0. , 0.55, 0.84, 0. , 0. , 0. ],\n",
- " [0.62, 0. , 0. , 0.47, 0.62, 0. , 0. ],\n",
- " [0. , 0.41, 0. , 0. , 0. , 0.82, 0.41]])"
- ]
- },
- "execution_count": 4,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "result.toarray() # TFIDF値"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([[0. , 0. , 0.55, 0.84, 0. , 0. , 0. ],\n",
- " [0.62, 0. , 0. , 0.47, 0.62, 0. , 0. ]])"
- ]
- },
- "execution_count": 5,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "usagi_arrays = result.toarray()[[True, True, False]] # うさぎとかめの歌だけ抽出\n",
- "usagi_arrays"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([0.31, 0. , 0.27, 0.65, 0.31, 0. , 0. ])"
- ]
- },
- "execution_count": 6,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "usagi_weight = np.mean(usagi_arrays, axis=0) # 重みを計算\n",
- "usagi_weight"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([0. , 0.41, 0. , 0. , 0. , 0.82, 0.41])"
- ]
- },
- "execution_count": 7,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "momotaro_arrays = result.toarray()[[False, False, True]] # ももたろうの歌だけ抽出\n",
- "momotaro_arrays\n",
- "momotaro_weight = np.mean(momotaro_arrays, axis=0) # 重みを計算\n",
- "momotaro_weight"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([[0., 0., 0., 1., 0., 0., 0.]])"
- ]
- },
- "execution_count": 8,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "new_song = tfidf.transform(['浦島 亀 竜宮城 絵']).toarray() # 新たな歌が与えられた\n",
- "new_song"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([0.65])"
- ]
- },
- "execution_count": 10,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "score = new_song.dot(usagi_weight)\n",
- "score"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "score = new_song.dot(momotaro_weight)\n",
- "score"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.6.0"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
- }
Add Comment
Please, Sign In to add comment