Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- {
- "cells": [
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "from gensim.models import word2vec\n",
- "from collections import defaultdict\n",
- "import json\n",
- "\n",
- "model = word2vec.Word2Vec.load_word2vec_format(\n",
- " '/www/magenta/home/gorosz/work/concepts/data/wiki2vec/wiki_vec.txt', binary=False)\n",
- "\n",
- "pages_per_category=defaultdict(set)\n",
- "with open('concept-data-sorted') as f:\n",
- " for line in f:\n",
- " pages_per_category[line.strip().split(' ')[1]].add(line.strip().split(' ')[0])\n",
- "\n",
- "with open('child_beats.json') as f:\n",
- " child_beats = json.load(f)\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "import numpy\n",
- "def vector_of_page(page):\n",
- " try:\n",
- " return model[\"DBPEDIA_ID/\"+page]\n",
- " except:\n",
- " return numpy.zeros(500,dtype=\"float32\")\n",
- "\n",
- "def vector_of_word(word):\n",
- " try:\n",
- " return model[word]\n",
- " except:\n",
- " return numpy.zeros(500,dtype=\"float32\")\n",
- "\n",
- "def vector_of_text(text):\n",
- " words=text.lower().split()\n",
- " a=numpy.zeros(500,dtype=\"float32\")\n",
- " for word in words:\n",
- " a=numpy.add(a,vector_of_word(word))\n",
- " return a\n",
- "\n",
- "vectors_per_category=dict()\n",
- "for cat in pages_per_category:\n",
- " v=numpy.zeros(500,dtype=\"float32\")\n",
- " for page in pages_per_category[cat]:\n",
- " v=numpy.add(v,vector_of_page(page))\n",
- " vectors_per_category[cat]=v\n",
- "\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "with open('vectors_per_category.txt','w+')as f:\n",
- " f.write(str(len(vectors_per_category))+' 500'+'\\n')\n",
- " for cat in vectors_per_category:\n",
- " f.write(cat+' '+' '.join([str(x) for x in vectors_per_category[cat]])+'\\n')\n",
- " "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {
- "collapsed": false,
- "scrolled": true
- },
- "outputs": [],
- "source": [
- "cat_model = word2vec.Word2Vec.load_word2vec_format('vectors_per_category.txt', binary=False)\n",
- "\n",
- "def most_similar_cat(vector):\n",
- " return [x[0][9:] for x in cat_model.most_similar([vector])[:5]]\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "result=[]\n",
- "for chb in child_beats:\n",
- " result.append([chb['parentName']]+[chb['name']]+[chb['wikipediaCategory']]+most_similar_cat(vector_of_text(chb['name'])))\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "with open ('x.txt','w+') as f:\n",
- " for line in result:\n",
- " f.write('\\t'.join(line)+'\\n')"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.5.1"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 0
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement