Advertisement
Guest User

Untitled

a guest
May 24th, 2016
58
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.56 KB | None | 0 0
  1. {
  2. "cells": [
  3. {
  4. "cell_type": "code",
  5. "execution_count": null,
  6. "metadata": {
  7. "collapsed": false
  8. },
  9. "outputs": [],
  10. "source": [
  11. "from gensim.models import word2vec\n",
  12. "from collections import defaultdict\n",
  13. "import json\n",
  14. "\n",
  15. "model = word2vec.Word2Vec.load_word2vec_format(\n",
  16. " '/www/magenta/home/gorosz/work/concepts/data/wiki2vec/wiki_vec.txt', binary=False)\n",
  17. "\n",
  18. "pages_per_category=defaultdict(set)\n",
  19. "with open('concept-data-sorted') as f:\n",
  20. " for line in f:\n",
  21. " pages_per_category[line.strip().split(' ')[1]].add(line.strip().split(' ')[0])\n",
  22. "\n",
  23. "with open('child_beats.json') as f:\n",
  24. " child_beats = json.load(f)\n"
  25. ]
  26. },
  27. {
  28. "cell_type": "code",
  29. "execution_count": null,
  30. "metadata": {
  31. "collapsed": false
  32. },
  33. "outputs": [],
  34. "source": [
  35. "import numpy\n",
  36. "def vector_of_page(page):\n",
  37. " try:\n",
  38. " return model[\"DBPEDIA_ID/\"+page]\n",
  39. " except:\n",
  40. " return numpy.zeros(500,dtype=\"float32\")\n",
  41. "\n",
  42. "def vector_of_word(word):\n",
  43. " try:\n",
  44. " return model[word]\n",
  45. " except:\n",
  46. " return numpy.zeros(500,dtype=\"float32\")\n",
  47. "\n",
  48. "def vector_of_text(text):\n",
  49. " words=text.lower().split()\n",
  50. " a=numpy.zeros(500,dtype=\"float32\")\n",
  51. " for word in words:\n",
  52. " a=numpy.add(a,vector_of_word(word))\n",
  53. " return a\n",
  54. "\n",
  55. "vectors_per_category=dict()\n",
  56. "for cat in pages_per_category:\n",
  57. " v=numpy.zeros(500,dtype=\"float32\")\n",
  58. " for page in pages_per_category[cat]:\n",
  59. " v=numpy.add(v,vector_of_page(page))\n",
  60. " vectors_per_category[cat]=v\n",
  61. "\n"
  62. ]
  63. },
  64. {
  65. "cell_type": "code",
  66. "execution_count": null,
  67. "metadata": {
  68. "collapsed": true
  69. },
  70. "outputs": [],
  71. "source": [
  72. "with open('vectors_per_category.txt','w+')as f:\n",
  73. " f.write(str(len(vectors_per_category))+' 500'+'\\n')\n",
  74. " for cat in vectors_per_category:\n",
  75. " f.write(cat+' '+' '.join([str(x) for x in vectors_per_category[cat]])+'\\n')\n",
  76. " "
  77. ]
  78. },
  79. {
  80. "cell_type": "code",
  81. "execution_count": 1,
  82. "metadata": {
  83. "collapsed": false,
  84. "scrolled": true
  85. },
  86. "outputs": [],
  87. "source": [
  88. "cat_model = word2vec.Word2Vec.load_word2vec_format('vectors_per_category.txt', binary=False)\n",
  89. "\n",
  90. "def most_similar_cat(vector):\n",
  91. " return [x[0][9:] for x in cat_model.most_similar([vector])[:5]]\n"
  92. ]
  93. },
  94. {
  95. "cell_type": "code",
  96. "execution_count": null,
  97. "metadata": {
  98. "collapsed": false
  99. },
  100. "outputs": [],
  101. "source": [
  102. "result=[]\n",
  103. "for chb in child_beats:\n",
  104. " result.append([chb['parentName']]+[chb['name']]+[chb['wikipediaCategory']]+most_similar_cat(vector_of_text(chb['name'])))\n"
  105. ]
  106. },
  107. {
  108. "cell_type": "code",
  109. "execution_count": null,
  110. "metadata": {
  111. "collapsed": true
  112. },
  113. "outputs": [],
  114. "source": [
  115. "with open ('x.txt','w+') as f:\n",
  116. " for line in result:\n",
  117. " f.write('\\t'.join(line)+'\\n')"
  118. ]
  119. }
  120. ],
  121. "metadata": {
  122. "kernelspec": {
  123. "display_name": "Python 3",
  124. "language": "python",
  125. "name": "python3"
  126. },
  127. "language_info": {
  128. "codemirror_mode": {
  129. "name": "ipython",
  130. "version": 3
  131. },
  132. "file_extension": ".py",
  133. "mimetype": "text/x-python",
  134. "name": "python",
  135. "nbconvert_exporter": "python",
  136. "pygments_lexer": "ipython3",
  137. "version": "3.5.1"
  138. }
  139. },
  140. "nbformat": 4,
  141. "nbformat_minor": 0
  142. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement