Advertisement
Guest User

Untitled

a guest
Dec 9th, 2016
107
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 27.84 KB | None | 0 0
  1. {
  2. "cells": [
  3. {
  4. "cell_type": "code",
  5. "execution_count": 25,
  6. "metadata": {
  7. "collapsed": false
  8. },
  9. "outputs": [],
  10. "source": [
  11. "from nltk.tokenize import RegexpTokenizer\n",
  12. "from stop_words import get_stop_words\n",
  13. "from nltk.stem.porter import PorterStemmer\n",
  14. "from gensim import corpora, models\n",
  15. "import gensim\n",
  16. "import re\n",
  17. "import numpy as np\n",
  18. "import pandas as pd"
  19. ]
  20. },
  21. {
  22. "cell_type": "markdown",
  23. "metadata": {},
  24. "source": [
  25. "# Reading data"
  26. ]
  27. },
  28. {
  29. "cell_type": "markdown",
  30. "metadata": {},
  31. "source": [
  32. "First, we'll create a data structure to store all the information about the different ToS documents with which we'll train our LDA model, to get the different topics.\n",
  33. "\n",
  34. "The information will be cleaned (removing stop-words, stemmed, etc.) and stores as a list of paragraphs."
  35. ]
  36. },
  37. {
  38. "cell_type": "code",
  39. "execution_count": 26,
  40. "metadata": {
  41. "collapsed": true
  42. },
  43. "outputs": [],
  44. "source": [
  45. "#utility function to read files (ToS) to an list of paragraphs.\n",
  46. "def read_file_to_paragraphs(file_path):\n",
  47. " file = open(file_path, 'r')\n",
  48. " doc = file.read()\n",
  49. " file.close()\n",
  50. " pars = re.split('\\n\\n+', doc) #some documents have a end of line during what could be considered the same \n",
  51. " #paragraph, I believe 2 or more \\n's if a better slip for this.\n",
  52. " print('reading %s wich have %d paragraphs' % (file_path, len(pars)))\n",
  53. " return(pars)"
  54. ]
  55. },
  56. {
  57. "cell_type": "markdown",
  58. "metadata": {},
  59. "source": [
  60. "Reading all the files..."
  61. ]
  62. },
  63. {
  64. "cell_type": "code",
  65. "execution_count": 27,
  66. "metadata": {
  67. "collapsed": false
  68. },
  69. "outputs": [
  70. {
  71. "name": "stdout",
  72. "output_type": "stream",
  73. "text": [
  74. "reading ./data/twitter_tos.txt wich have 80 paragraphs\n"
  75. ]
  76. }
  77. ],
  78. "source": [
  79. "pars = read_file_to_paragraphs('./data/twitter_tos.txt')"
  80. ]
  81. },
  82. {
  83. "cell_type": "code",
  84. "execution_count": 28,
  85. "metadata": {
  86. "collapsed": false
  87. },
  88. "outputs": [
  89. {
  90. "name": "stdout",
  91. "output_type": "stream",
  92. "text": [
  93. "reading ./data/facebook_tos.txt wich have 35 paragraphs\n",
  94. "reading ./data/github_tos.txt wich have 60 paragraphs\n",
  95. "reading ./data/google_privacy_tos.txt wich have 71 paragraphs\n",
  96. "reading ./data/google_tos.txt wich have 52 paragraphs\n",
  97. "reading ./data/snaptchat_tos.txt wich have 83 paragraphs\n",
  98. "reading ./data/squarespace_tos.txt wich have 153 paragraphs\n",
  99. "reading ./data/youtube_tos.txt wich have 13 paragraphs\n"
  100. ]
  101. }
  102. ],
  103. "source": [
  104. "pars.extend(read_file_to_paragraphs('./data/facebook_tos.txt'))\n",
  105. "pars.extend(read_file_to_paragraphs('./data/github_tos.txt'))\n",
  106. "pars.extend(read_file_to_paragraphs('./data/google_privacy_tos.txt'))\n",
  107. "pars.extend(read_file_to_paragraphs('./data/google_tos.txt'))\n",
  108. "pars.extend(read_file_to_paragraphs('./data/snaptchat_tos.txt'))\n",
  109. "pars.extend(read_file_to_paragraphs('./data/squarespace_tos.txt'))\n",
  110. "pars.extend(read_file_to_paragraphs('./data/youtube_tos.txt'))"
  111. ]
  112. },
  113. {
  114. "cell_type": "code",
  115. "execution_count": 29,
  116. "metadata": {
  117. "collapsed": false
  118. },
  119. "outputs": [
  120. {
  121. "data": {
  122. "text/plain": [
  123. "547"
  124. ]
  125. },
  126. "execution_count": 29,
  127. "metadata": {},
  128. "output_type": "execute_result"
  129. }
  130. ],
  131. "source": [
  132. "len(pars)"
  133. ]
  134. },
  135. {
  136. "cell_type": "markdown",
  137. "metadata": {},
  138. "source": [
  139. "We'll train our model with 547 paragraphs."
  140. ]
  141. },
  142. {
  143. "cell_type": "code",
  144. "execution_count": 30,
  145. "metadata": {
  146. "collapsed": true
  147. },
  148. "outputs": [],
  149. "source": [
  150. "tokenizer = RegexpTokenizer(r'\\w+')\n",
  151. "en_stop = get_stop_words('en')\n",
  152. "p_stemmer = PorterStemmer()\n",
  153. "\n",
  154. "#utility function that will tokenize, remove stop-words and stem the paragraphs.\n",
  155. "def tokenize_and_stem(text, tokenizer, stemmer, stop_words):\n",
  156. " return([stemmer.stem(word) for word in tokenizer.tokenize(text.lower()) if word not in stop_words])"
  157. ]
  158. },
  159. {
  160. "cell_type": "code",
  161. "execution_count": 31,
  162. "metadata": {
  163. "collapsed": false
  164. },
  165. "outputs": [],
  166. "source": [
  167. "norm_texts = [tokenize_and_stem(par, tokenizer, p_stemmer, en_stop) for par in pars]"
  168. ]
  169. },
  170. {
  171. "cell_type": "markdown",
  172. "metadata": {},
  173. "source": [
  174. "An example of the normalized text:"
  175. ]
  176. },
  177. {
  178. "cell_type": "code",
  179. "execution_count": 32,
  180. "metadata": {
  181. "collapsed": false
  182. },
  183. "outputs": [
  184. {
  185. "data": {
  186. "text/plain": [
  187. "[['skip',\n",
  188. " 'main',\n",
  189. " 'content',\n",
  190. " 'twitter',\n",
  191. " 'languag',\n",
  192. " 'english',\n",
  193. " 'sign',\n",
  194. " 'download',\n",
  195. " 'thetwitteruseragr',\n",
  196. " 'pdf'],\n",
  197. " ['live',\n",
  198. " 'unit',\n",
  199. " 'state',\n",
  200. " 'twitter',\n",
  201. " 'user',\n",
  202. " 'agreement',\n",
  203. " 'compris',\n",
  204. " 'term',\n",
  205. " 'servic',\n",
  206. " 'privaci',\n",
  207. " 'polici',\n",
  208. " 'twitter',\n",
  209. " 'rule',\n",
  210. " 'incorpor',\n",
  211. " 'polici']]"
  212. ]
  213. },
  214. "execution_count": 32,
  215. "metadata": {},
  216. "output_type": "execute_result"
  217. }
  218. ],
  219. "source": [
  220. "norm_texts[:2]"
  221. ]
  222. },
  223. {
  224. "cell_type": "markdown",
  225. "metadata": {},
  226. "source": [
  227. "# Create Dictionary object"
  228. ]
  229. },
  230. {
  231. "cell_type": "markdown",
  232. "metadata": {},
  233. "source": [
  234. "To train an LDA model, we first need to map words to numeric ids with a _Dictionary_ model."
  235. ]
  236. },
  237. {
  238. "cell_type": "code",
  239. "execution_count": 33,
  240. "metadata": {
  241. "collapsed": false
  242. },
  243. "outputs": [
  244. {
  245. "data": {
  246. "text/plain": [
  247. "gensim.corpora.dictionary.Dictionary"
  248. ]
  249. },
  250. "execution_count": 33,
  251. "metadata": {},
  252. "output_type": "execute_result"
  253. }
  254. ],
  255. "source": [
  256. "# turn our tokenized documents into a id <-> term dictionary\n",
  257. "dictionary = corpora.Dictionary(norm_texts)\n",
  258. "type(dictionary)"
  259. ]
  260. },
  261. {
  262. "cell_type": "code",
  263. "execution_count": 34,
  264. "metadata": {
  265. "collapsed": true
  266. },
  267. "outputs": [],
  268. "source": [
  269. "#saving the dictionary object to used later (in the web app).\n",
  270. "dictionary.save('lda_dictionary')"
  271. ]
  272. },
  273. {
  274. "cell_type": "markdown",
  275. "metadata": {},
  276. "source": [
  277. "# Create BOW object"
  278. ]
  279. },
  280. {
  281. "cell_type": "markdown",
  282. "metadata": {},
  283. "source": [
  284. "Also we'll need a bag-of-words representation of our text to train the LDA model."
  285. ]
  286. },
  287. {
  288. "cell_type": "code",
  289. "execution_count": 35,
  290. "metadata": {
  291. "collapsed": false
  292. },
  293. "outputs": [
  294. {
  295. "data": {
  296. "text/plain": [
  297. "547"
  298. ]
  299. },
  300. "execution_count": 35,
  301. "metadata": {},
  302. "output_type": "execute_result"
  303. }
  304. ],
  305. "source": [
  306. "# convert tokenized documents into a document-term matrix\n",
  307. "bows = [dictionary.doc2bow(text) for text in norm_texts]\n",
  308. "len(bows)"
  309. ]
  310. },
  311. {
  312. "cell_type": "markdown",
  313. "metadata": {},
  314. "source": [
  315. "An example of how this bow structure looks like (for the first paragraph):"
  316. ]
  317. },
  318. {
  319. "cell_type": "code",
  320. "execution_count": 36,
  321. "metadata": {
  322. "collapsed": false
  323. },
  324. "outputs": [
  325. {
  326. "data": {
  327. "text/plain": [
  328. "[(0, 1),\n",
  329. " (1, 1),\n",
  330. " (2, 1),\n",
  331. " (3, 1),\n",
  332. " (4, 1),\n",
  333. " (5, 1),\n",
  334. " (6, 1),\n",
  335. " (7, 1),\n",
  336. " (8, 1),\n",
  337. " (9, 1)]"
  338. ]
  339. },
  340. "execution_count": 36,
  341. "metadata": {},
  342. "output_type": "execute_result"
  343. }
  344. ],
  345. "source": [
  346. "bows[0]"
  347. ]
  348. },
  349. {
  350. "cell_type": "markdown",
  351. "metadata": {},
  352. "source": [
  353. "# Train the LDA model"
  354. ]
  355. },
  356. {
  357. "cell_type": "markdown",
  358. "metadata": {},
  359. "source": [
  360. "Now we're ready to train our LDA model. Reading through the documents and trying different parameters we concluded that a good number of topics to set our model with would be 10 (we're classifying our text in only 5 categories, but with this we're saying that the complete documents have (an average) of 10 different topics. When we tried 15 or 20 the topics seem to repeated."
  361. ]
  362. },
  363. {
  364. "cell_type": "code",
  365. "execution_count": 37,
  366. "metadata": {
  367. "collapsed": false
  368. },
  369. "outputs": [
  370. {
  371. "data": {
  372. "text/plain": [
  373. "[(0,\n",
  374. " '0.059*\"servic\" + 0.027*\"content\" + 0.024*\"use\" + 0.020*\"term\" + 0.018*\"will\" + 0.015*\"parti\" + 0.014*\"third\" + 0.012*\"damag\" + 0.012*\"includ\" + 0.011*\"access\"'),\n",
  375. " (1,\n",
  376. " '0.037*\"account\" + 0.036*\"twitter\" + 0.021*\"servic\" + 0.018*\"use\" + 0.017*\"term\" + 0.015*\"com\" + 0.012*\"may\" + 0.010*\"inform\" + 0.010*\"provid\" + 0.010*\"agreement\"'),\n",
  377. " (2,\n",
  378. " '0.051*\"privaci\" + 0.049*\"polici\" + 0.034*\"inform\" + 0.033*\"use\" + 0.029*\"servic\" + 0.025*\"term\" + 0.023*\"payment\" + 0.021*\"user\" + 0.019*\"end\" + 0.019*\"site\"'),\n",
  379. " (3,\n",
  380. " '0.072*\"inform\" + 0.046*\"use\" + 0.039*\"googl\" + 0.026*\"servic\" + 0.023*\"share\" + 0.017*\"may\" + 0.017*\"content\" + 0.017*\"facebook\" + 0.016*\"advertis\" + 0.016*\"collect\"'),\n",
  381. " (4,\n",
  382. " '0.037*\"servic\" + 0.019*\"chang\" + 0.017*\"domain\" + 0.017*\"renew\" + 0.017*\"may\" + 0.016*\"13\" + 0.016*\"term\" + 0.016*\"copyright\" + 0.015*\"fee\" + 0.014*\"notic\"'),\n",
  383. " (5,\n",
  384. " '0.023*\"right\" + 0.021*\"account\" + 0.021*\"will\" + 0.017*\"s\" + 0.015*\"may\" + 0.015*\"person\" + 0.014*\"state\" + 0.014*\"user\" + 0.013*\"inform\" + 0.013*\"includ\"'),\n",
  385. " (6,\n",
  386. " '0.045*\"term\" + 0.042*\"arbitr\" + 0.039*\"will\" + 0.026*\"servic\" + 0.026*\"snap\" + 0.025*\"inc\" + 0.019*\"agreement\" + 0.017*\"provis\" + 0.016*\"right\" + 0.014*\"use\"'),\n",
  387. " (7,\n",
  388. " '0.061*\"servic\" + 0.037*\"content\" + 0.028*\"use\" + 0.023*\"may\" + 0.016*\"right\" + 0.014*\"youtub\" + 0.014*\"provid\" + 0.013*\"copyright\" + 0.011*\"user\" + 0.010*\"infring\"'),\n",
  389. " (8,\n",
  390. " '0.030*\"servic\" + 0.021*\"account\" + 0.020*\"may\" + 0.014*\"will\" + 0.013*\"googl\" + 0.013*\"us\" + 0.013*\"parti\" + 0.013*\"facebook\" + 0.011*\"law\" + 0.011*\"respons\"'),\n",
  391. " (9,\n",
  392. " '0.029*\"ecommerc\" + 0.019*\"parti\" + 0.019*\"agreement\" + 0.018*\"payment\" + 0.018*\"may\" + 0.017*\"squarespac\" + 0.017*\"third\" + 0.017*\"privaci\" + 0.016*\"processor\" + 0.015*\"us\"')]"
  393. ]
  394. },
  395. "execution_count": 37,
  396. "metadata": {},
  397. "output_type": "execute_result"
  398. }
  399. ],
  400. "source": [
  401. "ldamodel = gensim.models.ldamodel.LdaModel(bows, num_topics=10, id2word = dictionary, passes=20)\n",
  402. "ldamodel.print_topics()"
  403. ]
  404. },
  405. {
  406. "cell_type": "markdown",
  407. "metadata": {},
  408. "source": [
  409. "We'll save the model for later use (in our web app)."
  410. ]
  411. },
  412. {
  413. "cell_type": "code",
  414. "execution_count": 38,
  415. "metadata": {
  416. "collapsed": true
  417. },
  418. "outputs": [],
  419. "source": [
  420. "ldamodel.save('lda_model')"
  421. ]
  422. },
  423. {
  424. "cell_type": "markdown",
  425. "metadata": {},
  426. "source": [
  427. "# Running the model in a new doc"
  428. ]
  429. },
  430. {
  431. "cell_type": "markdown",
  432. "metadata": {},
  433. "source": [
  434. "The above topics appear to make sense, but they are not labeled nicely for our use. For example `(1,\n",
  435. " '0.057*\"servic\" + 0.036*\"inform\" + 0.036*\"use\" + 0.023*\"privaci\" + 0.022*\"polici\" + 0.015*\"provid\" + 0.013*\"access\" + 0.013*\"term\" + 0.012*\"user\" + 0.010*\"collect\"')` seem to be talking about privacy and use of information. We'll need to take these topics and mark them with a more readable label.\n",
  436. "\n",
  437. "In the following dictionary we define the topics of our interest and wich words below to those topics."
  438. ]
  439. },
  440. {
  441. "cell_type": "code",
  442. "execution_count": 39,
  443. "metadata": {
  444. "collapsed": true
  445. },
  446. "outputs": [],
  447. "source": [
  448. "#this topic list can be expanded with more topics and more words related to those topics.\n",
  449. "topic_dic = {'privacy': ['privacy'], 'copyright': ['copyright'], 'content sharing/use': ['share'], 'cancelation/termination': ['cancelation', 'termination'], 'modification/pricing': ['modification', 'pricing'], 'special': ['law', 'jurisdiction', 'governing']}"
  450. ]
  451. },
  452. {
  453. "cell_type": "markdown",
  454. "metadata": {},
  455. "source": [
  456. "Now we need to use our trained model to analyse a new document, for that we'll need to apply the model to each paragraph to find the most relevant topic and label that paragraph. We'll create a list element will be another list with the original paragraph (not cleaned) and the label, to analyze the paragraph with LDA we do clean the words in the text."
  457. ]
  458. },
  459. {
  460. "cell_type": "code",
  461. "execution_count": 40,
  462. "metadata": {
  463. "collapsed": true
  464. },
  465. "outputs": [],
  466. "source": [
  467. "def create_topic_pars(pars, tokenizer, stemmer, stop_words, ldamodel, word_dictionary, topic_dictionary):\n",
  468. " norm_pars = [tokenize_and_stem(par, tokenizer, stemmer, stop_words) for par in pars]\n",
  469. " print('created normalized paragraphs object of length %d' % len(norm_pars))\n",
  470. " bows = [word_dictionary.doc2bow(text) for text in norm_pars]\n",
  471. " print('created bag-of-words object of length %d' % len(bows))\n",
  472. " topic_pars = []\n",
  473. " for idx, val in enumerate(bows):\n",
  474. " lda_vector = ldamodel[val]\n",
  475. " #original LDA model topic (most relevant) and paragraph:\n",
  476. " topic_pars.append([ldamodel.print_topic(max(lda_vector, key=lambda item: item[1])[0]), pars[idx]]) #we attach the original paragraph here, not the cleaned version that we used for LDA.\n",
  477. " \n",
  478. " #now we'll create a nicely labeled structure.\n",
  479. " tagged_pars = []\n",
  480. " for topic_name in topic_dictionary:\n",
  481. " topic_words = topic_dictionary[topic_name]\n",
  482. " for pars in topic_pars:\n",
  483. " topic = pars[0]\n",
  484. " par = pars[1]\n",
  485. " if(len(par) > 50):\n",
  486. " for word in topic_words:\n",
  487. " if stemmer.stem(word) in topic:\n",
  488. " tagged_pars.append([par, topic_name])\n",
  489. " break\n",
  490. " return(tagged_pars)"
  491. ]
  492. },
  493. {
  494. "cell_type": "markdown",
  495. "metadata": {},
  496. "source": [
  497. "We'll use Medium's ToS to try our model (obviously this was not part of the training data):"
  498. ]
  499. },
  500. {
  501. "cell_type": "code",
  502. "execution_count": 41,
  503. "metadata": {
  504. "collapsed": false
  505. },
  506. "outputs": [
  507. {
  508. "name": "stdout",
  509. "output_type": "stream",
  510. "text": [
  511. "reading ./data/medium_tos.txt wich have 32 paragraphs\n"
  512. ]
  513. }
  514. ],
  515. "source": [
  516. "new_pars = read_file_to_paragraphs('./data/medium_tos.txt')"
  517. ]
  518. },
  519. {
  520. "cell_type": "code",
  521. "execution_count": 42,
  522. "metadata": {
  523. "collapsed": false
  524. },
  525. "outputs": [
  526. {
  527. "name": "stdout",
  528. "output_type": "stream",
  529. "text": [
  530. "created normalized paragraphs object of length 32\n",
  531. "created bag-of-words object of length 32\n"
  532. ]
  533. }
  534. ],
  535. "source": [
  536. "topic_pars = create_topic_pars(new_pars, tokenizer, p_stemmer, en_stop, ldamodel, dictionary, topic_dic)"
  537. ]
  538. },
  539. {
  540. "cell_type": "code",
  541. "execution_count": 43,
  542. "metadata": {
  543. "collapsed": false
  544. },
  545. "outputs": [
  546. {
  547. "data": {
  548. "text/plain": [
  549. "[['These Terms of Service (“Terms”) are a contract between you and A Medium Corporation. They govern your use of Medium’s sites, services, mobile apps, products, and content (“Services”). By using Medium, you agree to these Terms. If you don’t agree to any of the Terms, you can’t use Medium. We can change these Terms at any time. We keep a historical record of all changes to our Terms on GitHub. If a change is material, we’ll let you know before they take effect. By using Medium on or after that effective date, you agree to the new Terms. If you don’t agree to them, you should delete your account before they take effect, otherwise your use of the site and content will be subject to the new Terms.',\n",
  550. " 'copyright'],\n",
  551. " ['You own the rights to the content you create and post on Medium.',\n",
  552. " 'copyright'],\n",
  553. " ['By posting content to Medium, you give us a nonexclusive license to publish it on Medium Services, including anything reasonably related to publishing it (like storing, displaying, reformatting, and distributing it). In consideration for Medium granting you access to and use of the Services, you agree that Medium may enable advertising on the Services, including in connection with the display of your content or other information. We may also use your content to promote Medium, including its products and content. We will never sell your content to third parties without your explicit permission.',\n",
  554. " 'copyright'],\n",
  555. " ['You’re responsible for the content you post. This means you assume all risks related to it, including someone else’s reliance on its accuracy, or claims relating to intellectual property or other legal rights.',\n",
  556. " 'copyright'],\n",
  557. " ['You’re welcome to post content on Medium that you’ve published elsewhere, as long as you have the rights you need to do so. By posting content to Medium, you represent that doing so doesn’t conflict with any other agreement you’ve made.',\n",
  558. " 'copyright'],\n",
  559. " ['By posting content you didn’t create to Medium, you are representing that you have the right to do so. For example, you are posting a work that’s in the public domain, used under license (including a free license, such as Creative Commons), or a fair use.',\n",
  560. " 'copyright'],\n",
  561. " ['You can delete any of your posts, or your account, anytime. Processing the deletion may take a little time, but we’ll do it as quickly as possible. We may keep backup copies of your deleted post or account on our servers for up to 14 days after you delete it.',\n",
  562. " 'copyright'],\n",
  563. " ['We reserve all rights in Medium’s look and feel. Some parts of Medium are licensed under third-party open source licenses. We also make some of our own code available under open source licenses. As for other parts of Medium, you may not copy or adapt any portion of our code or visual design elements (including logos) without express written permission from Medium unless otherwise permitted by law.',\n",
  564. " 'copyright'],\n",
  565. " ['We may change, terminate, or restrict access to any aspect of the service, at any time, without notice.',\n",
  566. " 'copyright'],\n",
  567. " ['Medium is only for people 13 years old and over. By using Medium, you affirm that you are over 13. If we learn someone under 13 is using Medium, we’ll terminate their account.',\n",
  568. " 'copyright'],\n",
  569. " ['To enable a functioning community, we have Rules. To ensure usernames are distributed and used fairly, we have a Username Policy. Under our DMCA Policy, we’ll remove material after receiving a valid takedown notice. Under our Trademark Policy, we’ll investigate any use of another’s trademark and respond appropriately.',\n",
  570. " 'copyright'],\n",
  571. " ['By using Medium, you agree to follow these Rules and Policies. If you don’t, we may remove content, or suspend or delete your account.',\n",
  572. " 'copyright'],\n",
  573. " ['Disclaimer of warranty. Medium provides the Services to you as is. You use them at your own risk and discretion. That means they don’t come with any warranty. None express, none implied. No implied warranty of merchantability, fitness for a particular purpose, availability, security, title or non-infringement.',\n",
  574. " 'copyright'],\n",
  575. " ['By using the Services, you agree to let Medium collect and use information as detailed in our Privacy Policy. If you’re outside the United States, you consent to letting Medium transfer, store, and process your information (including your personal information and content) in and out of the United States.',\n",
  576. " 'privacy']]"
  577. ]
  578. },
  579. "execution_count": 43,
  580. "metadata": {},
  581. "output_type": "execute_result"
  582. }
  583. ],
  584. "source": [
  585. "topic_pars"
  586. ]
  587. },
  588. {
  589. "cell_type": "markdown",
  590. "metadata": {},
  591. "source": [
  592. "The result appear to make sense. To test the validity of our model we'll survey the result of several of these ToSs on different people and we'll get their opinions."
  593. ]
  594. },
  595. {
  596. "cell_type": "markdown",
  597. "metadata": {},
  598. "source": [
  599. "---"
  600. ]
  601. },
  602. {
  603. "cell_type": "markdown",
  604. "metadata": {},
  605. "source": [
  606. "This part is the same as above but it was used to tried if we could run our model from the different saved parts, to use it in our web app."
  607. ]
  608. },
  609. {
  610. "cell_type": "markdown",
  611. "metadata": {},
  612. "source": [
  613. "# Running a saved model"
  614. ]
  615. },
  616. {
  617. "cell_type": "code",
  618. "execution_count": 44,
  619. "metadata": {
  620. "collapsed": false
  621. },
  622. "outputs": [
  623. {
  624. "data": {
  625. "text/plain": [
  626. "gensim.corpora.dictionary.Dictionary"
  627. ]
  628. },
  629. "execution_count": 44,
  630. "metadata": {},
  631. "output_type": "execute_result"
  632. }
  633. ],
  634. "source": [
  635. "dictionary2 = corpora.Dictionary.load('lda_dictionary')\n",
  636. "type(dictionary2)"
  637. ]
  638. },
  639. {
  640. "cell_type": "code",
  641. "execution_count": 45,
  642. "metadata": {
  643. "collapsed": false
  644. },
  645. "outputs": [
  646. {
  647. "data": {
  648. "text/plain": [
  649. "gensim.models.ldamodel.LdaModel"
  650. ]
  651. },
  652. "execution_count": 45,
  653. "metadata": {},
  654. "output_type": "execute_result"
  655. }
  656. ],
  657. "source": [
  658. "ldamodel2 = gensim.models.ldamodel.LdaModel.load('lda_model')\n",
  659. "type(ldamodel2)"
  660. ]
  661. },
  662. {
  663. "cell_type": "code",
  664. "execution_count": 46,
  665. "metadata": {
  666. "collapsed": false
  667. },
  668. "outputs": [
  669. {
  670. "name": "stdout",
  671. "output_type": "stream",
  672. "text": [
  673. "created normalized paragraphs object of length 32\n",
  674. "created bag-of-words object of length 32\n"
  675. ]
  676. }
  677. ],
  678. "source": [
  679. "topic_pars2 = create_topic_pars(new_pars, tokenizer, p_stemmer, en_stop, ldamodel2, dictionary2, topic_dic)"
  680. ]
  681. },
  682. {
  683. "cell_type": "code",
  684. "execution_count": 47,
  685. "metadata": {
  686. "collapsed": false
  687. },
  688. "outputs": [
  689. {
  690. "data": {
  691. "text/plain": [
  692. "[['These Terms of Service (“Terms”) are a contract between you and A Medium Corporation. They govern your use of Medium’s sites, services, mobile apps, products, and content (“Services”). By using Medium, you agree to these Terms. If you don’t agree to any of the Terms, you can’t use Medium. We can change these Terms at any time. We keep a historical record of all changes to our Terms on GitHub. If a change is material, we’ll let you know before they take effect. By using Medium on or after that effective date, you agree to the new Terms. If you don’t agree to them, you should delete your account before they take effect, otherwise your use of the site and content will be subject to the new Terms.',\n",
  693. " 'copyright'],\n",
  694. " ['You own the rights to the content you create and post on Medium.',\n",
  695. " 'copyright'],\n",
  696. " ['By posting content to Medium, you give us a nonexclusive license to publish it on Medium Services, including anything reasonably related to publishing it (like storing, displaying, reformatting, and distributing it). In consideration for Medium granting you access to and use of the Services, you agree that Medium may enable advertising on the Services, including in connection with the display of your content or other information. We may also use your content to promote Medium, including its products and content. We will never sell your content to third parties without your explicit permission.',\n",
  697. " 'copyright'],\n",
  698. " ['You’re responsible for the content you post. This means you assume all risks related to it, including someone else’s reliance on its accuracy, or claims relating to intellectual property or other legal rights.',\n",
  699. " 'copyright'],\n",
  700. " ['You’re welcome to post content on Medium that you’ve published elsewhere, as long as you have the rights you need to do so. By posting content to Medium, you represent that doing so doesn’t conflict with any other agreement you’ve made.',\n",
  701. " 'copyright'],\n",
  702. " ['By posting content you didn’t create to Medium, you are representing that you have the right to do so. For example, you are posting a work that’s in the public domain, used under license (including a free license, such as Creative Commons), or a fair use.',\n",
  703. " 'copyright'],\n",
  704. " ['You can delete any of your posts, or your account, anytime. Processing the deletion may take a little time, but we’ll do it as quickly as possible. We may keep backup copies of your deleted post or account on our servers for up to 14 days after you delete it.',\n",
  705. " 'copyright'],\n",
  706. " ['We reserve all rights in Medium’s look and feel. Some parts of Medium are licensed under third-party open source licenses. We also make some of our own code available under open source licenses. As for other parts of Medium, you may not copy or adapt any portion of our code or visual design elements (including logos) without express written permission from Medium unless otherwise permitted by law.',\n",
  707. " 'copyright'],\n",
  708. " ['We may change, terminate, or restrict access to any aspect of the service, at any time, without notice.',\n",
  709. " 'copyright'],\n",
  710. " ['Medium is only for people 13 years old and over. By using Medium, you affirm that you are over 13. If we learn someone under 13 is using Medium, we’ll terminate their account.',\n",
  711. " 'copyright'],\n",
  712. " ['To enable a functioning community, we have Rules. To ensure usernames are distributed and used fairly, we have a Username Policy. Under our DMCA Policy, we’ll remove material after receiving a valid takedown notice. Under our Trademark Policy, we’ll investigate any use of another’s trademark and respond appropriately.',\n",
  713. " 'copyright'],\n",
  714. " ['By using Medium, you agree to follow these Rules and Policies. If you don’t, we may remove content, or suspend or delete your account.',\n",
  715. " 'copyright'],\n",
  716. " ['Disclaimer of warranty. Medium provides the Services to you as is. You use them at your own risk and discretion. That means they don’t come with any warranty. None express, none implied. No implied warranty of merchantability, fitness for a particular purpose, availability, security, title or non-infringement.',\n",
  717. " 'copyright'],\n",
  718. " ['By using the Services, you agree to let Medium collect and use information as detailed in our Privacy Policy. If you’re outside the United States, you consent to letting Medium transfer, store, and process your information (including your personal information and content) in and out of the United States.',\n",
  719. " 'privacy']]"
  720. ]
  721. },
  722. "execution_count": 47,
  723. "metadata": {},
  724. "output_type": "execute_result"
  725. }
  726. ],
  727. "source": [
  728. "topic_pars2"
  729. ]
  730. },
  731. {
  732. "cell_type": "code",
  733. "execution_count": null,
  734. "metadata": {
  735. "collapsed": true
  736. },
  737. "outputs": [],
  738. "source": []
  739. }
  740. ],
  741. "metadata": {
  742. "anaconda-cloud": {},
  743. "kernelspec": {
  744. "display_name": "Python [Root]",
  745. "language": "python",
  746. "name": "Python [Root]"
  747. },
  748. "language_info": {
  749. "codemirror_mode": {
  750. "name": "ipython",
  751. "version": 3
  752. },
  753. "file_extension": ".py",
  754. "mimetype": "text/x-python",
  755. "name": "python",
  756. "nbconvert_exporter": "python",
  757. "pygments_lexer": "ipython3",
  758. "version": "3.5.2"
  759. }
  760. },
  761. "nbformat": 4,
  762. "nbformat_minor": 0
  763. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement