Advertisement
Guest User

Untitled

a guest
May 25th, 2016
57
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 8.67 KB | None | 0 0
  1. {
  2. "cells": [
  3. {
  4. "cell_type": "code",
  5. "execution_count": 1,
  6. "metadata": {
  7. "collapsed": true
  8. },
  9. "outputs": [],
  10. "source": [
  11. "import sklearn.datasets\n",
  12. "import re\n",
  13. "import sklearn.feature_extraction.text\n",
  14. "import numpy\n",
  15. "import sklearn.linear_model\n",
  16. "import sklearn.metrics\n",
  17. "import sklearn.grid_search\n",
  18. "from os import listdir\n",
  19. "import pymorphy2"
  20. ]
  21. },
  22. {
  23. "cell_type": "code",
  24. "execution_count": 2,
  25. "metadata": {
  26. "collapsed": true
  27. },
  28. "outputs": [],
  29. "source": [
  30. "CLEAN = '/home/max/PycharmProjects/FULL_DATA/CLEAN_DATA/'\n",
  31. "MARK = '/home/max/PycharmProjects/FULL_DATA/MARKS_DATA/'\n",
  32. "WORD = '/home/max/PycharmProjects/FULL_DATA/WORDS_DATA/'\n",
  33. "ALL_DATA = sorted([int(i.split('.')[0]) for i in listdir(CLEAN)])\n",
  34. "WORD_DATA = sorted([int(i.split('.')[0]) for i in listdir(WORD)])\n",
  35. "AllowedWords = '[а-я]+'\n",
  36. "\n",
  37. "data = []\n",
  38. "wordNormal = pymorphy2.MorphAnalyzer()\n",
  39. "interesting, boring = [], []\n",
  40. "marked = []\n",
  41. "unmarked = []\n",
  42. "markedD = []\n",
  43. "unmarkedD = []\n",
  44. "\n",
  45. "vectorizer = sklearn.feature_extraction.text.TfidfVectorizer()\n",
  46. "metric = sklearn.metrics.roc_auc_score\n",
  47. "cls = sklearn.linear_model.SGDClassifier(loss='log')\n",
  48. "searcher = sklearn.grid_search.GridSearchCV"
  49. ]
  50. },
  51. {
  52. "cell_type": "code",
  53. "execution_count": 3,
  54. "metadata": {
  55. "collapsed": true
  56. },
  57. "outputs": [],
  58. "source": [
  59. "def clean_by_words():\n",
  60. " for i in ALL_DATA:\n",
  61. " with open(CLEAN + str(i) + '.txt', 'r') as file:\n",
  62. " text = file.read()\n",
  63. " with open(WORD + str(i) + '.txt', 'w') as wordfile:\n",
  64. " data.append('')\n",
  65. " for word in re.findall(AllowedWords, text.lower()):\n",
  66. " normal = wordNormal.normal_forms(word)[0]\n",
  67. " wordfile.write(normal + ' ')\n",
  68. " data[-1] += normal + ' '"
  69. ]
  70. },
  71. {
  72. "cell_type": "code",
  73. "execution_count": 4,
  74. "metadata": {
  75. "collapsed": true
  76. },
  77. "outputs": [],
  78. "source": [
  79. "# clean_by_words вызывается каждый раз после выполения clean"
  80. ]
  81. },
  82. {
  83. "cell_type": "code",
  84. "execution_count": 5,
  85. "metadata": {
  86. "collapsed": true
  87. },
  88. "outputs": [],
  89. "source": [
  90. "def check():\n",
  91. " for i in WORD_DATA:\n",
  92. " text = open(WORD + str(i) + '.txt', 'r').read()\n",
  93. " try:\n",
  94. " with open(MARK + str(i) + '.txt', 'r') as mark:\n",
  95. " if mark.read() == '1':\n",
  96. " interesting.append(i)\n",
  97. " else:\n",
  98. " boring.append(i)\n",
  99. " marked.append(i)\n",
  100. " markedD.append(text)\n",
  101. " except:\n",
  102. " unmarked.append(i)\n",
  103. " unmarkedD.append(text)\n",
  104. " data = markedD + unmarkedD\n",
  105. " X = vectorizer.fit_transform(data)\n",
  106. " X_train = X[:len(markedD), :]\n",
  107. " X_test = X[len(markedD):, :]\n",
  108. " Y_train = numpy.array([1 if t in interesting else 0 for t in marked])\n",
  109. " return [X_train, Y_train, X_test]"
  110. ]
  111. },
  112. {
  113. "cell_type": "code",
  114. "execution_count": 6,
  115. "metadata": {
  116. "collapsed": true
  117. },
  118. "outputs": [],
  119. "source": [
  120. "def Grid_anal(X_train, Y_train, X_test): # название функции исторически так сложилось\n",
  121. " def scorer(estimator, X, Y):\n",
  122. " return metric(Y, estimator.predict_proba(X)[:, 1])\n",
  123. "\n",
  124. " grid = {\n",
  125. " 'penalty': ['elasticnet'],\n",
  126. " 'alpha': [0.001, 0.0001, 0.00001, 0.000001, 0.0000001],\n",
  127. " 'l1_ratio': [0.0, 0.01, 0.05, 0.10, 0.2, 0.3, 0.4, 0.5],\n",
  128. " }\n",
  129. "\n",
  130. " searcher = sklearn.grid_search.GridSearchCV(\n",
  131. " estimator=cls,\n",
  132. " param_grid=grid,\n",
  133. " scoring=scorer,\n",
  134. " cv=5,\n",
  135. " n_jobs=1\n",
  136. " )\n",
  137. " searcher.fit(X_train, Y_train)\n",
  138. " print(\"best score is \", end='')\n",
  139. " print(searcher.best_score_)\n",
  140. " every = [[unmarked[i[0]], searcher.predict_proba(i[1])[0][0]] for i in enumerate(X_test)]\n",
  141. " sor_every = sorted(every, key=lambda i: i[1])\n",
  142. " return sor_every\n",
  143. "# выводит сортированный по возрастанию список предсказаний для неразмеченных текстов"
  144. ]
  145. },
  146. {
  147. "cell_type": "code",
  148. "execution_count": 9,
  149. "metadata": {
  150. "collapsed": true
  151. },
  152. "outputs": [],
  153. "source": [
  154. "# Вообще, пока я разметил статьи вот так: \n",
  155. "# если 'клиент' или 'кассир' или 'продажа' в тексте, то интересно. Если нет, то нет"
  156. ]
  157. },
  158. {
  159. "cell_type": "code",
  160. "execution_count": 10,
  161. "metadata": {
  162. "collapsed": true
  163. },
  164. "outputs": [],
  165. "source": [
  166. "# посмотрим что получилось"
  167. ]
  168. },
  169. {
  170. "cell_type": "code",
  171. "execution_count": 11,
  172. "metadata": {
  173. "collapsed": true
  174. },
  175. "outputs": [],
  176. "source": [
  177. "a = check()"
  178. ]
  179. },
  180. {
  181. "cell_type": "code",
  182. "execution_count": 12,
  183. "metadata": {
  184. "collapsed": false
  185. },
  186. "outputs": [
  187. {
  188. "name": "stdout",
  189. "output_type": "stream",
  190. "text": [
  191. "best score is 0.832676697897\n",
  192. "[[267, 0.14814952467268638], [298, 0.2166617416736395], [252, 0.24473145596548596], [254, 0.26935462832632096], [232, 0.27283091354834865], [244, 0.28140093345841333], [276, 0.28947516950620178], [218, 0.29387222489191678], [207, 0.30707113979568001], [287, 0.31877048978812561], [292, 0.36731871723393728], [296, 0.37302046686667301], [205, 0.38205666504341418], [203, 0.38852198660623471], [281, 0.4025823163378931], [237, 0.41930907365357484], [206, 0.4280537538224205], [220, 0.44046959512779293], [221, 0.44814932978273203], [238, 0.44897073417062461], [209, 0.47191844440196373], [256, 0.47511814554681608], [235, 0.47555399504439011], [229, 0.47612651590939192], [259, 0.48483799987533849], [278, 0.48921829500850178], [263, 0.49757905956173765], [216, 0.50149374685657788], [249, 0.50269324098690671], [201, 0.50308278468703782], [219, 0.50408912918333293], [283, 0.51592650364622961], [288, 0.5172713798542109], [294, 0.52362343818718848], [234, 0.52422251293222666], [239, 0.53146968307160236], [299, 0.53168978106135323], [295, 0.53489004026208109], [290, 0.541030785346163], [271, 0.54224302595328966], [285, 0.54299920477807162], [266, 0.54814198165685979], [231, 0.55068725569906096], [277, 0.5608893890236033], [230, 0.56297478131202039], [240, 0.56614902920451837], [204, 0.56758415762862402], [236, 0.56782892286609277], [270, 0.57269026357913921], [210, 0.57384941836522496], [245, 0.57538615471925847], [225, 0.58171786780125767], [274, 0.5828350343677341], [297, 0.58297955139509039], [272, 0.58369657317829238], [243, 0.58564651804901513], [273, 0.58966822172867861], [222, 0.59088783808307799], [253, 0.59160771669018142], [289, 0.59270254053977234], [246, 0.59558636146822796], [248, 0.59624588835668524], [258, 0.59677337006723996], [268, 0.59734468376091177], [260, 0.59811350795753637], [264, 0.59892437495551509], [251, 0.60183526797396003], [233, 0.60358659442466411], [261, 0.60805553104182108], [265, 0.60952683929229257], [211, 0.6115376552127203], [208, 0.61217634692205958], [280, 0.61469232376076122], [255, 0.61637215828609659], [269, 0.61700412991059994], [286, 0.6185465176882663], [250, 0.62756469952015226], [262, 0.63334979506975264], [257, 0.64040372810246016], [202, 0.64451325288979922], [226, 0.64486976029194198], [200, 0.65194649573052132], [282, 0.65436902791761997], [247, 0.6617526472728168], [224, 0.66331512219290034], [279, 0.67356869890173887], [223, 0.6894785765767022]]\n"
  193. ]
  194. }
  195. ],
  196. "source": [
  197. "print(Grid_anal(a[0], a[1], a[2]))"
  198. ]
  199. },
  200. {
  201. "cell_type": "code",
  202. "execution_count": 13,
  203. "metadata": {
  204. "collapsed": true
  205. },
  206. "outputs": [],
  207. "source": [
  208. "# Такие дела"
  209. ]
  210. },
  211. {
  212. "cell_type": "code",
  213. "execution_count": null,
  214. "metadata": {
  215. "collapsed": true
  216. },
  217. "outputs": [],
  218. "source": []
  219. }
  220. ],
  221. "metadata": {
  222. "kernelspec": {
  223. "display_name": "Python 3",
  224. "language": "python",
  225. "name": "python3"
  226. },
  227. "language_info": {
  228. "codemirror_mode": {
  229. "name": "ipython",
  230. "version": 3
  231. },
  232. "file_extension": ".py",
  233. "mimetype": "text/x-python",
  234. "name": "python",
  235. "nbconvert_exporter": "python",
  236. "pygments_lexer": "ipython3",
  237. "version": "3.5.1"
  238. }
  239. },
  240. "nbformat": 4,
  241. "nbformat_minor": 0
  242. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement