Untitled

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import sklearn.datasets\n",
    "import re\n",
    "import sklearn.feature_extraction.text\n",
    "import numpy\n",
    "import sklearn.linear_model\n",
    "import sklearn.metrics\n",
    "import sklearn.grid_search\n",
    "from os import listdir\n",
    "import pymorphy2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "CLEAN = '/home/max/PycharmProjects/FULL_DATA/CLEAN_DATA/'\n",
    "MARK = '/home/max/PycharmProjects/FULL_DATA/MARKS_DATA/'\n",
    "WORD = '/home/max/PycharmProjects/FULL_DATA/WORDS_DATA/'\n",
    "ALL_DATA = sorted([int(i.split('.')[0]) for i in listdir(CLEAN)])\n",
    "WORD_DATA = sorted([int(i.split('.')[0]) for i in listdir(WORD)])\n",
    "AllowedWords = '[а-я]+'\n",
    "\n",
    "data = []\n",
    "wordNormal = pymorphy2.MorphAnalyzer()\n",
    "interesting, boring = [], []\n",
    "marked = []\n",
    "unmarked = []\n",
    "markedD = []\n",
    "unmarkedD = []\n",
    "\n",
    "vectorizer = sklearn.feature_extraction.text.TfidfVectorizer()\n",
    "metric = sklearn.metrics.roc_auc_score\n",
    "cls = sklearn.linear_model.SGDClassifier(loss='log')\n",
    "searcher = sklearn.grid_search.GridSearchCV"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def clean_by_words():\n",
    "    for i in ALL_DATA:\n",
    "        with open(CLEAN + str(i) + '.txt', 'r') as file:\n",
    "            text = file.read()\n",
    "        with open(WORD + str(i) + '.txt', 'w') as wordfile:\n",
    "            data.append('')\n",
    "            for word in re.findall(AllowedWords, text.lower()):\n",
    "                normal = wordNormal.normal_forms(word)[0]\n",
    "                wordfile.write(normal + ' ')\n",
    "                data[-1] += normal + ' '"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# clean_by_words вызывается каждый раз после выполения clean"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def check():\n",
    "    for i in WORD_DATA:\n",
    "        text = open(WORD + str(i) + '.txt', 'r').read()\n",
    "        try:\n",
    "            with open(MARK + str(i) + '.txt', 'r') as mark:\n",
    "                if mark.read() == '1':\n",
    "                    interesting.append(i)\n",
    "                else:\n",
    "                    boring.append(i)\n",
    "            marked.append(i)\n",
    "            markedD.append(text)\n",
    "        except:\n",
    "            unmarked.append(i)\n",
    "            unmarkedD.append(text)\n",
    "    data = markedD + unmarkedD\n",
    "    X = vectorizer.fit_transform(data)\n",
    "    X_train = X[:len(markedD), :]\n",
    "    X_test = X[len(markedD):, :]\n",
    "    Y_train = numpy.array([1 if t in interesting else 0 for t in marked])\n",
    "    return [X_train, Y_train, X_test]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def Grid_anal(X_train, Y_train, X_test): #  название функции исторически так сложилось\n",
    "    def scorer(estimator, X, Y):\n",
    "        return metric(Y, estimator.predict_proba(X)[:, 1])\n",
    "\n",
    "    grid = {\n",
    "        'penalty': ['elasticnet'],\n",
    "        'alpha': [0.001, 0.0001, 0.00001, 0.000001, 0.0000001],\n",
    "        'l1_ratio': [0.0, 0.01, 0.05, 0.10, 0.2, 0.3, 0.4, 0.5],\n",
    "    }\n",
    "\n",
    "    searcher = sklearn.grid_search.GridSearchCV(\n",
    "        estimator=cls,\n",
    "        param_grid=grid,\n",
    "        scoring=scorer,\n",
    "        cv=5,\n",
    "        n_jobs=1\n",
    "    )\n",
    "    searcher.fit(X_train, Y_train)\n",
    "    print(\"best score is \", end='')\n",
    "    print(searcher.best_score_)\n",
    "    every = [[unmarked[i[0]], searcher.predict_proba(i[1])[0][0]] for i in enumerate(X_test)]\n",
    "    sor_every = sorted(every, key=lambda i: i[1])\n",
    "    return sor_every\n",
    "# выводит сортированный по возрастанию список предсказаний для неразмеченных текстов"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Вообще, пока я разметил статьи вот так: \n",
    "# если 'клиент' или 'кассир' или 'продажа' в тексте, то интересно. Если нет, то нет"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# посмотрим что получилось"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "a = check()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "best score is 0.832676697897\n",
      "[[267, 0.14814952467268638], [298, 0.2166617416736395], [252, 0.24473145596548596], [254, 0.26935462832632096], [232, 0.27283091354834865], [244, 0.28140093345841333], [276, 0.28947516950620178], [218, 0.29387222489191678], [207, 0.30707113979568001], [287, 0.31877048978812561], [292, 0.36731871723393728], [296, 0.37302046686667301], [205, 0.38205666504341418], [203, 0.38852198660623471], [281, 0.4025823163378931], [237, 0.41930907365357484], [206, 0.4280537538224205], [220, 0.44046959512779293], [221, 0.44814932978273203], [238, 0.44897073417062461], [209, 0.47191844440196373], [256, 0.47511814554681608], [235, 0.47555399504439011], [229, 0.47612651590939192], [259, 0.48483799987533849], [278, 0.48921829500850178], [263, 0.49757905956173765], [216, 0.50149374685657788], [249, 0.50269324098690671], [201, 0.50308278468703782], [219, 0.50408912918333293], [283, 0.51592650364622961], [288, 0.5172713798542109], [294, 0.52362343818718848], [234, 0.52422251293222666], [239, 0.53146968307160236], [299, 0.53168978106135323], [295, 0.53489004026208109], [290, 0.541030785346163], [271, 0.54224302595328966], [285, 0.54299920477807162], [266, 0.54814198165685979], [231, 0.55068725569906096], [277, 0.5608893890236033], [230, 0.56297478131202039], [240, 0.56614902920451837], [204, 0.56758415762862402], [236, 0.56782892286609277], [270, 0.57269026357913921], [210, 0.57384941836522496], [245, 0.57538615471925847], [225, 0.58171786780125767], [274, 0.5828350343677341], [297, 0.58297955139509039], [272, 0.58369657317829238], [243, 0.58564651804901513], [273, 0.58966822172867861], [222, 0.59088783808307799], [253, 0.59160771669018142], [289, 0.59270254053977234], [246, 0.59558636146822796], [248, 0.59624588835668524], [258, 0.59677337006723996], [268, 0.59734468376091177], [260, 0.59811350795753637], [264, 0.59892437495551509], [251, 0.60183526797396003], [233, 0.60358659442466411], [261, 0.60805553104182108], [265, 0.60952683929229257], [211, 0.6115376552127203], [208, 0.61217634692205958], [280, 0.61469232376076122], [255, 0.61637215828609659], [269, 0.61700412991059994], [286, 0.6185465176882663], [250, 0.62756469952015226], [262, 0.63334979506975264], [257, 0.64040372810246016], [202, 0.64451325288979922], [226, 0.64486976029194198], [200, 0.65194649573052132], [282, 0.65436902791761997], [247, 0.6617526472728168], [224, 0.66331512219290034], [279, 0.67356869890173887], [223, 0.6894785765767022]]\n"
     ]
    }
   ],
   "source": [
    "print(Grid_anal(a[0], a[1], a[2]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Такие дела"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}