Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- {
- "cells": [
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "import sklearn.datasets\n",
- "import re\n",
- "import sklearn.feature_extraction.text\n",
- "import numpy\n",
- "import sklearn.linear_model\n",
- "import sklearn.metrics\n",
- "import sklearn.grid_search\n",
- "from os import listdir\n",
- "import pymorphy2"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "CLEAN = '/home/max/PycharmProjects/FULL_DATA/CLEAN_DATA/'\n",
- "MARK = '/home/max/PycharmProjects/FULL_DATA/MARKS_DATA/'\n",
- "WORD = '/home/max/PycharmProjects/FULL_DATA/WORDS_DATA/'\n",
- "ALL_DATA = sorted([int(i.split('.')[0]) for i in listdir(CLEAN)])\n",
- "WORD_DATA = sorted([int(i.split('.')[0]) for i in listdir(WORD)])\n",
- "AllowedWords = '[а-я]+'\n",
- "\n",
- "data = []\n",
- "wordNormal = pymorphy2.MorphAnalyzer()\n",
- "interesting, boring = [], []\n",
- "marked = []\n",
- "unmarked = []\n",
- "markedD = []\n",
- "unmarkedD = []\n",
- "\n",
- "vectorizer = sklearn.feature_extraction.text.TfidfVectorizer()\n",
- "metric = sklearn.metrics.roc_auc_score\n",
- "cls = sklearn.linear_model.SGDClassifier(loss='log')\n",
- "searcher = sklearn.grid_search.GridSearchCV"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "def clean_by_words():\n",
- " for i in ALL_DATA:\n",
- " with open(CLEAN + str(i) + '.txt', 'r') as file:\n",
- " text = file.read()\n",
- " with open(WORD + str(i) + '.txt', 'w') as wordfile:\n",
- " data.append('')\n",
- " for word in re.findall(AllowedWords, text.lower()):\n",
- " normal = wordNormal.normal_forms(word)[0]\n",
- " wordfile.write(normal + ' ')\n",
- " data[-1] += normal + ' '"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "# clean_by_words вызывается каждый раз после выполения clean"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "def check():\n",
- " for i in WORD_DATA:\n",
- " text = open(WORD + str(i) + '.txt', 'r').read()\n",
- " try:\n",
- " with open(MARK + str(i) + '.txt', 'r') as mark:\n",
- " if mark.read() == '1':\n",
- " interesting.append(i)\n",
- " else:\n",
- " boring.append(i)\n",
- " marked.append(i)\n",
- " markedD.append(text)\n",
- " except:\n",
- " unmarked.append(i)\n",
- " unmarkedD.append(text)\n",
- " data = markedD + unmarkedD\n",
- " X = vectorizer.fit_transform(data)\n",
- " X_train = X[:len(markedD), :]\n",
- " X_test = X[len(markedD):, :]\n",
- " Y_train = numpy.array([1 if t in interesting else 0 for t in marked])\n",
- " return [X_train, Y_train, X_test]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "def Grid_anal(X_train, Y_train, X_test): # название функции исторически так сложилось\n",
- " def scorer(estimator, X, Y):\n",
- " return metric(Y, estimator.predict_proba(X)[:, 1])\n",
- "\n",
- " grid = {\n",
- " 'penalty': ['elasticnet'],\n",
- " 'alpha': [0.001, 0.0001, 0.00001, 0.000001, 0.0000001],\n",
- " 'l1_ratio': [0.0, 0.01, 0.05, 0.10, 0.2, 0.3, 0.4, 0.5],\n",
- " }\n",
- "\n",
- " searcher = sklearn.grid_search.GridSearchCV(\n",
- " estimator=cls,\n",
- " param_grid=grid,\n",
- " scoring=scorer,\n",
- " cv=5,\n",
- " n_jobs=1\n",
- " )\n",
- " searcher.fit(X_train, Y_train)\n",
- " print(\"best score is \", end='')\n",
- " print(searcher.best_score_)\n",
- " every = [[unmarked[i[0]], searcher.predict_proba(i[1])[0][0]] for i in enumerate(X_test)]\n",
- " sor_every = sorted(every, key=lambda i: i[1])\n",
- " return sor_every\n",
- "# выводит сортированный по возрастанию список предсказаний для неразмеченных текстов"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "# Вообще, пока я разметил статьи вот так: \n",
- "# если 'клиент' или 'кассир' или 'продажа' в тексте, то интересно. Если нет, то нет"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "# посмотрим что получилось"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "a = check()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "best score is 0.832676697897\n",
- "[[267, 0.14814952467268638], [298, 0.2166617416736395], [252, 0.24473145596548596], [254, 0.26935462832632096], [232, 0.27283091354834865], [244, 0.28140093345841333], [276, 0.28947516950620178], [218, 0.29387222489191678], [207, 0.30707113979568001], [287, 0.31877048978812561], [292, 0.36731871723393728], [296, 0.37302046686667301], [205, 0.38205666504341418], [203, 0.38852198660623471], [281, 0.4025823163378931], [237, 0.41930907365357484], [206, 0.4280537538224205], [220, 0.44046959512779293], [221, 0.44814932978273203], [238, 0.44897073417062461], [209, 0.47191844440196373], [256, 0.47511814554681608], [235, 0.47555399504439011], [229, 0.47612651590939192], [259, 0.48483799987533849], [278, 0.48921829500850178], [263, 0.49757905956173765], [216, 0.50149374685657788], [249, 0.50269324098690671], [201, 0.50308278468703782], [219, 0.50408912918333293], [283, 0.51592650364622961], [288, 0.5172713798542109], [294, 0.52362343818718848], [234, 0.52422251293222666], [239, 0.53146968307160236], [299, 0.53168978106135323], [295, 0.53489004026208109], [290, 0.541030785346163], [271, 0.54224302595328966], [285, 0.54299920477807162], [266, 0.54814198165685979], [231, 0.55068725569906096], [277, 0.5608893890236033], [230, 0.56297478131202039], [240, 0.56614902920451837], [204, 0.56758415762862402], [236, 0.56782892286609277], [270, 0.57269026357913921], [210, 0.57384941836522496], [245, 0.57538615471925847], [225, 0.58171786780125767], [274, 0.5828350343677341], [297, 0.58297955139509039], [272, 0.58369657317829238], [243, 0.58564651804901513], [273, 0.58966822172867861], [222, 0.59088783808307799], [253, 0.59160771669018142], [289, 0.59270254053977234], [246, 0.59558636146822796], [248, 0.59624588835668524], [258, 0.59677337006723996], [268, 0.59734468376091177], [260, 0.59811350795753637], [264, 0.59892437495551509], [251, 0.60183526797396003], [233, 0.60358659442466411], [261, 0.60805553104182108], [265, 0.60952683929229257], [211, 0.6115376552127203], [208, 0.61217634692205958], [280, 0.61469232376076122], [255, 0.61637215828609659], [269, 0.61700412991059994], [286, 0.6185465176882663], [250, 0.62756469952015226], [262, 0.63334979506975264], [257, 0.64040372810246016], [202, 0.64451325288979922], [226, 0.64486976029194198], [200, 0.65194649573052132], [282, 0.65436902791761997], [247, 0.6617526472728168], [224, 0.66331512219290034], [279, 0.67356869890173887], [223, 0.6894785765767022]]\n"
- ]
- }
- ],
- "source": [
- "print(Grid_anal(a[0], a[1], a[2]))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "# Такие дела"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.5.1"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 0
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement