Advertisement
Guest User

Untitled

a guest
May 30th, 2017
127
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.45 KB | None | 0 0
  1. from __future__ import absolute_import
  2.  
  3. from celery import shared_task
  4. from celery.exceptions import SoftTimeLimitExceeded, TimeLimitExceeded
  5. import time
  6.  
  7. from django.db.models import Count
  8. from django.db.models import Q
  9.  
  10. from content.models import *
  11. from django.conf import settings
  12.  
  13. import sys
  14. import traceback
  15.  
  16. import requests
  17. from requests.packages.urllib3.exceptions import InsecureRequestWarning
  18. import os
  19. import mimetypes
  20. import re
  21. import unicodedata
  22. import random
  23. import pprint
  24. import logging
  25. import json
  26. import datetime
  27. from pytz import timezone
  28. import urllib.parse
  29. import nltk
  30. from nltk import word_tokenize, WordNetLemmatizer
  31. from nltk.corpus import stopwords
  32. from nltk import NaiveBayesClassifier, classify
  33. import sys
  34. import imghdr
  35. import copy
  36. import magic
  37. from datetime import timedelta
  38. import numpy as np
  39. from sklearn.ensemble import RandomForestClassifier
  40. from sklearn.feature_extraction.text import TfidfVectorizer
  41. from sklearn.linear_model import LogisticRegression as LR
  42. import nltk.data
  43. import re
  44. import pymongo
  45. from random import shuffle
  46. import base64
  47. from influxdb import InfluxDBClient
  48. import math
  49. from celery.signals import celeryd_init
  50. from celery import Task
  51.  
  52.  
  53. def text_to_wordlist(text):
  54. text = re.sub('n\'t', ' not', text)
  55.  
  56. text = re.sub('[^a-zA-Zа-яА-Я]', ' ', text)
  57. words = text.lower().split()
  58.  
  59. #stops = set(stopwords.words("english"))
  60.  
  61. return words
  62.  
  63. def clean_text(text):
  64. text = text.replace('\xa0', ' ')
  65. return text
  66.  
  67. def text_to_sentences(text):
  68. # text = BeautifulSoup(text).get_text()
  69.  
  70. tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
  71. raw_sentences = tokenizer.tokenize(clean_text(text.strip()))
  72.  
  73. sentences = []
  74. for raw_sentence in raw_sentences:
  75. if len(raw_sentence) > 0:
  76. sentences.append(text_to_wordlist(raw_sentence))
  77.  
  78. return sentences
  79.  
  80.  
  81. def text_to_vec(words, model, size):
  82. text_vec = np.zeros((size,), dtype="float32")
  83. n_words = 0
  84.  
  85. index2word_set = set(model.index2word)
  86. for word in words:
  87. if word in index2word_set:
  88. n_words = n_words + 1
  89. text_vec = np.add(text_vec, model[word])
  90.  
  91. if n_words != 0:
  92. text_vec /= n_words
  93. return text_vec
  94.  
  95.  
  96. def texts_to_vecs(texts, model, size):
  97. texts_vecs = np.zeros((len(texts), size), dtype="float32")
  98.  
  99. for i, text in enumerate(texts):
  100. texts_vecs[i] = text_to_vec(text, model, size)
  101.  
  102. return texts_vecs
  103.  
  104.  
  105. class NewsClassify(Task):
  106. _model = None
  107. _vectorizer = None
  108.  
  109. def __init__(self):
  110. self.train_model()
  111.  
  112. def train_model(self):
  113. print("let's train news classification model")
  114. client = pymongo.MongoClient('127.0.0.1:27017')
  115. db = client['news']
  116. news_obj_cursor = db['yandex_news'].find()
  117. count = 0
  118. count_categories = {}
  119. count_main_categories = {}
  120. all = []
  121. news_objs = list(news_obj_cursor)
  122. for news_obj in news_objs:
  123. count += 1
  124. if news_obj['main_category']:
  125. if count_main_categories.get(news_obj['main_category']) is None:
  126. count_main_categories[news_obj['main_category']] = 1
  127. else:
  128. count_main_categories[news_obj['main_category']] += 1
  129. if news_obj['child_category']:
  130. if count_categories.get(news_obj['child_category']) is None:
  131. count_categories[news_obj['child_category']] = 1
  132. else:
  133. count_categories[news_obj['child_category']] += 1
  134.  
  135. min_value = min(count_main_categories.values())
  136. count_samples = {}
  137. for news_obj in news_objs:
  138. if count_samples.get(news_obj['main_category']) is None:
  139. count_samples[news_obj['main_category']] = 1
  140. all.append(news_obj)
  141. elif count_samples[news_obj['main_category']] < min_value:
  142. count_samples[news_obj['main_category']] += 1
  143. all.append(news_obj)
  144. shuffle(all)
  145. train = all[:int(len(all)/2-1)]
  146. test = all[int(len(all)/2-1):]
  147. print(len(train), len(test))
  148. print(count_categories)
  149. print(count_main_categories)
  150. target = 'title'
  151.  
  152.  
  153. all = []
  154. for news_obj in news_objs:
  155. all.append(news_obj)
  156. shuffle(all)
  157. train = all[:int(len(all)/2-1)]
  158. test = all[int(len(all)/2-1):]
  159. self._vectorizer = TfidfVectorizer( max_features = 40000, ngram_range = ( 1, 3 ), sublinear_tf = True )
  160. train_like_word_str = [" ".join(sum(text_to_sentences(text[target]), [])).strip() for text in train]
  161. train_x = self._vectorizer.fit_transform( train_like_word_str )
  162. self._model = LR()
  163. self._model.fit(train_x, [train_cur['main_category'] for train_cur in train])
  164.  
  165. def classify(self, test_like_word_str):
  166. test_x = self._vectorizer.transform(test_like_word_str)
  167. predict = self._model.predict(test_x)
  168. return predict
  169.  
  170.  
  171. @shared_task(ignore_result=False, result_expires=timedelta(minutes=60), time_limit=1800, base=NewsClassify)
  172. def news_classify(payload):
  173. test_like_word_str = [" ".join(sum(text_to_sentences(payload['title']), [])).strip()]
  174. predict = news_classify.classify(test_like_word_str)
  175. # print(predict)
  176. return list(predict)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement