Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from __future__ import absolute_import
- from celery import shared_task
- from celery.exceptions import SoftTimeLimitExceeded, TimeLimitExceeded
- import time
- from django.db.models import Count
- from django.db.models import Q
- from content.models import *
- from django.conf import settings
- import sys
- import traceback
- import requests
- from requests.packages.urllib3.exceptions import InsecureRequestWarning
- import os
- import mimetypes
- import re
- import unicodedata
- import random
- import pprint
- import logging
- import json
- import datetime
- from pytz import timezone
- import urllib.parse
- import nltk
- from nltk import word_tokenize, WordNetLemmatizer
- from nltk.corpus import stopwords
- from nltk import NaiveBayesClassifier, classify
- import sys
- import imghdr
- import copy
- import magic
- from datetime import timedelta
- import numpy as np
- from sklearn.ensemble import RandomForestClassifier
- from sklearn.feature_extraction.text import TfidfVectorizer
- from sklearn.linear_model import LogisticRegression as LR
- import nltk.data
- import re
- import pymongo
- from random import shuffle
- import base64
- from influxdb import InfluxDBClient
- import math
- from celery.signals import celeryd_init
- from celery import Task
- def text_to_wordlist(text):
- text = re.sub('n\'t', ' not', text)
- text = re.sub('[^a-zA-Zа-яА-Я]', ' ', text)
- words = text.lower().split()
- #stops = set(stopwords.words("english"))
- return words
- def clean_text(text):
- text = text.replace('\xa0', ' ')
- return text
- def text_to_sentences(text):
- # text = BeautifulSoup(text).get_text()
- tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
- raw_sentences = tokenizer.tokenize(clean_text(text.strip()))
- sentences = []
- for raw_sentence in raw_sentences:
- if len(raw_sentence) > 0:
- sentences.append(text_to_wordlist(raw_sentence))
- return sentences
- def text_to_vec(words, model, size):
- text_vec = np.zeros((size,), dtype="float32")
- n_words = 0
- index2word_set = set(model.index2word)
- for word in words:
- if word in index2word_set:
- n_words = n_words + 1
- text_vec = np.add(text_vec, model[word])
- if n_words != 0:
- text_vec /= n_words
- return text_vec
- def texts_to_vecs(texts, model, size):
- texts_vecs = np.zeros((len(texts), size), dtype="float32")
- for i, text in enumerate(texts):
- texts_vecs[i] = text_to_vec(text, model, size)
- return texts_vecs
- class NewsClassify(Task):
- _model = None
- _vectorizer = None
- def __init__(self):
- self.train_model()
- def train_model(self):
- print("let's train news classification model")
- client = pymongo.MongoClient('127.0.0.1:27017')
- db = client['news']
- news_obj_cursor = db['yandex_news'].find()
- count = 0
- count_categories = {}
- count_main_categories = {}
- all = []
- news_objs = list(news_obj_cursor)
- for news_obj in news_objs:
- count += 1
- if news_obj['main_category']:
- if count_main_categories.get(news_obj['main_category']) is None:
- count_main_categories[news_obj['main_category']] = 1
- else:
- count_main_categories[news_obj['main_category']] += 1
- if news_obj['child_category']:
- if count_categories.get(news_obj['child_category']) is None:
- count_categories[news_obj['child_category']] = 1
- else:
- count_categories[news_obj['child_category']] += 1
- min_value = min(count_main_categories.values())
- count_samples = {}
- for news_obj in news_objs:
- if count_samples.get(news_obj['main_category']) is None:
- count_samples[news_obj['main_category']] = 1
- all.append(news_obj)
- elif count_samples[news_obj['main_category']] < min_value:
- count_samples[news_obj['main_category']] += 1
- all.append(news_obj)
- shuffle(all)
- train = all[:int(len(all)/2-1)]
- test = all[int(len(all)/2-1):]
- print(len(train), len(test))
- print(count_categories)
- print(count_main_categories)
- target = 'title'
- all = []
- for news_obj in news_objs:
- all.append(news_obj)
- shuffle(all)
- train = all[:int(len(all)/2-1)]
- test = all[int(len(all)/2-1):]
- self._vectorizer = TfidfVectorizer( max_features = 40000, ngram_range = ( 1, 3 ), sublinear_tf = True )
- train_like_word_str = [" ".join(sum(text_to_sentences(text[target]), [])).strip() for text in train]
- train_x = self._vectorizer.fit_transform( train_like_word_str )
- self._model = LR()
- self._model.fit(train_x, [train_cur['main_category'] for train_cur in train])
- def classify(self, test_like_word_str):
- test_x = self._vectorizer.transform(test_like_word_str)
- predict = self._model.predict(test_x)
- return predict
- @shared_task(ignore_result=False, result_expires=timedelta(minutes=60), time_limit=1800, base=NewsClassify)
- def news_classify(payload):
- test_like_word_str = [" ".join(sum(text_to_sentences(payload['title']), [])).strip()]
- predict = news_classify.classify(test_like_word_str)
- # print(predict)
- return list(predict)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement