Untitled

import os
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
import re
from pymystem3 import Mystem
import nltk
from nltk.corpus import stopwords
from collections import OrderedDict
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import accuracy_score
import os
import csv
import sys
import zipfile
from warnings import filterwarnings
from functools import reduce
from tqdm import tqdm_notebook
import requests
import html2text
from urllib3 import exceptions

class Preproc(object):
    def __init__(self, corpus):
        self.corpus = corpus
        self.lens = []
        self.word2id = {}

    def preprocess(self, pattern):
        mystem = Mystem()
        self.corpus = self.corpus.apply(lambda x: x.lower())
        self.corpus = self.corpus.apply(lambda x: re.sub(pattern, ' ', x))
        self.curpus = self.corpus.apply(lambda x: mystem.lemmatize(x))
        self.corpus = self.corpus.apply(lambda x: [word for word in x if word not in stopwords_set])
        return self

    def make_vocab(self, min_count=5, unk=1):
        for text in self.corpus:
            for word in text:
                if word in self.word2id:
                    self.word2id[word] += 1
                else:
                    self.word2id[word] = 1

        self.word2id = OrderedDict(sorted(self.word2id.items(), key=lambda x: x[1], reverse=True))

        self.word2id = {word: index + 1 + unk for index, word in enumerate(self.word2id.keys()) if
                        self.word2id[word] >= min_count}
        return self

    def encode_sentences(self, sentence_len, unk=1, pad=0, vocab=None):
        if vocab == None:
            vocab = self.word2id
        assert len(vocab) != 0
        self.corpus = self.corpus.apply(lambda x: [vocab[word] if word in vocab else unk for word in x])
        self.corpus.apply(lambda x: self.lens.append(len(x)))
        self.corpus = pad_sequences(self.corpus, maxlen=sentence_len, padding='post', truncating='post', value=pad)

class get_pretrained_embedos(object):
    def __init__(self, path_to_embedos):
        assert os.path.exists(path_to_embedos)
        self.path = path_to_embedos
        self.embeddings = {}

    def fill_embeddings(self, vocab):
        assert len(self.embeddings) == 0

        def gen_line_from_file():
            with open(self.path) as file:
                file.readline()
                for line in file:
                    line = line.split()
                    yield line[0], list(map(np.float32, line[1:]))

        gen = gen_line_from_file()

        index = 0
        for word, emb in gen:
            if word in vocab:
                self.embeddings[word] = emb
                index += 1
        print('loaded {n} russian embeddings out of {m}'.format(n=index, m=len(vocab)))
        return self

    def add_embeddings(self, vocab, path_to_embedos):
        assert os.path.exists(path_to_embedos)

        def gen_line_from_file():
            with open(path_to_embedos) as file:
                for line in file:
                    line = line.split()
                    yield line[0], list(map(np.float32, line[1:]))

        gen = gen_line_from_file()

        index = 0
        for word, emb in gen:
            if word in vocab:
                self.embeddings[word] = emb
                index += 1
        print('loaded {n} english embeddings'.format(n=index))
        return self

    def get_embedos_matrix(self, vocab, embed_dim):
        matrix = np.random.normal(size=(len(vocab) + 2, embed_dim))

        for word, index in vocab.items():
            if word in self.embeddings:
                matrix[index] = self.embeddings[word]

        return np.asarray(matrix, dtype=np.float32)

class Site_does_not_exist_exception(Exception):
  def __init__(self, message):
    self.message = message

#nltk.download('stopwords')
print(tf.__version__)
print('SSS1')
stopwords_set = set(stopwords.words('russian'))
stopwords_set.update(set(stopwords.words('english')))
filterwarnings('ignore')
#csv.field_size_limit(sys.maxsize)
train = pd.read_csv('content/file.csv')
train = train.sample(frac=1)
train.info()
print('SSS2')
corpus, labels = train['text'], train['label']
unique_cat = pd.unique(labels)
category_to_id = {category:index for category, index in zip(unique_cat, range(len(unique_cat)))}
id_to_category = {index: category for category, index in category_to_id.items()}
labels = labels.replace(category_to_id)
pattern = re.compile(r'[^a-zA-Zа-яА-я ё]')
sentence_len = 700
one = Preproc(corpus)
(one
.preprocess(pattern)
.make_vocab(min_count=20)
.encode_sentences(sentence_len))
print('SSS3')
embedos = get_pretrained_embedos('content/ft_native_300_ru_twitter_nltk_word_tokenize.vec')
matrix = (embedos
.fill_embeddings(vocab = one.word2id)
.add_embeddings(vocab = one.word2id, path_to_embedos = 'content/glove.twitter.27B.100d.txt')
.get_embedos_matrix(vocab=one.word2id, embed_dim=100))

def my_initializer(shape=None, dtype=tf.float32, partition_info=None):
    assert dtype is tf.float32
    return matrix

def set_new_session():
  sess = tf.get_default_session()
  if sess is not None:
    sess.close()
  tf.reset_default_graph()
  config = tf.ConfigProto()
  s = tf.InteractiveSession(config=config)
  return s

def generator():
  for text, length, label in zip(one.corpus,one.lens, labels):
    yield text, length, label

def generator_eval():
  for text, length, label in zip(one.corpus[:100000],one.lens[:100000], labels[:100000]):
    yield text, length, label

def my_input_fn(params, training=True):
    data = tf.data.Dataset.from_generator(generator, (tf.int32, tf.int32, tf.int32),
                                          output_shapes=([sentence_len, ], [], []))

    if training:
        data = data.shuffle(buffer_size=params['buffer_size'])
        data = data.repeat(params['num_of_epochs'])

    data = data.batch(params['batch_size'], drop_remainder=True)
    iterator = data.make_one_shot_iterator()
    text, length, label = iterator.get_next()

    return {'sentences': text, 'lens': length}, label

def my_input_fn_eval(params, training=False):
    data = tf.data.Dataset.from_generator(generator_eval, (tf.int32, tf.int32, tf.int32),
                                          output_shapes=([sentence_len, ], [], []))

    if training:
        data = data.shuffle(buffer_size=params['buffer_size'])
        data = data.repeat(params['num_of_epochs'])

    data = data.batch(params['batch_size'], drop_remainder=True)
    iterator = data.make_one_shot_iterator()
    text, length, label = iterator.get_next()

    return {'sentences': text, 'lens': length}, label

def my_model_fn(features, labels, mode, params):
    input_layer = tf.contrib.layers.embed_sequence(features['sentences'],
                                                   vocab_size=params['vocab_size'],
                                                   embed_dim=params['embed_size'],
                                                   initializer=params['embed_init'])

    forward_cell, backward_cell = tf.nn.rnn_cell.MultiRNNCell(
        [tf.nn.rnn_cell.LSTMCell(64), tf.nn.rnn_cell.LSTMCell(128)]), \
                                  tf.nn.rnn_cell.MultiRNNCell(
                                      [tf.nn.rnn_cell.LSTMCell(64), tf.nn.rnn_cell.LSTMCell(128)])

    outputs, final_states = tf.nn.bidirectional_dynamic_rnn(cell_fw=forward_cell, cell_bw=backward_cell, \
                                                            inputs=input_layer, \
                                                            sequence_length=features['lens'], dtype=tf.float32)
    outputs = tf.concat(outputs, axis=2)
    final_states = tf.concat((final_states[0][1].h, final_states[1][1].h), axis=1)

    max_pool = tf.reduce_max(outputs, axis=1)
    mean_pool = tf.reduce_mean(outputs, axis=1)
    concat_pooling = tf.concat((final_states, max_pool, mean_pool), axis=1)

    logits = tf.layers.dense(concat_pooling, units=params['num_of_classes'])

    predictions = {'class_id': tf.argmax(logits, axis=1)}

    if mode == tf.estimator.ModeKeys.PREDICT:
        return tf.estimator.EstimatorSpec(
            mode,
            predictions=predictions
        )

    loss = tf.losses.sparse_softmax_cross_entropy(labels, logits)

    accuracy = tf.metrics.accuracy(labels, predictions['class_id'])

    if mode == tf.estimator.ModeKeys.TRAIN:
        optimizer = tf.train.AdamOptimizer(learning_rate=3e-04)
        gradients, variables = zip(*optimizer.compute_gradients(loss))
        gradients = [None if gradient is None else tf.clip_by_value(gradient, clip_value_min=-5, clip_value_max=5) for
                     gradient in gradients]
        train_op = optimizer.apply_gradients(zip(gradients, variables), global_step=tf.train.get_global_step())

        return tf.estimator.EstimatorSpec(
            mode,
            loss=loss,
            train_op=train_op
        )

    if mode == tf.estimator.ModeKeys.EVAL:
        return tf.estimator.EstimatorSpec(
            mode,
            loss=loss,
            eval_metric_ops={'accuracy': accuracy}
        )

vocab_size = len(one.word2id) + 2
embed_size=100
embed_init = my_initializer
num_of_classes = 55
batch_size=100
s = set_new_session()
print('SSS4')
params = {'vocab_size':vocab_size, 'embed_size':embed_size, 'embed_init': embed_init, 'num_of_classes': num_of_classes, 'batch_size':batch_size}
classifier  = tf.estimator.Estimator(model_fn = my_model_fn, \
                                     model_dir = 'content/pretrained_embedos_bilstm_concat', params = params)

to_text = html2text.HTML2Text()
to_text.escape_snob = True
to_text.ignore_images = True
to_text.ignore_tables = True
to_text.ignore_links = True

class wrapper(object):
    def __init__(self, url, to_text=to_text, pattern=pattern, analyzer=analyzer, sentence_len=sentence_len,
                 vocab=one.word2id, \
                 unk=1, pad=0, model=classifier):
        self.url = url
        self.to_text = to_text
        self.pattern = pattern
        self.analyzer = analyzer
        self.sentence_len = sentence_len
        self.vocab = vocab
        self.unk = unk
        self.pad = pad
        self.len = None
        self.classifier = model
        self.prediction = None

    def parse(self):
        if 'http' not in self.url and 'www' not in self.url:
            self.url = 'http://' + self.url
        try:
            response = requests.get(self.url, allow_redirects=True, timeout=30)
            if response.status_code == requests.codes.ok or response.status_code in [300, 301, 302, 303, 304, 305, 306,
                                                                                     307, 308]:
                raw = response.text
                text = re.sub('\n', ' ', self.to_text.handle(raw))
                text = text.lower()
                text = re.sub(self.pattern, ' ', text)
                text = [word for word in text.split() if word not in stopwords_set]
                text = [word.lemma for word in self.analyzer.analyze(text)]
                text = [self.vocab[word] if word in self.vocab else self.unk for word in text]
                length = len(text)
                text = \
                pad_sequences([text], maxlen=self.sentence_len, padding='post', truncating='post', value=self.pad)[0]

                def generator_predict(corpus=text, lens=length):
                    yield corpus, lens

                def predict_input_fn():
                    data = tf.data.Dataset.from_generator(generator_predict, (tf.int32, tf.int32),
                                                          output_shapes=([self.sentence_len, ], []))

                    data = data.repeat(1)
                    data = data.batch(1)
                    iterator = data.make_one_shot_iterator()
                    text, length = iterator.get_next()
                    return {'sentences': text, 'lens': length}

                predictions = self.classifier.predict(input_fn=lambda: predict_input_fn())
                for elem in predictions:
                    self.predictions = elem['class_id']

                return id_to_category[self.predictions]

                return text
            else:
                raise Site_does_not_exist_exception('The site {s} does not even exist'.format(s=self.url))
        except (requests.exceptions.Timeout, requests.exceptions.ConnectionError, AssertionError,
                requests.exceptions.ContentDecodingError,
                exceptions.DecodeError, Site_does_not_exist_exception) as e:
            print('The  exception was caught with message {m}'.format(m=e.message))

two = wrapper('https://vk.com/')
#print(two.parse())

def get_cat():
    return two.parse()