Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import os
- import pandas as pd
- import numpy as np
- import tensorflow as tf
- from sklearn.model_selection import train_test_split
- import re
- from pymystem3 import Mystem
- import nltk
- from nltk.corpus import stopwords
- from collections import OrderedDict
- from keras.preprocessing.sequence import pad_sequences
- from sklearn.metrics import accuracy_score
- import os
- import csv
- import sys
- import zipfile
- from warnings import filterwarnings
- from functools import reduce
- from tqdm import tqdm_notebook
- import requests
- import html2text
- from urllib3 import exceptions
- class Preproc(object):
- def __init__(self, corpus):
- self.corpus = corpus
- self.lens = []
- self.word2id = {}
- def preprocess(self, pattern):
- mystem = Mystem()
- self.corpus = self.corpus.apply(lambda x: x.lower())
- self.corpus = self.corpus.apply(lambda x: re.sub(pattern, ' ', x))
- self.curpus = self.corpus.apply(lambda x: mystem.lemmatize(x))
- self.corpus = self.corpus.apply(lambda x: [word for word in x if word not in stopwords_set])
- return self
- def make_vocab(self, min_count=5, unk=1):
- for text in self.corpus:
- for word in text:
- if word in self.word2id:
- self.word2id[word] += 1
- else:
- self.word2id[word] = 1
- self.word2id = OrderedDict(sorted(self.word2id.items(), key=lambda x: x[1], reverse=True))
- self.word2id = {word: index + 1 + unk for index, word in enumerate(self.word2id.keys()) if
- self.word2id[word] >= min_count}
- return self
- def encode_sentences(self, sentence_len, unk=1, pad=0, vocab=None):
- if vocab == None:
- vocab = self.word2id
- assert len(vocab) != 0
- self.corpus = self.corpus.apply(lambda x: [vocab[word] if word in vocab else unk for word in x])
- self.corpus.apply(lambda x: self.lens.append(len(x)))
- self.corpus = pad_sequences(self.corpus, maxlen=sentence_len, padding='post', truncating='post', value=pad)
- class get_pretrained_embedos(object):
- def __init__(self, path_to_embedos):
- assert os.path.exists(path_to_embedos)
- self.path = path_to_embedos
- self.embeddings = {}
- def fill_embeddings(self, vocab):
- assert len(self.embeddings) == 0
- def gen_line_from_file():
- with open(self.path) as file:
- file.readline()
- for line in file:
- line = line.split()
- yield line[0], list(map(np.float32, line[1:]))
- gen = gen_line_from_file()
- index = 0
- for word, emb in gen:
- if word in vocab:
- self.embeddings[word] = emb
- index += 1
- print('loaded {n} russian embeddings out of {m}'.format(n=index, m=len(vocab)))
- return self
- def add_embeddings(self, vocab, path_to_embedos):
- assert os.path.exists(path_to_embedos)
- def gen_line_from_file():
- with open(path_to_embedos) as file:
- for line in file:
- line = line.split()
- yield line[0], list(map(np.float32, line[1:]))
- gen = gen_line_from_file()
- index = 0
- for word, emb in gen:
- if word in vocab:
- self.embeddings[word] = emb
- index += 1
- print('loaded {n} english embeddings'.format(n=index))
- return self
- def get_embedos_matrix(self, vocab, embed_dim):
- matrix = np.random.normal(size=(len(vocab) + 2, embed_dim))
- for word, index in vocab.items():
- if word in self.embeddings:
- matrix[index] = self.embeddings[word]
- return np.asarray(matrix, dtype=np.float32)
- class Site_does_not_exist_exception(Exception):
- def __init__(self, message):
- self.message = message
- #nltk.download('stopwords')
- print(tf.__version__)
- print('SSS1')
- stopwords_set = set(stopwords.words('russian'))
- stopwords_set.update(set(stopwords.words('english')))
- filterwarnings('ignore')
- #csv.field_size_limit(sys.maxsize)
- train = pd.read_csv('content/file.csv')
- train = train.sample(frac=1)
- train.info()
- print('SSS2')
- corpus, labels = train['text'], train['label']
- unique_cat = pd.unique(labels)
- category_to_id = {category:index for category, index in zip(unique_cat, range(len(unique_cat)))}
- id_to_category = {index: category for category, index in category_to_id.items()}
- labels = labels.replace(category_to_id)
- pattern = re.compile(r'[^a-zA-Zа-яА-я ё]')
- sentence_len = 700
- one = Preproc(corpus)
- (one
- .preprocess(pattern)
- .make_vocab(min_count=20)
- .encode_sentences(sentence_len))
- print('SSS3')
- embedos = get_pretrained_embedos('content/ft_native_300_ru_twitter_nltk_word_tokenize.vec')
- matrix = (embedos
- .fill_embeddings(vocab = one.word2id)
- .add_embeddings(vocab = one.word2id, path_to_embedos = 'content/glove.twitter.27B.100d.txt')
- .get_embedos_matrix(vocab=one.word2id, embed_dim=100))
- def my_initializer(shape=None, dtype=tf.float32, partition_info=None):
- assert dtype is tf.float32
- return matrix
- def set_new_session():
- sess = tf.get_default_session()
- if sess is not None:
- sess.close()
- tf.reset_default_graph()
- config = tf.ConfigProto()
- s = tf.InteractiveSession(config=config)
- return s
- def generator():
- for text, length, label in zip(one.corpus,one.lens, labels):
- yield text, length, label
- def generator_eval():
- for text, length, label in zip(one.corpus[:100000],one.lens[:100000], labels[:100000]):
- yield text, length, label
- def my_input_fn(params, training=True):
- data = tf.data.Dataset.from_generator(generator, (tf.int32, tf.int32, tf.int32),
- output_shapes=([sentence_len, ], [], []))
- if training:
- data = data.shuffle(buffer_size=params['buffer_size'])
- data = data.repeat(params['num_of_epochs'])
- data = data.batch(params['batch_size'], drop_remainder=True)
- iterator = data.make_one_shot_iterator()
- text, length, label = iterator.get_next()
- return {'sentences': text, 'lens': length}, label
- def my_input_fn_eval(params, training=False):
- data = tf.data.Dataset.from_generator(generator_eval, (tf.int32, tf.int32, tf.int32),
- output_shapes=([sentence_len, ], [], []))
- if training:
- data = data.shuffle(buffer_size=params['buffer_size'])
- data = data.repeat(params['num_of_epochs'])
- data = data.batch(params['batch_size'], drop_remainder=True)
- iterator = data.make_one_shot_iterator()
- text, length, label = iterator.get_next()
- return {'sentences': text, 'lens': length}, label
- def my_model_fn(features, labels, mode, params):
- input_layer = tf.contrib.layers.embed_sequence(features['sentences'],
- vocab_size=params['vocab_size'],
- embed_dim=params['embed_size'],
- initializer=params['embed_init'])
- forward_cell, backward_cell = tf.nn.rnn_cell.MultiRNNCell(
- [tf.nn.rnn_cell.LSTMCell(64), tf.nn.rnn_cell.LSTMCell(128)]), \
- tf.nn.rnn_cell.MultiRNNCell(
- [tf.nn.rnn_cell.LSTMCell(64), tf.nn.rnn_cell.LSTMCell(128)])
- outputs, final_states = tf.nn.bidirectional_dynamic_rnn(cell_fw=forward_cell, cell_bw=backward_cell, \
- inputs=input_layer, \
- sequence_length=features['lens'], dtype=tf.float32)
- outputs = tf.concat(outputs, axis=2)
- final_states = tf.concat((final_states[0][1].h, final_states[1][1].h), axis=1)
- max_pool = tf.reduce_max(outputs, axis=1)
- mean_pool = tf.reduce_mean(outputs, axis=1)
- concat_pooling = tf.concat((final_states, max_pool, mean_pool), axis=1)
- logits = tf.layers.dense(concat_pooling, units=params['num_of_classes'])
- predictions = {'class_id': tf.argmax(logits, axis=1)}
- if mode == tf.estimator.ModeKeys.PREDICT:
- return tf.estimator.EstimatorSpec(
- mode,
- predictions=predictions
- )
- loss = tf.losses.sparse_softmax_cross_entropy(labels, logits)
- accuracy = tf.metrics.accuracy(labels, predictions['class_id'])
- if mode == tf.estimator.ModeKeys.TRAIN:
- optimizer = tf.train.AdamOptimizer(learning_rate=3e-04)
- gradients, variables = zip(*optimizer.compute_gradients(loss))
- gradients = [None if gradient is None else tf.clip_by_value(gradient, clip_value_min=-5, clip_value_max=5) for
- gradient in gradients]
- train_op = optimizer.apply_gradients(zip(gradients, variables), global_step=tf.train.get_global_step())
- return tf.estimator.EstimatorSpec(
- mode,
- loss=loss,
- train_op=train_op
- )
- if mode == tf.estimator.ModeKeys.EVAL:
- return tf.estimator.EstimatorSpec(
- mode,
- loss=loss,
- eval_metric_ops={'accuracy': accuracy}
- )
- vocab_size = len(one.word2id) + 2
- embed_size=100
- embed_init = my_initializer
- num_of_classes = 55
- batch_size=100
- s = set_new_session()
- print('SSS4')
- params = {'vocab_size':vocab_size, 'embed_size':embed_size, 'embed_init': embed_init, 'num_of_classes': num_of_classes, 'batch_size':batch_size}
- classifier = tf.estimator.Estimator(model_fn = my_model_fn, \
- model_dir = 'content/pretrained_embedos_bilstm_concat', params = params)
- to_text = html2text.HTML2Text()
- to_text.escape_snob = True
- to_text.ignore_images = True
- to_text.ignore_tables = True
- to_text.ignore_links = True
- class wrapper(object):
- def __init__(self, url, to_text=to_text, pattern=pattern, analyzer=analyzer, sentence_len=sentence_len,
- vocab=one.word2id, \
- unk=1, pad=0, model=classifier):
- self.url = url
- self.to_text = to_text
- self.pattern = pattern
- self.analyzer = analyzer
- self.sentence_len = sentence_len
- self.vocab = vocab
- self.unk = unk
- self.pad = pad
- self.len = None
- self.classifier = model
- self.prediction = None
- def parse(self):
- if 'http' not in self.url and 'www' not in self.url:
- self.url = 'http://' + self.url
- try:
- response = requests.get(self.url, allow_redirects=True, timeout=30)
- if response.status_code == requests.codes.ok or response.status_code in [300, 301, 302, 303, 304, 305, 306,
- 307, 308]:
- raw = response.text
- text = re.sub('\n', ' ', self.to_text.handle(raw))
- text = text.lower()
- text = re.sub(self.pattern, ' ', text)
- text = [word for word in text.split() if word not in stopwords_set]
- text = [word.lemma for word in self.analyzer.analyze(text)]
- text = [self.vocab[word] if word in self.vocab else self.unk for word in text]
- length = len(text)
- text = \
- pad_sequences([text], maxlen=self.sentence_len, padding='post', truncating='post', value=self.pad)[0]
- def generator_predict(corpus=text, lens=length):
- yield corpus, lens
- def predict_input_fn():
- data = tf.data.Dataset.from_generator(generator_predict, (tf.int32, tf.int32),
- output_shapes=([self.sentence_len, ], []))
- data = data.repeat(1)
- data = data.batch(1)
- iterator = data.make_one_shot_iterator()
- text, length = iterator.get_next()
- return {'sentences': text, 'lens': length}
- predictions = self.classifier.predict(input_fn=lambda: predict_input_fn())
- for elem in predictions:
- self.predictions = elem['class_id']
- return id_to_category[self.predictions]
- return text
- else:
- raise Site_does_not_exist_exception('The site {s} does not even exist'.format(s=self.url))
- except (requests.exceptions.Timeout, requests.exceptions.ConnectionError, AssertionError,
- requests.exceptions.ContentDecodingError,
- exceptions.DecodeError, Site_does_not_exist_exception) as e:
- print('The exception was caught with message {m}'.format(m=e.message))
- two = wrapper('https://vk.com/')
- #print(two.parse())
- def get_cat():
- return two.parse()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement