Advertisement
Guest User

Untitled

a guest
May 19th, 2019
79
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 12.82 KB | None | 0 0
  1. import os
  2. import pandas as pd
  3. import numpy as np
  4. import tensorflow as tf
  5. from sklearn.model_selection import train_test_split
  6. import re
  7. from pymystem3 import Mystem
  8. import nltk
  9. from nltk.corpus import stopwords
  10. from collections import OrderedDict
  11. from keras.preprocessing.sequence import pad_sequences
  12. from sklearn.metrics import accuracy_score
  13. import os
  14. import csv
  15. import sys
  16. import zipfile
  17. from warnings import filterwarnings
  18. from functools import reduce
  19. from tqdm import tqdm_notebook
  20. import requests
  21. import html2text
  22. from urllib3 import exceptions
  23.  
  24. class Preproc(object):
  25.     def __init__(self, corpus):
  26.         self.corpus = corpus
  27.         self.lens = []
  28.         self.word2id = {}
  29.  
  30.     def preprocess(self, pattern):
  31.         mystem = Mystem()
  32.         self.corpus = self.corpus.apply(lambda x: x.lower())
  33.         self.corpus = self.corpus.apply(lambda x: re.sub(pattern, ' ', x))
  34.         self.curpus = self.corpus.apply(lambda x: mystem.lemmatize(x))
  35.         self.corpus = self.corpus.apply(lambda x: [word for word in x if word not in stopwords_set])
  36.         return self
  37.  
  38.     def make_vocab(self, min_count=5, unk=1):
  39.         for text in self.corpus:
  40.             for word in text:
  41.                 if word in self.word2id:
  42.                     self.word2id[word] += 1
  43.                 else:
  44.                     self.word2id[word] = 1
  45.  
  46.         self.word2id = OrderedDict(sorted(self.word2id.items(), key=lambda x: x[1], reverse=True))
  47.  
  48.         self.word2id = {word: index + 1 + unk for index, word in enumerate(self.word2id.keys()) if
  49.                         self.word2id[word] >= min_count}
  50.         return self
  51.  
  52.     def encode_sentences(self, sentence_len, unk=1, pad=0, vocab=None):
  53.         if vocab == None:
  54.             vocab = self.word2id
  55.         assert len(vocab) != 0
  56.         self.corpus = self.corpus.apply(lambda x: [vocab[word] if word in vocab else unk for word in x])
  57.         self.corpus.apply(lambda x: self.lens.append(len(x)))
  58.         self.corpus = pad_sequences(self.corpus, maxlen=sentence_len, padding='post', truncating='post', value=pad)
  59.  
  60. class get_pretrained_embedos(object):
  61.     def __init__(self, path_to_embedos):
  62.         assert os.path.exists(path_to_embedos)
  63.         self.path = path_to_embedos
  64.         self.embeddings = {}
  65.  
  66.     def fill_embeddings(self, vocab):
  67.         assert len(self.embeddings) == 0
  68.  
  69.         def gen_line_from_file():
  70.             with open(self.path) as file:
  71.                 file.readline()
  72.                 for line in file:
  73.                     line = line.split()
  74.                     yield line[0], list(map(np.float32, line[1:]))
  75.  
  76.         gen = gen_line_from_file()
  77.  
  78.         index = 0
  79.         for word, emb in gen:
  80.             if word in vocab:
  81.                 self.embeddings[word] = emb
  82.                 index += 1
  83.         print('loaded {n} russian embeddings out of {m}'.format(n=index, m=len(vocab)))
  84.         return self
  85.  
  86.     def add_embeddings(self, vocab, path_to_embedos):
  87.         assert os.path.exists(path_to_embedos)
  88.  
  89.         def gen_line_from_file():
  90.             with open(path_to_embedos) as file:
  91.                 for line in file:
  92.                     line = line.split()
  93.                     yield line[0], list(map(np.float32, line[1:]))
  94.  
  95.         gen = gen_line_from_file()
  96.  
  97.         index = 0
  98.         for word, emb in gen:
  99.             if word in vocab:
  100.                 self.embeddings[word] = emb
  101.                 index += 1
  102.         print('loaded {n} english embeddings'.format(n=index))
  103.         return self
  104.  
  105.     def get_embedos_matrix(self, vocab, embed_dim):
  106.         matrix = np.random.normal(size=(len(vocab) + 2, embed_dim))
  107.  
  108.         for word, index in vocab.items():
  109.             if word in self.embeddings:
  110.                 matrix[index] = self.embeddings[word]
  111.  
  112.         return np.asarray(matrix, dtype=np.float32)
  113.  
  114. class Site_does_not_exist_exception(Exception):
  115.   def __init__(self, message):
  116.     self.message = message
  117.  
  118. #nltk.download('stopwords')
  119. print(tf.__version__)
  120. print('SSS1')
  121. stopwords_set = set(stopwords.words('russian'))
  122. stopwords_set.update(set(stopwords.words('english')))
  123. filterwarnings('ignore')
  124. #csv.field_size_limit(sys.maxsize)
  125. train = pd.read_csv('content/file.csv')
  126. train = train.sample(frac=1)
  127. train.info()
  128. print('SSS2')
  129. corpus, labels = train['text'], train['label']
  130. unique_cat = pd.unique(labels)
  131. category_to_id = {category:index for category, index in zip(unique_cat, range(len(unique_cat)))}
  132. id_to_category = {index: category for category, index in category_to_id.items()}
  133. labels = labels.replace(category_to_id)
  134. pattern = re.compile(r'[^a-zA-Zа-яА-я ё]')
  135. sentence_len = 700
  136. one = Preproc(corpus)
  137. (one
  138. .preprocess(pattern)
  139. .make_vocab(min_count=20)
  140. .encode_sentences(sentence_len))
  141. print('SSS3')
  142. embedos = get_pretrained_embedos('content/ft_native_300_ru_twitter_nltk_word_tokenize.vec')
  143. matrix = (embedos
  144. .fill_embeddings(vocab = one.word2id)
  145. .add_embeddings(vocab = one.word2id, path_to_embedos = 'content/glove.twitter.27B.100d.txt')
  146. .get_embedos_matrix(vocab=one.word2id, embed_dim=100))
  147.  
  148. def my_initializer(shape=None, dtype=tf.float32, partition_info=None):
  149.     assert dtype is tf.float32
  150.     return matrix
  151.  
  152. def set_new_session():
  153.   sess = tf.get_default_session()
  154.   if sess is not None:
  155.     sess.close()
  156.   tf.reset_default_graph()
  157.   config = tf.ConfigProto()
  158.   s = tf.InteractiveSession(config=config)
  159.   return s
  160.  
  161. def generator():
  162.   for text, length, label in zip(one.corpus,one.lens, labels):
  163.     yield text, length, label
  164.  
  165. def generator_eval():
  166.   for text, length, label in zip(one.corpus[:100000],one.lens[:100000], labels[:100000]):
  167.     yield text, length, label
  168.  
  169. def my_input_fn(params, training=True):
  170.     data = tf.data.Dataset.from_generator(generator, (tf.int32, tf.int32, tf.int32),
  171.                                           output_shapes=([sentence_len, ], [], []))
  172.  
  173.     if training:
  174.         data = data.shuffle(buffer_size=params['buffer_size'])
  175.         data = data.repeat(params['num_of_epochs'])
  176.  
  177.     data = data.batch(params['batch_size'], drop_remainder=True)
  178.     iterator = data.make_one_shot_iterator()
  179.     text, length, label = iterator.get_next()
  180.  
  181.     return {'sentences': text, 'lens': length}, label
  182.  
  183. def my_input_fn_eval(params, training=False):
  184.     data = tf.data.Dataset.from_generator(generator_eval, (tf.int32, tf.int32, tf.int32),
  185.                                           output_shapes=([sentence_len, ], [], []))
  186.  
  187.     if training:
  188.         data = data.shuffle(buffer_size=params['buffer_size'])
  189.         data = data.repeat(params['num_of_epochs'])
  190.  
  191.     data = data.batch(params['batch_size'], drop_remainder=True)
  192.     iterator = data.make_one_shot_iterator()
  193.     text, length, label = iterator.get_next()
  194.  
  195.     return {'sentences': text, 'lens': length}, label
  196.  
  197. def my_model_fn(features, labels, mode, params):
  198.     input_layer = tf.contrib.layers.embed_sequence(features['sentences'],
  199.                                                    vocab_size=params['vocab_size'],
  200.                                                    embed_dim=params['embed_size'],
  201.                                                    initializer=params['embed_init'])
  202.  
  203.     forward_cell, backward_cell = tf.nn.rnn_cell.MultiRNNCell(
  204.         [tf.nn.rnn_cell.LSTMCell(64), tf.nn.rnn_cell.LSTMCell(128)]), \
  205.                                   tf.nn.rnn_cell.MultiRNNCell(
  206.                                       [tf.nn.rnn_cell.LSTMCell(64), tf.nn.rnn_cell.LSTMCell(128)])
  207.  
  208.     outputs, final_states = tf.nn.bidirectional_dynamic_rnn(cell_fw=forward_cell, cell_bw=backward_cell, \
  209.                                                             inputs=input_layer, \
  210.                                                             sequence_length=features['lens'], dtype=tf.float32)
  211.     outputs = tf.concat(outputs, axis=2)
  212.     final_states = tf.concat((final_states[0][1].h, final_states[1][1].h), axis=1)
  213.  
  214.     max_pool = tf.reduce_max(outputs, axis=1)
  215.     mean_pool = tf.reduce_mean(outputs, axis=1)
  216.     concat_pooling = tf.concat((final_states, max_pool, mean_pool), axis=1)
  217.  
  218.     logits = tf.layers.dense(concat_pooling, units=params['num_of_classes'])
  219.  
  220.     predictions = {'class_id': tf.argmax(logits, axis=1)}
  221.  
  222.     if mode == tf.estimator.ModeKeys.PREDICT:
  223.         return tf.estimator.EstimatorSpec(
  224.             mode,
  225.             predictions=predictions
  226.         )
  227.  
  228.     loss = tf.losses.sparse_softmax_cross_entropy(labels, logits)
  229.  
  230.     accuracy = tf.metrics.accuracy(labels, predictions['class_id'])
  231.  
  232.     if mode == tf.estimator.ModeKeys.TRAIN:
  233.         optimizer = tf.train.AdamOptimizer(learning_rate=3e-04)
  234.         gradients, variables = zip(*optimizer.compute_gradients(loss))
  235.         gradients = [None if gradient is None else tf.clip_by_value(gradient, clip_value_min=-5, clip_value_max=5) for
  236.                      gradient in gradients]
  237.         train_op = optimizer.apply_gradients(zip(gradients, variables), global_step=tf.train.get_global_step())
  238.  
  239.         return tf.estimator.EstimatorSpec(
  240.             mode,
  241.             loss=loss,
  242.             train_op=train_op
  243.         )
  244.  
  245.     if mode == tf.estimator.ModeKeys.EVAL:
  246.         return tf.estimator.EstimatorSpec(
  247.             mode,
  248.             loss=loss,
  249.             eval_metric_ops={'accuracy': accuracy}
  250.         )
  251.  
  252. vocab_size = len(one.word2id) + 2
  253. embed_size=100
  254. embed_init = my_initializer
  255. num_of_classes = 55
  256. batch_size=100
  257. s = set_new_session()
  258. print('SSS4')
  259. params = {'vocab_size':vocab_size, 'embed_size':embed_size, 'embed_init': embed_init, 'num_of_classes': num_of_classes, 'batch_size':batch_size}
  260. classifier  = tf.estimator.Estimator(model_fn = my_model_fn, \
  261.                                      model_dir = 'content/pretrained_embedos_bilstm_concat', params = params)
  262.  
  263. to_text = html2text.HTML2Text()
  264. to_text.escape_snob = True
  265. to_text.ignore_images = True
  266. to_text.ignore_tables = True
  267. to_text.ignore_links = True
  268.  
  269. class wrapper(object):
  270.     def __init__(self, url, to_text=to_text, pattern=pattern, analyzer=analyzer, sentence_len=sentence_len,
  271.                  vocab=one.word2id, \
  272.                  unk=1, pad=0, model=classifier):
  273.         self.url = url
  274.         self.to_text = to_text
  275.         self.pattern = pattern
  276.         self.analyzer = analyzer
  277.         self.sentence_len = sentence_len
  278.         self.vocab = vocab
  279.         self.unk = unk
  280.         self.pad = pad
  281.         self.len = None
  282.         self.classifier = model
  283.         self.prediction = None
  284.  
  285.     def parse(self):
  286.         if 'http' not in self.url and 'www' not in self.url:
  287.             self.url = 'http://' + self.url
  288.         try:
  289.             response = requests.get(self.url, allow_redirects=True, timeout=30)
  290.             if response.status_code == requests.codes.ok or response.status_code in [300, 301, 302, 303, 304, 305, 306,
  291.                                                                                      307, 308]:
  292.                 raw = response.text
  293.                 text = re.sub('\n', ' ', self.to_text.handle(raw))
  294.                 text = text.lower()
  295.                 text = re.sub(self.pattern, ' ', text)
  296.                 text = [word for word in text.split() if word not in stopwords_set]
  297.                 text = [word.lemma for word in self.analyzer.analyze(text)]
  298.                 text = [self.vocab[word] if word in self.vocab else self.unk for word in text]
  299.                 length = len(text)
  300.                 text = \
  301.                 pad_sequences([text], maxlen=self.sentence_len, padding='post', truncating='post', value=self.pad)[0]
  302.  
  303.                 def generator_predict(corpus=text, lens=length):
  304.                     yield corpus, lens
  305.  
  306.                 def predict_input_fn():
  307.                     data = tf.data.Dataset.from_generator(generator_predict, (tf.int32, tf.int32),
  308.                                                           output_shapes=([self.sentence_len, ], []))
  309.  
  310.                     data = data.repeat(1)
  311.                     data = data.batch(1)
  312.                     iterator = data.make_one_shot_iterator()
  313.                     text, length = iterator.get_next()
  314.                     return {'sentences': text, 'lens': length}
  315.  
  316.                 predictions = self.classifier.predict(input_fn=lambda: predict_input_fn())
  317.                 for elem in predictions:
  318.                     self.predictions = elem['class_id']
  319.  
  320.                 return id_to_category[self.predictions]
  321.  
  322.                 return text
  323.             else:
  324.                 raise Site_does_not_exist_exception('The site {s} does not even exist'.format(s=self.url))
  325.         except (requests.exceptions.Timeout, requests.exceptions.ConnectionError, AssertionError,
  326.                 requests.exceptions.ContentDecodingError,
  327.                 exceptions.DecodeError, Site_does_not_exist_exception) as e:
  328.             print('The  exception was caught with message {m}'.format(m=e.message))
  329.  
  330. two = wrapper('https://vk.com/')
  331. #print(two.parse())
  332.  
  333. def get_cat():
  334.     return two.parse()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement