Advertisement
Guest User

Untitled

a guest
Jun 18th, 2018
64
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 10.31 KB | None | 0 0
  1. import re
  2. import os
  3. import nltk
  4. import pymorphy2
  5. import numpy as np
  6. import tensorflow as tf
  7. from collections import Counter
  8.  
  9.  
  10. DESCRIPTION = [
  11. {
  12. 'input_type': 'bag_of_char_ngrams', 'n_min': 3, 'n_max': 3, 'n_tokens_per_language': 4000,
  13. 'zero_digit_flag': True, 'normalize_word_flag': False,
  14. 'authorized_char_re': r'[^а-яёЁa-z0\-$?.]', 'lower_case_flag': True
  15. }
  16. ]
  17. BATCH_SIZE = 1000
  18. LANGUAGE_PATH = '/tmp/export/language'
  19. MODEL_FILE_NAME = '/tmp/export/tensorflow/model'
  20.  
  21.  
  22. def list_directory(path):
  23. return sorted([os.path.join(path, file_name) for file_name in os.listdir(path)])
  24.  
  25.  
  26. class DenseSemanticLayers:
  27. @staticmethod
  28. def run(input_tensor):
  29. dropout00 = tf.layers.dropout(input_tensor[0], 0.2)
  30. dense0 = tf.layers.dense(dropout00, 100)
  31. dropout01 = tf.layers.dropout(dense0, 0.2)
  32. return tf.nn.l2_normalize(dropout01, 1)
  33.  
  34.  
  35. class SentencesToVectors:
  36. def __init__(
  37. self, description=DESCRIPTION, batch_size=BATCH_SIZE, language_path=LANGUAGE_PATH,
  38. model_file_name=MODEL_FILE_NAME
  39. ):
  40. self.components = {'semantic_layers': DenseSemanticLayers()}
  41. self.session = tf.InteractiveSession()
  42. self.components['string_to_numpy'] = StringToNumpy(
  43. description=description, language_path=language_path, batch_size=BATCH_SIZE
  44. )
  45. self.components['numpy_filer'] = NumpyFilter()
  46. self.components['numpy_to_placeholder'] = _NumpyToPlaceholder(description=description)
  47. self.placeholders = []
  48. self.values = []
  49.  
  50. for entry in description:
  51. self.placeholders.append(tf.placeholder(
  52. dtype=tf.float32, shape=(batch_size, entry['n_tokens_per_language'])
  53. ))
  54. self.values.append(np.empty(
  55. dtype=np.float32, shape=(batch_size, entry['n_tokens_per_language'])
  56. ))
  57.  
  58. with tf.variable_scope('question_semantics', reuse=tf.AUTO_REUSE):
  59. self.vectors = self.components['semantic_layers'].run(self.placeholders)
  60.  
  61. tf.train.Saver(tf.global_variables()).restore(self.session, model_file_name)
  62.  
  63. def run(self, sentences):
  64. ids = []
  65.  
  66. for sentence_index, sentence in enumerate(sentences):
  67. arrays = self.components['string_to_numpy'].run(sentence)
  68.  
  69. if not self.components['numpy_filer'].run(arrays):
  70. continue
  71.  
  72. for i, value in enumerate(self.components['numpy_to_placeholder'].run(arrays)):
  73. self.values[i][len(ids)] = value
  74.  
  75. ids.append(sentence_index)
  76.  
  77. feed_dict = {placeholder: value for placeholder, value in zip(
  78. self.placeholders, self.values[:len(ids)]
  79. )}
  80. vectors = self.session.run(self.vectors, feed_dict=feed_dict)
  81. return ids, vectors
  82.  
  83.  
  84. class StringToNumpy:
  85. def __init__(self, description, language_path, batch_size):
  86. """
  87. :param description: [{'input_type': string, **params}, ...]
  88. """
  89. self.components = {}
  90. self.description = description
  91.  
  92. for entry_index, entry in enumerate(self.description):
  93. self.components[entry_index] = _BagOfCharNgramsToNumpy(
  94. description=entry, batch_size=batch_size,
  95. path=os.path.join(language_path, str(entry_index))
  96. )
  97.  
  98. def run(self, string):
  99. arrays = []
  100.  
  101. for i in range(len(self.description)):
  102. arrays.append(self.components[i].run(string))
  103.  
  104. return arrays
  105.  
  106.  
  107. class _BagOfCharNgramsToNumpy:
  108. def __init__(self, description, path, batch_size):
  109. """
  110. :param description: {'n_min': int, 'n_max': int, **string_processor_params}
  111. """
  112. self.components = {}
  113. self.description = description
  114. self.tokens = TokenRecordsToDict(path=path, batch_size=batch_size).run()
  115. self.components['string_to_tokens'] = StringToCharNgrams(
  116. description['n_min'], description['n_max']
  117. )
  118. self.components['string_processor'] = StringProcessor(
  119. zero_digit_flag=description['zero_digit_flag'],
  120. normalize_word_flag=description['normalize_word_flag'],
  121. authorized_char_re=description['authorized_char_re'],
  122. lower_case_flag=description['lower_case_flag']
  123. )
  124.  
  125. def run(self, string):
  126. tokens = self.components['string_to_tokens'].run(
  127. self.components['string_processor'].run(string)
  128. )
  129. counter = Counter()
  130.  
  131. for word_tokens in tokens:
  132. for token in word_tokens:
  133. index = self.tokens.get(token, None)
  134.  
  135. if index is not None:
  136. counter[index] += 1
  137.  
  138. if len(counter) == 0:
  139. array = np.zeros(shape=(0, 2), dtype=np.int64)
  140. else:
  141. array = np.asarray(list(counter.items()))
  142.  
  143. return array[:, 0], array[:, 1]
  144.  
  145.  
  146. class WordToCharNgrams:
  147. def __init__(self, n_min, n_max):
  148. self.n_min = n_min
  149. self.n_max = n_max
  150. self.pad_symbol = '#'
  151.  
  152. def run(self, word):
  153. return [''.join(ngram) for ngram in nltk.everygrams(
  154. word, self.n_min, self.n_max, pad_left=True, pad_right=True,
  155. left_pad_symbol='#', right_pad_symbol='#'
  156. )]
  157.  
  158.  
  159. class StringToWordNgrams:
  160. def __init__(self, n_min, n_max):
  161. self.n_min = n_min
  162. self.n_max = n_max
  163.  
  164. def run(self, string):
  165. words = nltk.word_tokenize(string, language='russian')
  166. return [' '.join(ngram) for ngram in nltk.everygrams(words, self.n_min, self.n_max)]
  167.  
  168.  
  169. class StringToCharNgrams:
  170. def __init__(self, n_min, n_max):
  171. self.n_min = n_min
  172. self.n_max = n_max
  173. self.components = dict()
  174. self.components['String->WordNgrams'] = StringToWordNgrams(1, 1)
  175. self.components['Word->CharNgrams'] = WordToCharNgrams(n_min, n_max)
  176.  
  177. def run(self, string):
  178. return [
  179. self.components['Word->CharNgrams'].run(word)
  180. for word in self.components['String->WordNgrams'].run(string)
  181. ]
  182.  
  183.  
  184. class StringProcessor:
  185. def __init__(self, zero_digit_flag, normalize_word_flag, authorized_char_re, lower_case_flag):
  186. super().__init__()
  187. self.zero_digit_flag = zero_digit_flag
  188. self.normalize_word_flag = normalize_word_flag
  189. self.lower_case_flag = lower_case_flag
  190. self.components = dict()
  191. self.components['string_to_word_ngrams'] = StringToWordNgrams(1, 1)
  192. self.authorized_char_compiled_re = re.compile(authorized_char_re)
  193.  
  194. if self.zero_digit_flag:
  195. self.zero_digit_compiled_re = re.compile(r'[0-9]')
  196.  
  197. if self.normalize_word_flag:
  198. self.analyzer = pymorphy2.MorphAnalyzer()
  199.  
  200. def run(self, string):
  201. words = []
  202.  
  203. for word in self.components['string_to_word_ngrams'].run(string):
  204. if self.lower_case_flag:
  205. word = word.lower()
  206.  
  207. if self.zero_digit_flag:
  208. word = self.zero_digit_compiled_re.sub('0', word)
  209.  
  210. if self.normalize_word_flag:
  211. word = self.analyzer.normal_forms(word)[0]
  212.  
  213. word = self.authorized_char_compiled_re.sub('', word)
  214.  
  215. if len(word) > 0:
  216. words.append(word)
  217.  
  218. return ' '.join(words)
  219.  
  220.  
  221. class NumpyFilter:
  222. @staticmethod
  223. def run(arrays):
  224. value = 0
  225.  
  226. for array_list in arrays:
  227. for array in array_list:
  228. value += np.sum(array)
  229.  
  230. return value != 0
  231.  
  232.  
  233. class _NumpyToPlaceholder:
  234. def __init__(self, description):
  235. """
  236. :param description: [{'input_type': string, **params}, ...]
  237. """
  238. self.components = dict()
  239. self.description = description
  240.  
  241. for entry_index, entry in enumerate(self.description):
  242. component = _BagOfCharNgramsNumpyToPlaceholder
  243. self.components[entry_index] = component(description=entry)
  244.  
  245. def run(self, arrays):
  246. values = []
  247.  
  248. for i in range(len(self.description)):
  249. values.append(self.components[i].run(arrays[i]))
  250.  
  251. return values
  252.  
  253.  
  254. class _BagOfCharNgramsNumpyToPlaceholder:
  255. def __init__(self, description):
  256. self.description = description
  257.  
  258. def run(self, arrays):
  259. value = np.zeros((self.description['n_tokens_per_language'],), dtype=np.float32)
  260. value[arrays[0]] = arrays[1]
  261. return value
  262.  
  263.  
  264. class TokenRecordsToDict:
  265. def __init__(self, path, batch_size):
  266. self.components = dict()
  267. self.path = path
  268. self.batch_size = batch_size
  269. self.components['record_to_values'] = TfRecordToDictValues()
  270.  
  271. def run(self):
  272. tokens = {}
  273.  
  274. for records in TfRecordIterator(batch_size=self.batch_size, path=self.path).run():
  275. for record in records:
  276. token, index = self.components['record_to_values'].run(
  277. description=(('token', 'bytes'), ('index', 'int')), record=record
  278. )
  279. tokens[token[0].decode()] = index[0]
  280.  
  281. return tokens
  282.  
  283.  
  284. class TfRecordIterator:
  285. def __init__(self, path, batch_size):
  286. self.path = path
  287. self.batch_size = batch_size
  288.  
  289. def run(self):
  290. records = []
  291.  
  292. for file_name in list_directory(self.path):
  293. for record in tf.python_io.tf_record_iterator(file_name):
  294. records.append(record)
  295.  
  296. if len(records) == self.batch_size:
  297. yield records
  298. records = []
  299.  
  300. if len(records) > 0:
  301. yield records
  302.  
  303.  
  304. class TfRecordToDictValues:
  305. @staticmethod
  306. def run(description, record):
  307. values = []
  308. example = tf.train.Example()
  309. example.ParseFromString(record)
  310.  
  311. for entry_index, entry in enumerate(description):
  312. feature = example.features.feature[entry[0]]
  313. dtype = entry[1]
  314.  
  315. if dtype == 'int':
  316. value_list = feature.int64_list
  317. elif dtype == 'float':
  318. value_list = feature.float_list
  319. else: # dtype == 'bytes'
  320. value_list = feature.bytes_list
  321.  
  322. values.append(value_list.value)
  323.  
  324. return values
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement