Advertisement
Guest User

Untitled

a guest
Sep 17th, 2019
135
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 10.41 KB | None | 0 0
  1. # coding: utf-8
  2. """
  3. Тренер классификатора интентов для чатбота - нейросетка поверх BERT.
  4. 13.07.2019 первая реализация
  5. 13.07.2019 сделан gridsearch для подбора параметров сетки
  6. 20.07.2019 переделка для прямого использования nlu.md
  7. 26.07.2019 в кач-ве метрики кроссвалидации используется f1_weighted
  8. """
  9.  
  10. from __future__ import print_function
  11. import numpy as np
  12. import argparse
  13. import platform
  14. import io
  15. import pandas as pd
  16. import csv
  17. import os
  18. import json
  19.  
  20. from scipy.sparse import lil_matrix
  21. from sklearn.model_selection import train_test_split
  22. from sklearn.feature_extraction.text import TfidfVectorizer
  23. from sklearn.model_selection import cross_val_score
  24. from sklearn.linear_model import LogisticRegression
  25. from sklearn.svm import LinearSVC
  26. from sklearn.svm import SVC
  27. from sklearn.linear_model import SGDClassifier
  28. from sklearn.svm import LinearSVC
  29. from sklearn.externals import joblib
  30. from sklearn.ensemble import GradientBoostingClassifier
  31. import sklearn.metrics
  32.  
  33. import keras.callbacks
  34. from keras import backend as K
  35. from keras.callbacks import ModelCheckpoint, EarlyStopping
  36. from keras.layers import Conv1D, GlobalMaxPooling1D, GlobalAveragePooling1D, AveragePooling1D
  37. from keras.layers import Input
  38. from keras.layers import Lambda
  39. from keras.layers import recurrent
  40. from keras.layers import Dropout
  41. from keras.layers.core import Dense
  42. from keras.layers.merge import concatenate, add, multiply
  43. from keras.layers.wrappers import Bidirectional
  44. from keras.models import Model
  45. from keras.models import model_from_json
  46. from keras.layers.normalization import BatchNormalization
  47. from keras.layers import Flatten
  48. import keras.regularizers
  49. from keras.wrappers.scikit_learn import KerasClassifier
  50.  
  51. from bert_embedder2 import BERTEmbedder
  52.  
  53.  
  54. NFOLD = 8
  55.  
  56.  
  57. def get_params_str(model_params):
  58. return ' '.join('{}={}'.format(k, v) for (k, v) in model_params.items())
  59.  
  60.  
  61. def prepare_phrase(phrase):
  62. for delim in u'?,!«»"()':
  63. phrase = phrase.replace(delim, ' ' + delim + ' ')
  64.  
  65. if phrase[-1] == '.':
  66. phrase = phrase[:-1]
  67. phrase = phrase.replace(' ', ' ').strip()
  68. return phrase
  69.  
  70.  
  71. def load_data(dataset_path, embedder):
  72. samples = set()
  73.  
  74. with io.open(dataset_path, 'r', encoding='utf-8') as rdr:
  75. current_intent = None
  76. for iline, line in enumerate(rdr):
  77. if line.startswith('#'):
  78. if line.startswith('##'):
  79. if 'intent:' in line:
  80. current_intent = line.split(':')[1].strip()
  81. else:
  82. raise RuntimeError()
  83. else:
  84. # комментарии пропускаем
  85. continue
  86. else:
  87. line = line.strip()
  88. if line.startswith('-'): # в файлах RASA строки начинаются с -
  89. line = line[1:]
  90. if line:
  91. if current_intent:
  92. phrase = prepare_phrase(line)
  93. samples.add((phrase, current_intent))
  94. else:
  95. print('line #{}: Current intent is "None"!'.format(iline))
  96. exit(0)
  97.  
  98. samples = list(samples)
  99.  
  100. df = pd.DataFrame(columns='phrase intent'.split(), index=None)
  101. for sample in samples:
  102. df = df.append({'phrase': sample[0], 'intent': sample[1]}, ignore_index=True)
  103.  
  104. labels = df['intent'].values
  105. phrases = df['phrase'].values
  106.  
  107. label2index = dict((label, i) for (i, label) in enumerate(set(labels)))
  108. y_data = np.zeros((len(phrases), len(label2index)))
  109. for i, label in enumerate(labels):
  110. y_data[i, label2index[label]] = 1
  111.  
  112. X_data = embedder(phrases)
  113.  
  114. return X_data, y_data, label2index
  115.  
  116.  
  117. def scorer(estimator, X, y):
  118. y_pred = estimator.predict(X)
  119. #return sklearn.metrics.accuracy_score(y_true=np.argmax(y, axis=-1), y_pred=y_pred)
  120. return sklearn.metrics.f1_score(y_true=np.argmax(y, axis=-1), y_pred=y_pred, average='weighted')
  121.  
  122.  
  123. def create_model(x_dim, nb_labels, model_params):
  124. input = Input(shape=(x_dim,), dtype='float32', name='input')
  125. net = input
  126.  
  127. optimizer = model_params['optimizer']
  128. units1 = model_params['units1']
  129. units2 = model_params['units2']
  130. activ1 = model_params['activ1']
  131. dropout_rate = model_params['dropout_rate']
  132.  
  133. if units1 > 0:
  134. net = Dense(units=units1, activation=activ1)(net)
  135. if dropout_rate > 0.0:
  136. net = Dropout(rate=dropout_rate)(net)
  137.  
  138. if units2 > 0:
  139. net = Dense(units=units2, activation=activ1)(net)
  140. if dropout_rate > 0.0:
  141. net = Dropout(rate=dropout_rate)(net)
  142.  
  143. net = Dense(units=nb_labels, activation='softmax')(net)
  144.  
  145. model = Model(inputs=[input], outputs=net)
  146. model.compile(loss='categorical_crossentropy', optimizer=optimizer)
  147. return model
  148.  
  149.  
  150. if __name__ == '__main__':
  151. parser = argparse.ArgumentParser()
  152. parser.add_argument('--run_mode', type=str, default='gridsearch', choices='train query gridsearch'.split())
  153. parser.add_argument('--tmp', type=str, default='../../../tmp')
  154. parser.add_argument('--dataset', default='../../../data/intents.txt')
  155.  
  156. args = parser.parse_args()
  157. tmp_dir = args.tmp
  158. run_mode = args.run_mode
  159. dataset_path = args.dataset
  160.  
  161. weights_file = os.path.abspath(os.path.join(tmp_dir, 'intent_classifier_bert.weights'))
  162. arch_file = os.path.abspath(os.path.join(tmp_dir, 'intent_classifier_bert.arch'))
  163. config_file = os.path.abspath(os.path.join(tmp_dir, 'intent_classifier_bert.config'))
  164.  
  165. max_seq_len = 40
  166.  
  167. # Предполагается, что архив с pretrained BERT model скачан и распакован.
  168.  
  169. # Гугловский multilingual:
  170. # bert_path = '/home/inkoziev/polygon/BERT_multilingual/model/multi_cased_L-12_H-768_A-12'
  171.  
  172. # deeppavlov ruBERT:
  173. bert_path = '/mnt/7383b08e-ace3-49d3-8991-5b9aa07d2596/EmbeddingModels/BERT_multilingual/model/rubert_cased_L-12_H-768_A-12_v1'
  174.  
  175. embedder = BERTEmbedder(model_path=bert_path, seq_len=max_seq_len)
  176.  
  177. if run_mode in ('gridsearch', 'train'):
  178. X_data, y_data, label2index = load_data(dataset_path, embedder)
  179.  
  180. if run_mode == 'gridsearch':
  181. best_params = None
  182. best_score = 0.0
  183.  
  184. for epochs in [8, 10, 20]:
  185. for batch_size in [20, 50, 100]: # 100, 50,
  186. for optimizer in ['nadam']: # 'rmsprop', 'adam',
  187. for units1 in [200, 390, 500]:
  188. for units2 in [0]:
  189. for activ1 in ['sigmoid']:
  190. for dropout_rate in [0.0, 0.1]:
  191. sk_params = {'epochs': epochs, 'batch_size': batch_size, 'verbose': 0,
  192. #, 'validation_split': 0.2
  193. #'callbacks': [EarlyStopping(monitor='val_loss', patience=10, mode='auto')],
  194. }
  195. model_params = sk_params.copy()
  196. model_params['optimizer'] = optimizer
  197. model_params['units1'] = units1
  198. model_params['units2'] = units2
  199. model_params['activ1'] = activ1
  200. model_params['dropout_rate'] = dropout_rate
  201.  
  202. estimator = KerasClassifier(build_fn=lambda: create_model(X_data.shape[1],
  203. len(label2index),
  204. model_params), **sk_params)
  205. cv_res = cross_val_score(estimator, X_data, y_data,
  206. scoring=scorer, cv=NFOLD, n_jobs=1,
  207. verbose=1)
  208. cv_score = np.mean(cv_res)
  209. print('{} ==> cv score={}'.format(get_params_str(model_params), cv_score))
  210. if cv_score > best_score:
  211. print('!!! NEW BEST !!! score={} for {}'.format(cv_score, get_params_str(model_params)))
  212. best_score = cv_score
  213. best_params = model_params
  214. else:
  215. print('No improvement over current best_score={}'.format(best_score))
  216.  
  217. print('best_score={} params={}'.format(best_score, get_params_str(best_params)))
  218. elif run_mode == 'train':
  219. # epochs=50 batch_size=20 verbose=0 optimizer=nadam units1=390 units2=0 activ1=sigmoid dropout_rate=0.1
  220. model_params = {}
  221. model_params['optimizer'] = 'nadam'
  222. model_params['units1'] = 390
  223. model_params['units2'] = 0
  224. model_params['activ1'] = 'sigmoid'
  225. model_params['dropout_rate'] = 0.1
  226. epochs = 50
  227. batch_size = 20
  228.  
  229. model = create_model(X_data.shape[1], len(label2index), model_params)
  230. with open(arch_file, 'w') as f:
  231. f.write(model.to_json())
  232.  
  233. model.fit(X_data, y_data, epochs=epochs, batch_size=batch_size, verbose=2)
  234. model.save_weights(weights_file)
  235.  
  236. config = {'max_seq_len': max_seq_len,
  237. 'bert_path': bert_path,
  238. 'label2index': label2index,
  239. 'weights': weights_file,
  240. 'arch': arch_file}
  241. with open(config_file, 'w') as f:
  242. json.dump(config, f, indent=4)
  243.  
  244. elif run_mode == 'query':
  245. with open(config_file, 'r') as f:
  246. model_config = json.load(f)
  247. max_seq_len = model_config['max_seq_len']
  248. label2index = model_config['label2index']
  249.  
  250. index2label = dict((i, l) for (l, i) in label2index.items())
  251.  
  252. with open(arch_file, 'r') as f:
  253. model = model_from_json(f.read())
  254.  
  255. model.load_weights(weights_file)
  256.  
  257. while True:
  258. phrase = input(':> ').strip()
  259. phrase = prepare_phrase(phrase)
  260. X_data = embedder([phrase])
  261. y_pred = model.predict(X_data, verbose=0)
  262. label = index2label[np.argmax(y_pred[0])]
  263. print(u'label={}'.format(label))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement