Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # coding: utf-8
- """
- Тренер классификатора интентов для чатбота - нейросетка поверх BERT.
- 13.07.2019 первая реализация
- 13.07.2019 сделан gridsearch для подбора параметров сетки
- 20.07.2019 переделка для прямого использования nlu.md
- 26.07.2019 в кач-ве метрики кроссвалидации используется f1_weighted
- """
- from __future__ import print_function
- import numpy as np
- import argparse
- import platform
- import io
- import pandas as pd
- import csv
- import os
- import json
- from scipy.sparse import lil_matrix
- from sklearn.model_selection import train_test_split
- from sklearn.feature_extraction.text import TfidfVectorizer
- from sklearn.model_selection import cross_val_score
- from sklearn.linear_model import LogisticRegression
- from sklearn.svm import LinearSVC
- from sklearn.svm import SVC
- from sklearn.linear_model import SGDClassifier
- from sklearn.svm import LinearSVC
- from sklearn.externals import joblib
- from sklearn.ensemble import GradientBoostingClassifier
- import sklearn.metrics
- import keras.callbacks
- from keras import backend as K
- from keras.callbacks import ModelCheckpoint, EarlyStopping
- from keras.layers import Conv1D, GlobalMaxPooling1D, GlobalAveragePooling1D, AveragePooling1D
- from keras.layers import Input
- from keras.layers import Lambda
- from keras.layers import recurrent
- from keras.layers import Dropout
- from keras.layers.core import Dense
- from keras.layers.merge import concatenate, add, multiply
- from keras.layers.wrappers import Bidirectional
- from keras.models import Model
- from keras.models import model_from_json
- from keras.layers.normalization import BatchNormalization
- from keras.layers import Flatten
- import keras.regularizers
- from keras.wrappers.scikit_learn import KerasClassifier
- from bert_embedder2 import BERTEmbedder
- NFOLD = 8
- def get_params_str(model_params):
- return ' '.join('{}={}'.format(k, v) for (k, v) in model_params.items())
- def prepare_phrase(phrase):
- for delim in u'?,!«»"()':
- phrase = phrase.replace(delim, ' ' + delim + ' ')
- if phrase[-1] == '.':
- phrase = phrase[:-1]
- phrase = phrase.replace(' ', ' ').strip()
- return phrase
- def load_data(dataset_path, embedder):
- samples = set()
- with io.open(dataset_path, 'r', encoding='utf-8') as rdr:
- current_intent = None
- for iline, line in enumerate(rdr):
- if line.startswith('#'):
- if line.startswith('##'):
- if 'intent:' in line:
- current_intent = line.split(':')[1].strip()
- else:
- raise RuntimeError()
- else:
- # комментарии пропускаем
- continue
- else:
- line = line.strip()
- if line.startswith('-'): # в файлах RASA строки начинаются с -
- line = line[1:]
- if line:
- if current_intent:
- phrase = prepare_phrase(line)
- samples.add((phrase, current_intent))
- else:
- print('line #{}: Current intent is "None"!'.format(iline))
- exit(0)
- samples = list(samples)
- df = pd.DataFrame(columns='phrase intent'.split(), index=None)
- for sample in samples:
- df = df.append({'phrase': sample[0], 'intent': sample[1]}, ignore_index=True)
- labels = df['intent'].values
- phrases = df['phrase'].values
- label2index = dict((label, i) for (i, label) in enumerate(set(labels)))
- y_data = np.zeros((len(phrases), len(label2index)))
- for i, label in enumerate(labels):
- y_data[i, label2index[label]] = 1
- X_data = embedder(phrases)
- return X_data, y_data, label2index
- def scorer(estimator, X, y):
- y_pred = estimator.predict(X)
- #return sklearn.metrics.accuracy_score(y_true=np.argmax(y, axis=-1), y_pred=y_pred)
- return sklearn.metrics.f1_score(y_true=np.argmax(y, axis=-1), y_pred=y_pred, average='weighted')
- def create_model(x_dim, nb_labels, model_params):
- input = Input(shape=(x_dim,), dtype='float32', name='input')
- net = input
- optimizer = model_params['optimizer']
- units1 = model_params['units1']
- units2 = model_params['units2']
- activ1 = model_params['activ1']
- dropout_rate = model_params['dropout_rate']
- if units1 > 0:
- net = Dense(units=units1, activation=activ1)(net)
- if dropout_rate > 0.0:
- net = Dropout(rate=dropout_rate)(net)
- if units2 > 0:
- net = Dense(units=units2, activation=activ1)(net)
- if dropout_rate > 0.0:
- net = Dropout(rate=dropout_rate)(net)
- net = Dense(units=nb_labels, activation='softmax')(net)
- model = Model(inputs=[input], outputs=net)
- model.compile(loss='categorical_crossentropy', optimizer=optimizer)
- return model
- if __name__ == '__main__':
- parser = argparse.ArgumentParser()
- parser.add_argument('--run_mode', type=str, default='gridsearch', choices='train query gridsearch'.split())
- parser.add_argument('--tmp', type=str, default='../../../tmp')
- parser.add_argument('--dataset', default='../../../data/intents.txt')
- args = parser.parse_args()
- tmp_dir = args.tmp
- run_mode = args.run_mode
- dataset_path = args.dataset
- weights_file = os.path.abspath(os.path.join(tmp_dir, 'intent_classifier_bert.weights'))
- arch_file = os.path.abspath(os.path.join(tmp_dir, 'intent_classifier_bert.arch'))
- config_file = os.path.abspath(os.path.join(tmp_dir, 'intent_classifier_bert.config'))
- max_seq_len = 40
- # Предполагается, что архив с pretrained BERT model скачан и распакован.
- # Гугловский multilingual:
- # bert_path = '/home/inkoziev/polygon/BERT_multilingual/model/multi_cased_L-12_H-768_A-12'
- # deeppavlov ruBERT:
- bert_path = '/mnt/7383b08e-ace3-49d3-8991-5b9aa07d2596/EmbeddingModels/BERT_multilingual/model/rubert_cased_L-12_H-768_A-12_v1'
- embedder = BERTEmbedder(model_path=bert_path, seq_len=max_seq_len)
- if run_mode in ('gridsearch', 'train'):
- X_data, y_data, label2index = load_data(dataset_path, embedder)
- if run_mode == 'gridsearch':
- best_params = None
- best_score = 0.0
- for epochs in [8, 10, 20]:
- for batch_size in [20, 50, 100]: # 100, 50,
- for optimizer in ['nadam']: # 'rmsprop', 'adam',
- for units1 in [200, 390, 500]:
- for units2 in [0]:
- for activ1 in ['sigmoid']:
- for dropout_rate in [0.0, 0.1]:
- sk_params = {'epochs': epochs, 'batch_size': batch_size, 'verbose': 0,
- #, 'validation_split': 0.2
- #'callbacks': [EarlyStopping(monitor='val_loss', patience=10, mode='auto')],
- }
- model_params = sk_params.copy()
- model_params['optimizer'] = optimizer
- model_params['units1'] = units1
- model_params['units2'] = units2
- model_params['activ1'] = activ1
- model_params['dropout_rate'] = dropout_rate
- estimator = KerasClassifier(build_fn=lambda: create_model(X_data.shape[1],
- len(label2index),
- model_params), **sk_params)
- cv_res = cross_val_score(estimator, X_data, y_data,
- scoring=scorer, cv=NFOLD, n_jobs=1,
- verbose=1)
- cv_score = np.mean(cv_res)
- print('{} ==> cv score={}'.format(get_params_str(model_params), cv_score))
- if cv_score > best_score:
- print('!!! NEW BEST !!! score={} for {}'.format(cv_score, get_params_str(model_params)))
- best_score = cv_score
- best_params = model_params
- else:
- print('No improvement over current best_score={}'.format(best_score))
- print('best_score={} params={}'.format(best_score, get_params_str(best_params)))
- elif run_mode == 'train':
- # epochs=50 batch_size=20 verbose=0 optimizer=nadam units1=390 units2=0 activ1=sigmoid dropout_rate=0.1
- model_params = {}
- model_params['optimizer'] = 'nadam'
- model_params['units1'] = 390
- model_params['units2'] = 0
- model_params['activ1'] = 'sigmoid'
- model_params['dropout_rate'] = 0.1
- epochs = 50
- batch_size = 20
- model = create_model(X_data.shape[1], len(label2index), model_params)
- with open(arch_file, 'w') as f:
- f.write(model.to_json())
- model.fit(X_data, y_data, epochs=epochs, batch_size=batch_size, verbose=2)
- model.save_weights(weights_file)
- config = {'max_seq_len': max_seq_len,
- 'bert_path': bert_path,
- 'label2index': label2index,
- 'weights': weights_file,
- 'arch': arch_file}
- with open(config_file, 'w') as f:
- json.dump(config, f, indent=4)
- elif run_mode == 'query':
- with open(config_file, 'r') as f:
- model_config = json.load(f)
- max_seq_len = model_config['max_seq_len']
- label2index = model_config['label2index']
- index2label = dict((i, l) for (l, i) in label2index.items())
- with open(arch_file, 'r') as f:
- model = model_from_json(f.read())
- model.load_weights(weights_file)
- while True:
- phrase = input(':> ').strip()
- phrase = prepare_phrase(phrase)
- X_data = embedder([phrase])
- y_pred = model.predict(X_data, verbose=0)
- label = index2label[np.argmax(y_pred[0])]
- print(u'label={}'.format(label))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement