Advertisement
Guest User

Untitled

a guest
Feb 9th, 2016
58
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 17.19 KB | None | 0 0
  1. import numpy as np
  2. import time
  3. import os
  4. import theano
  5. import theano.tensor as T
  6. import lasagne
  7.  
  8. from sklearn.preprocessing import LabelEncoder
  9. from sklearn.metrics import accuracy_score
  10. import pickle
  11.  
  12. from scikits.audiolab import Sndfile
  13. from scikits.talkbox import segment_axis
  14.  
  15.  
  16. def feature_data_and_phoneme_matching(save=True):
  17.     """""
  18.    This function performs reading the audio files with
  19.    their corresponding phonetic transcriptions, then, given
  20.    the size of the hamming window and overlap, it segments the
  21.    audio data and stacks them on a matrix, whilst figuring out
  22.    which phoneme corresponds to which row (window) of the matrix.
  23.  
  24.    After all the files have been passed, the matrix and the list of
  25.    phonemes are saved as .npy or returned if stated.
  26.    """""
  27.  
  28.     # PATH TO TRAINING CORPORA.
  29.     msak_path = '/home/gunslinger/Desktop/Msak/'
  30.     fsew_path = '/home/gunslinger/Desktop/Fsew/'
  31.  
  32.     # Defining a sliding window of size 256.
  33.     # With 96 overlap.
  34.     window_size = 256
  35.     non_overlap = 160;
  36.     overlap = window_size - non_overlap
  37.  
  38.     # Rate at which audio file is sampled at.
  39.     sample_rate = 16e3
  40.  
  41.     # Phonemes will be stored here, this list
  42.     # will be of the same row size as the size of
  43.     # the matrix of frames. These will later be used
  44.     # for the supervised learning for the uppermost
  45.     # layer of the autoencoder.
  46.  
  47.     phoneme_list = []
  48.  
  49.     # If it's the first time that we're adding values to the matrix
  50.     # it needs to be initialized.
  51.     flag = True
  52.  
  53.     # A simple counter for how many files we've passed.
  54.     at_file = 0
  55.  
  56.     # Ordering is important.
  57.     for path in msak_path, fsew_path:
  58.         files = sorted(os.listdir(path))
  59.         # Audio files
  60.         wav_files = [f for f in files if f.endswith('wav')]
  61.         # Phonetic transcripts
  62.         lab_files = [f for f in files if f.endswith('lab')]
  63.         # Nevermind which, wav_files' shape equals lab_files'
  64.         num_files = len(wav_files)
  65.  
  66.         for i in range(0, num_files):
  67.             at_file += 1
  68.             print (at_file)
  69.             # After reading the audio file, the data
  70.             # is segmented w/ 96 overlap in a matrix.
  71.             if path == msak_path:
  72.                 wav_file = Sndfile(msak_path + wav_files[i], 'r')
  73.                 # Read the phonetic transcript
  74.                 # in line. File is of the following type:
  75.                 # sec.ms sec.ms ph where ph is a phoneme.
  76.                 lab_file = open(msak_path + lab_files[i], 'r').read().split()
  77.             else:
  78.                 wav_file = Sndfile(fsew_path + wav_files[i], 'r')
  79.                 lab_file = open(fsew_path + lab_files[i], 'r').read().split()
  80.  
  81.             # Audio data of file is stored as a numpy array.
  82.             audio_data = wav_file.read_frames(wav_file.nframes)
  83.             # Segmentation of the array with the given parameters.
  84.             segmented_audio_data = segment_axis(audio_data, length=window_size, overlap=overlap)
  85.  
  86.             if flag:
  87.                 features = segmented_audio_data
  88.                 flag = False
  89.             else:
  90.                 # Add the segmented audio data array to the features matrix:
  91.                 features = np.vstack((features, segmented_audio_data))
  92.  
  93.             # nul_el holds the number of phonemes (along with timestamps)
  94.             # that exists in the file.
  95.             num_el = len(lab_file)
  96.  
  97.             # Reiterating will be slow. This variable will
  98.             # tell us where we left off.
  99.             start_at = 0
  100.  
  101.             for j in range(0, num_el, 3):
  102.                 # Going over the timesamps and phonemes acquiring
  103.                 # a whole line in one iteration.
  104.                 start_time = float(lab_file[j])
  105.                 end_time = float(lab_file[j + 1])
  106.                 phoneme = lab_file[j + 2]
  107.  
  108.                 # The frame interval between which a
  109.                 # phoneme is found.
  110.                 start_frame_phoneme = sample_rate * start_time
  111.                 end_frame_phoneme = sample_rate * end_time
  112.  
  113.                 # Going through frames w/ 96 overlap and mapping them to phonemes
  114.                 for LHS_interval_of_frames in range(start_at, wav_file.nframes, non_overlap):
  115.                     RHS_interval_of_frames = LHS_interval_of_frames + window_size
  116.                     if LHS_interval_of_frames >= start_frame_phoneme:
  117.                         phoneme_list.append(phoneme)
  118.                     if RHS_interval_of_frames >= end_frame_phoneme:
  119.                         # Start where we left off.
  120.                         start_at = LHS_interval_of_frames
  121.                         break
  122.  
  123.             phoneme_list.append('sil')
  124.  
  125.     phonemes = np.asarray(phoneme_list)
  126.     if save:
  127.         np.save('features.npy', features)
  128.         np.save('phonemes.npy', phonemes)
  129.     else:
  130.         return features, phonemes
  131.  
  132.  
  133. def to_categorical(y, nb_classes=None):
  134.     '''Convert class vector (integers from 0 to nb_classes)
  135.    to binary class matrix, for use with categorical_crossentropy.
  136.    '''
  137.     y = np.asarray(y, dtype='int32')
  138.     if not nb_classes:
  139.         nb_classes = np.max(y)+1
  140.     Y = np.zeros((len(y), nb_classes))
  141.     for i in range(len(y)):
  142.         Y[i, y[i]] = 1.
  143.     return Y
  144.  
  145.  
  146. def float32(k):
  147.     return np.cast['float32'](k)
  148.  
  149.  
  150. def batch_gen_ae(X, y, N):
  151.     while True:
  152.         idx = np.random.choice(len(y), N)
  153.         yield X[idx].astype('float32'), y[idx].astype('float32')
  154.  
  155.  
  156. def batch_gen_clf(X, y, N):
  157.     while True:
  158.         idx = np.random.choice(len(y), N)
  159.         yield X[idx].astype('float32'), y[idx].astype('int32')
  160.  
  161.  
  162. def layerwise_model(input_var=None, sizes=None, gaussian_noise=False):
  163.     # Depth of the network.
  164.     no_layers = sizes.shape[0]
  165.  
  166.     # Nonlinearities selection
  167.     linear = lasagne.nonlinearities.linear
  168.     sigmoid = lasagne.nonlinearities.sigmoid
  169.  
  170.     # Input layer:
  171.     network = lasagne.layers.InputLayer(shape=(None, sizes[0]),
  172.                                         input_var=input_var)
  173.     if gaussian_noise:
  174.         network = lasagne.layers.GaussianNoiseLayer(network, sigma=0.1)
  175.     # Hidden layers and dropout:
  176.     network = lasagne.layers.DenseLayer(
  177.             network, sizes[1], nonlinearity=sigmoid)
  178.     # Output layer:
  179.     network = lasagne.layers.DenseLayer(network, sizes[no_layers-1],
  180.                                         nonlinearity=linear)
  181.     return network
  182.  
  183.  
  184. def fine_tuning_model(input_var=None, sizes=None) :
  185.     # Number of layers
  186.     no_layers = sizes.shape[0]
  187.     no_hidden = no_layers - 2
  188.     middle_index = no_layers / 2
  189.     # Nonlinearities
  190.     linear = lasagne.nonlinearities.linear
  191.     sigmoid = lasagne.nonlinearities.sigmoid
  192.     # Input layer:
  193.     model = lasagne.layers.InputLayer(shape=(None, sizes[0]),
  194.                                              input_var=input_var, name='input')
  195.     # Hidden layers:
  196.     for i in range(no_hidden):
  197.         if i < middle_index:
  198.             model = lasagne.layers.DenseLayer(model, num_units=sizes[i+1], nonlinearity=sigmoid)
  199.         else:
  200.             model = lasagne.layers.DenseLayer(model, num_units=sizes[i+1], nonlinearity=linear)
  201.     # Output layer:
  202.     model = lasagne.layers.DenseLayer(model, num_units=sizes[no_layers-1], nonlinearity=linear)
  203.     return model
  204.  
  205.  
  206. def softmax_classifier(input_var=None, input_size=0, output_size=0):
  207.     model = lasagne.layers.InputLayer(input_var=input_var, shape=(None, input_size))
  208.     model = lasagne.layers.DenseLayer(model, num_units=246, nonlinearity=lasagne.nonlinearities.sigmoid)
  209.     model = lasagne.layers.DenseLayer(model, num_units=output_size,
  210.                                       nonlinearity=lasagne.nonlinearities.softmax)
  211.     return model
  212.  
  213.  
  214. def train_classifier(X, y, network=None, lr=0.1, batch_size=128, num_epochs=300):
  215.     split_at = X.shape[0]*.2
  216.     X_train = X[:-split_at]
  217.     X_test = X[:split_at]
  218.     y_train = y[:-split_at]
  219.     y_test = y[:split_at]
  220.  
  221.     input_var = T.dmatrix()
  222.     target_var = T.ivector()
  223.  
  224.     # Loss function
  225.     prediction = lasagne.layers.get_output(network, inputs=input_var)
  226.     loss = lasagne.objectives.categorical_crossentropy(prediction, target_var)
  227.     loss = loss.mean()
  228.  
  229.     # Regularizer
  230.  
  231.     # Update expression
  232.     params = lasagne.layers.get_all_params(network, trainable=True)
  233.     updates = lasagne.updates.nesterov_momentum(loss, params=params, learning_rate=lr,
  234.                                                 momentum=0.9)
  235.  
  236.     # Expression for loss
  237.     test_prediction = lasagne.layers.get_output(network, inputs=input_var, deterministic=True)
  238.     test_loss = lasagne.objectives.categorical_crossentropy(test_prediction, target_var)
  239.     test_loss = test_loss.mean()
  240.  
  241.     f_train = theano.function([input_var, target_var], loss, updates=updates)
  242.     f_val = theano.function([input_var, target_var], test_loss)
  243.  
  244.     # Batch size choice and the number of batches per epoch
  245.     BATCH_SIZE = batch_size
  246.     N_BATCHES = len(X_train) // BATCH_SIZE
  247.     N_VAL_BATCHES = len(X_test) // BATCH_SIZE
  248.  
  249.     # Minibatch generators
  250.     train_batches_gen = batch_gen_clf(X_train, y_train, BATCH_SIZE)
  251.     val_batches_gen = batch_gen_clf(X_test, y_test, BATCH_SIZE)
  252.  
  253.     for epoch in range(num_epochs):
  254.         # In each epoch, we do a full pass over the training data:
  255.         train_err = 0
  256.         start_time = time.time()
  257.         for _ in range(N_BATCHES):
  258.             inputs, targets = next(train_batches_gen)
  259.             train_err += f_train(inputs, targets)
  260.  
  261.         # And a full pass over the validation data:
  262.         val_err = 0
  263.         for _ in range(N_VAL_BATCHES):
  264.             inputs, targets = next(val_batches_gen)
  265.             err = f_val(inputs, targets)
  266.             val_err += err
  267.  
  268.         # Then we print the results for this epoch:
  269.         print("Epoch {} of {} took {:.3f}s".format(
  270.             epoch + 1, num_epochs, time.time() - start_time))
  271.         print("  training loss:\t\t{:.6f}".format(train_err / N_BATCHES))
  272.         print("  validation loss:\t\t{:.6f}".format(val_err / N_VAL_BATCHES))
  273.  
  274.  
  275. def train_autoenc(X, network=None, lr=0.03, batch_size=128, num_epochs=300):
  276.     split_at = X.shape[0] * 0.2
  277.     X_train = X[:-split_at]
  278.     X_test = X[:split_at]
  279.  
  280.     input_var = T.dmatrix()
  281.     target_var = T.dmatrix()
  282.  
  283.     # Loss expression
  284.     prediction = lasagne.layers.get_output(network, input_var)
  285.     loss = lasagne.objectives.squared_error(prediction, target_var)
  286.     loss = loss.mean()
  287.  
  288.     # Weight decay
  289.  
  290.     # Retrieve the parameters
  291.     params = lasagne.layers.get_all_params(network, trainable=True)
  292.  
  293.     # Compute the gradient of the loss function with respect to the parameters.
  294.     grad = T.grad(loss, params)
  295.     # Learning rate and momentum will be variable.
  296.     updates = lasagne.updates.nesterov_momentum(grad, params, learning_rate=lr, momentum=0.9)
  297.  
  298.     # Expression for testing, set deterministic to true in case dropouts are activated.
  299.     test_prediction = lasagne.layers.get_output(network, input_var, deterministic=True)
  300.     test_loss = lasagne.objectives.squared_error(test_prediction,
  301.                                                             target_var)
  302.     test_loss = test_loss.mean()
  303.  
  304.     # Define a training function
  305.     f_train = theano.function([input_var, target_var], loss, updates=updates)
  306.  
  307.     # A validation function, similar but it doesn't alter the parameters
  308.     f_val = theano.function([input_var, target_var], test_loss)
  309.  
  310.     # Batch size choice and the number of batches per epoch
  311.     BATCH_SIZE = batch_size
  312.     N_BATCHES = len(X_train) // BATCH_SIZE
  313.     N_VAL_BATCHES = len(X_test) // BATCH_SIZE
  314.  
  315.     # Minibatch generators
  316.     train_batches_gen = batch_gen_ae(X_train, X_train, BATCH_SIZE)
  317.     val_batches_gen = batch_gen_ae(X_test, X_test, BATCH_SIZE)
  318.  
  319.     for epoch in range(num_epochs):
  320.         # In each epoch, we do a full pass over the training data:
  321.         train_err = 0
  322.         start_time = time.time()
  323.         for _ in range(N_BATCHES):
  324.             inputs, targets = next(train_batches_gen)
  325.             train_err += f_train(inputs, targets)
  326.  
  327.         # And a full pass over the validation data:
  328.         val_err = 0
  329.         for _ in range(N_VAL_BATCHES):
  330.             inputs, targets = next(val_batches_gen)
  331.             err = f_val(inputs, targets)
  332.             val_err += err
  333.  
  334.         # Then we print the results for this epoch:
  335.         print("Epoch {} of {} took {:.3f}s".format(
  336.             epoch + 1, num_epochs, time.time() - start_time))
  337.         print("  training loss:\t\t{:.6f}".format(train_err / N_BATCHES))
  338.         print("  validation loss:\t\t{:.6f}".format(val_err / N_VAL_BATCHES))
  339.  
  340.  
  341. def layerwise_training_f(X=None, no_epochs=0, input_shapes=[], hidden_shapes=[],
  342.                        output_shapes=[]):
  343.     """"
  344.    Given input, hidden and output shapes,
  345.    it performs greedy layer-wise training,
  346.    :return: No return, only the parameters are saved.
  347.    """""
  348.     i = 1
  349.     for input_num, hidden_num, output_num in zip(input_shapes, hidden_shapes, output_shapes):
  350.         input_var = T.dmatrix()
  351.         print ("Building model for layer " + str(i))
  352.         sizes = np.array([input_num, hidden_num, output_num])
  353.         model = layerwise_model(input_var=input_var, sizes=sizes, gaussian_noise=True)
  354.         print ("Training")
  355.         train_autoenc(X=X, network=model, num_epochs=no_epochs)
  356.         print ("Saving parameters")
  357.         parameters = lasagne.layers.get_all_param_values(model)
  358.         np.save('LasagneLayerParams' + str(i), parameters)
  359.         encode_layer = lasagne.layers.get_all_layers(model)[2]
  360.         X = lasagne.layers.get_output(layer_or_layers=encode_layer, inputs=X).eval()
  361.         i += 1
  362.  
  363.  
  364. def disperse(phonemes, no_frames=5):
  365.     """""
  366.    :param phonemes: Target(phoneme) array.
  367.    :param overlap: The number of features that will overlap.
  368.    :param no_frames: The number of frames that will be trained at once.
  369.    :return: List of the dominant phonemes corresponding to multiple frames
  370.     at once.
  371.    """""
  372.     # The phonemes are categorized as if used for categorical cross-entropy.
  373.     lab_enc = LabelEncoder()
  374.     phonemes_categorized = to_categorical(lab_enc.fit_transform(phonemes))
  375.     phoneme_classes = lab_enc.classes_
  376.     no_features = phonemes.shape[0]
  377.     dominant_phonemes = []
  378.     for frame_ind in range(0, no_features, no_frames):
  379.         array = sum(phonemes_categorized[frame_ind:frame_ind + no_frames])
  380.         dominant_phonemes.append(phoneme_classes[np.argmax(array)])
  381.     return np.asarray(dominant_phonemes)
  382.  
  383.  
  384. def proliferate_params(file_names, model=None):
  385.     """""
  386.    Given file names, sets the parameters to the
  387.    unwinded model.
  388.    """""
  389.     no_parameters = file_names.__len__() * 4
  390.     list = [None for _ in range(no_parameters)]
  391.     i = 0
  392.     for filename in file_names:
  393.         parameters = np.load(filename)
  394.         list.__setitem__(i, parameters[0])
  395.         list.__setitem__(i+1, parameters[1])
  396.         list.__setitem__(no_parameters - i - 1, parameters[3])
  397.         list.__setitem__(no_parameters - i - 2, parameters[2])
  398.         i += 2
  399.     lasagne.layers.set_all_param_values(model, list)
  400.  
  401.  
  402. def main(generate_features=False, layerwise_training=False,
  403.          save_encoded_features=False):
  404.     if generate_features:
  405.         X, phonemes = feature_data_and_phoneme_matching(save=False)
  406.     else:
  407.         X, phonemes = np.load('features_std.npy'), np.load('phonemes.npy')
  408.  
  409.     if layerwise_training:
  410.         input_shapes  = [256, 100, 70]
  411.         hidden_shapes = [100, 70,  50]
  412.         output_shapes = [256, 100, 70]
  413.         layerwise_training_f(X=X, no_epochs=300, input_shapes=input_shapes,
  414.                              hidden_shapes=hidden_shapes, output_shapes=output_shapes)
  415.  
  416.     # Process of fine-tuning.
  417.     file_names = np.array(['LasagneLayerParams1.npy', 'LasagneLayerParams2.npy', 'LasagneLayerParams3.npy'])
  418.     sizes = np.array([256, 100, 70, 50, 70, 100, 256])
  419.     ft_model = fine_tuning_model(sizes=sizes)
  420.     proliferate_params(file_names=file_names, model=ft_model)
  421.     train_autoenc(X=X, network=ft_model)
  422.  
  423.     # Careful which layer is picked
  424.     enc_layer = lasagne.layers.get_all_layers(ft_model)[3]
  425.     encoded_features = lasagne.layers.get_output(layer_or_layers=enc_layer, inputs=X)
  426.     if save_encoded_features
  427.         np.save('encoded_features.npy', encoded_features)
  428.  
  429.     # Finding the dominant phonemes in a selected number
  430.     # of frames. The features are also reshaped so one row
  431.     # corresponds to the dominant phoneme.
  432.     no_frames = 5
  433.     dispersed_phonemes = disperse(phonemes=phonemes, no_frames=no_frames)
  434.     encoded_features = encoded_features.reshape((encoded_features.shape[0]/no_frames,
  435.                                              encoded_features.shape[1]*no_frames))
  436.     # Instantiate a model for the classifier
  437.     input_var = T.dmatrix()
  438.     softmax_clf = softmax_classifier(input_var=input_var, input_size=no_frames*50,
  439.                                      output_size=46)
  440.  
  441.     # Feed data
  442.     train_classifier(X=encoded_features, y=dispersed_phonemes, network=softmax_clf)
  443.    
  444.     predicted = softmax_clf.get_output_for(input=encoded_features)
  445.     print "Accuracy score:"
  446.     print accuracy_score(dispersed_phonemes, predicted)
  447.  
  448.  
  449. main(generate_features=False, layerwise_training=False,
  450.      save_encoded_features=True)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement