Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- """Derp"""
- import sys
- if (len(sys.argv) < 3):
- print("usage: python", sys.argv[0], "<spectrograms-npy-folder>", "<targets-npy-file>", "<serialized-model-folder>")
- exit(1)
- import torch
- import numpy
- import os
- import random
- import datetime
- import model_class
- import scipy.stats
- if torch.cuda.is_available():
- print("CUDA is available. Using GPU.")
- device = torch.device('cuda')
- else:
- print("CUDA is not available. Using CPU")
- device = torch.device('cpu')
- print("Reading the spectrogram files")
- spectrograms = numpy.empty((5000, 500,149))
- n_spectrograms = 0
- for root, _, files in os.walk(sys.argv[1]):
- for f in files:
- spectrogram = numpy.load(os.path.join(root, f))
- spectrograms[n_spectrograms] = spectrogram
- n_spectrograms = n_spectrograms + 1
- spectrograms = torch.from_numpy(spectrograms).float()
- spectrograms = spectrograms[:, None, :, :] # PyTorch requires an (empty) channel dimension. How the hell does "None-style syntax" work? O_o
- print("Reading the targets file")
- targets = numpy.load(sys.argv[2])
- targets = torch.from_numpy(targets).float()
- targets = targets * 0.1
- n_features = targets.size()[1]
- print("Defining the network")
- net = model_class.Net(n_features).to(device)
- optimizer = torch.optim.Adam(net.parameters(), lr=0.00005)
- criterion = torch.nn.MSELoss()
- training_to_test_ratio = 0.08
- n_epochs = 50
- batch_size = 8
- params = list(net.parameters())
- print(len(params))
- exit(0)
- print("Splitting data into %d/%d training/testing data" % (n_spectrograms*training_to_test_ratio, n_spectrograms*(1-training_to_test_ratio)))
- training_set_indices = random.sample(range(0, n_spectrograms), int(n_spectrograms*training_to_test_ratio))
- training_spectrograms = spectrograms[training_set_indices]
- training_targets = targets[training_set_indices]
- training_set = torch.utils.data.TensorDataset(training_spectrograms, training_targets)
- testing_set_indices = [x for x in range(0, n_spectrograms) if x not in training_set_indices]
- testing_spectrograms = spectrograms[testing_set_indices]
- testing_targets = targets[testing_set_indices]
- testing_set = torch.utils.data.TensorDataset(testing_spectrograms, testing_targets)
- print("Training model")
- training_loss = numpy.zeros(n_epochs)
- testing_loss = numpy.zeros(n_epochs)
- correlation = numpy.zeros((n_epochs, n_features))
- for epoch in range(0, n_epochs):
- # Define the PyTorch dataset batch iterator
- # Train
- training_loader = torch.utils.data.DataLoader(training_set, batch_size = batch_size, shuffle = True)
- for batch_number, batch in enumerate(training_loader):
- # Get batch data
- batch_spectrograms, batch_targets = batch
- batch_spectrograms = batch_spectrograms.to(device)
- batch_targets = batch_targets.to(device)
- # Forward
- batch_outputs = net.forward(batch_spectrograms)
- # Compute loss
- loss = criterion.forward(input = batch_outputs, target = batch_targets)
- # Zero the gradients from the previous run
- optimizer.zero_grad()
- # Compute gradients
- loss.backward()
- # Update gradients
- optimizer.step()
- # Record loss of this batch
- training_loss[epoch] += loss.item()
- training_loss[epoch] /= int(len(training_set)/batch_size) # epoch loss is the average loss of all the mini batches
- # Test (validation?)
- testing_loader = torch.utils.data.DataLoader(testing_set, batch_size = batch_size, shuffle = False)
- all_batch_outputs_cpu = numpy.zeros((testing_spectrograms.size()[0], n_features))
- for batch_number, batch in enumerate(testing_loader):
- batch_spectrograms, batch_targets = batch
- batch_spectrograms = batch_spectrograms.to(device)
- batch_targets = batch_targets.to(device)
- batch_outputs = net.forward(batch_spectrograms)
- all_batch_outputs_cpu[batch_number*batch_size:(batch_number+1)*batch_size] = batch_outputs.cpu().detach().numpy()
- loss = criterion.forward(input = batch_outputs, target = batch_targets)
- testing_loss[epoch] += loss.item()
- # Calculate Pearson's correlation coefficient for each feature
- for feature in range(0, n_features):
- corr, _ = scipy.stats.pearsonr(all_batch_outputs_cpu[:, feature], testing_targets.numpy()[:, feature])
- correlation[epoch, feature] = corr
- testing_loss[epoch] /= int(len(testing_set)/batch_size) # epoch loss is the average loss of all the mini batches
- print("Epoch %d/%d" % (epoch, n_epochs))
- print(" Training loss: %3f" % training_loss[epoch])
- print(" Testing loss: %.3f" % testing_loss[epoch])
- print(" Correlation:", correlation[epoch])
- serialized_model_folder_name = os.path.join(sys.argv[3], datetime.datetime.now().isoformat())
- os.mkdir(serialized_model_folder_name)
- print("Serializing model parameters to", serialized_model_folder_name)
- torch.save(net.state_dict(), os.path.join(serialized_model_folder_name, "model.pth"))
- numpy.save(os.path.join(serialized_model_folder_name, "training-set-indicies.npy"), training_set_indices)
- numpy.save(os.path.join(serialized_model_folder_name, "training-loss.npy"), training_loss)
- numpy.save(os.path.join(serialized_model_folder_name, "testing-loss.npy"), testing_loss)
- numpy.save(os.path.join(serialized_model_folder_name, "correlation.npy"), correlation)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement