Untitled

#!/usr/bin/env python
"""Derp"""

import sys
if (len(sys.argv) < 3):
    print("usage: python", sys.argv[0], "<spectrograms-npy-folder>", "<targets-npy-file>", "<serialized-model-folder>")
    exit(1)

import torch
import numpy
import os
import random
import datetime
import model_class
import scipy.stats

if torch.cuda.is_available():
    print("CUDA is available. Using GPU.")
    device = torch.device('cuda')
else:
    print("CUDA is not available. Using CPU")
    device = torch.device('cpu')

print("Reading the spectrogram files")
spectrograms = numpy.empty((5000, 500,149))
n_spectrograms = 0
for root, _, files in os.walk(sys.argv[1]):
    for f in files:
        spectrogram = numpy.load(os.path.join(root, f))
        spectrograms[n_spectrograms] = spectrogram
        n_spectrograms = n_spectrograms + 1

spectrograms = torch.from_numpy(spectrograms).float()
spectrograms = spectrograms[:, None, :, :] # PyTorch requires an (empty) channel dimension. How the hell does "None-style syntax" work? O_o

print("Reading the targets file")
targets = numpy.load(sys.argv[2])
targets = torch.from_numpy(targets).float()
targets = targets * 0.1
n_features = targets.size()[1]

print("Defining the network")
net = model_class.Net(n_features).to(device)
optimizer = torch.optim.Adam(net.parameters(), lr=0.00005)
criterion = torch.nn.MSELoss()

training_to_test_ratio = 0.08
n_epochs = 50
batch_size = 8

params = list(net.parameters())
print(len(params))
exit(0)

print("Splitting data into %d/%d training/testing data" % (n_spectrograms*training_to_test_ratio, n_spectrograms*(1-training_to_test_ratio)))
training_set_indices = random.sample(range(0, n_spectrograms), int(n_spectrograms*training_to_test_ratio))
training_spectrograms = spectrograms[training_set_indices]
training_targets = targets[training_set_indices]
training_set = torch.utils.data.TensorDataset(training_spectrograms, training_targets)

testing_set_indices = [x for x in range(0, n_spectrograms) if x not in training_set_indices]
testing_spectrograms = spectrograms[testing_set_indices]
testing_targets = targets[testing_set_indices]
testing_set = torch.utils.data.TensorDataset(testing_spectrograms, testing_targets)

print("Training model")
training_loss = numpy.zeros(n_epochs)
testing_loss = numpy.zeros(n_epochs)
correlation = numpy.zeros((n_epochs, n_features))
for epoch in range(0, n_epochs):

    # Define the PyTorch dataset batch iterator

    # Train
    training_loader = torch.utils.data.DataLoader(training_set, batch_size = batch_size, shuffle = True)
    for batch_number, batch in enumerate(training_loader):

        # Get batch data
        batch_spectrograms, batch_targets = batch
        batch_spectrograms = batch_spectrograms.to(device)
        batch_targets = batch_targets.to(device)

        # Forward
        batch_outputs = net.forward(batch_spectrograms)

        # Compute loss
        loss = criterion.forward(input = batch_outputs, target = batch_targets)

        # Zero the gradients from the previous run
        optimizer.zero_grad()

        # Compute gradients
        loss.backward()

        # Update gradients
        optimizer.step()

        # Record loss of this batch
        training_loss[epoch] += loss.item()

    training_loss[epoch] /= int(len(training_set)/batch_size) # epoch loss is the average loss of all the mini batches

    # Test (validation?)
    testing_loader = torch.utils.data.DataLoader(testing_set, batch_size = batch_size, shuffle = False)
    all_batch_outputs_cpu = numpy.zeros((testing_spectrograms.size()[0], n_features))
    for batch_number, batch in enumerate(testing_loader):
        batch_spectrograms, batch_targets = batch
        batch_spectrograms = batch_spectrograms.to(device)
        batch_targets = batch_targets.to(device)

        batch_outputs = net.forward(batch_spectrograms)
        all_batch_outputs_cpu[batch_number*batch_size:(batch_number+1)*batch_size] = batch_outputs.cpu().detach().numpy()

        loss = criterion.forward(input = batch_outputs, target = batch_targets)

        testing_loss[epoch] += loss.item()

    # Calculate Pearson's correlation coefficient for each feature
    for feature in range(0, n_features):
        corr, _ = scipy.stats.pearsonr(all_batch_outputs_cpu[:, feature], testing_targets.numpy()[:, feature])
        correlation[epoch, feature] = corr

    testing_loss[epoch] /= int(len(testing_set)/batch_size) # epoch loss is the average loss of all the mini batches

    print("Epoch %d/%d" % (epoch, n_epochs))
    print("    Training loss: %3f" % training_loss[epoch])
    print("    Testing loss: %.3f" % testing_loss[epoch])
    print("    Correlation:", correlation[epoch])


serialized_model_folder_name = os.path.join(sys.argv[3], datetime.datetime.now().isoformat())
os.mkdir(serialized_model_folder_name)
print("Serializing model parameters to", serialized_model_folder_name)
torch.save(net.state_dict(), os.path.join(serialized_model_folder_name, "model.pth"))
numpy.save(os.path.join(serialized_model_folder_name, "training-set-indicies.npy"), training_set_indices)
numpy.save(os.path.join(serialized_model_folder_name, "training-loss.npy"), training_loss)
numpy.save(os.path.join(serialized_model_folder_name, "testing-loss.npy"), testing_loss)
numpy.save(os.path.join(serialized_model_folder_name, "correlation.npy"), correlation)