Untitled

from os.path import join, basename, splitext
import argparse

import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# setup parser
parser = argparse.ArgumentParser()
parser.add_argument("X", help="filename of the feature file (`.npy`) to visualize")
parser.add_argument("y", help="filename of the label file (`.csv` or `.npy`) to visualize classes")
parser.add_argument("out_fn", help="filename for the outputing image (`.pdf`)")
args = parser.parse_args()

# load the feature file
X = np.load(args.X)

# load the label file
ext = splitext(args.y)[1]
if ext == '.csv':
    with open(args.y) as f:
        y = np.array([l.split('\n')[0] for l in f])
elif ext == '.npy':
    y = np.load(args.y)
else:
    raise NotImplementedError('{} is not supported!'.format(ext))

# check shape
if X.shape[0] != len(y):
    raise ValueError('Feature & label should have same number of samples!')

# run the PCA
pca = PCA(2)
z = pca.fit_transform(X)

# markers
markers = ['o', '.', ',', 'x', '+', 'v', '^', '<', '>', 's', 'd']

# visualize per label
for k, label in enumerate(set(y)):
    idx = np.where(y == label)[0]
    plt.scatter(z[idx, 0], z[idx, 1], label=label,
                marker=markers[(len(markers) % (k + 1)) - 1])

# save fig
plt.legend()
plt.tight_layout()
plt.savefig(args.out_fn)