Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ```python
- %matplotlib notebook
- import matplotlib as mpl
- from matplotlib import pyplot as plt
- from matplotlib import rcParams
- import seaborn as sns
- sns.set_style("white")
- sns.set_context("paper", rc={"lines.linewidth": 1})
- rcParams['axes.titlepad'] = 20
- rcParams['axes.titlesize'] = "medium"
- rcParams['axes.edgecolor'] = "red"
- rcParams['axes.spines.right'] = rcParams['axes.spines.top'] = False
- rcParams['xtick.labelsize'] = "small"
- rcParams['xtick.major.pad'] = 10
- rcParams['ytick.labelsize'] = "small"
- rcParams['ytick.major.pad'] = 10
- rcParams['axes.formatter.use_mathtext'] = True
- rcParams['axes.labelpad'] = 10
- ```
- ```python
- root_path = "/home/fat-fighter/Documents/cs771-project/hybrid-method/"
- ```
- ## Description of Files
- ### Folder: features
- - **tracks-mfcc.csv** - Contains already extracted mfcc features from all tracks using 30-60 seconds of tracks
- - **tracks-cluster-probabilities.csv** - Contains the cluster probabolities and assignments for all tracks (based on their mfcc features_
- - **timbres-cluster-probabilities.csv** - Contains the cluster probabilities and assignments for all segment timbres of all tracks
- - **tracks-collective-timbres-clusters-features.csv** - Contains the extracted features of a track using its timbres' collective cluster probabilities
- ### Folder: million-song-subset
- - **tracks-features.csv** - Contains mfcc features extracted from tracks in the MSS
- - **tracks-timbres.csv** - Contains segment timbres for all tracks
- ### Folder: taste-profile-subset
- - **songs.txt** - A list of song ids
- - **users.txt** - A list of user ids
- - **train-triplets.txt** - A user-song-count triplets
- - **song-to-tracks.txt** - A song-track id mapping
- # Collaborative Filtering
- ## Finding Optimal Number of Track Clusters (Based on Tracks' MFCC Features)
- ```python
- import numpy as np
- import pandas as pd
- from sklearn.cluster import KMeans
- from sklearn.decomposition import PCA
- ```
- ```python
- local_path = root_path + "data/"
- n_jobs = -1
- max_iter = 500
- algorithm = "full"
- n_init = 5
- ```
- ```python
- tracks_data = pd.read_csv(local_path + "features/tracks-mfcc.csv", sep="\t")
- cols = tracks_data.columns.tolist()[1:]
- tracks_features = tracks_data[cols]
- ```
- ```python
- estimators = [
- (n_clusters, KMeans(n_clusters=n_clusters, random_state=0, n_jobs=n_jobs, max_iter=max_iter, algorithm=algorithm, n_init=n_init))
- for n_clusters in range(5, 16, 1)
- ]
- ```
- ```python
- for n_clusters, estimator in estimators:
- estimator.fit(tracks_features)
- ```
- ```python
- with open(local_path + "features/tracks-clustering-kmeans-inertias.csv", "w") as f:
- cluster_inertias = []
- for n_clusters, estimator in estimators:
- cluster_inertias.append([n_clusters, estimator.inertia_])
- f.write("\n".join([str(n_clusters) + "\t" + str(inertia) for n_clusters, inertia in cluster_inertias]))
- ```
- ### Inertial Plot
- ```python
- with open(local_path + "features/tracks-clustering-kmeans-inertias.csv") as f:
- cluster_inertias = [line.strip(" \t\n\r").split("\t") for line in f.readlines()]
- cluster_inertias = [[int(cluster), float(inertia)] for cluster, inertia in cluster_inertias]
- cluster_inertias = np.array(cluster_inertias)
- ```
- ```python
- sns.pointplot(cluster_inertias[:, 0], cluster_inertias[:, 1])
- plt.title("Tracks Clustering: Inertia for K-Means")
- plt.xlabel("Number of Clusters")
- plt.ylabel("Variance")
- plt.savefig(local_path + "plots/tracks-clustering-kmeans-inertia.png", dpi=250)
- plt.show()
- ```
- ### PCA Plot of Tracks MFCC (for 10 Clusters)
- ```python
- decomposed_tracks_features = PCA(n_components=2).fit(tracks_features).transform(tracks_features)
- ```
- ```python
- n_clusters, estimator = estimators[5]
- cluster_assignments = estimator.labels_
- ```
- ```python
- plt.scatter(decomposed_tracks_features[:, 0], decomposed_tracks_features[:, 1], alpha=.8, s=0.7)
- plt.title("Tracks MFCC: PCA Plot")
- plt.savefig(local_path + "plots/tracks-mfcc-pca.png", dpi=250)
- plt.show()
- ```
- ## Clustering Tracks using GMM
- ```python
- import pandas as pd
- from sklearn.externals import joblib
- from sklearn.decomposition import PCA
- from sklearn.mixture import GaussianMixture
- from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
- ```
- ```python
- local_path = root_path + "data/"
- n_clusters = 10
- max_iter = 5000
- covariance_type = "diag"
- n_init = 3
- ```
- ```python
- tracks_data = pd.read_csv(local_path + "features/tracks-mfcc.csv", sep="\t")
- cols = tracks_data.columns[1:]
- tracks_mfcc = tracks_data[cols]
- ```
- ```python
- estimator = GaussianMixture(n_components=n_clusters, covariance_type=covariance_type, max_iter=max_iter, random_state=0, n_init=n_init)
- ```
- ```python
- estimator.fit(tracks_mfcc)
- ```
- ```python
- joblib.dump(estimator, local_path + "models/tracks-clustering-gmm-model.pkl")
- ```
- ```python
- estimator = joblib.load(local_path + "models/tracks-clustering-gmm-model.pkl")
- ```
- ```python
- probs = estimator.predict_proba(tracks_mfcc)
- cluster_assignments = estimator.predict(tracks_mfcc)
- ```
- ```python
- with open(local_path + "tracks-cluster-probabilities.csv", "w") as f:
- for i, song_id in enumerate(tracks_data["id"]):
- params = [song_id] + list(probs[i]) + [cluster_assignments[i]]
- params = [str(param) for param in params]
- f.write("\t".join(params) + "\n")
- ```
- ### LDA Plot of Tracks MFCC
- ```python
- decomposed_tracks_mfcc = LinearDiscriminantAnalysis(n_components=2).fit(tracks_mfcc, cluster_assignments).transform(tracks_mfcc)
- ```
- ```python
- for i in range(n_clusters):
- plt.scatter(decomposed_tracks_mfcc[cluster_assignments == i, 0], decomposed_tracks_mfcc[cluster_assignments == i, 1], alpha=.8, s=0.7)
- plt.gca().set_xlim([-16, 6])
- plt.gca().set_ylim([-5, 5])
- plt.title("Tracks MFCC: LDA Plot (After GMM)")
- plt.savefig(local_path + "plots/tracks-mfcc-gmm-clustering-pca.png", dpi=250)
- plt.show()
- ```
- ## Mapping Users to Tracks
- ```python
- local_path = root_path + "data/taste-profile-subset/"
- ```
- ```python
- songs_to_tracks = dict()
- count = 0
- with open(local_path + "songs-to-tracks.txt", "r") as f:
- for line in f.readlines():
- line = line.strip(" \t\n\r").split()
- if len(line) > 1:
- songs_to_tracks[line[0]] = line[1:]
- ```
- ```python
- outfile = open(local_path + "user-track-counts-raw.txt", "w")
- ```
- ```python
- with open(local_path + "user-song-counts.txt", "r") as f:
- line = f.readline()
- while line:
- line = line.strip(" \t\n\r").split()
- if len(line) == 3 and line[1] in songs_to_tracks:
- for track in songs_to_tracks[line[1]]:
- outfile.write("\t".join([line[0], track, line[2]]) + "\n")
- line = f.readline()
- ```
- ```python
- outfile.close()
- ```
- ## Splitting Users into Training and Evaluation Sets
- ```python
- import random
- ```
- ```python
- local_path = root_path + "data/"
- ```
- ```bash
- %%bash -s "$local_path"
- cd $1/taste-profile-subset
- cut -f1 user-track-counts-raw.txt | sort | uniq -c > user-counts.txt
- cat user-counts.txt | sed 's/^ *\([0-9]*\) /\1\t/g' | awk '($1 > 49)' > t; mv t user-counts.txt
- ```
- ```bash
- %%bash -s "$local_path"
- cd $1/taste-profile-subset/
- awk 'BEGIN {
- FS = OFS = "\t"
- }
- NR == FNR {
- f[$2] = $0
- next
- }
- $1 in f {
- print $0
- }' user-counts.txt user-track-counts-raw.txt > t
- ```
- ```bash
- %%bash -s "$local_path"
- cd $1
- awk 'BEGIN {
- FS = OFS = "\t"
- }
- NR == FNR {
- f[$1] = 1
- next
- }
- $2 in f {
- print $0
- }' features/tracks-cluster-probabilities.csv taste-profile-subset/t > taste-profile-subset/user-track-counts.txt
- ```
- ```bash
- %%bash -s "$local_path"
- cd $1/taste-profile-subset/
- cut -f2 -d$'\t' user-counts.txt | sort --random-sort > t
- size=`cat user-counts.txt | wc -l`
- vsize=$(( $size / 10 ))
- head -$vsize t > users-validation.txt
- tail -n+$vsize t > users-train.txt
- rm t
- ```
- ```bash
- %%bash -s "$local_path"
- cd $1/taste-profile-subset/
- awk 'BEGIN {
- FS = OFS = "\t"
- }
- NR == FNR {
- f[$1] = 1
- next
- }
- $1 in f {
- print $0
- }' users-train.txt user-track-counts.txt > user-track-counts-train.txt
- ```
- ```bash
- %%bash -s "$local_path"
- cd $1/taste-profile-subset/
- awk 'BEGIN {
- FS = OFS = "\t"
- }
- NR == FNR {
- f[$1] = 1
- next
- }
- $1 in f {
- print $0
- }' users-validation.txt user-track-counts.txt > user-track-counts-validation.txt
- ```
- ## Computing User Features (Based on Tracks' Cluster Probabilities)
- ```python
- import numpy as np
- ```
- ```python
- local_path = root_path + "data/"
- n_clusters = 10
- ```
- ```python
- tracks_mfcc = dict()
- with open(local_path + "features/tracks-cluster-probabilities.csv", "r") as f:
- line = f.readline()
- while line:
- line = f.readline()
- line = line.strip(" \t\n\r").split()
- if len(line) == 12:
- tracks_mfcc[line[0]] = np.array([float(field) for field in line[1:-1]])
- ```
- ```python
- with open(local_path + "taste-profile-subset/users-train.txt") as f:
- users_train = [user.strip(" \n\r") for user in f.readlines()]
- with open(local_path + "taste-profile-subset/users-validation.txt") as f:
- users_validation = [user.strip(" \n\r") for user in f.readlines()]
- ```
- ```python
- user_features = dict()
- user_track_counts = dict()
- ```
- ```python
- with open(local_path + "taste-profile-subset/user-track-counts.txt", "r") as f:
- for line in f:
- line = line.strip(" \t\n\r").split()
- if len(line) == 3:
- if line[0] not in user_track_counts:
- user_features[line[0]] = np.zeros(n_clusters)
- user_track_counts[line[0]] = 0
- user_features[line[0]] += tracks_mfcc[line[1]]
- user_track_counts[line[0]] += 1
- ```
- ```python
- outfile_train = local_path + "features/user-features-train.csv"
- outfile_validation = local_path + "features/user-features-validation.csv"
- ```
- ```python
- with open(outfile_train, "w") as f:
- for user in users_train:
- f.write("\t".join([user] + [str(field) for field in (user_features[user] / float(user_track_counts[user]))]) + "\n")
- with open(outfile_validation, "w") as f:
- for user in users_validation:
- f.write("\t".join([user] + [str(field) for field in (user_features[user] / float(user_track_counts[user]))]) + "\n")
- ```
- ## Finding Optimal Number of Users Clusters (Based on Users' Computed Features)
- ```python
- import numpy as np
- import pandas as pd
- from sklearn.cluster import KMeans
- from sklearn.decomposition import PCA
- ```
- ```python
- local_path = root_path + "data/"
- ```
- ```python
- n_jobs = -1
- max_iter = 500
- algorithm = "full"
- n_init = 5
- ```
- ```python
- user_data = pd.read_csv(local_path + "features/user-features-train.csv", sep="\t", header=None)
- cols = user_data.columns.tolist()[1:]
- user_features = user_data[cols]
- ```
- ```python
- estimators = [
- (n_clusters, KMeans(n_clusters=n_clusters, random_state=0, n_jobs=n_jobs, max_iter=max_iter, algorithm=algorithm, n_init=n_init))
- for n_clusters in range(10, 30, 1)
- ]
- ```
- ```python
- for n_clusters, estimator in estimators:
- estimator.fit(user_features)
- ```
- ```python
- with open(local_path + "features/users-clustering-kmeans-inertias.csv", "w") as f:
- cluster_inertias = []
- for n_clusters, estimator in estimators:
- cluster_inertias.append([n_clusters, estimator.inertia_])
- f.write("\n".join([str(n_clusters) + "\t" + str(inertia) for n_cluster, inertia in cluster_inertias]))
- cluster_inertias = np.array(cluster_inertias)
- ```
- ### Inertial Plot
- ```python
- with open(local_path + "features/users-clustering-kmeans-inertias.csv") as f:
- cluster_inertias = [line.strip(" \t\n\r").split("\t") for line in f.readlines()]
- cluster_inertias = [[int(cluster), float(inertia)] for cluster, inertia in cluster_inertias]
- cluster_inertias = np.array(cluster_inertias)
- ```
- ```python
- sns.pointplot(cluster_inertias[:, 0].astype(int), cluster_inertias[:, 1])
- plt.title("Users Clustering: Inertia for K-Means")
- plt.xlabel("Number of Clusters")
- plt.ylabel("Variance")
- plt.savefig(local_path + "plots/users-clustering-kmeans-inertia.png", dpi=250)
- plt.show()
- ```
- ### PCA Plot of Users MFCC (for 20 Clusters)
- ```python
- decomposed_user_features = PCA(n_components=2).fit(user_features).transform(user_features)
- ```
- ```python
- plt.scatter(decomposed_user_features[:, 0], decomposed_user_features[:, 1], alpha=.8, s=0.7)
- plt.title("User Features: PCA Plot")
- plt.savefig(local_path + "plots/user-features-pca.png", dpi=250)
- plt.show()
- ```
- ## Clustering Users using GMM
- ```python
- import pandas as pd
- from sklearn.externals import joblib
- from sklearn.decomposition import PCA
- from sklearn.mixture import GaussianMixture
- from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
- ```
- ```python
- local_path = root_path + "data/"
- n_clusters = 20
- max_iter = 5000
- covariance_type = "diag"
- n_init = 3
- ```
- ```python
- user_data = pd.read_csv(local_path + "/features/user-features-train.csv", sep="\t", header=None)
- cols = user_data.columns[1:]
- user_features = user_data[cols]
- ```
- ```python
- estimator = GaussianMixture(n_components=n_clusters, covariance_type=covariance_type, max_iter=max_iter, random_state=0, n_init=n_init)
- ```
- ```python
- estimator.fit(user_features)
- ```
- ```python
- joblib.dump(estimator, local_path + "/models/users-clustering-gmm-model.pkl")
- ```
- ```python
- estimator = joblib.load(local_path + "/models/users-clustering-gmm-model.pkl")
- ```
- ```python
- probs = estimator.predict_proba(user_features)
- cluster_assignments = estimator.predict(user_features)
- ```
- ```python
- for cluster in range(n_clusters):
- with open(local_path + "taste-profile-subset/clusters/user-ids-" + str(cluster + 1) + ".txt", "w") as f:
- f.write("\n".join(user_data[cluster_assignments == cluster][0]))
- ```
- ```python
- with open(local_path + "/features/user-cluster-probabilities.csv", "w") as f:
- for i, user_id in enumerate(user_data[user_data.columns[0]]):
- params = [user_id] + list(probs[i]) + [cluster_assignments[i]]
- params = [str(param) for param in params]
- f.write("\t".join(params) + "\n")
- ```
- ### LDA Plot of User Features
- ```python
- decomposed_user_features = LinearDiscriminantAnalysis(n_components=2).fit(user_features, cluster_assignments).transform(user_features)
- ```
- ```python
- for i in range(n_clusters):
- plt.scatter(decomposed_user_features[cluster_assignments == i, 0], decomposed_user_features[cluster_assignments == i, 1], alpha=.8, rasterized=True, s=0.7)
- plt.gca().set_ylim([-15, 5])
- plt.title("User Features: LDA Plot (After GMM)")
- plt.savefig(local_path + "plots/user-features-gmm-clustering-pca.png", dpi=250)
- plt.show()
- ```
- ## Distributing Users by their clusters
- ```python
- local_path = root_path + "data/taste-profile-subset/"
- ```
- ```bash
- %%bash -s "$local_path"
- cd $1/clusters/
- for cluster in {1..20}; do
- cat user-ids-$cluster.txt | sed "s/$/\t$cluster/g"
- echo ""
- done > user-clusters.txt
- ```
- ```python
- n_clusters = 20
- ```
- ```python
- cluster_files = [open(local_path + "clusters/user-track-counts-" + str(cluster + 1) + ".txt", "w") for cluster in range(n_clusters)]
- ```
- ```python
- user_clusters = dict()
- with open(local_path + "clusters/user-clusters.txt") as f:
- for line in f:
- line = line.strip("\t\n\r").split("\t")
- user_clusters[line[0]] = int(line[1])
- ```
- ```python
- with open(local_path + "user-track-counts.txt") as f:
- for line in f:
- line = line.strip("\t\n\r").split("\t")
- if line[0] in user_clusters:
- cluster_files[user_clusters[line[0]] - 1].write("\t".join(line) + "\n")
- ```
- ```python
- for f in cluster_files:
- f.close()
- ```
- ## Collaborative Filtering On User Clusters
- ```python
- from math import sqrt
- import numpy as np
- from scipy.sparse import csr_matrix
- ```
- ```python
- local_path = root_path + "data/taste-profile-subset/"
- n_clusters = 20
- ```
- ```python
- user_suggestions_file = open(local_path + "suggestions.csv", "w")
- ```
- ```python
- user_track_counts = dict()
- with open(local_path + "clustered-user-track-counts/cluster-k0.txt") as f:
- for line in f:
- line = line.strip(" \t\n\r").split("\t")
- if line != []:
- user_track_counts[line[0]] = set(line[1:])
- similarity = [[0]*len(user_track_counts)]*len(user_track_counts)
- ```
- ```python
- tracks = set([])
- for user in user_track_counts:
- for track in user_track_counts[user]:
- tracks.add(track)
- tracks = list(tracks)
- users = list(user_track_counts)
- ```
- ```python
- N, M = (len(users), len(tracks))
- ```
- ```python
- for i, user_i in enumerate(users):
- weights = dict()
- for track in tracks:
- weights[track] = 0
- for user_j in user_track_counts:
- if user_i != user_j:
- similarity = len(user_track_counts[user_i].intersection(user_track_counts[user_j]))
- similarity = similarity / (sqrt(len(user_track_counts[user_i])) * sqrt(len(user_track_counts[user_j])))
- for track in user_track_counts[user_j]:
- if track not in user_track_counts[user_i]:
- weights[track] += similarity
- keys = sorted(list(weights), key=lambda x: -weights[x])[:50]
- user_suggestions_file.write(user_i + "\t" + "\t".join(keys) + "\n")
- ```
- ```python
- user_suggestions_file.close()
- ```
- ## Generating Recommendations for Validation Users (User-User Localized Similarity)
- ```python
- import random
- from math import sqrt
- import numpy as np
- from sklearn.externals import joblib
- from multiprocessing import Pool
- ```
- ```python
- local_path = root_path + "data/"
- n_clusters = 20
- ```
- ```python
- user_features = dict()
- with open(local_path + "features/user-features-validation.csv") as f:
- for line in f:
- line = line.strip(" \t\n\r").split()
- user_features[line[0]] = line[1:]
- ```
- ```python
- users = list(user_features)
- ```
- ```python
- gmm_clustering_model = joblib.load(local_path + "models/users-clustering-gmm-model.pkl")
- ```
- ```python
- clustered_users = dict()
- for cluster in range(n_clusters):
- clustered_users[cluster] = []
- for user in users:
- cluster = gmm_clustering_model.predict([user_features[user]])[0]
- clustered_users[cluster].append(user)
- ```
- ```python
- user_tracks = dict()
- user_validation_tracks = dict()
- for user in users:
- user_tracks[user] = [set([]), 0]
- user_validation_tracks[user] = set([])
- with open(local_path + "taste-profile-subset/user-track-counts-validation.txt") as f:
- for line in f:
- line = line.strip(" \n\r").split("\t")
- if random.random() > 0.35:
- user_tracks[line[0]][0].add(line[1])
- else:
- user_validation_tracks[line[0]].add(line[1])
- for user in users:
- user_tracks[user][1] = sqrt(len(user_tracks[user][0]))
- ```
- ```python
- def get_suggestions_for_cluster(cluster):
- global user_tracks, clustered_users, local_path
- outfile = open(local_path + "taste-profile-subset/suggestions-validation-" + str(cluster) + ".txt", "w")
- print "Starting for cluster", cluster
- tracks = set([])
- cluster_user_tracks = dict()
- with open(local_path + "taste-profile-subset/clusters/user-ids-" + str(cluster + 1) + ".txt") as f:
- for line in f:
- cluster_user_tracks[line.strip(" \n\r")] = [set([]), 0]
- with open(local_path + "taste-profile-subset/clusters/user-track-counts-" + str(cluster + 1) + ".txt") as f:
- for line in f:
- line = line.strip(" \n\r").split("\t")
- cluster_user_tracks[line[0]][0].add(line[1])
- tracks.add(line[1])
- for user in cluster_user_tracks:
- cluster_user_tracks[user][1] = sqrt(len(cluster_user_tracks[user][0]))
- for i, user_v in enumerate(clustered_users[cluster]):
- if i % 10 == 0:
- print "\tStarting for user", i
- track_weights = dict()
- for track in tracks:
- track_weights[track] = 0
- for user_t in cluster_user_tracks:
- similarity = len(user_tracks[user_v][0].intersection(cluster_user_tracks[user_t][0]))
- similarity = similarity / (user_tracks[user_v][1] * cluster_user_tracks[user_t][1])
- similarity = pow(similarity, 6)
- for track in cluster_user_tracks[user_t][0].difference(user_tracks[user_v][0]):
- track_weights[track] += similarity
- suggestions = np.array(sorted(tracks, key=lambda x: track_weights[x]))[-500:]
- suggestions = set(suggestions[np.searchsorted([track_weights[track] for track in suggestions], 0, side="right"):])
- outfile.write(user_v + "\t" + "\t".join(suggestions) + "\n")
- outfile.close()
- ```
- ```python
- process_pool = Pool(4)
- for i in range(n_clusters):
- process_pool.map(get_suggestions_for_cluster, range(n_clusters))
- ```
- ```bash
- %%bash -s "$local_path"
- cd $1/taste-profile-subset
- for cluster in {1..20}; do
- cat suggestions-validation-$cluster.txt
- done > suggestions-validation.txt
- for cluster in {1..20}; do
- rm suggestions-validation-$cluster.txt
- done
- ```
- ```python
- with open(local_path + "taste-profile-subset/user-tracks-used-validation.txt", "w") as f:
- for user in user_tracks:
- f.write(user + "\t" + "\t".join(user_tracks[user][0]) + "\n")
- ```
- ## Generating Recommendations for Validation Users (Item-Item Localized Similarity) NOT
- ```python
- import random
- from math import sqrt
- import numpy as np
- from sklearn.externals import joblib
- from multiprocessing import Pool
- ```
- ```python
- local_path = root_path + "data/"
- n_clusters = 20
- ```
- ```python
- user_features = dict()
- with open(local_path + "features/user-features-validation.csv") as f:
- for line in f:
- line = line.strip(" \t\n\r").split()
- user_features[line[0]] = line[1:]
- ```
- ```python
- users = list(user_features)
- ```
- ```python
- gmm_clustering_model = joblib.load(local_path + "models/users-clustering-gmm-model.pkl")
- ```
- ```python
- user_tracks = dict()
- user_validation_tracks = dict()
- for user in users:
- user_tracks[user] = [set([]), 0]
- user_validation_tracks[user] = set([])
- track_users = dict()
- with open(local_path + "taste-profile-subset/user-track-counts-validation.txt") as f:
- for line in f:
- line = line.strip(" \n\r").split("\t")
- if random.random() > 0.35:
- if line[1] not in track_users:
- track_users[line[1]] = [set([]), 0]
- track_users[line[1]][0].add(line[0])
- user_tracks[line[0]][0].add(line[1])
- else:
- user_validation_tracks[line[0]].add(line[1])
- for user in users:
- user_tracks[user][1] = sqrt(len(user_tracks[user][0]))
- for track in track_users:
- track_users[track][1] = sqrt(len(track_users[track][0]))
- ```
- ```python
- clustered_users = dict()
- clustered_tracks = dict()
- for cluster in range(n_clusters):
- clustered_users[cluster] = []
- clustered_tracks[cluster] = set([])
- for user in users:
- cluster = gmm_clustering_model.predict([user_features[user]])[0]
- clustered_users[cluster].append(user)
- clustered_tracks[cluster] = clustered_tracks[cluster].union(user_tracks[user][0])
- ```
- ```python
- def get_suggestions_for_cluster(cluster):
- global track_users, clustered_users, clustered_tracks, local_path
- outfile = open(local_path + "taste-profile-subset/suggestions-validation-" + str(cluster) + ".txt", "w")
- print "Starting for cluster", cluster
- cluster_track_users = dict()
- with open(local_path + "taste-profile-subset/clusters/user-track-counts-" + str(cluster + 1) + ".txt") as f:
- for line in f:
- line = line.strip(" \n\r").split("\t")
- if line[1] not in cluster_track_users:
- cluster_track_users[line[1]] = [set([]), 0]
- cluster_track_users[line[1]][0].add(line[0])
- for track in cluster_track_users:
- cluster_track_users[track][1] = sqrt(len(cluster_track_users[track][0]))
- for i, user_v in enumerate(list(clustered_users[cluster])):
- if i % 10 == 9:
- print "\tStarting for user", i
- outfile.close()
- suggestions = []
- for j, track_t in enumerate(list(cluster_track_users)):
- similarity = 0
- for track_v in list(user_tracks[user_v][0]):
- similarity_t = len(track_users[track_v][0].intersection(cluster_track_users[track_t][0]))
- similarity_t = similarity / (track_users[track_v][1] * cluster_track_users[track_t][1])
- similarity_t = pow(similarity, 3)
- similarity += similarity_t
- suggestions.append((track_t, similarity))
- suggestions.sort(key=lambda x: -x[1])
- suggestions = suggestions[:500]
- suggestions = suggestions[:np.searchsorted([track[1] for track in suggestions], 0, side="left")]
- print suggestions
- outfile.write(user_v + "\t" + "\t".join(suggestions) + "\n")
- outfile.close()
- ```
- ```python
- # process_pool = Pool(1)
- # process_pool.map(get_suggestions_for_cluster, range(n_clusters))
- get_suggestions_for_cluster(0)
- ```
- ```bash
- %%bash -s "$local_path"
- cd $1/taste-profile-subset
- for cluster in {1..20}; do
- cat suggestions-validation-$cluster.txt
- done > suggestions-validation.txt
- for cluster in {1..20}; do
- rm suggestions-validation-$cluster.txt
- done
- ```
- ```python
- with open(local_path + "taste-profile-subset/user-tracks-used-validation.txt", "w") as f:
- for user in user_tracks:
- f.write(user + "\t" + "\t".join(user_tracks[user][0]) + "\n")
- ```
- ## Computing Truncated mAP on the Predicted Recommendations
- ```python
- import numpy as np
- ```
- ```python
- local_path = root_path + "data/taste-profile-subset/"
- ```
- ```python
- listened_user_tracks = dict()
- with open(local_path + "users-validation.txt") as f:
- for line in f:
- line = line.strip(" \n\r")
- listened_user_tracks[line] = set([])
- with open(local_path + "user-track-counts-validation.txt") as f:
- for line in f:
- line = line.strip(" \n\r").split("\t")
- listened_user_tracks[line[0]].add(line[1])
- for user in listened_user_tracks:
- listened_user_tracks[user] = set(listened_user_tracks[user])
- ```
- ```python
- with open(local_path + "user-tracks-used-validation-uu2.txt") as f:
- for line in f:
- line = line.strip(" \n\r").split("\t")
- listened_user_tracks[line[0]] = listened_user_tracks[line[0]].difference(line[1:])
- ```
- ```python
- with open(local_path + "suggestions-uu2.txt") as f:
- aps = list()
- for line in f:
- if line.strip() == "":
- continue
- line = line.strip(" \t\n\r").split("\t")
- user = line[0]
- tracks = line[1:]
- tracks = np.array(tracks[:500])
- k = 0
- l = 0
- p = 0.0
- for i, track in enumerate(tracks):
- k += 1
- if track in listened_user_tracks[user]:
- l += 1
- p += float(l) / float(k)
- if l != 0:
- aps.append(p / l)
- else:
- aps.append(0)
- print np.mean(aps)
- ```
- 0.275001780514
- ## LDA Plot of the User Suggestions
- ```python
- import numpy as np
- from sklearn.externals import joblib
- from sklearn.decomposition import PCA
- from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
- ```
- ```python
- local_path = root_path + "data/"
- n_clusters = 10
- ```
- ```python
- listened_user_tracks = dict()
- with open(local_path + "taste-profile-subset/users-validation.txt") as f:
- for line in f:
- line = line.strip(" \n\r")
- listened_user_tracks[line] = set([])
- with open(local_path + "taste-profile-subset/user-track-counts-validation.txt") as f:
- for line in f:
- line = line.strip(" \n\r").split("\t")
- listened_user_tracks[line[0]].add(line[1])
- for user in listened_user_tracks:
- listened_user_tracks[user] = set(listened_user_tracks[user])
- ```
- ```python
- with open(local_path + "taste-profile-subset/user-tracks-used-validation-uu2.txt") as f:
- for line in f:
- line = line.strip(" \n\r").split("\t")
- listened_user_tracks[line[0]] = listened_user_tracks[line[0]].difference(line[1:])
- ```
- ```python
- user_suggestions = dict()
- with open(local_path + "taste-profile-subset/suggestions-uu2.txt") as f:
- for line in f:
- line = line.strip(" \t\n\r").split("\t")
- user_suggestions[line[0]] = set(line[1:]).difference(listened_user_tracks[user])
- ```
- ### Clustering Tracks
- ```python
- tracks_clustering_model = joblib.load(local_path + "models/tracks-clustering-gmm-model.pkl")
- ```
- ```python
- tracks_mfcc = []
- with open(local_path + "features/tracks-mfcc.csv") as f:
- f.readline()
- for line in f:
- line = line.strip(" \t\n\r").split()
- tracks_mfcc.append([float(field) for field in line[1:]])
- ```
- ```python
- cluster_assignments = tracks_clustering_model.predict(tracks_mfcc)
- ```
- ### Loading User Tracks
- ```python
- user = list(user_suggestions)[0]
- ```
- ```python
- user_tracks = listened_user_tracks[user]
- user_suggestions = user_suggestions[user]
- ```
- ```python
- user_tracks_mfcc = []
- user_suggestions_mfcc = []
- with open(local_path + "features/tracks-mfcc.csv") as f:
- f.readline()
- for line in f:
- line = line.strip(" \t\n\r").split()
- if line[0] in user_tracks:
- user_tracks_mfcc.append([float(field) for field in line[1:]])
- if line[0] in user_suggestions:
- user_suggestions_mfcc.append([float(field) for field in line[1:]])
- ```
- ### LDA Plot of User Tracks and Suggestions
- ```python
- lda_model = LinearDiscriminantAnalysis(n_components=2).fit(tracks_mfcc, cluster_assignments)
- ```
- ```python
- decomposed_tracks_mfcc = lda_model.transform(tracks_mfcc)
- decomposed_user_tracks_mfcc = lda_model.transform(user_tracks_mfcc)
- decomposed_user_suggestions_mfcc = lda_model.transform(user_suggestions_mfcc)
- ```
- ```python
- for i in range(n_clusters):
- plt.scatter(decomposed_tracks_mfcc[cluster_assignments == i, 0], decomposed_tracks_mfcc[cluster_assignments == i, 1], alpha=.8, rasterized=True, s=0.7)
- plt.scatter(decomposed_user_tracks_mfcc[:, 0], decomposed_user_tracks_mfcc[:, 1], alpha=1, s=8, c="blue")
- plt.scatter(decomposed_user_suggestions_mfcc[:, 0], decomposed_user_suggestions_mfcc[:, 1], alpha=1, s=8, c="black")
- plt.gca().set_xlim([-15, 5])
- plt.gca().set_ylim([-4, 4.5])
- plt.title("Tracks MFCC: LDA Plot (After GMM)")
- plt.savefig(local_path + "plots/tracks-mfcc-lda-exploited-suggestions.png", dpi=250)
- plt.show()
- ```
- # Exploration
- ## Generating Track Recommendations through Exploration
- ```python
- from math import sqrt
- import numpy as np
- from sklearn.externals import joblib
- from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
- ```
- ```python
- local_path = root_path + "data/"
- n_clusters = 10
- n_suggestions = 25
- ```
- ```python
- users = set()
- tracks = set()
- with open(local_path + "taste-profile-subset/user-track-counts.txt") as f:
- for line in f:
- line = line.strip(" \t\n\r").split("\t")
- users.add(line[0])
- tracks.add(line[1])
- ```
- ```python
- users = list(users)
- tracks = list(tracks)
- ```
- ```python
- user_indices = dict()
- track_indices = dict()
- for i, user in enumerate(users):
- user_indices[user] = i
- for i, track in enumerate(tracks):
- track_indices[track] = i
- ```
- ```python
- user_tracks = dict()
- track_features = dict()
- for i in range(len(users)):
- user_tracks[i] = set()
- for i in range(len(tracks)):
- track_features[i] = [0, -1, -1]
- ```
- ```python
- with open(local_path + "taste-profile-subset/user-track-counts.txt") as f:
- for line in f:
- line = line.strip(" \t\n\r").split("\t")
- user, track = user_indices[line[0]], track_indices[line[1]]
- user_tracks[user].add(track)
- track_features[track][0] += 1
- ```
- ```python
- clustered_tracks = dict()
- for cluster in range(n_clusters):
- clustered_tracks[cluster] = []
- with open(local_path + "features/tracks-cluster-probabilities.csv") as f:
- for track in f:
- track = track.strip(" \t\n\r").split("\t")
- if track[0] in track_indices:
- track[0] = track_indices[track[0]]
- clustered_tracks[int(track[-1])].append(track[0])
- track_features[track[0]][1] = int(track[-1])
- track_features[track[0]][2] = float(track[int(track[-1]) + 1])
- ```
- ```python
- track_features[list(track_features)[0]]
- ```
- ```python
- for cluster in clustered_tracks:
- clustered_tracks[cluster].sort(key=lambda track: -track_features[track][0] * track_features[track][2])
- ```
- ```python
- user_tracks_clusters = dict()
- for user in user_tracks:
- user_tracks_clusters[user] = []
- for cluster in range(n_clusters):
- user_tracks_clusters[user].append(1)
- for track in user_tracks[user]:
- user_tracks_clusters[user][track_features[track][1]] += 1
- ```
- ```python
- for user in user_tracks:
- normalization_const = 0
- for cluster in range(n_clusters):
- user_tracks_clusters[user][cluster] = sqrt(len(clustered_tracks[cluster])) / user_tracks_clusters[user][cluster]
- normalization_const += user_tracks_clusters[user][cluster]
- for cluster in range(n_clusters):
- user_tracks_clusters[user][cluster] = user_tracks_clusters[user][cluster] / normalization_const
- ```
- ```python
- outfile = open(local_path + "taste-profile-subset/suggestions-exploration.txt", "w")
- user_suggestions = dict()
- for user in user_tracks:
- suggestions = set([])
- cluster_indices = [0] * n_suggestions
- while len(suggestions) < n_suggestions:
- cluster = np.argmax(np.random.multinomial(20, user_tracks_clusters[user], size = 1))
- while clustered_tracks[cluster][cluster_indices[cluster]] in user_tracks[user]:
- cluster_indices[cluster] += 1
- suggestions.add(clustered_tracks[cluster][cluster_indices[cluster]])
- cluster_indices[cluster] += 1
- user_suggestions[user] = suggestions
- outfile.write(users[user] + "\t" + "\t".join([tracks[track] for track in suggestions]) + "\n")
- outfile.close()
- ```
- ## Plotting User Suggestions
- ```python
- from sklearn.externals import joblib
- ```
- ```python
- local_path = root_path + "data/"
- ```
- ```python
- tracks_clustering_model = joblib.load(local_path + "models/tracks-clustering-gmm-model.pkl")
- ```
- ```python
- tracks_mfcc = []
- with open(local_path + "features/tracks-mfcc.csv") as f:
- f.readline()
- for line in f:
- line = line.strip(" \t\n\r").split()
- tracks_mfcc.append([float(field) for field in line[1:]])
- ```
- ```python
- cluster_assignments = tracks_clustering_model.predict(tracks_mfcc)
- ```
- ### Loading Tracks for First User
- ```python
- user_suggestions = []
- with open(local_path + "taste-profile-subset/suggestions-exploration.txt") as f:
- user = f.readline().split("\t")
- user_suggestions = user[1:]
- user = user[0]
- ```
- ```python
- user_tracks = []
- with open(local_path + "taste-profile-subset/user-track-counts.txt") as f:
- for line in f:
- line = line.strip(" \t\n\r").split("\t")
- if line[0] == user:
- user_tracks.append(line[1])
- ```
- ```python
- user_tracks_mfcc = []
- user_suggestions_mfcc = []
- with open(local_path + "features/tracks-mfcc.csv") as f:
- f.readline()
- for line in f:
- line = line.strip(" \t\n\r").split()
- if line[0] in user_tracks:
- user_tracks_mfcc.append([float(field) for field in line[1:]])
- if line[0] in user_suggestions:
- user_suggestions_mfcc.append([float(field) for field in line[1:]])
- ```
- ### LDA Plot of Tracks
- ```python
- lda_model = LinearDiscriminantAnalysis(n_components=2).fit(tracks_mfcc, cluster_assignments)
- ```
- ```python
- decomposed_tracks_mfcc = lda_model.transform(tracks_mfcc)
- decomposed_user_tracks_mfcc = lda_model.transform(user_tracks_mfcc)
- decomposed_user_suggestions_mfcc = lda_model.transform(user_suggestions_mfcc)
- ```
- ```python
- for i in range(n_clusters):
- plt.scatter(decomposed_tracks_mfcc[cluster_assignments == i, 0], decomposed_tracks_mfcc[cluster_assignments == i, 1], alpha=.8, rasterized=True, s=0.7)
- plt.scatter(decomposed_user_tracks_mfcc[:, 0], decomposed_user_tracks_mfcc[:, 1], alpha=1, s=8, c="blue")
- plt.scatter(decomposed_user_suggestions_mfcc[:, 0], decomposed_user_suggestions_mfcc[:, 1], alpha=1, s=15, c="black")
- plt.gca().set_xlim([-15, 5])
- plt.gca().set_ylim([-4, 4.5])
- plt.title("Tracks MFCC: LDA Plot (After GMM)")
- plt.savefig(local_path + "plots/tracks-mfcc-lda-explored-suggestions.png", dpi=250)
- plt.show()
- ```
Add Comment
Please, Sign In to add comment