Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- import seaborn as sns
- import matplotlib.pyplot as plt
- import numpy as np
- import networkx as nx
- import pickle
- from collections import deque
- import multiprocessing as mp
- import editdistance
- import os
- from sklearn.decomposition import PCA, KernelPCA
- from sklearn.manifold import TSNE
- from tqdm import tqdm
- samplesize = 1000
- def wrapper(args):
- s1, s2 = args
- return editdistance.eval(s1, s2)
- if not os.path.exists('distmatrix.csv'):
- # ----------------------------------------------------------------------------------------
- # Read the dataset
- # ----------------------------------------------------------------------------------------
- sols = pd.read_csv('solutions.csv', usecols=['QCode', 'SolutionID', 'Status', 'Language'])
- sols = sols.loc[sols.Language == 'C'].drop('Language', axis=1).dropna() # Only C
- sols = sols.loc[(sols.Status == 'accepted') | (sols.Status == 'wrong answer')]
- sols = sols.dropna().sample(samplesize*3) # Thrice the sample size we need
- #sols.Status = (sols.Status == 'accepted').astype(int)
- f, s, t = [pd.read_csv('code/' + i + '.csv') for i in ['first', 'second', 'third']]
- print('Concatenating solutions')
- c = pd.concat([f, s, t]); del(f); del(s); del(t)
- print('Merging with dataset')
- df = sols.merge(c, how='left', on='SolutionID'); del(c); del(sols)
- print(df.info())
- # ----------------------------------------------------------------------------------------
- # Create keyword structure
- # ----------------------------------------------------------------------------------------
- def to_keywords_only(code):
- C_keys = '''auto,break,case,char,const,continue,default,do,double,else,enum,extern,float,
- for,goto,if,int,long,register,return,short,signed,sizeof,static,struct,switch,typedef,
- union,unsigned,void,volatile,while'''.replace('\\n', '').split(',')
- special = '`1234567890-=+_)(*&^%$#@!~[]{}\|";:/?.>,<' + "'"
- if isinstance(code, str):
- code = code.replace('\n', ' ')
- for sp in special:
- code = code.replace(sp, ' {} '.format(sp))
- useful = ' '.join((word for word in code.split(' ') if word in (C_keys + list(special))))
- return useful
- else:
- return ''
- tqdm.pandas(desc='applying_structure', ncols=100)
- df['Structure'] = df.Solutions.progress_apply(to_keywords_only)
- print(df.info())
- # ----------------------------------------------------------------------------------------
- # Take a sample from the dataset and calculate distance matrix
- # ----------------------------------------------------------------------------------------
- print('Taking {} programs as sample'.format(samplesize))
- sample = df.dropna().sample(samplesize).dropna().copy()
- sample.to_csv('sample.csv', index=False)
- del(df) # We no longer need df
- print('Calculating Distance Matrix')
- distmatrix = deque()
- with mp.Pool() as pool:
- args = ((s1, s2) for s1 in sample.Structure.values for s2 in sample.Structure.values)
- work = pool.imap_unordered(wrapper, args)
- for value in tqdm(work, ncols=100, total=len(sample)**2):
- distmatrix.append(value)
- print('Saving distmatrix to disk')
- distmatrix = pd.DataFrame({'s1':[x for x in sample.SolutionID.values for y in sample.SolutionID.values],
- 's2':[y for x in sample.SolutionID.values for y in sample.SolutionID.values],
- 'distance':distmatrix})
- distmatrix['s1n'] = distmatrix.s1.str[1:].astype(int)
- distmatrix['s2n'] = distmatrix.s2.str[1:].astype(int)
- distmatrix.to_csv('distmatrix.csv', index=False)
- sample = pd.read_csv('sample.csv')
- distmatrix = pd.read_csv('distmatrix.csv')
- print('Done')
- # ----------------------------------------------------------------------------------------
- # Calculate graph positions on x, y plane
- # ----------------------------------------------------------------------------------------
- print(distmatrix.info())
- # Now we locate x, y locations for the points on the graph.
- print('Making graph')
- G = nx.Graph()
- G.add_nodes_from(list(set(distmatrix['s1n'])))
- def gen_edges(distmatrix):
- for _, val in distmatrix[['s1n', 's2n', 'distance']].iterrows():
- weight = val.values[0]
- s1 = val.values[1]
- s2 = val.values[2]
- yield (s1, s2, {'weight': weight})
- G.add_edges_from(gen_edges(distmatrix[['s1n','s2n','distance']]))
- print('Calculating X, Y positions')
- pos = {key: val for key, val in nx.spring_layout(G).items()}
- sample['solidno'] = sample.SolutionID.str[1:].astype(int)
- sample['xy'] = sample.solidno.map(pos)
- sample['x'] = sample.xy.str[0].astype(float)
- sample['y'] = sample.xy.str[1].astype(float)
- sample = sample.drop('xy', axis=1)
- print(sample.info())
- sample.to_csv('sample.csv', index=False)
- sample = pd.read_csv('sample.csv')
- # ----------------------------------------------------------------------------------------
- # Plot the figure
- # ----------------------------------------------------------------------------------------
- print('Plotting figure')
- size = 7
- sns.lmplot('x', 'y', data=sample, fit_reg=False, hue='QCode', size=size, legend=False, col='Status')
- plt.savefig("split.png") # save as png
- sns.lmplot('x', 'y', data=sample, fit_reg=False, hue='QCode', size=size, legend=False)
- plt.savefig("single.png") # save as png
- #plt.plot(df.x, df.y, 'o', alpha=0.5)
- print('Done')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement