Advertisement
Guest User

Untitled

a guest
Dec 8th, 2016
84
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.39 KB | None | 0 0
  1. import pandas as pd
  2. import seaborn as sns
  3. import matplotlib.pyplot as plt
  4. import numpy as np
  5. import networkx as nx
  6. import pickle
  7. from collections import deque
  8. import multiprocessing as mp
  9. import editdistance
  10. import os
  11. from sklearn.decomposition import PCA, KernelPCA
  12. from sklearn.manifold import TSNE
  13. from tqdm import tqdm
  14.  
  15. samplesize = 1000
  16. def wrapper(args):
  17. s1, s2 = args
  18. return editdistance.eval(s1, s2)
  19.  
  20. if not os.path.exists('distmatrix.csv'):
  21. # ----------------------------------------------------------------------------------------
  22. # Read the dataset
  23. # ----------------------------------------------------------------------------------------
  24. sols = pd.read_csv('solutions.csv', usecols=['QCode', 'SolutionID', 'Status', 'Language'])
  25. sols = sols.loc[sols.Language == 'C'].drop('Language', axis=1).dropna() # Only C
  26. sols = sols.loc[(sols.Status == 'accepted') | (sols.Status == 'wrong answer')]
  27. sols = sols.dropna().sample(samplesize*3) # Thrice the sample size we need
  28. #sols.Status = (sols.Status == 'accepted').astype(int)
  29. f, s, t = [pd.read_csv('code/' + i + '.csv') for i in ['first', 'second', 'third']]
  30. print('Concatenating solutions')
  31. c = pd.concat([f, s, t]); del(f); del(s); del(t)
  32. print('Merging with dataset')
  33. df = sols.merge(c, how='left', on='SolutionID'); del(c); del(sols)
  34. print(df.info())
  35. # ----------------------------------------------------------------------------------------
  36. # Create keyword structure
  37. # ----------------------------------------------------------------------------------------
  38. def to_keywords_only(code):
  39. C_keys = '''auto,break,case,char,const,continue,default,do,double,else,enum,extern,float,
  40. for,goto,if,int,long,register,return,short,signed,sizeof,static,struct,switch,typedef,
  41. union,unsigned,void,volatile,while'''.replace('\\n', '').split(',')
  42. special = '`1234567890-=+_)(*&^%$#@!~[]{}\|";:/?.>,<' + "'"
  43. if isinstance(code, str):
  44. code = code.replace('\n', ' ')
  45. for sp in special:
  46. code = code.replace(sp, ' {} '.format(sp))
  47. useful = ' '.join((word for word in code.split(' ') if word in (C_keys + list(special))))
  48. return useful
  49. else:
  50. return ''
  51. tqdm.pandas(desc='applying_structure', ncols=100)
  52. df['Structure'] = df.Solutions.progress_apply(to_keywords_only)
  53. print(df.info())
  54. # ----------------------------------------------------------------------------------------
  55. # Take a sample from the dataset and calculate distance matrix
  56. # ----------------------------------------------------------------------------------------
  57. print('Taking {} programs as sample'.format(samplesize))
  58. sample = df.dropna().sample(samplesize).dropna().copy()
  59. sample.to_csv('sample.csv', index=False)
  60. del(df) # We no longer need df
  61. print('Calculating Distance Matrix')
  62. distmatrix = deque()
  63. with mp.Pool() as pool:
  64. args = ((s1, s2) for s1 in sample.Structure.values for s2 in sample.Structure.values)
  65. work = pool.imap_unordered(wrapper, args)
  66. for value in tqdm(work, ncols=100, total=len(sample)**2):
  67. distmatrix.append(value)
  68. print('Saving distmatrix to disk')
  69. distmatrix = pd.DataFrame({'s1':[x for x in sample.SolutionID.values for y in sample.SolutionID.values],
  70. 's2':[y for x in sample.SolutionID.values for y in sample.SolutionID.values],
  71. 'distance':distmatrix})
  72. distmatrix['s1n'] = distmatrix.s1.str[1:].astype(int)
  73. distmatrix['s2n'] = distmatrix.s2.str[1:].astype(int)
  74. distmatrix.to_csv('distmatrix.csv', index=False)
  75.  
  76. sample = pd.read_csv('sample.csv')
  77. distmatrix = pd.read_csv('distmatrix.csv')
  78. print('Done')
  79. # ----------------------------------------------------------------------------------------
  80. # Calculate graph positions on x, y plane
  81. # ----------------------------------------------------------------------------------------
  82. print(distmatrix.info())
  83. # Now we locate x, y locations for the points on the graph.
  84. print('Making graph')
  85. G = nx.Graph()
  86. G.add_nodes_from(list(set(distmatrix['s1n'])))
  87. def gen_edges(distmatrix):
  88. for _, val in distmatrix[['s1n', 's2n', 'distance']].iterrows():
  89. weight = val.values[0]
  90. s1 = val.values[1]
  91. s2 = val.values[2]
  92. yield (s1, s2, {'weight': weight})
  93. G.add_edges_from(gen_edges(distmatrix[['s1n','s2n','distance']]))
  94. print('Calculating X, Y positions')
  95. pos = {key: val for key, val in nx.spring_layout(G).items()}
  96. sample['solidno'] = sample.SolutionID.str[1:].astype(int)
  97. sample['xy'] = sample.solidno.map(pos)
  98. sample['x'] = sample.xy.str[0].astype(float)
  99. sample['y'] = sample.xy.str[1].astype(float)
  100. sample = sample.drop('xy', axis=1)
  101. print(sample.info())
  102. sample.to_csv('sample.csv', index=False)
  103. sample = pd.read_csv('sample.csv')
  104. # ----------------------------------------------------------------------------------------
  105. # Plot the figure
  106. # ----------------------------------------------------------------------------------------
  107. print('Plotting figure')
  108. size = 7
  109. sns.lmplot('x', 'y', data=sample, fit_reg=False, hue='QCode', size=size, legend=False, col='Status')
  110. plt.savefig("split.png") # save as png
  111. sns.lmplot('x', 'y', data=sample, fit_reg=False, hue='QCode', size=size, legend=False)
  112. plt.savefig("single.png") # save as png
  113. #plt.plot(df.x, df.y, 'o', alpha=0.5)
  114. print('Done')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement