Advertisement
Guest User

Untitled

a guest
Jun 20th, 2019
122
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 12.66 KB | None | 0 0
  1. from sklearn.feature_extraction import DictVectorizer
  2. from tqdm import tqdm_notebook
  3. from sklearn.metrics import confusion_matrix
  4. from sklearn.metrics import precision_recall_fscore_support
  5. from sklearn.model_selection import KFold, StratifiedKFold
  6. from sklearn import svm
  7. from sklearn.ensemble import RandomForestClassifier
  8. from sklearn.neighbors import KNeighborsClassifier
  9. from sklearn.utils import shuffle
  10. import glob
  11. import os
  12.  
  13. from msbase.utils import load_json
  14. from matplotlib.pyplot import cm
  15.  
  16. import pandas as pd
  17. import numpy as np
  18.  
  19. import json
  20.  
  21. def load_vectors(vectors_dir: str, labels):
  22. DX = []
  23. DY = []
  24. DZ = []
  25. DAPKs = []
  26. for i, label in enumerate(labels):
  27. vectors = json.load(open(vectors_dir + '/' + label + "-vectors.json", "r"))
  28.  
  29. DAPKs += [ apk for apk, v in vectors ]
  30. DX += [ v for apk, v in vectors ]
  31. DY += [i] * len(vectors)
  32. if label == "benign":
  33. DZ += [0] * len(vectors)
  34. else:
  35. DZ += [1] * len(vectors)
  36.  
  37. if isinstance(DX[0], dict):
  38. v = DictVectorizer(sparse=False)
  39. DX = v.fit_transform(DX)
  40. feature_names = v.feature_names_
  41. else:
  42. DX = np.array(DX)
  43. return DX, np.array(DY), DZ, feature_names, DAPKs
  44.  
  45. def classify_fold(train_X, train_Y, test_X, test_Y,
  46. labels, feature_names,
  47. n_estimators, max_features, max_depth, report=False):
  48. classifier = RandomForestClassifier(max_depth=max_depth, n_estimators=n_estimators, max_features=max_features, n_jobs=6, random_state=33)
  49. classifier.fit(train_X, train_Y)
  50. pred_Y = classifier.predict(test_X)
  51. pred_proba_Y = classifier.predict_proba(test_X)
  52. for i in range(pred_proba_Y.shape[1], len(labels)):
  53. pred_proba_Y = np.insert(pred_proba_Y, i, 0, axis=1)
  54. if report:
  55. feature_importances = pd.DataFrame(classifier.feature_importances_,
  56. index = feature_names,
  57. columns=['importance']).sort_values('importance',ascending=False)
  58. # precision_recall_fscore_support(test_Y, pred_Y, labels=labels) + \
  59. return (None, None, None, None, feature_importances, test_Y, pred_Y, pred_proba_Y)
  60. return classifier.score(test_X, test_Y)
  61.  
  62. def classify(DX, DY, labels, feature_names, DAPKs, n_estimators, max_features, split_ratio, max_depth, report=False):
  63. X, Y, APKs = shuffle(DX, DY, DAPKs)
  64. classifier = RandomForestClassifier(max_depth=max_depth, n_estimators=n_estimators, max_features=max_features, n_jobs=6)
  65. train_size = int(len(Y) * split_ratio)
  66. train_X = X[:train_size]
  67. train_Y = Y[:train_size]
  68. classifier.fit(train_X, train_Y)
  69. test_X = X[train_size:]
  70. test_Y = Y[train_size:]
  71. test_APKs = APKs[train_size:]
  72. pred_Y = classifier.predict(test_X)
  73. pred_proba_Y = classifier.predict_proba(test_X)
  74. if report:
  75. feature_importances = pd.DataFrame(classifier.feature_importances_,
  76. index = feature_names,
  77. columns=['importance']).sort_values('importance',ascending=False)
  78. return precision_recall_fscore_support(test_Y, pred_Y, labels=labels) + (feature_importances, test_Y, pred_Y, pred_proba_Y, test_APKs)
  79. return classifier.score(test_X, test_Y)
  80.  
  81. def classify_knn(DX, DY, labels, split_ratio=0.7, n_neighbors=3, report=False):
  82. X, Y = shuffle(DX, DY)
  83. classifier = KNeighborsClassifier(n_neighbors=n_neighbors)
  84. train_size = int(len(Y) * split_ratio)
  85. train_X = X[:train_size]
  86. train_Y = Y[:train_size]
  87. classifier.fit(train_X, train_Y)
  88. test_X = X[train_size:]
  89. test_Y = Y[train_size:]
  90. pred_Y = classifier.predict(test_X)
  91. if report:
  92. return precision_recall_fscore_support(test_Y, pred_Y, labels=labels)
  93. return classifier.score(test_X, test_Y)
  94.  
  95. def classify_svm(DX, DY, labels, kernel="rbf", split_ratio=0.7, report=False):
  96. X, Y = shuffle(DX, DY)
  97. classifier = svm.SVC(kernel=kernel)
  98. train_size = int(len(Y) * split_ratio)
  99. train_X = X[:train_size]
  100. train_Y = Y[:train_size]
  101. classifier.fit(train_X, train_Y)
  102. test_X = X[train_size:]
  103. test_Y = Y[train_size:]
  104. pred_Y = classifier.predict(test_X)
  105. if report:
  106. return precision_recall_fscore_support(test_Y, pred_Y, labels=labels), classifier
  107. return classifier.score(test_X, test_Y)
  108.  
  109. def matrix(DX, DY, labels):
  110. _, n_feats = DX.shape
  111. estimate_scores = {}
  112.  
  113. for n_estimators in [2, 20, 60, 80, 100, 160, 200]:
  114. if n_estimators > n_feats:
  115. continue
  116. estimate_scores[n_estimators] = {}
  117. for max_features in [2, 20, 60, 80, 100, 160, 200]:
  118. if max_features > n_feats:
  119. continue
  120. scores = []
  121. for i in range(10):
  122. scores.append(classify(DX, DY, labels=labels, n_estimators=n_estimators,
  123. max_features=max_features, split_ratio=0.7))
  124. score = np.mean(scores)
  125. estimate_scores[n_estimators][max_features] = score
  126.  
  127. color= cm.rainbow(np.linspace(0, 1, len(estimate_scores)))
  128. n_estimators_map = dict(zip(estimate_scores.keys(), range(len(estimate_scores))))
  129.  
  130. for n_estimators, scores in estimate_scores.items():
  131. xs, ys = zip(*scores.items())
  132. plt.plot(xs, ys, c=color[n_estimators_map[n_estimators]], label=str(n_estimators))
  133. plt.xlabel("max_features")
  134. plt.ylabel("accuracy")
  135. plt.legend()
  136. plt.show()
  137.  
  138. def avg_eval(DX, DY, DAPKs, combined_labels, combined_labels_index, feature_names,
  139. max_features, n_estimators, n_fold, max_depth):
  140. feature_importances_s = []
  141. y_true_all = []
  142. y_pred_all = []
  143. y_pred_proba_all = []
  144. APKs_test_all = []
  145. kf = KFold(n_splits=n_fold, shuffle=True, random_state=36)
  146. for train_index, test_index in kf.split(DX):
  147. _, _, _, _, feature_importances, y_true, y_pred, y_pred_proba = \
  148. classify_fold(DX[train_index], DY[train_index], DX[test_index], DY[test_index],
  149. combined_labels_index, feature_names,
  150. max_features=max_features, n_estimators=n_estimators,
  151. max_depth=max_depth,
  152. report=True)
  153. feature_importances_s.append(feature_importances)
  154. y_true_all.append(y_true)
  155. y_pred_all.append(y_pred)
  156. y_pred_proba_all.append(y_pred_proba)
  157. APKs_test_all += list(pd.DataFrame(DAPKs, columns=["APK"]).loc[test_index]["APK"])
  158.  
  159. return None, feature_importances_s, \
  160. np.concatenate(y_true_all), np.concatenate(y_pred_all), \
  161. np.concatenate(y_pred_proba_all), APKs_test_all
  162.  
  163. # NOTE: Kmean isn't very good
  164.  
  165. # kmeans = KMeans(n_clusters=len(combined_labels))
  166.  
  167. # y_pred = kmeans.fit_predict(DX)
  168.  
  169. # mat = confusion_matrix(DY, y_pred).T
  170. # mat
  171.  
  172. # size_array = np.array([n for l, n in label_stat])
  173. # size_array
  174. # mat = (mat / size_array)
  175.  
  176. # sn.heatmap(mat,
  177. # xticklabels=labels,
  178. # yticklabels=range(len(labels)))
  179. # plt.xlabel('true label')
  180. # plt.ylabel('predicted label')
  181.  
  182. def load_vt_stat(apks):
  183. os.chdir("../..")
  184.  
  185. metadata_paths = []
  186. #if not gapps_only:
  187. metadata_paths.extend(glob.glob("samples_metadata/*/*.test.json"))
  188. # metadata_paths.extend(glob.glob("all_samples_eval/*.test.json"))
  189.  
  190. #label_samples = {}
  191. #bin_samples = { True: [], False: [] } # is_benign
  192. #vt_stat = {}
  193. #vt_stat_bin = {}
  194.  
  195. #for metadata_path in metadata_paths:
  196. # testset_json = load_json(metadata_path)
  197. #if gapps_only:
  198. # label = os.path.basename(metadata_path).split(".")[0]
  199. # for test_data in testset_json:
  200. #if not gapps_only:
  201. # label = test_data['label']
  202. # if label not in label_samples:
  203. # label_samples[label] = []
  204. # bin_label = test_data['label'] == "benign"
  205. # label_samples[label].append(test_data)
  206. #bin_samples[bin_label].append(test_data)
  207.  
  208. apks_is_malicious = {}
  209. apks_is_malicious_major = {}
  210.  
  211. for metadata_path in metadata_paths:
  212. for test_data in load_json(metadata_path):
  213. if test_data["apk"] in apks:
  214. assert "virustotal" in test_data, test_data['apk']
  215. vt_report = test_data["virustotal"]
  216. assert "positives" in vt_report and "scans" in vt_report
  217. #vt_frac_positives += int(vt_report["positives"]) / len(vt_report["scans"])
  218. #print(vt_report["positives"], len(vt_report["scans"]))
  219. assert len(vt_report["scans"]) == vt_report["total"]
  220. vt_major = vt_report["positives"] > int(len(vt_report["scans"]) * 0.5)
  221. vt_exist = vt_report["positives"] >= 1
  222. apks_is_malicious[test_data['apk']] = vt_exist
  223. apks_is_malicious_major[test_data['apk']] = vt_major
  224.  
  225. ret = [ int(apks_is_malicious[apk]) for apk in apks ]
  226. ret_major = [ int(apks_is_malicious_major[apk]) for apk in apks ]
  227. assert len(ret) == len(apks)
  228. os.chdir("eval/ase19")
  229. return ret, ret_major
  230. # vt_total += 1
  231. # if vt_total > 0:
  232. # vt_stat[label] = {
  233. # "vt_frac": vt_frac_positives / vt_total,
  234. # "vt_exist": vt_exist_positives / vt_total,
  235. # "vt_major": vt_major_positives / vt_total,
  236. # "vt_support": vt_total,
  237. # }
  238.  
  239.  
  240. # for label, samples in bin_samples.items():
  241. # vt_frac_positives = 0
  242. # vt_exist_positives = 0
  243. # vt_major_positives = 0
  244. # vt_total = 0
  245. # for test_data in samples:
  246. # if "virustotal" in test_data:
  247. # vt_report = test_data["virustotal"]
  248. # if "positives" in vt_report and "scans" in vt_report:
  249. # vt_frac_positives += int(vt_report["positives"]) / len(vt_report["scans"])
  250. # vt_exist_positives += int(vt_report["positives"] > 1)
  251. # vt_major_positives += int(vt_report["positives"] > len(vt_report["scans"]) * 0.5)
  252. # vt_total += 1
  253. # if vt_total > 0:
  254. # vt_stat_bin[label] = {
  255. # "vt_frac": vt_frac_positives / vt_total,
  256. # "vt_exist": vt_exist_positives / vt_total,
  257. # "vt_major": vt_major_positives / vt_total,
  258. # "vt_support": vt_total,
  259. # }
  260.  
  261. # vt_result_df = pd.DataFrame(vt_stat).T
  262. # vt_stat_bin_df = pd.DataFrame(vt_stat_bin).T
  263. # vt_stat_bin_df = vt_stat_bin_df.rename(index={True: "benign", False: "malicious"}).drop(["vt_frac", "vt_major"], axis=1)
  264. # return vt_result_df, vt_stat_bin_df
  265.  
  266. # FIXME: PCA is not good
  267.  
  268. # from sklearn.decomposition import PCA
  269. # from mpl_toolkits.mplot3d import Axes3D
  270.  
  271. # pca = PCA(n_components=3)
  272. # pca_2 = PCA(n_components=2)
  273.  
  274. # components = pca.fit_transform(DX)
  275. # components_2 = pca_2.fit_transform(DX)
  276.  
  277. # result = pd.DataFrame(components, columns=['PCA%i' % i for i in range(3)])
  278. # print(result.shape)
  279.  
  280. # result_2 = pd.DataFrame(components_2, columns=['PCA%i' % i for i in range(2)])
  281. # print(result_2.shape)
  282.  
  283. # def plot(color_map, DY, labels):
  284. # colors = [color_map[y] for y in DY]
  285.  
  286. ## Plot initialisation
  287. # fig = plt.figure(figsize=(8, 6))
  288. # ax = Axes3D(fig)
  289. # ax.scatter(result['PCA0'], result['PCA1'], result['PCA2'], c=colors, cmap="Set2_r", s=60)
  290.  
  291. ## make simple, bare axis lines through space:
  292. # xAxisLine = ((min(result['PCA0']), max(result['PCA0'])), (0, 0), (0,0))
  293. # ax.plot(xAxisLine[0], xAxisLine[1], xAxisLine[2], 'r')
  294. # yAxisLine = ((0, 0), (min(result['PCA1']), max(result['PCA1'])), (0,0))
  295. # ax.plot(yAxisLine[0], yAxisLine[1], yAxisLine[2], 'r')
  296. # zAxisLine = ((0, 0), (0,0), (min(result['PCA2']), max(result['PCA2'])))
  297. # ax.plot(zAxisLine[0], zAxisLine[1], zAxisLine[2], 'r')
  298.  
  299. ## label the axes
  300. # ax.set_xlabel("PC1")
  301. # ax.set_ylabel("PC2")
  302. # ax.set_zlabel("PC3")
  303.  
  304. # markers = [plt.Line2D([0,0],[0,0], color=color, marker='o', linestyle='') for color in color_map]
  305. # plt.legend(markers, labels, numpoints=1)
  306.  
  307. # color_map = cm.rainbow(np.linspace(0, 1, len(combined_labels)))
  308. # plot(color_map, DY, combined_labels)
  309.  
  310. # color_map_2 = cm.rainbow(np.linspace(0, 1, 2))
  311. # plot(color_map_2, DZ, ["benign", "malicous"])
  312.  
  313. # DX_pca = result
  314. # DX_pca.shape
  315.  
  316.  
  317. # results, classifier = classify_svm(result_2, DZ, labels=[0,1], report=True)
  318. # results
  319.  
  320. # plt.figure(1, figsize=(4, 3))
  321. # colors_2 = [color_map_2[y] for y in DZ]
  322. # plt.scatter(result_2['PCA0'], result['PCA1'], c=colors_2, zorder=10, cmap=plt.cm.Paired,
  323. # edgecolors='k')
  324.  
  325. # plt.scatter(classifier.support_vectors_[:, 0], classifier.support_vectors_[:, 1], s=80,
  326. # facecolors='none', zorder=10, edgecolors='k')
  327.  
  328. # plt.axis('tight')
  329. # x_min = -4
  330. # x_max = 4
  331. # y_min = -4
  332. # y_max = 4
  333.  
  334. # XX, YY = np.mgrid[x_min:x_max:200j, y_min:y_max:200j]
  335. # Z = classifier.decision_function(np.c_[XX.ravel(), YY.ravel()])
  336.  
  337. # Put the result into a color plot
  338. # Z = Z.reshape(XX.shape)
  339. # plt.figure(1, figsize=(4, 3))
  340. # plt.pcolormesh(XX, YY, Z > 0, cmap=plt.cm.Paired)
  341. # plt.contour(XX, YY, Z, colors=['k', 'k', 'k'], linestyles=['--', '-', '--'],
  342. # levels=[-1, -.5, 0, .5, 1])
  343.  
  344. # plt.xlim(x_min, x_max)
  345. # plt.ylim(y_min, y_max)
  346.  
  347. # plt.xticks(())
  348. # plt.yticks(())
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement