Advertisement
Guest User

Untitled

a guest
Nov 14th, 2019
154
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.98 KB | None | 0 0
  1. import pandas as pd
  2. import sklearn
  3. from sklearn.cluster import KMeans
  4. from sklearn import preprocessing
  5.  
  6. le = preprocessing.LabelEncoder()
  7.  
  8.  
  9.  
  10. pd.set_option('display.max_columns', None)
  11. #df1 = pd.read_excel('BVWEK33GZ17021703_MBB_Data Scientist Event.XLSX')
  12. df1 = pd.read_excel('c0.xlsx')
  13. le.fit(df1['vin'].unique())
  14. df1['num_vin'] = le.transform(df1['vin'])
  15. #print(df1.head())
  16.  
  17. #1 if successful, 0 if not
  18. success_codes = ['202', '200', '200 - OK', '202 - Accepted']
  19. df1['status_success'] = df1['responseStatus'].isin(success_codes)
  20. df1['strange_browser'] = df1['userAgent'].isin(['Opera/9.80 Presto/2.10.x',
  21. 'Apache-HttpClient/4.5.2 (Java 1.5 minimum; Java/1.8.0_152)',
  22. 'Apache-HttpClient/4.5.2 (Java 1.5 minimum; Java/1.8.0_162)',
  23. 'Opera/9.80 Presto/2.10.x', 'Java/1.8.0_162'])
  24.  
  25.  
  26.  
  27. traceId_unique = df1['traceId'].unique()
  28. parenttraceId_unique = df1['parentTraceId'].unique()
  29. commonIdSet = set(traceId_unique) & set(parenttraceId_unique)
  30. df1['hasParentId'] = df1['traceId'].isin(commonIdSet)
  31. tIdsWithParent = df1[(df1['hasParentId'] == True)]
  32.  
  33.  
  34. #clustering_columns = [df1['status_success'], df1['strange_browser'], df1['hasParentId'] ]
  35.  
  36.  
  37. #k_means = df1[['status_success','strange_browser','hasParentId', 'processingTime', 'num_vin']]
  38.  
  39.  
  40. # talk about this model! :::: ones: 127346, zeroes: 43972
  41.  
  42. scaler = preprocessing.MinMaxScaler()
  43. scaler.fit(df1['processingTime'])
  44.  
  45. print('i fit')
  46. #df1['scaled_times'] = scaler.fit_transform(df1['processingTime'])
  47. df1['scaled_times'] = (df1.processingTime - df1.processingTime.min() ) / df1.processingTime.max() - df1.processingTime.min()
  48. k_means = df1[['status_success','strange_browser', 'hasParentId', 'num_vin', 'scaled_times']]
  49.  
  50.  
  51. print(k_means.head())
  52.  
  53. kmeans = KMeans(n_clusters=2, random_state=0).fit(k_means)
  54.  
  55. ones = [a for a in kmeans.labels_ if a == 1]
  56. zeroes = [a for a in kmeans.labels_ if a == 0]
  57.  
  58. print(f'ones: {len(ones)}, zeroes: {len(zeroes)}')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement