Advertisement
Guest User

Untitled

a guest
Dec 11th, 2019
109
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.76 KB | None | 0 0
  1. def preproc(train):
  2.  
  3.   lat_list = [55, 56, 59, 60, 51]
  4.   lon_list = [36, 37, 38, 29, 30, 39]
  5.  
  6.   def nooutliers(data):
  7.     newdata = pd.DataFrame()
  8.     newdata_2 = pd.DataFrame()
  9.     for i in lat_list:
  10.       data_2 = data[data['lat'] > i]
  11.       data_3 = data_2[data_2['lat'] < i+1]
  12.       newdata = pd.concat([data_3, newdata])
  13.     for i in lon_list:
  14.       data_2 = newdata[newdata['lon'] > i]
  15.       data_3 = data_2[data_2['lon'] < i+1]
  16.       newdata_2 = pd.concat([data_3, newdata_2])
  17.  
  18.     newdata_2 = newdata_2.assign(Moscow=0)
  19.     newdata_2 = newdata_2.assign(Peter=0)
  20.     newdata_2 = newdata_2.assign(Voron=0)
  21.  
  22.     newdata_2['Moscow'] = newdata_2['lon'].apply(lambda x: 1 if x < 39 and x >= 36 else 0)
  23.     newdata_2['Peter'] = newdata_2['lon'].apply(lambda x: 1 if x < 31 and x >= 29 else 0)
  24.     newdata_2['Voron'] = newdata_2['lon'].apply(lambda x: 1 if x < 40 and x >= 39 else 0)
  25.  
  26.     return newdata_2
  27.  
  28.   train = nooutliers(train)
  29.  
  30.   from sklearn.preprocessing import OneHotEncoder, LabelEncoder
  31.   cat_clmns = ['f_class', 's_class', 't_class']
  32.   #orddict = {'econom':1, 'business':2, 'vip':3}
  33.   encoders = [LabelEncoder().fit(train[c]) for c in cat_clmns]
  34.   cat_dat = np.stack([enc.transform(train[c]) for enc, c in zip(encoders, cat_clmns)]).T
  35.   ohe = OneHotEncoder()
  36.   ohe.fit(cat_dat)
  37.   cat_ohe = ohe.transform(cat_dat).toarray()
  38.   cat_ohe
  39.  
  40.   data = train.copy()
  41.   for c in list(cat_clmns):
  42.     del data[c]
  43.   a = pd.DataFrame(cat_ohe, index = data.index)
  44.   a.columns = ['col0', 'col1', 'col2', 'col3', 'col4', 'col5', 'col6', 'col7', 'col8', 'col9', 'col10', 'col11']
  45.   data_OH = pd.concat([data, a], axis=1)
  46.   data_OH
  47.   data = data_OH.copy()
  48.   #data['f_class'] = data['f_class'].map(orddict)
  49.  # data['s_class'] = data['s_class'].map(orddict)
  50.   #data['t_class'] = data['t_class'].map(orddict)
  51.   #data['f_class'] = data['f_class'].fillna(1)
  52.   #data['s_class'] = data['s_class'].fillna(2)
  53.   #data['t_class'] = data['t_class'].fillna(3)
  54.    
  55.   from sklearn.preprocessing import StandardScaler
  56.   scaler = StandardScaler()
  57.   for i in ['Moscow', "Peter", 'Voron']:
  58.       b = pd.DataFrame(data[data[i] == 1].lat, index = data.index)
  59.       b['lon'] = data[data[i] == 1].lon
  60.  
  61.       data_scaled = scaler.fit_transform(b)
  62.  
  63.       c = pd.DataFrame(data_scaled, index = data.index)
  64.       c.columns = ['scaled_lat_' + i, 'scaled_lon_' + i]
  65.       c = c.fillna(0)
  66.  
  67.       data = pd.concat([data, c], axis=1)
  68.  
  69.   data['scaled_lat'] = data['scaled_lat_Moscow'] + data['scaled_lat_Peter'] + data['scaled_lat_Voron']
  70.   data['scaled_lon'] = data['scaled_lon_Moscow'] + data['scaled_lon_Peter'] + data['scaled_lon_Voron']
  71.  
  72.   data.drop(columns =["lat", 'lon', 'scaled_lat_Moscow', 'scaled_lat_Peter', 'scaled_lat_Voron', 'scaled_lon_Moscow', 'scaled_lon_Peter', 'scaled_lon_Voron', 'Voron'], inplace = True)
  73.  
  74.   new = data["due"].str.split(" ", n = 1, expand = True)
  75.   data["date"]= new[0]
  76.   data["time"]= new[1]
  77.   data.drop(columns =["due"], inplace = True)
  78.  
  79.   new2 = data["time"].str.split(".", n = 1, expand = True)
  80.   data["time"]= new2[0]
  81.   data['time'] = pd.to_timedelta(data['time'])
  82.  
  83.   data['sec'] = data['time'].dt.total_seconds()
  84.   data['sec'+'_sin']=np.sin((2*np.pi*data['sec'])/max(data['sec']))
  85.   data['sec'+'_cos']=np.cos((2*np.pi*data['sec'])/max(data['sec']))
  86.  
  87.   data=data.drop('time',axis=1)
  88.   data=data.drop('sec',axis=1)
  89.  
  90.   encoders = [LabelEncoder().fit(data['date'])]
  91.   date_cat = np.stack([enc.transform(data['date']) for enc, c in zip(encoders, cat_clmns)]).T
  92.   d = pd.DataFrame(date_cat, index = data.index)
  93.   d.columns = ['date_cat']
  94.   data = pd.concat([data, d], axis=1)
  95.   data['day'] = data['date_cat']%7
  96.   data['week'] = data['date_cat']//7
  97.  
  98.   data = data.drop('date', axis=1)
  99.   data = data.drop('date_cat', axis=1)
  100.  
  101.   return data.copy()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement