Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- def preproc(train):
- lat_list = [55, 56, 59, 60, 51]
- lon_list = [36, 37, 38, 29, 30, 39]
- def nooutliers(data):
- newdata = pd.DataFrame()
- newdata_2 = pd.DataFrame()
- for i in lat_list:
- data_2 = data[data['lat'] > i]
- data_3 = data_2[data_2['lat'] < i+1]
- newdata = pd.concat([data_3, newdata])
- for i in lon_list:
- data_2 = newdata[newdata['lon'] > i]
- data_3 = data_2[data_2['lon'] < i+1]
- newdata_2 = pd.concat([data_3, newdata_2])
- newdata_2 = newdata_2.assign(Moscow=0)
- newdata_2 = newdata_2.assign(Peter=0)
- newdata_2 = newdata_2.assign(Voron=0)
- newdata_2['Moscow'] = newdata_2['lon'].apply(lambda x: 1 if x < 39 and x >= 36 else 0)
- newdata_2['Peter'] = newdata_2['lon'].apply(lambda x: 1 if x < 31 and x >= 29 else 0)
- newdata_2['Voron'] = newdata_2['lon'].apply(lambda x: 1 if x < 40 and x >= 39 else 0)
- return newdata_2
- train = nooutliers(train)
- from sklearn.preprocessing import OneHotEncoder, LabelEncoder
- cat_clmns = ['f_class', 's_class', 't_class']
- #orddict = {'econom':1, 'business':2, 'vip':3}
- encoders = [LabelEncoder().fit(train[c]) for c in cat_clmns]
- cat_dat = np.stack([enc.transform(train[c]) for enc, c in zip(encoders, cat_clmns)]).T
- ohe = OneHotEncoder()
- ohe.fit(cat_dat)
- cat_ohe = ohe.transform(cat_dat).toarray()
- cat_ohe
- data = train.copy()
- for c in list(cat_clmns):
- del data[c]
- a = pd.DataFrame(cat_ohe, index = data.index)
- a.columns = ['col0', 'col1', 'col2', 'col3', 'col4', 'col5', 'col6', 'col7', 'col8', 'col9', 'col10', 'col11']
- data_OH = pd.concat([data, a], axis=1)
- data_OH
- data = data_OH.copy()
- #data['f_class'] = data['f_class'].map(orddict)
- # data['s_class'] = data['s_class'].map(orddict)
- #data['t_class'] = data['t_class'].map(orddict)
- #data['f_class'] = data['f_class'].fillna(1)
- #data['s_class'] = data['s_class'].fillna(2)
- #data['t_class'] = data['t_class'].fillna(3)
- from sklearn.preprocessing import StandardScaler
- scaler = StandardScaler()
- for i in ['Moscow', "Peter", 'Voron']:
- b = pd.DataFrame(data[data[i] == 1].lat, index = data.index)
- b['lon'] = data[data[i] == 1].lon
- data_scaled = scaler.fit_transform(b)
- c = pd.DataFrame(data_scaled, index = data.index)
- c.columns = ['scaled_lat_' + i, 'scaled_lon_' + i]
- c = c.fillna(0)
- data = pd.concat([data, c], axis=1)
- data['scaled_lat'] = data['scaled_lat_Moscow'] + data['scaled_lat_Peter'] + data['scaled_lat_Voron']
- data['scaled_lon'] = data['scaled_lon_Moscow'] + data['scaled_lon_Peter'] + data['scaled_lon_Voron']
- data.drop(columns =["lat", 'lon', 'scaled_lat_Moscow', 'scaled_lat_Peter', 'scaled_lat_Voron', 'scaled_lon_Moscow', 'scaled_lon_Peter', 'scaled_lon_Voron', 'Voron'], inplace = True)
- new = data["due"].str.split(" ", n = 1, expand = True)
- data["date"]= new[0]
- data["time"]= new[1]
- data.drop(columns =["due"], inplace = True)
- new2 = data["time"].str.split(".", n = 1, expand = True)
- data["time"]= new2[0]
- data['time'] = pd.to_timedelta(data['time'])
- data['sec'] = data['time'].dt.total_seconds()
- data['sec'+'_sin']=np.sin((2*np.pi*data['sec'])/max(data['sec']))
- data['sec'+'_cos']=np.cos((2*np.pi*data['sec'])/max(data['sec']))
- data=data.drop('time',axis=1)
- data=data.drop('sec',axis=1)
- encoders = [LabelEncoder().fit(data['date'])]
- date_cat = np.stack([enc.transform(data['date']) for enc, c in zip(encoders, cat_clmns)]).T
- d = pd.DataFrame(date_cat, index = data.index)
- d.columns = ['date_cat']
- data = pd.concat([data, d], axis=1)
- data['day'] = data['date_cat']%7
- data['week'] = data['date_cat']//7
- data = data.drop('date', axis=1)
- data = data.drop('date_cat', axis=1)
- return data.copy()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement