Untitled

def preproc(train):

  lat_list = [55, 56, 59, 60, 51]
  lon_list = [36, 37, 38, 29, 30, 39]

  def nooutliers(data):
    newdata = pd.DataFrame()
    newdata_2 = pd.DataFrame()
    for i in lat_list:
      data_2 = data[data['lat'] > i]
      data_3 = data_2[data_2['lat'] < i+1]
      newdata = pd.concat([data_3, newdata])
    for i in lon_list:
      data_2 = newdata[newdata['lon'] > i]
      data_3 = data_2[data_2['lon'] < i+1]
      newdata_2 = pd.concat([data_3, newdata_2])

    newdata_2 = newdata_2.assign(Moscow=0)
    newdata_2 = newdata_2.assign(Peter=0)
    newdata_2 = newdata_2.assign(Voron=0)

    newdata_2['Moscow'] = newdata_2['lon'].apply(lambda x: 1 if x < 39 and x >= 36 else 0)
    newdata_2['Peter'] = newdata_2['lon'].apply(lambda x: 1 if x < 31 and x >= 29 else 0)
    newdata_2['Voron'] = newdata_2['lon'].apply(lambda x: 1 if x < 40 and x >= 39 else 0)

    return newdata_2

  train = nooutliers(train)

  from sklearn.preprocessing import OneHotEncoder, LabelEncoder
  cat_clmns = ['f_class', 's_class', 't_class']
  #orddict = {'econom':1, 'business':2, 'vip':3}
  encoders = [LabelEncoder().fit(train[c]) for c in cat_clmns]
  cat_dat = np.stack([enc.transform(train[c]) for enc, c in zip(encoders, cat_clmns)]).T
  ohe = OneHotEncoder()
  ohe.fit(cat_dat)
  cat_ohe = ohe.transform(cat_dat).toarray()
  cat_ohe

  data = train.copy()
  for c in list(cat_clmns):
    del data[c]
  a = pd.DataFrame(cat_ohe, index = data.index)
  a.columns = ['col0', 'col1', 'col2', 'col3', 'col4', 'col5', 'col6', 'col7', 'col8', 'col9', 'col10', 'col11']
  data_OH = pd.concat([data, a], axis=1)
  data_OH
  data = data_OH.copy()
  #data['f_class'] = data['f_class'].map(orddict)
 # data['s_class'] = data['s_class'].map(orddict)
  #data['t_class'] = data['t_class'].map(orddict)
  #data['f_class'] = data['f_class'].fillna(1)
  #data['s_class'] = data['s_class'].fillna(2)
  #data['t_class'] = data['t_class'].fillna(3)

  from sklearn.preprocessing import StandardScaler
  scaler = StandardScaler()
  for i in ['Moscow', "Peter", 'Voron']:
      b = pd.DataFrame(data[data[i] == 1].lat, index = data.index)
      b['lon'] = data[data[i] == 1].lon

      data_scaled = scaler.fit_transform(b)

      c = pd.DataFrame(data_scaled, index = data.index)
      c.columns = ['scaled_lat_' + i, 'scaled_lon_' + i]
      c = c.fillna(0)

      data = pd.concat([data, c], axis=1)

  data['scaled_lat'] = data['scaled_lat_Moscow'] + data['scaled_lat_Peter'] + data['scaled_lat_Voron']
  data['scaled_lon'] = data['scaled_lon_Moscow'] + data['scaled_lon_Peter'] + data['scaled_lon_Voron']

  data.drop(columns =["lat", 'lon', 'scaled_lat_Moscow', 'scaled_lat_Peter', 'scaled_lat_Voron', 'scaled_lon_Moscow', 'scaled_lon_Peter', 'scaled_lon_Voron', 'Voron'], inplace = True)

  new = data["due"].str.split(" ", n = 1, expand = True)
  data["date"]= new[0]
  data["time"]= new[1]
  data.drop(columns =["due"], inplace = True)

  new2 = data["time"].str.split(".", n = 1, expand = True)
  data["time"]= new2[0]
  data['time'] = pd.to_timedelta(data['time'])

  data['sec'] = data['time'].dt.total_seconds()
  data['sec'+'_sin']=np.sin((2*np.pi*data['sec'])/max(data['sec']))
  data['sec'+'_cos']=np.cos((2*np.pi*data['sec'])/max(data['sec']))

  data=data.drop('time',axis=1)
  data=data.drop('sec',axis=1)

  encoders = [LabelEncoder().fit(data['date'])]
  date_cat = np.stack([enc.transform(data['date']) for enc, c in zip(encoders, cat_clmns)]).T
  d = pd.DataFrame(date_cat, index = data.index)
  d.columns = ['date_cat']
  data = pd.concat([data, d], axis=1)
  data['day'] = data['date_cat']%7
  data['week'] = data['date_cat']//7

  data = data.drop('date', axis=1)
  data = data.drop('date_cat', axis=1)

  return data.copy()