Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from src.geo_labeler import GeoLabeler
- import pandas as pd
- import numpy as np
- from shapely import wkt
- def floor4(val):
- return round(float(val), 4)
- source = 'intouchs'
- folder = 'world-test'
- regions = ['44', '55']
- pd.set_option('display.max_columns', 13)
- pd.set_option('display.width', 1000)
- intouchs = pd.read_csv('data/' + folder + '/source/' + source + '.csv', index_col=0, dtype=str)
- print('original shape:', intouchs.shape)
- intouchs = intouchs[intouchs['city_id'].isin(regions)]
- print('filtered by regions:', regions, intouchs.shape)
- intouchs.drop_duplicates(subset=['customer_id', 'event_timestamp'], inplace=True)
- print('duplicates are dropped:', intouchs.shape)
- intouchs = intouchs[(intouchs['latitude'].map(floor4) != 42.3503) | (intouchs['longitude'].map(floor4) != -71.0571)]
- print('filtered bad square', intouchs.shape)
- intouchs['event_timestamp'] = pd.to_datetime(intouchs['event_timestamp'])
- # intouchs = intouchs[intouchs['event_timestamp'] <= '2016-05-01']
- intouchs['longitude'] = intouchs['longitude'].astype(np.float64)
- intouchs['latitude'] = intouchs['latitude'].astype(np.float64)
- print(intouchs)
- grouper = pd.TimeGrouper(key='event_timestamp', freq='1T')
- intouchs = intouchs.sort_values('event_timestamp').groupby([grouper, 'customer_id', 'city_id'], as_index=False).first()
- print('prefiltered', intouchs.shape)
- hexagons = pd.read_csv('data/%s/source/hexagons.csv' % folder, index_col=0)
- hexagons['polygon'] = hexagons.apply(lambda row: wkt.loads(row['polygon']), axis=1)
- labeler = GeoLabeler(hexagons['polygon'])
- for i in range(10):
- grouper = pd.TimeGrouper(key='event_timestamp', freq='10T', base=i)
- filtered_intouchs = intouchs.sort_values('event_timestamp').groupby([grouper, 'customer_id', 'city_id'], as_index=False).first()
- print('filtered first with shift', str(i) + 'M', filtered_intouchs.shape)
- labeler.label(filtered_intouchs, lon_column='longitude', lat_column='latitude', label_column='polygon')
- filtered_intouchs['polygon_group'] = filtered_intouchs['polygon'].apply(lambda x: np.NaN if np.isnan(x) else hexagons.loc[x, 'polygon_group'])
- filtered_intouchs.to_csv('data/%s/source/shifted/%s-%sM-filtered-labeled.csv' % (folder, source, i))
- # print(intouchs)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement