Advertisement
Guest User

Untitled

a guest
Sep 26th, 2016
63
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.18 KB | None | 0 0
  1. from src.geo_labeler import GeoLabeler
  2. import pandas as pd
  3. import numpy as np
  4. from shapely import wkt
  5.  
  6. def floor4(val):
  7. return round(float(val), 4)
  8.  
  9. source = 'intouchs'
  10. folder = 'world-test'
  11. regions = ['44', '55']
  12.  
  13. pd.set_option('display.max_columns', 13)
  14. pd.set_option('display.width', 1000)
  15.  
  16. intouchs = pd.read_csv('data/' + folder + '/source/' + source + '.csv', index_col=0, dtype=str)
  17. print('original shape:', intouchs.shape)
  18.  
  19. intouchs = intouchs[intouchs['city_id'].isin(regions)]
  20. print('filtered by regions:', regions, intouchs.shape)
  21.  
  22. intouchs.drop_duplicates(subset=['customer_id', 'event_timestamp'], inplace=True)
  23. print('duplicates are dropped:', intouchs.shape)
  24.  
  25. intouchs = intouchs[(intouchs['latitude'].map(floor4) != 42.3503) | (intouchs['longitude'].map(floor4) != -71.0571)]
  26. print('filtered bad square', intouchs.shape)
  27.  
  28. intouchs['event_timestamp'] = pd.to_datetime(intouchs['event_timestamp'])
  29. # intouchs = intouchs[intouchs['event_timestamp'] <= '2016-05-01']
  30. intouchs['longitude'] = intouchs['longitude'].astype(np.float64)
  31. intouchs['latitude'] = intouchs['latitude'].astype(np.float64)
  32. print(intouchs)
  33.  
  34. grouper = pd.TimeGrouper(key='event_timestamp', freq='1T')
  35. intouchs = intouchs.sort_values('event_timestamp').groupby([grouper, 'customer_id', 'city_id'], as_index=False).first()
  36.  
  37. print('prefiltered', intouchs.shape)
  38.  
  39. hexagons = pd.read_csv('data/%s/source/hexagons.csv' % folder, index_col=0)
  40. hexagons['polygon'] = hexagons.apply(lambda row: wkt.loads(row['polygon']), axis=1)
  41. labeler = GeoLabeler(hexagons['polygon'])
  42.  
  43. for i in range(10):
  44. grouper = pd.TimeGrouper(key='event_timestamp', freq='10T', base=i)
  45. filtered_intouchs = intouchs.sort_values('event_timestamp').groupby([grouper, 'customer_id', 'city_id'], as_index=False).first()
  46. print('filtered first with shift', str(i) + 'M', filtered_intouchs.shape)
  47. labeler.label(filtered_intouchs, lon_column='longitude', lat_column='latitude', label_column='polygon')
  48. filtered_intouchs['polygon_group'] = filtered_intouchs['polygon'].apply(lambda x: np.NaN if np.isnan(x) else hexagons.loc[x, 'polygon_group'])
  49. filtered_intouchs.to_csv('data/%s/source/shifted/%s-%sM-filtered-labeled.csv' % (folder, source, i))
  50.  
  51. # print(intouchs)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement