Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- data = pd.read_csv('/datasets/visits_eng.csv', sep='\t')
- # filter excessively fast and slow visits and gas stations
- data['too_fast'] = data['time_spent'] < 60
- data['too_slow'] = data['time_spent'] > 1000
- too_fast_stat = data.pivot_table(index='id', values='too_fast')
- good_ids = too_fast_stat.query('too_fast < 0.5')
- good_data = data.query('id in @good_ids.index')
- good_data = good_data.query('60 <= time_spent <= 1000')
- # consider data by individual gas station and by chains
- station_stat = data.pivot_table(index='id', values='time_spent', aggfunc='median')
- good_station_stat = good_data.pivot_table(index='id', values='time_spent', aggfunc='median')
- name_stat = data.pivot_table(index='name', values='time_spent')
- good_name_stat = good_data.pivot_table(index='name', values='time_spent', aggfunc='median')
- name_stat['good_time_spent'] = good_name_stat['time_spent']
- id_name = good_data.pivot_table(index='id', values='name', aggfunc=['first', 'count'])
- id_name.columns = ['name', 'count']
- station_stat_full = id_name.join(good_station_stat)
- station_stat_multi = data.pivot_table(index='id', values=['time_spent', 'too_fast', 'too_slow'])
- print(station_stat_multi.corr())
- pd.plotting.scatter_matrix(station_stat_multi, figsize=(9, 9))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement