Untitled

import pandas as pd

data = pd.read_csv('/datasets/visits_eng.csv', sep='\t')
data['local_time'] = (
    pd.to_datetime(data['date_time'], format='%Y-%m-%dT%H:%M:%S')
    - pd.Timedelta(hours=7)
)
data['date_hour'] = data['local_time'].dt.round('1H')

# filter excessively fast and slow visits and gas stations
data['too_fast'] = data['time_spent'] < 60
data['too_slow'] = data['time_spent'] > 1000
too_fast_stat = data.pivot_table(index='id', values='too_fast')
good_ids = too_fast_stat.query('too_fast < 0.5')
good_data = data.query('id in @good_ids.index')
good_data = good_data.query('60 <= time_spent <= 1000')

# consider data by individual gas station and by chains
station_stat = data.pivot_table(index='id', values='time_spent', aggfunc='median')
good_station_stat = good_data.pivot_table(index='id', values='time_spent', aggfunc='median')
name_stat = data.pivot_table(index='name', values='time_spent')
good_name_stat = good_data.pivot_table(index='name', values='time_spent', aggfunc='median')
name_stat['good_time_spent'] = good_name_stat['time_spent']

id_name = good_data.pivot_table(index='id', values='name', aggfunc=['first', 'count'])
id_name.columns = ['name', 'count']
station_stat_full = id_name.join(good_station_stat)

# calculate the chains' results from the gas stations results,
# but not average visits to all of a chain's gas stations
good_name_stat2 = (
    station_stat_full
    .query('count > 30')
    .pivot_table(index='name', values='time_spent', aggfunc=['median', 'count'])
)
good_name_stat2.columns = ['median_time', 'stations']
final_stat = name_stat.join(good_name_stat2)

big_nets_stat = final_stat.query('stations > 10')
station_stat_full['group_name'] = (
    station_stat_full['name']
    .where(station_stat_full['name'].isin(big_nets_stat.index), 'Others')
)

stat_grouped = (
    station_stat_full
    .query('count > 30')
    .pivot_table(index='group_name', values='time_spent', aggfunc=['median', 'count'])
)
stat_grouped.columns = ['time_spent', 'count']

good_data['group_name'] = (
    good_data['name']
    .where(good_data['name'].isin(big_nets_stat.index), 'Others')
)
#print(good_data.head())

for name in good_data.groupby('group_name'):
    good_data.hist('time_spent', bins=50)