Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- data = pd.read_csv("/datasets/visits.csv", sep="\t")
- data['local_time'] = (
- pd.to_datetime(data['date_time'], yearfirst=True)
- + pd.Timedelta(hours=3)
- )
- data['date_hour'] = data['local_time'].dt.round('1H')
- data['too_fast'] = data['time_spent'] < 60
- data['too_slow'] = data['time_spent'] > 1000
- too_fast_stat = data.pivot_table(index='id', values='too_fast')
- good_ids = too_fast_stat.query('too_fast < 0.5')
- good_data = data.query('id in @good_ids.index')
- good_data = good_data.query('60 <= time_spent <= 1000')
- station_stat = data.pivot_table(index="id", values="time_spent", aggfunc="median")
- good_station_stat = good_data.pivot_table(index="id", values="time_spent", aggfunc="median")
- stat = data.pivot_table(index='name', values='time_spent')
- good_stat = good_data.pivot_table(index='name', values='time_spent', aggfunc='median')
- stat['good_time_spent'] = good_stat['time_spent']
- id_name = good_data.pivot_table(index='id', values='name', aggfunc=['first', 'count'])
- id_name.columns = ['name', 'count']
- station_stat_full = id_name.join(good_station_stat)
- good_stat2 = (
- station_stat_full
- .query('count > 30')
- .pivot_table(index='name', values='time_spent', aggfunc=['median', 'count'])
- )
- good_stat2.columns = ['median_time', 'stations']
- final_stat = stat.join(good_stat2)
- big_nets_stat = final_stat.query('stations > 10')
- station_stat_full['group_name'] = (
- station_stat_full['name']
- .where(station_stat_full['name'].isin(big_nets_stat.index), 'Другие')
- )
- stat_grouped = (
- station_stat_full
- .query('count > 30')
- .pivot_table(index='group_name', values='time_spent', aggfunc=['median', 'count'])
- )
- stat_grouped.columns = ['time_spent', 'count']
- good_data['group_name'] = (
- good_data['name']
- .where(good_data['name'].isin(big_nets_stat.index), 'Другие')
- )
- for name, group_data in good_data.groupby('group_name'):
- name, len(group_data.hist('time_spent', bins = 50))
- good_data.plot(kind = 'hist', y = 'time_spent', title = name, bins = 50)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement