Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python2.7
- #=*- coding:utf-8 -*-
- import pandas as pd
- from datetime import datetime
- def runQ1(df):
- print "\n=============Q1=============="
- tdf = df[df["country_id"] == "BDV"]
- print "the number of the rows with country_id = 'BDV' : %d" % len(tdf)
- print tdf.groupby(["site_id"])["user_id"].nunique().sort_values(ascending=False)
- def runQ2(df):
- print "\n=============Q2=============="
- tdf = df[(df["dt"] >= datetime(2019,2,3,0,0,0)) & (df["dt"] <= datetime(2019,2,4,23,59,59))]
- print "the number of the rows from 2019-02-03 00:00:00 to 2019-02-04 23:59:59 : %d" % len(tdf)
- cdf = tdf.groupby(["user_id", "site_id"])["ts"].count()
- print cdf[cdf >= 10]
- def runQ3(df):
- print "\n=============Q3=============="
- tdf = df.sort_values(by=["dt"]).drop_duplicates(["user_id"], keep="last")
- print "After drop duplicates of user_id, the number of rows : %d" % len(tdf)
- print tdf.groupby(["site_id"])["user_id"].nunique().sort_values(ascending=False)
- def runQ4(df):
- print "\n=============Q4=============="
- first = df.sort_values(by=["dt"]).drop_duplicates(["user_id"], keep="first")[["user_id", "site_id"]]
- last = df.sort_values(by=["dt"]).drop_duplicates(["user_id"], keep="last")[["user_id", "site_id"]]
- first = first.rename(columns= {"site_id" : "first_site_id"})
- last = last.rename(columns= {"site_id" : "last_site_id"})
- join_df = first.set_index("user_id").join(last.set_index("user_id"))
- people_first_and_last_visit_same = join_df[join_df["first_site_id"] == join_df["last_site_id"]]
- print "the number of people whose first/last visit are same : %d" % len(people_first_and_last_visit_same)
- if __name__ == "__main__":
- input_path = "./q3_data.tsv"
- df = pd.read_csv(input_path, sep='\t')
- df["dt"] = df["ts"].apply(lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S"))
- runQ1(df)
- runQ2(df)
- runQ3(df)
- runQ4(df)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement