Untitled

#!/usr/bin/env python2.7
#=*- coding:utf-8 -*-

import pandas as pd
from datetime import datetime

def runQ1(df):
	print "\n=============Q1=============="
	tdf = df[df["country_id"] == "BDV"]
	print "the number of the rows with country_id = 'BDV' : %d" % len(tdf)
	print tdf.groupby(["site_id"])["user_id"].nunique().sort_values(ascending=False)

def runQ2(df):
	print "\n=============Q2=============="
	tdf = df[(df["dt"] >= datetime(2019,2,3,0,0,0)) & (df["dt"] <= datetime(2019,2,4,23,59,59))]
	print "the number of the rows from 2019-02-03 00:00:00 to 2019-02-04 23:59:59 : %d" % len(tdf)

	cdf = tdf.groupby(["user_id", "site_id"])["ts"].count()
	print cdf[cdf >= 10]

def runQ3(df):
	print "\n=============Q3=============="
	tdf = df.sort_values(by=["dt"]).drop_duplicates(["user_id"], keep="last")
	print "After drop duplicates of user_id, the number of rows : %d" % len(tdf)
	print tdf.groupby(["site_id"])["user_id"].nunique().sort_values(ascending=False)

def runQ4(df):
	print "\n=============Q4=============="
	first = df.sort_values(by=["dt"]).drop_duplicates(["user_id"], keep="first")[["user_id", "site_id"]]
	last = df.sort_values(by=["dt"]).drop_duplicates(["user_id"], keep="last")[["user_id", "site_id"]]

	first = first.rename(columns= {"site_id" : "first_site_id"})
	last = last.rename(columns= {"site_id" : "last_site_id"})

	join_df = first.set_index("user_id").join(last.set_index("user_id"))

	people_first_and_last_visit_same = join_df[join_df["first_site_id"] == join_df["last_site_id"]]
	print "the number of people whose first/last visit are same : %d" % len(people_first_and_last_visit_same)

if __name__ == "__main__":
	input_path = "./q3_data.tsv"
	df = pd.read_csv(input_path, sep='\t')
	df["dt"] = df["ts"].apply(lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S"))

	runQ1(df)
	runQ2(df)
	runQ3(df)
	runQ4(df)