Advertisement
Guest User

Untitled

a guest
Jul 18th, 2019
86
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.80 KB | None | 0 0
  1. #!/usr/bin/env python2.7
  2. #=*- coding:utf-8 -*-
  3.  
  4. import pandas as pd
  5. from datetime import datetime
  6.  
  7. def runQ1(df):
  8. print "\n=============Q1=============="
  9. tdf = df[df["country_id"] == "BDV"]
  10. print "the number of the rows with country_id = 'BDV' : %d" % len(tdf)
  11. print tdf.groupby(["site_id"])["user_id"].nunique().sort_values(ascending=False)
  12.  
  13. def runQ2(df):
  14. print "\n=============Q2=============="
  15. tdf = df[(df["dt"] >= datetime(2019,2,3,0,0,0)) & (df["dt"] <= datetime(2019,2,4,23,59,59))]
  16. print "the number of the rows from 2019-02-03 00:00:00 to 2019-02-04 23:59:59 : %d" % len(tdf)
  17.  
  18. cdf = tdf.groupby(["user_id", "site_id"])["ts"].count()
  19. print cdf[cdf >= 10]
  20.  
  21. def runQ3(df):
  22. print "\n=============Q3=============="
  23. tdf = df.sort_values(by=["dt"]).drop_duplicates(["user_id"], keep="last")
  24. print "After drop duplicates of user_id, the number of rows : %d" % len(tdf)
  25. print tdf.groupby(["site_id"])["user_id"].nunique().sort_values(ascending=False)
  26.  
  27. def runQ4(df):
  28. print "\n=============Q4=============="
  29. first = df.sort_values(by=["dt"]).drop_duplicates(["user_id"], keep="first")[["user_id", "site_id"]]
  30. last = df.sort_values(by=["dt"]).drop_duplicates(["user_id"], keep="last")[["user_id", "site_id"]]
  31.  
  32. first = first.rename(columns= {"site_id" : "first_site_id"})
  33. last = last.rename(columns= {"site_id" : "last_site_id"})
  34.  
  35. join_df = first.set_index("user_id").join(last.set_index("user_id"))
  36.  
  37. people_first_and_last_visit_same = join_df[join_df["first_site_id"] == join_df["last_site_id"]]
  38. print "the number of people whose first/last visit are same : %d" % len(people_first_and_last_visit_same)
  39.  
  40. if __name__ == "__main__":
  41. input_path = "./q3_data.tsv"
  42. df = pd.read_csv(input_path, sep='\t')
  43. df["dt"] = df["ts"].apply(lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S"))
  44.  
  45. runQ1(df)
  46. runQ2(df)
  47. runQ3(df)
  48. runQ4(df)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement