Advertisement
Guest User

Untitled

a guest
Oct 22nd, 2019
79
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.92 KB | None | 0 0
  1. import sys
  2. import argparse
  3. import pandas as pd
  4.  
  5.  
  6. def main():
  7. argparser = argparse.ArgumentParser()
  8. argparser.add_argument("-f", help='The path to the summary statistics ftp log csv', required=True)
  9.  
  10. args = argparser.parse_args()
  11.  
  12. df = pd.read_csv(args.f, header=0, names=['resource', 'file_size', 'count'])
  13.  
  14. # file size > 1MB
  15. df = df[df.file_size > 99999]
  16.  
  17. df.resource = df.resource.str.replace('/pub/databases/gwas/summary_statistics/','')
  18. df = df.drop(columns=['file_size'])
  19.  
  20. raw = df[~df.resource.str.contains('harmonised')]
  21. raw.resource = raw.resource.str.replace('/.*','')
  22. raw = raw.groupby('resource').agg('max')
  23.  
  24. f_and_h = df[df.resource.str.contains('harmonised')]
  25.  
  26. formatted = f_and_h[f_and_h.resource.str.contains('\.f\.')]
  27. formatted = formatted.groupby('resource').agg('max')
  28.  
  29. harmonised = f_and_h[f_and_h.resource.str.contains('\.h\.')]
  30. harmonised = harmonised.groupby('resource').agg('max')
  31.  
  32.  
  33. raw_total = raw['count'].sum()
  34. f_total = formatted['count'].sum()
  35. h_total = harmonised['count'].sum()
  36.  
  37. df = df.groupby('resource').agg('max')
  38. top_all = df.sort_values(by=["count"], ascending=False).head(10)
  39. top_raw = raw.sort_values(by=["count"], ascending=False).head(10)
  40. top_f = formatted.sort_values(by=["count"], ascending=False).head(10)
  41. top_h = harmonised.sort_values(by=["count"], ascending=False).head(10)
  42.  
  43. print("Stats:")
  44. print("raw total: {}".format(str(raw_total)))
  45. print("formatted total: {}".format(str(f_total)))
  46. print("harmonised total: {}".format(str(h_total)))
  47. print("complete total: {}".format(str(sum([raw_total, f_total, h_total]))))
  48. print("\n========================\n")
  49.  
  50. print("top downloads:")
  51. print(top_all)
  52.  
  53. print("\ntop raw:")
  54. print(top_raw)
  55.  
  56. print("\ntop formatted:")
  57. print(top_f)
  58.  
  59. print("\ntop harmonised:")
  60. print(top_h)
  61.  
  62. if __name__ == '__main__':
  63. sys.exit(main())
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement