Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import sys
- import argparse
- import pandas as pd
- def main():
- argparser = argparse.ArgumentParser()
- argparser.add_argument("-f", help='The path to the summary statistics ftp log csv', required=True)
- args = argparser.parse_args()
- df = pd.read_csv(args.f, header=0, names=['resource', 'file_size', 'count'])
- # file size > 1MB
- df = df[df.file_size > 99999]
- df.resource = df.resource.str.replace('/pub/databases/gwas/summary_statistics/','')
- df = df.drop(columns=['file_size'])
- raw = df[~df.resource.str.contains('harmonised')]
- raw.resource = raw.resource.str.replace('/.*','')
- raw = raw.groupby('resource').agg('max')
- f_and_h = df[df.resource.str.contains('harmonised')]
- formatted = f_and_h[f_and_h.resource.str.contains('\.f\.')]
- formatted = formatted.groupby('resource').agg('max')
- harmonised = f_and_h[f_and_h.resource.str.contains('\.h\.')]
- harmonised = harmonised.groupby('resource').agg('max')
- raw_total = raw['count'].sum()
- f_total = formatted['count'].sum()
- h_total = harmonised['count'].sum()
- df = df.groupby('resource').agg('max')
- top_all = df.sort_values(by=["count"], ascending=False).head(10)
- top_raw = raw.sort_values(by=["count"], ascending=False).head(10)
- top_f = formatted.sort_values(by=["count"], ascending=False).head(10)
- top_h = harmonised.sort_values(by=["count"], ascending=False).head(10)
- print("Stats:")
- print("raw total: {}".format(str(raw_total)))
- print("formatted total: {}".format(str(f_total)))
- print("harmonised total: {}".format(str(h_total)))
- print("complete total: {}".format(str(sum([raw_total, f_total, h_total]))))
- print("\n========================\n")
- print("top downloads:")
- print(top_all)
- print("\ntop raw:")
- print(top_raw)
- print("\ntop formatted:")
- print(top_f)
- print("\ntop harmonised:")
- print(top_h)
- if __name__ == '__main__':
- sys.exit(main())
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement