Advertisement
sergioMITM

parsing script for proxy users

Jan 23rd, 2018
90
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.42 KB | None | 0 0
  1. '''
  2. creates json and csv of top 10 urls visited by user, country
  3. from sergiomitm.com proxy users.
  4. input: csv of freedom house rankings, geocoded ips from squidanalyzer
  5. '''
  6.  
  7. import csv, re, json
  8.  
  9. def main():
  10.     # get the freedom hous3 rankings out of a csv
  11.     with open("fh_rankings.csv", "r") as f:
  12.         reader = csv.reader(f)
  13.         rankings = []
  14.         for row in reader:
  15.             country = row[0]
  16.             ranking = row[1]
  17.             rankings.append([country,ranking])
  18.  
  19.     # this big loop counts users and requests per country and url
  20.     # end with a list like [num_users, num_requests, country, url, freedom_house_rank]
  21.     with open("geoip_users.csv","r") as f:
  22.         reader = csv.reader(f)
  23.         alldata = []
  24.         for row in reader:
  25.             if row[0]=='Users':continue
  26.             ip = row[0]
  27.             try:
  28.                 requests = int(re.search('(^\d+)',row[1]).group(1))
  29.             except:
  30.                 requests = 0
  31.             try:
  32.                 country = row[7]
  33.             except:
  34.                 continue
  35.             url = row[6]
  36.             found = False
  37.             for x in alldata:
  38.                 if x['country'] == country and x['top_url']== url:
  39.                     x['users']+=1
  40.                     x['requests']+=requests
  41.                     found = True
  42.                     break
  43.             if not found:
  44.                 freedom_rank='Unknown'
  45.                 for r in rankings:
  46.                     if r[0]==country: freedom_rank = r[1]
  47.                 d = {}
  48.                 d['country']=country
  49.                 d['requests']=requests
  50.                 d['users']=1
  51.                 d['freedom_rank']=freedom_rank
  52.                 d['top_url']=url
  53.                 alldata.append(d)
  54.  
  55.     # trim to only top 10 urls per country
  56.     sorted_data = sorted(alldata, key=lambda x: x['users'], reverse=True)
  57.     outdata = []
  58.     for s in sorted_data:
  59.         country = s['country']
  60.         count = 0
  61.         for o in outdata:
  62.             if o['country']==country: count +=1
  63.         if count >9: continue
  64.         outdata.append(s)
  65.  
  66.     with open("country_summary.json", "w") as f:
  67.         json.dump(outdata,f)
  68.     with open("country_summary.csv", "w") as f:
  69.         fn = ['users','requests','country','freedom_rank','top_url']
  70.         wr = csv.DictWriter(f, fn)
  71.         wr.writeheader()
  72.         wr.writerows(outdata)
  73.  
  74. if __name__ == "__main__":
  75.     main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement