Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- '''
- creates json and csv of top 10 urls visited by user, country
- from sergiomitm.com proxy users.
- input: csv of freedom house rankings, geocoded ips from squidanalyzer
- '''
- import csv, re, json
- def main():
- # get the freedom hous3 rankings out of a csv
- with open("fh_rankings.csv", "r") as f:
- reader = csv.reader(f)
- rankings = []
- for row in reader:
- country = row[0]
- ranking = row[1]
- rankings.append([country,ranking])
- # this big loop counts users and requests per country and url
- # end with a list like [num_users, num_requests, country, url, freedom_house_rank]
- with open("geoip_users.csv","r") as f:
- reader = csv.reader(f)
- alldata = []
- for row in reader:
- if row[0]=='Users':continue
- ip = row[0]
- try:
- requests = int(re.search('(^\d+)',row[1]).group(1))
- except:
- requests = 0
- try:
- country = row[7]
- except:
- continue
- url = row[6]
- found = False
- for x in alldata:
- if x['country'] == country and x['top_url']== url:
- x['users']+=1
- x['requests']+=requests
- found = True
- break
- if not found:
- freedom_rank='Unknown'
- for r in rankings:
- if r[0]==country: freedom_rank = r[1]
- d = {}
- d['country']=country
- d['requests']=requests
- d['users']=1
- d['freedom_rank']=freedom_rank
- d['top_url']=url
- alldata.append(d)
- # trim to only top 10 urls per country
- sorted_data = sorted(alldata, key=lambda x: x['users'], reverse=True)
- outdata = []
- for s in sorted_data:
- country = s['country']
- count = 0
- for o in outdata:
- if o['country']==country: count +=1
- if count >9: continue
- outdata.append(s)
- with open("country_summary.json", "w") as f:
- json.dump(outdata,f)
- with open("country_summary.csv", "w") as f:
- fn = ['users','requests','country','freedom_rank','top_url']
- wr = csv.DictWriter(f, fn)
- wr.writeheader()
- wr.writerows(outdata)
- if __name__ == "__main__":
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement