Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- '''
- author: @sergiomitm
- description:
- Compares squid access log against blacklist sites
- from http://dsi.ut-capitole.fr/blacklists/index_en.php.
- Saves output to 2 files: top_sites.txt (most frequented
- sites in log file) and category_counts.txt (number of
- requests per blacklist category)
- '''
- from urlparse import urlparse
- from collections import Counter
- import collections
- import os
- BLACKLIST_DIR = 'blacklists'
- ACCESS_LOG = 'access_logs/access.log.1'
- def main():
- print "loading blacklists..."
- lists = get_blacklists(BLACKLIST_DIR)
- print "loading access log file..."
- log = load_access_log(ACCESS_LOG)
- total_lines = 0
- unique_lines = 0
- for l in log:
- total_lines += l[1]
- unique_lines += 1
- print "total requests = %d" %total_lines
- print "unique sites = %d" %unique_lines
- print
- update_counter = 0
- for l in log:
- hostname = l[0]
- for d in lists:
- if any(hostname in s for s in d['list']): d['count'] += int(l[1])
- #provide update to stdout every so often
- update_counter += 1
- if update_counter % 200 == 0:
- progress = float(update_counter)/float(unique_lines)
- print "progress: %.2f%%" %(100*progress)
- for d in lists:
- print "%d\t%s"%(d['count'], d['name'])
- print "------------------------"
- #finally, save results
- save_results(lists, log, total_lines)
- def save_results(lists, log, total_lines):
- with open('top_sites.txt','w') as f:
- slog = sorted(log, key=lambda x: x[1], reverse = True)
- for s in slog:
- f.write("%d\t%s\n"%(s[1],s[0]))
- with open('category_counts.txt','w') as f:
- slist = sorted(lists, key=lambda x: x['count'], reverse=True)
- print "--------------FINAL COUNT---------------"
- for d in slist:
- counts ="%d\t%.2f%%\t%s"%(d['count'],float(100*d['count']/float(total_lines)),d['name'])
- f.write(counts + '\n')
- print counts
- def get_blacklists(a_dir):
- lists = []
- dirs = [name for name in os.listdir(a_dir)
- if os.path.isdir(os.path.join(a_dir, name))]
- for d in dirs:
- b={}
- b['name']=d
- b['list']=load_blacklist(d)
- b['count']=0
- lists.append(b)
- return lists
- def load_access_log(filename):
- names = []
- with open(filename, 'r') as f:
- for line in f:
- url = line.split(" ")[6]
- if len(url)<8: url = line.split(" ")[7]
- if len(url)<8: url = line.split(" ")[8]
- if len(url)<8: url = line.split(" ")[9]
- hostname = urlparse(url).netloc
- if hostname: names.append(hostname)
- sites = list(collections.Counter(names).items())
- return sorted(sites,key=lambda x: x[1], reverse=True)
- def load_blacklist(category):
- ret = []
- blacklist = BLACKLIST_DIR + "/" + category + "/domains"
- with open(blacklist,"r") as f:
- for l in f:
- ret.append(l)
- return ret
- if __name__ == "__main__":
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement