Advertisement
sergioMITM

Squid log analysis vs. blacklists

Mar 4th, 2018
107
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.11 KB | None | 0 0
  1. '''
  2. author: @sergiomitm
  3. description:
  4.  
  5.    Compares squid access log against blacklist sites
  6.    from http://dsi.ut-capitole.fr/blacklists/index_en.php.
  7.  
  8.    Saves output to 2 files: top_sites.txt (most frequented
  9.    sites in log file) and category_counts.txt (number of
  10.    requests per blacklist category)
  11.  
  12. '''
  13.  
  14. from urlparse import urlparse
  15. from collections import Counter
  16. import collections
  17. import os
  18.  
  19. BLACKLIST_DIR = 'blacklists'
  20. ACCESS_LOG = 'access_logs/access.log.1'
  21.  
  22. def main():
  23.     print "loading blacklists..."
  24.     lists = get_blacklists(BLACKLIST_DIR)
  25.  
  26.     print "loading access log file..."
  27.     log = load_access_log(ACCESS_LOG)
  28.     total_lines = 0
  29.     unique_lines = 0
  30.     for l in log:
  31.         total_lines += l[1]
  32.         unique_lines += 1
  33.     print "total requests = %d" %total_lines
  34.     print "unique sites = %d" %unique_lines
  35.     print
  36.  
  37.     update_counter = 0
  38.     for l in log:
  39.         hostname = l[0]
  40.         for d in lists:
  41.             if any(hostname in s for s in d['list']): d['count'] += int(l[1])
  42.  
  43.         #provide update to stdout every so often
  44.         update_counter += 1
  45.         if update_counter % 200 == 0:
  46.             progress = float(update_counter)/float(unique_lines)
  47.             print "progress: %.2f%%" %(100*progress)
  48.             for d in lists:
  49.                 print "%d\t%s"%(d['count'], d['name'])
  50.             print "------------------------"
  51.  
  52.     #finally, save results
  53.     save_results(lists, log, total_lines)
  54.  
  55. def save_results(lists, log, total_lines):
  56.     with open('top_sites.txt','w') as f:
  57.         slog = sorted(log, key=lambda x: x[1], reverse = True)
  58.         for s in slog:
  59.             f.write("%d\t%s\n"%(s[1],s[0]))
  60.     with open('category_counts.txt','w') as f:
  61.         slist = sorted(lists, key=lambda x: x['count'], reverse=True)
  62.         print "--------------FINAL COUNT---------------"
  63.         for d in slist:
  64.             counts ="%d\t%.2f%%\t%s"%(d['count'],float(100*d['count']/float(total_lines)),d['name'])
  65.             f.write(counts + '\n')
  66.             print counts
  67.  
  68. def get_blacklists(a_dir):
  69.     lists = []
  70.     dirs = [name for name in os.listdir(a_dir)
  71.                 if os.path.isdir(os.path.join(a_dir, name))]
  72.     for d in dirs:
  73.         b={}
  74.         b['name']=d
  75.         b['list']=load_blacklist(d)
  76.         b['count']=0
  77.         lists.append(b)
  78.     return lists
  79.  
  80. def load_access_log(filename):
  81.     names = []
  82.     with open(filename, 'r') as f:
  83.         for line in f:
  84.             url = line.split(" ")[6]
  85.             if len(url)<8: url = line.split(" ")[7]
  86.             if len(url)<8: url = line.split(" ")[8]
  87.             if len(url)<8: url = line.split(" ")[9]
  88.  
  89.             hostname = urlparse(url).netloc
  90.             if hostname: names.append(hostname)
  91.     sites =  list(collections.Counter(names).items())
  92.     return sorted(sites,key=lambda x: x[1], reverse=True)
  93.  
  94. def load_blacklist(category):
  95.     ret = []
  96.     blacklist = BLACKLIST_DIR + "/" + category + "/domains"
  97.     with open(blacklist,"r") as f:
  98.         for l in f:
  99.             ret.append(l)
  100.     return ret
  101.  
  102. if __name__ == "__main__":
  103.     main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement