Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/bin/python
- '''
- parses squid access log to find
- ratio of images, css, js for each ip.
- can be used for detection of bots.
- '''
- import pandas as pd
- import numpy as np
- from collections import namedtuple
- def main():
- data = []
- labels = ['ip','method','js','css','img']
- methods = ['CONNECT', 'GET', 'HEAD', 'NONE', 'OPTIONS', 'POST', 'PUT', 'TRACE']
- squid_logfile = namedtuple('logfile','time elapsed remotehost code bytes method url rfc peer type')
- with open("/var/log/squid/access.log.1", 'r') as log:
- for l in log:
- try:
- lf = squid_logfile._make(l.split())
- except TypeError:
- #line is screwed up for some reason, ususally a funky url
- continue
- if lf.method not in methods:
- #another indicator of line import problem
- continue
- #parse the line fo the things we are interested in
- js,css,img=[0,0,0]
- if 'javascript' in lf.type:
- js = 1
- elif 'css' in lf.type:
- css = 1
- elif 'image' in lf.type:
- img = 1
- data.append([lf.remotehost,lf.method,js,css,img])
- df = pd.DataFrame.from_records(data, columns=labels)
- data = [] #this line is responsible for 60seconds faster execution
- method_table = df.pivot_table(index='method',values=['js','css','img'], aggfunc=[np.mean, len])
- print method_table
- #we're not able to interpret encrypted data
- get_df = df.loc[df['method']!='CONNECT']
- ip_table = get_df.pivot_table(index='ip',values=['js','css','img'], aggfunc=[np.mean, len])
- ip_table.to_csv('table.csv', header='ip')
- if __name__ == "__main__":
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement