Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import re
- from sklearn.feature_extraction.text import TfidfVectorizer
- from sklearn.model_selection import train_test_split
- import os
- from sklearn.linear_model import LogisticRegression
- import urllib.parse
- import pickle
- def loadData(name):
- with open(name, 'r', encoding="utf8") as f:
- result = []
- data = f.readlines()
- for d in data:
- if (len(d) > 0):
- d = str(urllib.parse.unquote(d))
- result.append(d.lower())
- return set(result)
- def preProcessing(data):
- result = []
- for b in data:
- if '?' in b and b[-1] != '?':
- b = b[b.find('?') + 1:]
- b = re.sub('\s+', '', b)
- b = re.sub('\d+', '', b)
- str = ''
- qsparsed = urllib.parse.parse_qsl(b)
- for param in qsparsed:
- if '[' in param[0]:
- str += param[0]
- str += param[1]
- if len(str) >= 3:
- result.append(str)
- else:
- if b[-1] == "?":
- result.append(b[:-1])
- else:
- result.append(b)
- return set(result)
- class WAF_ML:
- def __init__(self,mode="train"):
- self.mode = mode
- badQueries = loadData('malicious.txt')
- validQueries = loadData('good.txt')
- badQueries = list(badQueries)
- validQueries = list(validQueries)
- badQueriesforTrain = list(preProcessing(badQueries))
- validQueriesforTrain = list(preProcessing(validQueries))
- self.queries = badQueriesforTrain + validQueriesforTrain
- self.badCount = len(badQueriesforTrain)
- self.validCount = len(validQueriesforTrain)
- yBad = [1 for i in range(0, len(badQueriesforTrain))]
- yGood = [0 for i in range(0, len(validQueriesforTrain))]
- self.y = yBad + yGood
- def train(self):
- self.vectorizer = TfidfVectorizer(min_df=0.0, analyzer="char", sublinear_tf=True, ngram_range=(3, 3))
- self.X = self.vectorizer.fit_transform(self.queries)
- try:
- if self.mode == "train":
- 1/0
- lgsfile = open('lgsfile', 'rb')
- self.lgs = pickle.load(lgsfile)
- lgsfile.close()
- except:
- self.lgs = LogisticRegression(class_weight={1: 2 * self.validCount / self.badCount, 0: 1.0})
- self.lgs.fit(self.X, self.y)
- lgsfile = open('lgsfile', 'wb')
- pickle.dump(self.lgs, lgsfile)
- lgsfile.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement