Advertisement
Guest User

Untitled

a guest
Jan 18th, 2018
70
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.51 KB | None | 0 0
  1. import re
  2. from sklearn.feature_extraction.text import TfidfVectorizer
  3. from sklearn.model_selection import train_test_split
  4. import os
  5. from sklearn.linear_model import LogisticRegression
  6. import urllib.parse
  7. import pickle
  8. def loadData(name):
  9. with open(name, 'r', encoding="utf8") as f:
  10. result = []
  11. data = f.readlines()
  12. for d in data:
  13. if (len(d) > 0):
  14. d = str(urllib.parse.unquote(d))
  15. result.append(d.lower())
  16. return set(result)
  17.  
  18. def preProcessing(data):
  19. result = []
  20. for b in data:
  21. if '?' in b and b[-1] != '?':
  22. b = b[b.find('?') + 1:]
  23. b = re.sub('\s+', '', b)
  24. b = re.sub('\d+', '', b)
  25. str = ''
  26. qsparsed = urllib.parse.parse_qsl(b)
  27. for param in qsparsed:
  28. if '[' in param[0]:
  29. str += param[0]
  30. str += param[1]
  31. if len(str) >= 3:
  32. result.append(str)
  33. else:
  34. if b[-1] == "?":
  35. result.append(b[:-1])
  36. else:
  37. result.append(b)
  38. return set(result)
  39.  
  40. class WAF_ML:
  41. def __init__(self,mode="train"):
  42. self.mode = mode
  43. badQueries = loadData('malicious.txt')
  44. validQueries = loadData('good.txt')
  45.  
  46. badQueries = list(badQueries)
  47. validQueries = list(validQueries)
  48.  
  49. badQueriesforTrain = list(preProcessing(badQueries))
  50. validQueriesforTrain = list(preProcessing(validQueries))
  51.  
  52. self.queries = badQueriesforTrain + validQueriesforTrain
  53. self.badCount = len(badQueriesforTrain)
  54. self.validCount = len(validQueriesforTrain)
  55. yBad = [1 for i in range(0, len(badQueriesforTrain))]
  56. yGood = [0 for i in range(0, len(validQueriesforTrain))]
  57. self.y = yBad + yGood
  58. def train(self):
  59. self.vectorizer = TfidfVectorizer(min_df=0.0, analyzer="char", sublinear_tf=True, ngram_range=(3, 3))
  60. self.X = self.vectorizer.fit_transform(self.queries)
  61. try:
  62. if self.mode == "train":
  63. 1/0
  64. lgsfile = open('lgsfile', 'rb')
  65. self.lgs = pickle.load(lgsfile)
  66. lgsfile.close()
  67. except:
  68. self.lgs = LogisticRegression(class_weight={1: 2 * self.validCount / self.badCount, 0: 1.0})
  69. self.lgs.fit(self.X, self.y)
  70. lgsfile = open('lgsfile', 'wb')
  71. pickle.dump(self.lgs, lgsfile)
  72. lgsfile.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement