Guest User

Untitled

a guest
Mar 6th, 2018
286
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.78 KB | None | 0 0
  1. ## DNICE -> lol@auburn.edu
  2.  
  3. import math
  4. import re
  5. from scipy.stats.stats import pearsonr
  6. import pylab
  7.  
  8. class Markov(object):
  9.     def doMarkov(self, text_list,gram_size):
  10.         gram_str_dict = {}
  11.         gram_size += 1
  12.  
  13.         # Read all of the tokens from the database:
  14.         # Create a dictionary of gram strings
  15.         # AAA,AAB,AAC -> {AA:[A,B,C]}
  16.         for token in text_list:
  17.             x = re.compile('.'*gram_size)
  18.             grams = x.findall(token)
  19.            
  20.             for g in grams:
  21.                 if g[:gram_size-1] not in gram_str_dict:
  22.                     gram_str_dict[g[:gram_size-1]] = [g[-1]]
  23.                 else:
  24.                     gram_str_dict[g[:gram_size-1]].append(g[-1])
  25.  
  26.         return gram_str_dict
  27.  
  28.     def calculateSurprisal(self, token, gram_str_dict,gram_size):
  29.         total_prob = 1.0
  30.         gram_size += 1
  31.         x = re.compile('.'*gram_size)
  32.                  
  33.         for g in x.findall(token):
  34.             gram = g[:gram_size-1]
  35.             gram_next_char = g[-1]
  36.             gram_list = gram_str_dict[gram]
  37.             total_next_states = len(gram_list)
  38.             my_states = gram_list.count(gram_next_char)
  39.             gram_prob = float(my_states)/float(total_next_states)
  40.             total_prob *= gram_prob
  41.  
  42.         surprisal = -1*math.log(total_prob,2)
  43.  
  44.         return surprisal
  45.  
  46. if __name__ == '__main__':
  47.     # Create a new markov class
  48.     m = Markov()
  49.  
  50.     flat_db = open('./parsed_db.txt')
  51.     lines = flat_db.readlines()
  52.  
  53.     username_list = []
  54.     password_list = []
  55.     email_list = []
  56.  
  57.     username_surprisals = []
  58.     password_surprisals = []
  59.     email_surprisals = []
  60.     uvp = []
  61.    
  62.     gram_len = 3
  63.     username_re = re.compile('^.*?:::')
  64.     password_re = re.compile(':::.*?:::')
  65.  
  66.     #TODO passwords and emails
  67.     for i in lines:
  68.         username = username_re.findall(i)[0][:-4]
  69.         password = password_re.findall(i)[0][4:-4]
  70.         #email = email_re.findall(i)[0][4:]
  71.        
  72.         if len(username) > gram_len and len(password) > gram_len:
  73.             username_list.append(username)
  74.             password_list.append(password)
  75.  
  76.     username_gram_str_dict = m.doMarkov(username_list,gram_len)
  77.     password_gram_str_dict = m.doMarkov(password_list,gram_len)
  78.     #email_gram_str_dict = m.doMarkov(password_list,gram_len)
  79.  
  80.     for i in username_list:
  81.         surprisal = m.calculateSurprisal(i,username_gram_str_dict,gram_len)
  82.         username_surprisals.append(float(surprisal))
  83.     for i in password_list:
  84.         surprisal = m.calculateSurprisal(i,password_gram_str_dict,gram_len)
  85.         password_surprisals.append(float(surprisal))
  86.  
  87.     print pearsonr(password_surprisals,username_surprisals)
  88.     pylab.scatter(password_surprisals,username_surprisals)
  89.     pylab.show()
Add Comment
Please, Sign In to add comment