SHARE
TWEET

Untitled

a guest Mar 20th, 2019 72 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. import glob
  2. from Class import Mail
  3. # import xml.etree.ElementTree as ET
  4.  
  5.  
  6. def read_mails():
  7.     file_paths = glob.glob('spam/*.txt')
  8.     # print(file_paths)
  9.     # print(file_paths[5])
  10.  
  11.     lista_spam = []
  12.     lista_ham = []
  13.  
  14.     for path in file_paths:
  15.         mail = Mail()
  16.         mail.load_mail(path)
  17.         if path.find('ham') != -1:
  18.             lista_ham.append(mail)
  19.         else:
  20.             lista_spam.append(mail)
  21.  
  22.     return lista_spam, lista_ham
  23.  
  24.  
  25. def classify(lista_maili):
  26.     spam = {}
  27.     for mail in lista_maili:
  28.         mail.show_mail()
  29.         slowa = mail.tresc.split(' ') + mail.tytul.split(' ')
  30.  
  31.         slowa = [i.split(',', 1)[0] for i in slowa]
  32.         slowa = [i.split('\n', 1)[0] for i in slowa]
  33.         slowa = [i.split('\r', 1)[0] for i in slowa]
  34.         slowa = [i.split('\t', 1)[0] for i in slowa]
  35.         slowa = [i.split('?', 1)[0] for i in slowa]
  36.         del slowa[slowa == '\\t']
  37.         slowa[-1] = slowa[-1].strip()
  38.         slowa = list(filter(lambda a: a != '', slowa))
  39.  
  40.         # print(slowa)
  41.         for slowo in slowa:
  42.             if slowo.lower() in spam:
  43.                 spam[slowo.lower()] += 1
  44.             else:
  45.                 spam[slowo.lower()] = 1
  46.     print(spam)
  47.     return spam
  48.  
  49.  
  50. def count_probability(spam, ham):
  51.     spam_count = 0
  52.     ham_count = 0
  53.     p_spam = {}
  54.     p_ham = {}
  55.     p = 2
  56.     k = 2
  57.     pl_spam = {}
  58.     pl_ham = {}
  59.     for _, value in spam.items():
  60.         spam_count = spam_count + value
  61.     for _, value in ham.items():
  62.         ham_count = ham_count + value
  63.     for key, value in spam.items():
  64.         p_spam[key] = value/spam_count
  65.         pl_spam[key] = value/(spam_count + k*p)
  66.     for key, value in ham.items():
  67.         p_ham[key] = value / ham_count
  68.         pl_ham[key] = value / (ham_count + k*p)
  69.  
  70.     print(" ")
  71.     print("Prawdopodobienstwo warunkowe slow w spamie:")
  72.     print(p_spam)
  73.     print("Po wygladzeniu:")
  74.     print(pl_spam)
  75.  
  76.     print(" ")
  77.     print("Prawdopodobienstwo warunkowe slow w hamie:")
  78.     print(p_ham)
  79.     print("Po wygladzeniu:")
  80.     print(pl_ham)
  81.     print("")
  82.  
  83.  
  84. """
  85. # TODO
  86. def test():
  87.     mail = Mail()
  88.     mail.load_mail('example.txt')
  89.     mail = [mail]
  90.     example = classify(mail)
  91. """
  92.  
  93.  
  94. """
  95. def xml():
  96.     tree = ET.parse('dict.xml')
  97.     root = tree.getroot()
  98.     children = root.getchildren()
  99.     print(root)
  100.     print(children)
  101.     xml_dict = {}
  102.     for child in children:
  103.         xml_dict[child.getText()] = child.
  104. """
  105.  
  106.  
  107. def main():
  108.     lista_spam, lista_ham = read_mails()
  109.     spam = classify(lista_spam)
  110.     ham = classify(lista_ham)
  111.     count_probability(spam, ham)
  112.     # test()
  113.     # xml()
  114.  
  115.  
  116. if __name__ == "__main__":
  117.     main()
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
Not a member of Pastebin yet?
Sign Up, it unlocks many cool features!
 
Top