Advertisement
Guest User

Untitled

a guest
Mar 20th, 2019
147
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.70 KB | None | 0 0
  1. import glob
  2. from Class import Mail
  3. # import xml.etree.ElementTree as ET
  4.  
  5.  
  6. def read_mails():
  7. file_paths = glob.glob('spam/*.txt')
  8. # print(file_paths)
  9. # print(file_paths[5])
  10.  
  11. lista_spam = []
  12. lista_ham = []
  13.  
  14. for path in file_paths:
  15. mail = Mail()
  16. mail.load_mail(path)
  17. if path.find('ham') != -1:
  18. lista_ham.append(mail)
  19. else:
  20. lista_spam.append(mail)
  21.  
  22. return lista_spam, lista_ham
  23.  
  24.  
  25. def classify(lista_maili):
  26. spam = {}
  27. for mail in lista_maili:
  28. mail.show_mail()
  29. slowa = mail.tresc.split(' ') + mail.tytul.split(' ')
  30.  
  31. slowa = [i.split(',', 1)[0] for i in slowa]
  32. slowa = [i.split('\n', 1)[0] for i in slowa]
  33. slowa = [i.split('\r', 1)[0] for i in slowa]
  34. slowa = [i.split('\t', 1)[0] for i in slowa]
  35. slowa = [i.split('?', 1)[0] for i in slowa]
  36. del slowa[slowa == '\\t']
  37. slowa[-1] = slowa[-1].strip()
  38. slowa = list(filter(lambda a: a != '', slowa))
  39.  
  40. # print(slowa)
  41. for slowo in slowa:
  42. if slowo.lower() in spam:
  43. spam[slowo.lower()] += 1
  44. else:
  45. spam[slowo.lower()] = 1
  46. print(spam)
  47. return spam
  48.  
  49.  
  50. def count_probability(spam, ham):
  51. spam_count = 0
  52. ham_count = 0
  53. p_spam = {}
  54. p_ham = {}
  55. p = 2
  56. k = 2
  57. pl_spam = {}
  58. pl_ham = {}
  59. for _, value in spam.items():
  60. spam_count = spam_count + value
  61. for _, value in ham.items():
  62. ham_count = ham_count + value
  63. for key, value in spam.items():
  64. p_spam[key] = value/spam_count
  65. pl_spam[key] = value/(spam_count + k*p)
  66. for key, value in ham.items():
  67. p_ham[key] = value / ham_count
  68. pl_ham[key] = value / (ham_count + k*p)
  69.  
  70. print(" ")
  71. print("Prawdopodobienstwo warunkowe slow w spamie:")
  72. print(p_spam)
  73. print("Po wygladzeniu:")
  74. print(pl_spam)
  75.  
  76. print(" ")
  77. print("Prawdopodobienstwo warunkowe slow w hamie:")
  78. print(p_ham)
  79. print("Po wygladzeniu:")
  80. print(pl_ham)
  81. print("")
  82.  
  83.  
  84. """
  85. # TODO
  86. def test():
  87. mail = Mail()
  88. mail.load_mail('example.txt')
  89. mail = [mail]
  90. example = classify(mail)
  91. """
  92.  
  93.  
  94. """
  95. def xml():
  96. tree = ET.parse('dict.xml')
  97. root = tree.getroot()
  98. children = root.getchildren()
  99. print(root)
  100. print(children)
  101. xml_dict = {}
  102. for child in children:
  103. xml_dict[child.getText()] = child.
  104. """
  105.  
  106.  
  107. def main():
  108. lista_spam, lista_ham = read_mails()
  109. spam = classify(lista_spam)
  110. ham = classify(lista_ham)
  111. count_probability(spam, ham)
  112. # test()
  113. # xml()
  114.  
  115.  
  116. if __name__ == "__main__":
  117. main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement