Advertisement
Guest User

Untitled

a guest
Mar 27th, 2017
49
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.24 KB | None | 0 0
  1. import os
  2. import re
  3. import xml.etree.ElementTree as ET
  4.  
  5.  
  6. class Mail:
  7. def __init__(self, v, tresc = ''):
  8. self.addres = v
  9. self.tresc = tresc
  10. self.spam = 0
  11. self.ham = 0
  12.  
  13.  
  14. SPAM = {}
  15. HAM = {}
  16. maile = []
  17.  
  18. doc = ET.parse('spam/dict.xml')
  19. root = doc.getroot()
  20. for child in root:
  21. t = child.text
  22. p = child.attrib['probabilty']
  23. if child.attrib['type'] == 'spam':
  24. SPAM[t] = p
  25. elif child.attrib['type'] == 'ham':
  26. HAM[t] = p
  27.  
  28. print(SPAM)
  29. print(HAM)
  30.  
  31. for dirname, dirnames, filenames in os.walk('spam/'):
  32. for filename in filenames:
  33. if '.txt' in filename:
  34. #print(os.path.join(dirname, filename))
  35. with open(os.path.join(dirname, filename)) as f:
  36. data = f.read()
  37. r = re.search(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+", data).group(0)
  38. #print(r)
  39. maile.append(Mail(r, data))
  40.  
  41. for k, v in SPAM.items():
  42. count = len(re.findall('^('+k+')*', data.lower()))
  43. print(count)
  44.  
  45. for k, v in HAM.items():
  46. count = len(re.findall('^('+k+')*', data.lower()))
  47. print(count)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement