Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import os
- import re
- import xml.etree.ElementTree as ET
- class Mail:
- def __init__(self, v, tresc = ''):
- self.addres = v
- self.tresc = tresc
- self.spam = 0
- self.ham = 0
- SPAM = {}
- HAM = {}
- maile = []
- doc = ET.parse('spam/dict.xml')
- root = doc.getroot()
- for child in root:
- t = child.text
- p = child.attrib['probabilty']
- if child.attrib['type'] == 'spam':
- SPAM[t] = p
- elif child.attrib['type'] == 'ham':
- HAM[t] = p
- print(SPAM)
- print(HAM)
- for dirname, dirnames, filenames in os.walk('spam/'):
- for filename in filenames:
- if '.txt' in filename:
- #print(os.path.join(dirname, filename))
- with open(os.path.join(dirname, filename)) as f:
- data = f.read()
- r = re.search(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+", data).group(0)
- #print(r)
- maile.append(Mail(r, data))
- for k, v in SPAM.items():
- count = len(re.findall('^('+k+')*', data.lower()))
- print(count)
- for k, v in HAM.items():
- count = len(re.findall('^('+k+')*', data.lower()))
- print(count)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement