Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import glob
- from Class import Mail
- # import xml.etree.ElementTree as ET
- def read_mails():
- file_paths = glob.glob('spam/*.txt')
- # print(file_paths)
- # print(file_paths[5])
- lista_spam = []
- lista_ham = []
- for path in file_paths:
- mail = Mail()
- mail.load_mail(path)
- if path.find('ham') != -1:
- lista_ham.append(mail)
- else:
- lista_spam.append(mail)
- return lista_spam, lista_ham
- def classify(lista_maili):
- spam = {}
- for mail in lista_maili:
- mail.show_mail()
- slowa = mail.tresc.split(' ') + mail.tytul.split(' ')
- slowa = [i.split(',', 1)[0] for i in slowa]
- slowa = [i.split('\n', 1)[0] for i in slowa]
- slowa = [i.split('\r', 1)[0] for i in slowa]
- slowa = [i.split('\t', 1)[0] for i in slowa]
- slowa = [i.split('?', 1)[0] for i in slowa]
- del slowa[slowa == '\\t']
- slowa[-1] = slowa[-1].strip()
- slowa = list(filter(lambda a: a != '', slowa))
- # print(slowa)
- for slowo in slowa:
- if slowo.lower() in spam:
- spam[slowo.lower()] += 1
- else:
- spam[slowo.lower()] = 1
- print(spam)
- return spam
- def count_probability(spam, ham):
- spam_count = 0
- ham_count = 0
- p_spam = {}
- p_ham = {}
- p = 2
- k = 2
- pl_spam = {}
- pl_ham = {}
- for _, value in spam.items():
- spam_count = spam_count + value
- for _, value in ham.items():
- ham_count = ham_count + value
- for key, value in spam.items():
- p_spam[key] = value/spam_count
- pl_spam[key] = value/(spam_count + k*p)
- for key, value in ham.items():
- p_ham[key] = value / ham_count
- pl_ham[key] = value / (ham_count + k*p)
- print(" ")
- print("Prawdopodobienstwo warunkowe slow w spamie:")
- print(p_spam)
- print("Po wygladzeniu:")
- print(pl_spam)
- print(" ")
- print("Prawdopodobienstwo warunkowe slow w hamie:")
- print(p_ham)
- print("Po wygladzeniu:")
- print(pl_ham)
- print("")
- """
- # TODO
- def test():
- mail = Mail()
- mail.load_mail('example.txt')
- mail = [mail]
- example = classify(mail)
- """
- """
- def xml():
- tree = ET.parse('dict.xml')
- root = tree.getroot()
- children = root.getchildren()
- print(root)
- print(children)
- xml_dict = {}
- for child in children:
- xml_dict[child.getText()] = child.
- """
- def main():
- lista_spam, lista_ham = read_mails()
- spam = classify(lista_spam)
- ham = classify(lista_ham)
- count_probability(spam, ham)
- # test()
- # xml()
- if __name__ == "__main__":
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement