Nannoka

spam_classifier

Apr 29th, 2021 (edited)
206
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 7.18 KB | None | 0 0
  1. from typing import TypeVar, List, Tuple, Dict, Iterable, NamedTuple, Set
  2. from collections import defaultdict, Counter
  3. import re
  4. import random
  5. import math
  6.  
  7. #importa o porter stemmer
  8. from nltk.stem import PorterStemmer
  9.  
  10.  
  11. X = TypeVar('X')  # generic type to represent a data point
  12.  
  13. def split_data(data: List[X], prob: float) -> Tuple[List[X], List[X]]:
  14.     """Split data into fractions [prob, 1 - prob]"""
  15.     data = data[:]                    # Make a shallow copy
  16.     random.shuffle(data)              # because shuffle modifies the list.
  17.     cut = int(len(data) * prob)       # Use prob to find a cutoff
  18.     return data[:cut], data[cut:]     # and split the shuffled list there.
  19.  
  20. def tokenize(text: str) -> Set[str]:
  21.     text = text.lower()                         # Convert to lowercase,
  22.     all_words = re.findall("[a-z0-9']+", text)  # extract the words, and
  23.     ##################################################################1- alteração: inseri o porter stemmer
  24.     stemmer = PorterStemmer()
  25.     #a função está reduzindo cada palavra ao seu radical
  26.     all_words = [stemmer.stem(word) for word in all_words]
  27.  
  28.     return set(all_words)                       # remove duplicates.
  29.  
  30. #assert tokenize("Data Science is science") == {"data", "science", "is"}
  31.  
  32. class Message(NamedTuple):
  33.     text: str
  34.     is_spam: bool
  35.  
  36. class NaiveBayesClassifier:
  37.     def __init__(self, k: float = 0.5) -> None:
  38.         self.k = k  # smoothing factor
  39.  
  40.         self.tokens: Set[str] = set()
  41.         self.token_spam_counts: Dict[str, int] = defaultdict(int)
  42.         self.token_ham_counts: Dict[str, int] = defaultdict(int)
  43.         self.spam_messages = self.ham_messages = 0
  44.  
  45.     def train(self, messages: Iterable[Message]) -> None:
  46.         for message in messages:
  47.             # Increment message counts
  48.             if message.is_spam:
  49.                 self.spam_messages += 1
  50.             else:
  51.                 self.ham_messages += 1
  52.  
  53.             # Increment word counts
  54.             for token in tokenize(message.text):
  55.                 self.tokens.add(token)
  56.                 if message.is_spam:
  57.                     self.token_spam_counts[token] += 1
  58.                 else:
  59.                     self.token_ham_counts[token] += 1
  60.  
  61.     def probabilities(self, token: str) -> Tuple[float, float]:
  62.         """returns P(token | spam) and P(token | not spam)"""
  63.         spam = self.token_spam_counts[token]
  64.         ham = self.token_ham_counts[token]
  65.  
  66.         p_token_spam = (spam + self.k) / (self.spam_messages + 2 * self.k)
  67.         p_token_ham = (ham + self.k) / (self.ham_messages + 2 * self.k)
  68.  
  69.         return p_token_spam, p_token_ham
  70.     #####################################################################3- inserido parametro min_count
  71.     def predict(self, text: str, min_count: int) -> float:
  72.         text_tokens = tokenize(text)
  73.         log_prob_if_spam = log_prob_if_ham = 0.0
  74.  
  75.         # Iterate through each word in our vocabulary.
  76.         for token in self.tokens:
  77.             if self.token_spam_counts[token] + self.token_ham_counts[token] > min_count:
  78.                 prob_if_spam, prob_if_ham = self.probabilities(token)
  79.  
  80.                 # If *token* appears in the message,
  81.                 # add the log probability of seeing it;
  82.                 if token in text_tokens:
  83.                     log_prob_if_spam += math.log(prob_if_spam)
  84.                     log_prob_if_ham += math.log(prob_if_ham)
  85.  
  86.                 # otherwise add the log probability of _not_ seeing it
  87.                 # which is log(1 - probability of seeing it)
  88.                 else:
  89.                     log_prob_if_spam += math.log(1.0 - prob_if_spam)
  90.                     log_prob_if_ham += math.log(1.0 - prob_if_ham)
  91.  
  92.         prob_if_spam = math.exp(log_prob_if_spam)
  93.         prob_if_ham = math.exp(log_prob_if_ham)
  94.         if (prob_if_spam + prob_if_ham) > 0:
  95.             return prob_if_spam / (prob_if_spam + prob_if_ham)
  96.         else:
  97.             return 0    
  98.  
  99. ###############################################################################
  100. # Testes do Modelo
  101. #
  102.  
  103. messages = [Message("spam rules", is_spam=True),
  104.             Message("ham rules", is_spam=False),
  105.             Message("hello ham", is_spam=False)]
  106.  
  107. model = NaiveBayesClassifier(k=0.5)
  108. model.train(messages)
  109.  
  110. '''
  111. assert model.tokens == {"spam", "ham", "rules", "hello"}
  112. assert model.spam_messages == 1
  113. assert model.ham_messages == 2
  114. assert model.token_spam_counts == {"spam": 1, "rules": 1}
  115. assert model.token_ham_counts == {"ham": 2, "rules": 1, "hello": 1}
  116. '''
  117. text = "hello spam"
  118.  
  119. probs_if_spam = [
  120.     (1 + 0.5) / (1 + 2 * 0.5),      # "spam"  (present)
  121.     1 - (0 + 0.5) / (1 + 2 * 0.5),  # "ham"   (not present)
  122.     1 - (1 + 0.5) / (1 + 2 * 0.5),  # "rules" (not present)
  123.     (0 + 0.5) / (1 + 2 * 0.5)       # "hello" (present)
  124. ]
  125.  
  126. probs_if_ham = [
  127.     (0 + 0.5) / (2 + 2 * 0.5),      # "spam"  (present)
  128.     1 - (2 + 0.5) / (2 + 2 * 0.5),  # "ham"   (not present)
  129.     1 - (1 + 0.5) / (2 + 2 * 0.5),  # "rules" (not present)
  130.     (1 + 0.5) / (2 + 2 * 0.5),      # "hello" (present)
  131. ]
  132.  
  133. p_if_spam = math.exp(sum(math.log(p) for p in probs_if_spam))
  134. p_if_ham = math.exp(sum(math.log(p) for p in probs_if_ham))
  135.  
  136. # Should be about 0.83
  137. #assert math.isclose(model.predict(text, 10), p_if_spam / (p_if_spam + p_if_ham))
  138.  
  139. ###############################################################################
  140. # Exemplo com mensagens verdadeiras
  141. #
  142.  
  143. #def main():
  144. import glob
  145.  
  146. # modify the path to wherever you've put the files
  147. path = 'emails/*/*'
  148.  
  149. data: List[Message] = []
  150.  
  151. # glob.glob returns every filename that matches the wildcarded path
  152. for filename in glob.glob(path):
  153.     is_spam = "ham" not in filename
  154.  
  155.     # There are some garbage characters in the emails, the errors='ignore'
  156.     # skips them instead of raising an exception.
  157.     with open(filename, errors='ignore') as email_file:
  158.         trigger = False
  159.         txt = ''
  160.         #################################2- alterado o for para ler o conteúdo do e-mail que começa após a linha em branco
  161.         for line in email_file:
  162.             if line == '\n':
  163.                 trigger = True
  164.                 continue
  165.             if trigger:
  166.                 txt += ' ' + line
  167.         data.append(Message(txt, is_spam))
  168.  
  169. random.seed(0)      # just so you get the same answers as me
  170. train_messages, test_messages = split_data(data, 0.75)
  171.  
  172. model = NaiveBayesClassifier()
  173. model.train(train_messages)
  174.  
  175. predictions = [(message, model.predict(message.text, 10))
  176.                for message in test_messages]
  177.  
  178. # Assume that spam_probability > 0.5 corresponds to spam prediction
  179. # and count the combinations of (actual is_spam, predicted is_spam)
  180. confusion_matrix = Counter((message.is_spam, spam_probability > 0.5)
  181.                            for message, spam_probability in predictions)
  182.  
  183. print(confusion_matrix)
  184.  
  185. def p_spam_given_token(token: str, model: NaiveBayesClassifier) -> float:
  186.     prob_if_spam, prob_if_ham = model.probabilities(token)
  187.  
  188.     return prob_if_spam / (prob_if_spam + prob_if_ham)
  189.  
  190. words = sorted(model.tokens, key=lambda t: p_spam_given_token(t, model))
  191.  
  192. print("spammiest_words", words[-10:])
  193. print("hammiest_words", words[:10])
  194.  
  195. #if __name__ == "__main__": main()
  196.  
Add Comment
Please, Sign In to add comment