Advertisement
Guest User

Untitled

a guest
Jan 19th, 2020
85
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.15 KB | None | 0 0
  1. import sys
  2. import os
  3. import email
  4. import base64
  5. import binascii
  6. import re
  7. import random
  8. import pickle
  9. import traceback
  10.  
  11. from w3lib.html import remove_tags
  12. from collections import Counter
  13. from googletrans import Translator
  14.  
  15. import nltk
  16. from nltk import word_tokenize, WordNetLemmatizer
  17. from nltk.corpus import stopwords
  18. from nltk import NaiveBayesClassifier, classify
  19.  
  20. APP_NAME = "AntiSpamFilter"
  21. APP_VERSION = "v1"
  22. MY_NAME = "Glodeanu Irina-Elena"
  23. ALIAS = "Avion"
  24.  
  25. nltk.download('wordnet')
  26. nltk.download('punkt')
  27. nltk.download('stopwords')
  28. stoplist = stopwords.words('english')
  29.  
  30. def doInfo(output_file):
  31. text = APP_NAME + "\n" + MY_NAME + "\n" + ALIAS + "\n" + APP_VERSION
  32. writeInFile(output_file, text)
  33.  
  34. def writeInFile(output_file, textToWrite):
  35.  
  36. f = open(output_file, "w+")
  37. f.write(textToWrite)
  38. f.close()
  39.  
  40. def extractEnglishWords(sentence):
  41. lemmatizer = WordNetLemmatizer()
  42. return [lemmatizer.lemmatize(word.lower()) for word in word_tokenize(sentence) if not word in stoplist and word.isalpha()] # remove stopwords and not alphabetic strings
  43.  
  44. def translateToEnglish(textToTranslate):
  45.  
  46. translator = Translator()
  47. languageDetected = translator.detect(textToTranslate[:100]).lang
  48.  
  49. if languageDetected != "en":
  50. print("found language: " + languageDetected + "\n")
  51.  
  52. result = ""
  53. n = 100
  54. for part in [textToTranslate[i:i+n] for i in range(0, len(textToTranslate), n)]:
  55. print(part + "\n")
  56. result += translator.translate(text = part, src = languageDetected, dest = "en").text
  57.  
  58. return result
  59. else:
  60. return textToTranslate
  61.  
  62. def extractText(emailText):
  63. msg = email.message_from_string(emailText)
  64.  
  65. emailBody = re.sub(r"http\S+", "", msg.get_payload()) # remove hyperlinks
  66.  
  67. if not msg.get('Subject'):
  68. subject = ''
  69. else:
  70. subject = msg.get('Subject')
  71.  
  72. rawEmail = emailText
  73.  
  74. if "<html" in emailBody.lower():
  75. rawEmail = subject + '\n' + processHtmlMail(emailBody) # add back subject of the email
  76. elif is64Encoded(emailBody):
  77. rawEmail = subject + '\n' + process64EncodedMail(emailBody) # add back subject of the email
  78.  
  79. return translateToEnglish(rawEmail)
  80.  
  81. def is64Encoded(emailBody):
  82.  
  83. if ' ' in emailBody.strip():
  84. return False
  85.  
  86. try:
  87. base64.b64decode(emailBody)
  88. return True
  89. except binascii.Error:
  90. return False
  91. except ValueError:
  92. return False
  93.  
  94. def processHtmlMail(emailText):
  95.  
  96. return remove_tags(emailText)
  97.  
  98. def process64EncodedMail(emailText):
  99. return base64.b64decode(emailText).decode('latin-1').replace("\r\n"," ").replace("\n", " ")
  100.  
  101. def prepareEmail(email):
  102. return {word:True for word in extractEnglishWords(extractText(email))}
  103.  
  104. def doCleanliness(email_folder, output_file, classifier):
  105. directory = os.fsencode(email_folder)
  106.  
  107. result = ""
  108. for file in os.listdir(directory):
  109.  
  110. f = open(os.path.join(directory, file), "r", encoding="latin1")
  111. emailToTest = prepareEmail(f.read())
  112. f.close()
  113.  
  114. if classifier.classify(emailToTest) == "cln":
  115. result = result + file.decode("utf-8") + "|cln\n"
  116. else:
  117. result = result + file.decode("utf-8") + "|inf\n"
  118.  
  119. writeInFile(output_file, result.strip())
  120.  
  121. def readEmails(path):
  122. emails = []
  123. fileList = os.listdir(path)
  124.  
  125. for file in fileList:
  126. f = open(path + file, "r", encoding="latin1")
  127. emails.append(f.read())
  128. f.close()
  129. return emails
  130.  
  131. def getTrainingData():
  132.  
  133. cleanEmails = readEmails("/Users/dan.nastasa/Projects/antiSpamMalw/Lot1_/Lot1/Clean/")
  134. spamEmails = readEmails("/Users/dan.nastasa/Projects/antiSpamMalw/Lot1_/Lot1/Spam/")
  135.  
  136. allEmails = [(email, 'cln') for email in cleanEmails]
  137. allEmails += [(email, 'inf') for email in spamEmails]
  138. random.shuffle(allEmails)
  139.  
  140. return [(prepareEmail(email), result) for (email, result) in allEmails]
  141.  
  142. def trainClassifier():
  143. trainingData = getTrainingData()
  144. print("Extracted features")
  145.  
  146. classifier = NaiveBayesClassifier.train(trainingData)
  147. print("Finished training")
  148.  
  149. return classifier
  150.  
  151. def getClassifier():
  152. fn = os.path.join(os.path.dirname(__file__), 'classifier-l1-l2.pickle')
  153.  
  154. if os.path.exists(fn):
  155. print("Classifier already exists")
  156.  
  157. f = open(fn, "rb")
  158. classifier = pickle.load(f)
  159. f.close()
  160. else:
  161. print("Classifier does not exist. Training from scratch.")
  162. classifier = trainClassifier()
  163.  
  164. f = open(fn, "wb")
  165. pickle.dump(classifier, f)
  166. f.close()
  167.  
  168. return classifier
  169.  
  170. if __name__ == '__main__':
  171. try:
  172. if str(sys.argv[1]) == "-info" and len(sys.argv) == 3:
  173. print(str(sys.argv[1]) + str(sys.argv[2]))
  174. doInfo(sys.argv[2])
  175. elif str(sys.argv[1]) == "-scan" and len(sys.argv) == 4:
  176.  
  177. classifier = getClassifier()
  178. doCleanliness(sys.argv[2], sys.argv[3], classifier)
  179. else:
  180. print("wrong command")
  181. except:
  182. print(traceback.print_exc())
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement