Guest User

Untitled

a guest
Jan 20th, 2018
313
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.38 KB | None | 0 0
  1. # -*- coding:utf8 -*-
  2. """
  3. Author : Myth
  4. Date : 18/1/11
  5. Email : email4myth at gmail.com
  6. """
  7.  
  8. import email
  9. import email.header
  10. import hashlib
  11.  
  12. import chardet
  13.  
  14.  
  15. def choose_alternative_part(subparts):
  16. return sorted(subparts, key=lambda m: 1 if m.get_content_subtype() == 'html' else 0, reverse=True)[0]
  17.  
  18.  
  19. def walk(mail):
  20. if mail.is_multipart():
  21. subparts = mail.get_payload()
  22. content_type = mail.get_content_type()
  23. if content_type and content_type.lower() == 'multipart/alternative':
  24. prefer_subpart = choose_alternative_part(subparts)
  25. for subpart in walk(prefer_subpart):
  26. yield subpart
  27. else:
  28. for subpart in subparts:
  29. for subsubpart in walk(subpart):
  30. yield subsubpart
  31. else:
  32. yield mail
  33.  
  34.  
  35. def decode_header(header):
  36. rmsg = ''
  37. header = header.replace('"', '')
  38. for ret in email.header.decode_header(header):
  39. msg, charset = ret
  40. msg = decode_str(msg, charset)
  41. if not rmsg:
  42. rmsg += msg
  43. else:
  44. rmsg += ' ' + msg
  45. return rmsg
  46.  
  47.  
  48. def is_attachment(msg):
  49. return bool(msg.get_filename())
  50.  
  51.  
  52. def split_letter_and_attach(msgs):
  53. letters = []
  54. attachments = []
  55.  
  56. for msg in msgs:
  57. if is_attachment(msg):
  58. attachments.append(msg)
  59. elif msg.get_content_maintype().lower() == 'text': # 非附件只取文本类内容
  60. letters.append(msg)
  61. return letters, attachments
  62.  
  63.  
  64. def decode_str(s, charset, errors='ignore'):
  65. if not s:
  66. return s
  67.  
  68. if isinstance(s, unicode):
  69. return s
  70.  
  71. if not charset:
  72. ret = chardet.detect(s)
  73. # {'confidence': 0.99, 'encoding': 'GB2312'}
  74. if ret:
  75. charset = ret.get('encoding')
  76.  
  77. if not charset:
  78. charset = 'utf-8'
  79.  
  80. return s.decode(charset, errors=errors)
  81.  
  82.  
  83. def merge_letters(letters):
  84. content = u''
  85.  
  86. for letter in letters:
  87. payload = letter.get_payload(decode=True)
  88. if not payload:
  89. continue
  90.  
  91. charset = letter.get_content_charset()
  92. payload = decode_str(payload, charset)
  93.  
  94. content += payload
  95. content += '\r\n\r\n'
  96.  
  97. return content
  98.  
  99.  
  100. def content_md5(content):
  101. return hashlib.md5(content).hexdigest()
  102.  
  103.  
  104. def save_attachment(attachment):
  105. content = attachment.get_payload(decode=True)
  106. filename = decode_header(attachment.get_filename())
  107. md5 = content_md5(content)
  108. return {
  109. 'filename': filename,
  110. 'md5': md5,
  111. 'content': content
  112. }
  113.  
  114.  
  115. def parse_raw_mail(raw_mail, truncated):
  116. try:
  117. mail = email.message_from_string(raw_mail.lstrip())
  118. except Exception as e:
  119. print "error happened when get message from str: %s" % e
  120. return {
  121. 'content': '',
  122. 'attachments': []
  123. }
  124. msgs = list(walk(mail))
  125. if not msgs:
  126. return {
  127. 'content': '',
  128. 'attachments': []
  129. }
  130.  
  131. if truncated and is_attachment(msgs[-1]): # ignore last truncated attachment
  132. msgs.pop()
  133.  
  134. letters, attachments = split_letter_and_attach(msgs)
  135.  
  136. letter_content = merge_letters(letters)
  137.  
  138. attach_infos = []
  139. for attach in attachments:
  140. attach_info = save_attachment(attach)
  141. attach_infos.append(attach_info)
  142.  
  143. mail_info = {
  144. 'content': letter_content,
  145. 'attachments': attach_infos
  146. }
  147. return mail_info
Add Comment
Please, Sign In to add comment