Advertisement
Guest User

Untitled

a guest
Sep 24th, 2017
77
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.15 KB | None | 0 0
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3.  
  4. import email
  5. import html2text
  6. html2text.UNICODE_SNOB = 1 # No reason to replace unicode characters with ascii lookalikes there
  7. import re
  8.  
  9. try:
  10. from IPython import embed
  11. except:
  12. embed = False
  13.  
  14. import mailbox
  15. maildir = mailbox.Maildir('/home/andres/gmail-archive')
  16. userid = 'andres.erbsen'
  17.  
  18. words = re.compile(ur'[\wöäüõšž]+',re.UNICODE+re.IGNORECASE)
  19. madr = re.compile(ur"""(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])""",re.IGNORECASE+re.UNICODE)
  20. SEP = u'_'
  21.  
  22. def getmessagetext_plain(message):
  23. """ Returns all plaintexts content in a message"""
  24. if message.get_content_type() == 'text/plain':
  25. encoding = message.get_content_charset()
  26. text = message.get_payload(decode=True)
  27. if encoding:
  28. text = text.decode(encoding,errors='ignore')
  29. else:
  30. # Let's just try to decode it, the chances are this will
  31. # work and even a text without unicode characters is better
  32. # than no text at all
  33. text = text.decode('unicode-escape',errors='ignore')
  34. return text + '\n'
  35. elif message.is_multipart():
  36. # Parts are message too, so they can consist of parts again. They do.
  37. return ''.join(getmessagetext_plain(part) for part in message.get_payload()).strip('\n')
  38. else:
  39. return ''
  40.  
  41. def getmessagetext_html(message):
  42. if message.get_content_type() == 'text/html':
  43. encoding = message.get_content_charset()
  44. text = message.get_payload(decode=True)
  45. if encoding:
  46. text = text.decode(encoding,errors='ignore')
  47. else:
  48. text = text.decode('unicode-escape',errors='ignore')
  49. try:
  50. return html2text.html2text(text) + '\n'
  51. except: # Some html is just invalid...
  52. return ''
  53. elif message.is_multipart():
  54. return ''.join(getmessagetext_html(part) for part in message.get_payload()).strip('\n')
  55. else:
  56. return ''
  57.  
  58. def getmessagetext(message):
  59. """ Extracts text content from email. Parses HTML using html2text if
  60. no plaintext content is found."""
  61. text = getmessagetext_plain(message)
  62. if text:
  63. return text
  64. return getmessagetext_html(message)
  65.  
  66. def getheaders(message,header):
  67. ret = []
  68. for text, encoding in email.Header.decode_header(message[header]):
  69. if encoding:
  70. text = text.decode(encoding)
  71. else:
  72. text = text.decode('unicode-escape')
  73. ret.append(text)
  74. return ret
  75.  
  76. def messageinfo(message):
  77. ret = getmessagetext(message) + '\n\n'
  78. #~ for word in words.findall(getmessagetext(message)):
  79. #~ if sum(c.isalpha() for c in word) <= (len(word)/3*2+1):
  80. #~ continue
  81. #~ yield word
  82. for header in ['subject']: # Headers, that are also content
  83. for instance in getheaders(message,header):
  84. if instance != 'None':
  85. for word in words.findall(instance):
  86. ret += header + SEP + word +' '
  87. ret += word +' '
  88. for header in ['to','cc','bcc','from','sender']:
  89. ret = ret.rstrip() + '\n'
  90. for instance in getheaders(message,header):
  91. if instance != 'None':
  92. for mailaddr in madr.findall(instance):
  93. ret += header + SEP + mailaddr +' '
  94. return ret
  95.  
  96. messages_as_text = []
  97. repliedmessageids = set()
  98.  
  99. for message in maildir:
  100. if userid in message.getheader('from'):
  101. repliedmessageids.update( message.getheaders('in-reply-to') )
  102. repliedmessageids.update( message.getheaders('references') )
  103. else:
  104. message.fp.seek(0)
  105. message = email.message_from_file(message.fp)
  106. messages_as_text.append( messageinfo(message) )
  107.  
  108. isreplied = [ message.getheader('message-id') in repliedmessageids for message in maildir if userid not in message.getheader('from') ]
  109.  
  110.  
  111.  
  112. from scikits.learn.pipeline import Pipeline
  113. from scikits.learn.feature_extraction.text import CountVectorizer
  114. from scikits.learn.feature_extraction.text import TfidfTransformer
  115. from scikits.learn.svm.sparse import LinearSVC
  116.  
  117. text_clf = Pipeline([
  118. ('vect', CountVectorizer()),
  119. ('tfidf', TfidfTransformer(use_tf=False, use_idf=False)),
  120. ('clf', LinearSVC()),
  121. ])
  122.  
  123. tt = int( len(messages_as_text)*(2/3.) )
  124. train_mails = messages_as_text[:tt]
  125. test_mails = messages_as_text[tt:]
  126. train_target = isreplied[:tt]
  127. test_target = isreplied[tt:]
  128.  
  129. text_clf.fit(train_mails,train_target)
  130. predicted = text_clf.predict(test_mails)
  131.  
  132.  
  133. from scikits.learn import metrics
  134. import numpy
  135. print metrics.classification_report(numpy.array(test_target),predicted)
  136. print 'Total f1 score:', metrics.f1_score(numpy.array(test_target),predicted)
  137.  
  138. if embed: embed()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement