SHARE
TWEET

Untitled

a guest Sep 24th, 2017 31 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3.  
  4. import email
  5. import html2text
  6. html2text.UNICODE_SNOB = 1 # No reason to replace unicode characters with ascii lookalikes there
  7. import re
  8.  
  9. try:
  10.     from IPython import embed
  11. except:
  12.     embed = False
  13.  
  14. import mailbox
  15. maildir = mailbox.Maildir('/home/andres/gmail-archive')
  16. userid = 'andres.erbsen'
  17.  
  18. words = re.compile(ur'[\wöäüõšž]+',re.UNICODE+re.IGNORECASE)
  19. madr = re.compile(ur"""(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])""",re.IGNORECASE+re.UNICODE)
  20. SEP = u'_'
  21.  
  22. def getmessagetext_plain(message):
  23.     """ Returns all plaintexts content in a message"""
  24.     if message.get_content_type() == 'text/plain':
  25.         encoding = message.get_content_charset()
  26.         text = message.get_payload(decode=True)
  27.         if encoding:
  28.             text = text.decode(encoding,errors='ignore')
  29.         else:
  30.             # Let's just try to decode it, the chances are this will
  31.             # work and even a text without unicode characters is better
  32.             # than no text at all
  33.             text = text.decode('unicode-escape',errors='ignore')
  34.         return text + '\n'
  35.     elif message.is_multipart():
  36.         # Parts are message too, so they can consist of parts again. They do.
  37.         return ''.join(getmessagetext_plain(part) for part in message.get_payload()).strip('\n')
  38.     else:
  39.         return ''
  40.  
  41. def getmessagetext_html(message):
  42.     if message.get_content_type() == 'text/html':
  43.         encoding = message.get_content_charset()
  44.         text = message.get_payload(decode=True)
  45.         if encoding:
  46.             text = text.decode(encoding,errors='ignore')
  47.         else:
  48.             text = text.decode('unicode-escape',errors='ignore')
  49.         try:
  50.             return html2text.html2text(text) + '\n'
  51.         except: # Some html is just invalid...
  52.             return ''
  53.     elif message.is_multipart():
  54.         return ''.join(getmessagetext_html(part) for part in message.get_payload()).strip('\n')
  55.     else:
  56.         return ''
  57.  
  58. def getmessagetext(message):
  59.     """ Extracts text content from email. Parses HTML using html2text if
  60.     no plaintext content is found."""
  61.     text = getmessagetext_plain(message)
  62.     if text:
  63.         return text
  64.     return getmessagetext_html(message)
  65.  
  66. def getheaders(message,header):
  67.     ret = []
  68.     for text, encoding in email.Header.decode_header(message[header]):
  69.         if encoding:
  70.             text = text.decode(encoding)
  71.         else:
  72.             text = text.decode('unicode-escape')
  73.         ret.append(text)
  74.     return ret
  75.  
  76. def messageinfo(message):
  77.     ret = getmessagetext(message) + '\n\n'
  78.     #~ for word in words.findall(getmessagetext(message)):
  79.         #~ if sum(c.isalpha() for c in word) <= (len(word)/3*2+1):
  80.             #~ continue
  81.         #~ yield word
  82.     for header in ['subject']: # Headers, that are also content
  83.         for instance in getheaders(message,header):
  84.             if instance != 'None':
  85.                 for word in words.findall(instance):
  86.                     ret += header + SEP + word +' '
  87.                     ret += word +' '
  88.     for header in ['to','cc','bcc','from','sender']:
  89.         ret = ret.rstrip() + '\n'
  90.         for instance in getheaders(message,header):
  91.             if instance != 'None':
  92.                 for mailaddr in madr.findall(instance):
  93.                     ret += header + SEP + mailaddr +' '
  94.     return ret
  95.                    
  96. messages_as_text = []
  97. repliedmessageids = set()
  98.  
  99. for message in maildir:
  100.     if userid in message.getheader('from'):
  101.         repliedmessageids.update( message.getheaders('in-reply-to') )
  102.         repliedmessageids.update( message.getheaders('references') )
  103.     else:
  104.         message.fp.seek(0)
  105.         message = email.message_from_file(message.fp)
  106.         messages_as_text.append( messageinfo(message) )
  107.  
  108. isreplied = [ message.getheader('message-id') in repliedmessageids for message in maildir if userid not in message.getheader('from') ]
  109.  
  110.  
  111.  
  112. from scikits.learn.pipeline import Pipeline
  113. from scikits.learn.feature_extraction.text import CountVectorizer
  114. from scikits.learn.feature_extraction.text import TfidfTransformer
  115. from scikits.learn.svm.sparse import LinearSVC
  116.  
  117. text_clf = Pipeline([
  118.     ('vect', CountVectorizer()),
  119.     ('tfidf', TfidfTransformer(use_tf=False, use_idf=False)),
  120.     ('clf', LinearSVC()),
  121. ])
  122.  
  123. tt = int( len(messages_as_text)*(2/3.) )
  124. train_mails = messages_as_text[:tt]
  125. test_mails = messages_as_text[tt:]
  126. train_target = isreplied[:tt]
  127. test_target = isreplied[tt:]
  128.  
  129. text_clf.fit(train_mails,train_target)
  130. predicted = text_clf.predict(test_mails)
  131.  
  132.  
  133. from scikits.learn import metrics
  134. import numpy
  135. print metrics.classification_report(numpy.array(test_target),predicted)
  136. print 'Total f1 score:', metrics.f1_score(numpy.array(test_target),predicted)
  137.  
  138. if embed: embed()
RAW Paste Data
Top