Untitled

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import email
import html2text
html2text.UNICODE_SNOB = 1 # No reason to replace unicode characters with ascii lookalikes there
import re

try:
    from IPython import embed
except:
    embed = False

import mailbox
maildir = mailbox.Maildir('/home/andres/gmail-archive')
userid = 'andres.erbsen'

words = re.compile(ur'[\wöäüõšž]+',re.UNICODE+re.IGNORECASE)
madr = re.compile(ur"""(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])""",re.IGNORECASE+re.UNICODE)
SEP = u'_'

def getmessagetext_plain(message):
    """ Returns all plaintexts content in a message"""
    if message.get_content_type() == 'text/plain':
        encoding = message.get_content_charset()
        text = message.get_payload(decode=True)
        if encoding:
            text = text.decode(encoding,errors='ignore')
        else:
            # Let's just try to decode ut, the chances are this will
            # work and even a text without unicode characters is better
            # than no text at all
            text = text.decode('unicode-escape',errors='ignore')
        return text + '\n'
    elif message.is_multipart():
        # Parts are message too, so they can consist of parts again. They do.
        return ''.join(getmessagetext_plain(part) for part in message.get_payload()).strip('\n')
    else:
        return ''

def getmessagetext_html(message):
    if message.get_content_type() == 'text/html':
        encoding = message.get_content_charset()
        text = message.get_payload(decode=True)
        if encoding:
            text = text.decode(encoding,errors='ignore')
        else:
            text = text.decode('unicode-escape',errors='ignore')
        try:
            return html2text.html2text(text) + '\n'
        except: # Some html is just invalid...
            return ''
    elif message.is_multipart():
        return ''.join(getmessagetext_html(part) for part in message.get_payload()).strip('\n')
    else:
        return ''

def getmessagetext(message):
    """ Extracts text content from email. Parses HTML using html2text if
    no plaintext content is found."""
    text = getmessagetext_plain(message)
    if text:
        return text
    return getmessagetext_html(message)

def getheaders(message,header):
    ret = []
    for text, encoding in email.Header.decode_header(message[header]):
        if encoding:
            text = text.decode(encoding)
        else:
            text = text.decode('unicode-escape')
        ret.append(text)
    return ret

def messageinfo(message):
    ret = getmessagetext(message) + '\n\n'
    #~ for word in words.findall(getmessagetext(message)):
        #~ if sum(c.isalpha() for c in word) <= (len(word)/3*2+1):
            #~ continue
        #~ yield word
    for header in ['subject']: # Headers, that are also content
        for instance in getheaders(message,header):
            if instance != 'None':
                for word in words.findall(instance):
                    ret += header + SEP + word +' '
                    ret += word +' '
    for header in ['to','cc','bcc','from','sender']:
        ret = ret.rstrip() + '\n'
        for instance in getheaders(message,header):
            if instance != 'None':
                for mailaddr in madr.findall(instance):
                    ret += header + SEP + mailaddr +' '
    return ret

messages_as_text = []
repliedmessageids = set()

for message in maildir:
    if userid in message.getheader('from'):
        repliedmessageids.update( message.getheaders('in-reply-to') )
        repliedmessageids.update( message.getheaders('references') )
    else:
        message.fp.seek(0)
        message = email.message_from_file(message.fp)
        messages_as_text.append( messageinfo(message) )

isreplied = [ message.getheader('message-id') in repliedmessageids for message in maildir if userid not in message.getheader('from') ]


from scikits.learn.pipeline import Pipeline
from scikits.learn.feature_extraction.text import CountVectorizer
from scikits.learn.feature_extraction.text import TfidfTransformer
from scikits.learn.svm.sparse import LinearSVC

text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer(use_tf=False, use_idf=False)),
    ('clf', LinearSVC()),
])

tt = int( len(messages_as_text)*(2/3.) )
train_mails = messages_as_text[:tt]
test_mails = messages_as_text[tt:]
train_target = isreplied[:tt]
test_target = isreplied[tt:]

text_clf.fit(train_mails,train_target)
predicted = text_clf.predict(test_mails)


from scikits.learn import metrics
import numpy
print metrics.classification_report(numpy.array(test_target),predicted)
print 'Total f1 score:', metrics.f1_score(numpy.array(test_target),predicted)

if embed: embed()