Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- # -*- coding: utf-8 -*-
- import email
- import html2text
- html2text.UNICODE_SNOB = 1 # No reason to replace unicode characters with ascii lookalikes there
- import re
- try:
- from IPython import embed
- except:
- embed = False
- import mailbox
- maildir = mailbox.Maildir('/home/andres/gmail-archive')
- userid = 'andres.erbsen'
- words = re.compile(ur'[\wöäüõšž]+',re.UNICODE+re.IGNORECASE)
- madr = re.compile(ur"""(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])""",re.IGNORECASE+re.UNICODE)
- SEP = u'_'
- def getmessagetext_plain(message):
- """ Returns all plaintexts content in a message"""
- if message.get_content_type() == 'text/plain':
- encoding = message.get_content_charset()
- text = message.get_payload(decode=True)
- if encoding:
- text = text.decode(encoding,errors='ignore')
- else:
- # Let's just try to decode ut, the chances are this will
- # work and even a text without unicode characters is better
- # than no text at all
- text = text.decode('unicode-escape',errors='ignore')
- return text + '\n'
- elif message.is_multipart():
- # Parts are message too, so they can consist of parts again. They do.
- return ''.join(getmessagetext_plain(part) for part in message.get_payload()).strip('\n')
- else:
- return ''
- def getmessagetext_html(message):
- if message.get_content_type() == 'text/html':
- encoding = message.get_content_charset()
- text = message.get_payload(decode=True)
- if encoding:
- text = text.decode(encoding,errors='ignore')
- else:
- text = text.decode('unicode-escape',errors='ignore')
- try:
- return html2text.html2text(text) + '\n'
- except: # Some html is just invalid...
- return ''
- elif message.is_multipart():
- return ''.join(getmessagetext_html(part) for part in message.get_payload()).strip('\n')
- else:
- return ''
- def getmessagetext(message):
- """ Extracts text content from email. Parses HTML using html2text if
- no plaintext content is found."""
- text = getmessagetext_plain(message)
- if text:
- return text
- return getmessagetext_html(message)
- def getheaders(message,header):
- ret = []
- for text, encoding in email.Header.decode_header(message[header]):
- if encoding:
- text = text.decode(encoding)
- else:
- text = text.decode('unicode-escape')
- ret.append(text)
- return ret
- def messageinfo(message):
- ret = getmessagetext(message) + '\n\n'
- #~ for word in words.findall(getmessagetext(message)):
- #~ if sum(c.isalpha() for c in word) <= (len(word)/3*2+1):
- #~ continue
- #~ yield word
- for header in ['subject']: # Headers, that are also content
- for instance in getheaders(message,header):
- if instance != 'None':
- for word in words.findall(instance):
- ret += header + SEP + word +' '
- ret += word +' '
- for header in ['to','cc','bcc','from','sender']:
- ret = ret.rstrip() + '\n'
- for instance in getheaders(message,header):
- if instance != 'None':
- for mailaddr in madr.findall(instance):
- ret += header + SEP + mailaddr +' '
- return ret
- messages_as_text = []
- repliedmessageids = set()
- for message in maildir:
- if userid in message.getheader('from'):
- repliedmessageids.update( message.getheaders('in-reply-to') )
- repliedmessageids.update( message.getheaders('references') )
- else:
- message.fp.seek(0)
- message = email.message_from_file(message.fp)
- messages_as_text.append( messageinfo(message) )
- isreplied = [ message.getheader('message-id') in repliedmessageids for message in maildir if userid not in message.getheader('from') ]
- from scikits.learn.pipeline import Pipeline
- from scikits.learn.feature_extraction.text import CountVectorizer
- from scikits.learn.feature_extraction.text import TfidfTransformer
- from scikits.learn.svm.sparse import LinearSVC
- text_clf = Pipeline([
- ('vect', CountVectorizer()),
- ('tfidf', TfidfTransformer(use_tf=False, use_idf=False)),
- ('clf', LinearSVC()),
- ])
- tt = int( len(messages_as_text)*(2/3.) )
- train_mails = messages_as_text[:tt]
- test_mails = messages_as_text[tt:]
- train_target = isreplied[:tt]
- test_target = isreplied[tt:]
- text_clf.fit(train_mails,train_target)
- predicted = text_clf.predict(test_mails)
- from scikits.learn import metrics
- import numpy
- print metrics.classification_report(numpy.array(test_target),predicted)
- print 'Total f1 score:', metrics.f1_score(numpy.array(test_target),predicted)
- if embed: embed()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement