Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import getpass, imaplib, email, sys
- class NGramSummer(object): #for tallying total ngram count from someone
- def __init__(self, from_person): #constructor constructor... whats your...
- self.from_person = from_person #self is tagging with the "who" it's from
- self.ngrams = dict()
- def add_ngrams(self, new_ngrams): #this is where the counting is going on
- for word in new_ngrams:
- if word in self.ngrams:
- self.ngrams[word] += new_ngrams[word]
- else:
- self.ngrams[word] = new_ngrams[word]
- def get_ngrams(self):
- return self.ngrams
- # NGramCounter builds a dictionary relating ngrams (as tuples) to the number
- # of times that ngram occurs in a text (as integers)
- class NGramCounter(object):
- # parameter n is the 'order' (length) of the desired n-gram
- def __init__(self, text):
- self.text = text
- self.ngrams = dict()
- # feed method calls tokenize to break the given string up into units
- def tokenize(self):
- return self.text.split(" ")
- # feed method takes text, tokenizes it, and visits every group of n tokens
- # in turn, adding the group to self.ngrams or incrementing count in same
- def parse(self):
- tokens = self.tokenize()
- #Moves through every individual word in the text, increments counter if already found
- #else sets count to 1
- for word in tokens:
- if word in self.ngrams:
- self.ngrams[word] += 1
- else:
- self.ngrams[word] = 1
- def get_ngrams(self):
- return self.ngrams
- #loading profile for login
- M = imaplib.IMAP4_SSL('imap.gmail.com')
- M.login("willimite@gmail.com", "passingWORD")
- # M.select("[Gmail]/Sent Mail")
- M.select("[Gmail]/All Mail")
- def get_first_text_part(msg): #setup to cleanup all text
- maintype = msg.get_content_maintype()
- if maintype == 'multipart':
- for part in msg.get_payload():
- if part.get_content_maintype() == 'text':
- return part.get_payload()
- elif maintype == 'text':
- return msg.get_payload()
- xml_template = "<email><from>{sender}</from><to>{to}</to><date>{date}</date><subject>{subject}</subject><body>{body}</body><ngrams>{ngrams}</ngrams></email>"
- # my_message = xml_template.format(sender="sushionthego@gmail.com", to="willimite@gmail.com", date="4/13/2012", subject="Did you drink all the milk or did you throw it out b/c Avery is moving in?", body="see subject", ngrams="somengramsgohere")
- theperson = ["alinjen@bellsouth.net", "ajarnp@gmail.com", "alinjen@me.com", "dlbergman@gmail.com", "donjen@bellsouth.net", "hljen@bellsouth.net", "trixietree@hotmail.com", "wendycantdrive@hotmail.com", "avery@averymax.com", "chris.laniosz@gmail.com", "frannie.hall@gmail.com"] #"ian.oliver@flawlessfuture.com", "jasonaston@gmail.com", "seatubers@gmail.com", "seatubers@gmail.com", "idralcar@hotmail.com", "matthew.d.rader@gmail.com", "rader@matthewrader.com", "mslyssa@gmail.com", "guerrajmichael@gmail.com", "pamela@pamelareed.com", "studio@reedandrader.com", "ryan@letsneverdie.net", "4stepan@gmail.com", "cmae.oliver@gmail.com", "conniemae.olive@gmail.com", "larissa_bemis@yahoo.com", "larissarbemis@gmail.com", "lbemis@apple.com", "sushionthego@gmail.com", "acm466@nyu.edu", "lia.martinez@nyu.edu", "lia@potiondesign.com", "kiwi@smirkyplop.com", "roisin.stack@gmail.com", "ohannamarie@gmail.com", "bryan.baxter@gmail.com", "davidestici@hotmail.com", "genny.hoffman@gmail.com", "sheenamcneal@gmail.com", "daniel.shiffman@gmail.com", "daniel.shiffman@nyu.edu", "dan.osullivan@nyu.edu", "edward.gordon@nyu.edu", "marianne.petit@nyu.edu", "midori.yasuda@nyu.edu", "nh19@nyu.edu", "rob.ryan@nyu.edu", "EFarnon@wsgc.com", "lori@scoreatthetop.com"]
- for i in range(0, len(theperson)):
- type, data = M.search(None, 'FROM', theperson[i]) #Gets ALL messages
- summer = NGramSummer(theperson[i]) #a NGramSummer object,this will sum all the ngrams together
- new = open(theperson[i]+".xml", 'w')
- print theperson[i]
- for num in data[0].split(): #Loops through all messages
- yp, data = M.fetch(num, '(RFC822)') #Pulls Message
- msg = email.message_from_string(data[0][1]) #Puts message into easy to use python objects
- _from = msg['from'] #pull from
- _to = msg['to'] #pull to
- _subject = msg['subject'] #pull subject
- _date = msg['date']
- _body = get_first_text_part(msg) #pull body
- if _body:
- ngrams = NGramCounter(" ".join(_body.strip(">").split()))
- ngrams.parse()
- _feed = ngrams.get_ngrams()
- print _feed
- summer.add_ngrams(_feed)
- my_message = xml_template.format(sender=_from, to=_to, date=_date, subject=_subject, ngrams=_feed, body=_body)
- new.write(my_message)
- print my_message
- # print 'Content-Type:',msg.get_content_type()
- # last_message = xml_template.format(sender="_from", to=_to, date=_date, subject=_subject, body=_body, ngrams=_summer.get_ngrams())
- # new.write('------summmary----------')
- # new.write(str(summer.get_ngrams()))
- # print '------summmary----------'
- print summer.get_ngrams()
- new.close()
- M.close()
- M.logout()
Add Comment
Please, Sign In to add comment