Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import numpy as np
- import pandas as pd
- import matplotlib.pyplot as plt
- import matplotlib.dates as dates
- import matplotlib.gridspec as gridspec
- from datetime import timedelta, datetime, date
- import GmailAccount # my package
- gmail = GmailAccount(username='you@gmail.com', password=password)
- gmail.login()
- daysback = 6000 # ~10yrs...make this whatever ya like
- notsince = 0 # since now.
- since = (date.today() - timedelta(daysback)).strftime("%d-%b-%Y")
- before = (date.today() - timedelta(notsince)).strftime("%d-%b-%Y")
- SEARCH = '(SENTSINCE {si} SENTBEFORE {bf})'.format(si=since, bf=before)
- ALL_HEADERS = '(BODY.PEEK[HEADER.FIELDS (DATE TO CC FROM SUBJECT)])'
- # Search and fetch emails!
- received = gmail.load_parse_query(search_query=SEARCH,
- fetch_query=ALL_HEADERS,
- folder='"[Gmail]/All Mail"')
- def scrub_email(headers):
- # IMAP sometimes returns fields with varying capitalization. Lowercase each header name.
- return dict([(title.lower(), value) for title, value in headers])
- df = pd.DataFrame([scrub_email(email._headers) for email in received])
- # Parse date strings remaining naive across multiple timezones
- def try_parse_date(d):
- try:
- ts = pd.Timestamp(d)
- # IMAP is very much not perfect...some of my emails have no timezone
- # in their date string. ¯\_(ツ)_/¯
- if ts.tz is None:
- ts = ts.tz_localize('UTC')
- # I moved from east coast to west coast in 2010, so automatically assume EST/PST
- # before/after that date.
- if ts < pd.Timestamp('2010-09-01', tz='US/Eastern'):
- ts = ts.tz_convert('US/Eastern')
- else:
- ts = ts.tz_convert('US/Pacific')
- # Here's the magic to use timezone-naive timestamps
- return pd.Timestamp(ts.to_datetime().replace(tzinfo=None))
- except:
- # If we fail, return NaN so pandas can remove this email later.
- return np.nan
- df['timestamp'] = df.date.map(try_parse_date)
- # Remove any emails that Timestamp was unable to parse
- df = df.dropna(subset=['timestamp'])
- df['hour'] = df.timestamp.map(lambda x: x.hour)
- freq = 'M' # could also be 'W' (week) or 'D' (day), but month looks nice.
- df = df.set_index('timestamp', drop=False)
- df.index = df.index.to_period(freq)
- mindate = df.timestamp.min()
- maxdate = df.timestamp.max()
- pr = pd.period_range(mindate, maxdate, freq=freq)
- # Initialize a new HeatMap dataframe where the indicies are actually Periods of time
- # Size the frame anticipating the correct number of rows (periods) and columns (hours in a day)
- hm = pd.DataFrame(np.zeros([len(pr), 24]) , index=pr)
- for period in pr:
- # HERE'S where the magic happens...with pandas, when you structure your data correctly,
- # it can be so terse that you almost aren't sure the program does what it says it does...
- # For this period (month), find relevant emails and count how many emails were received in
- # each hour of the day. Takes more words to explain than to code.
- if period in df.index:
- hm.ix[period] = df.ix[[period]].hour.value_counts()
- # If for some weird reason there was ever an hour period where you had no email,
- # fill those NaNs with zeros.
- hm.fillna(0, inplace=True)
- ### Set up figure
- fig = plt.figure(figsize=(12,8))
- # This will be useful laterz
- gs = gridspec.GridSpec(2, 2, height_ratios=[4,1], width_ratios=[20,1],)
- gs.update(wspace=0.05)
- ### Plot our heatmap
- ax = plt.subplot(gs[0])
- x = dates.date2num([p.start_time for p in pr])
- t = [datetime(2000, 1, 1, h, 0, 0) for h in range(24)]
- t.append(datetime(2000, 1, 2, 0, 0, 0)) # add last fencepost
- y = dates.date2num(t)
- cm = plt.get_cmap('Oranges')
- plt.pcolor(x, y, hm.transpose().as_matrix(), cmap=cm)
- ### Now format our axes to be human-readable
- ax.xaxis.set_major_formatter(dates.DateFormatter('%b %Y'))
- ax.yaxis.set_major_formatter(dates.DateFormatter('%H:%M'))
- ax.set_yticks(t[::2])
- ax.set_xticks(x[::12])
- ax.set_xlim([x[0], x[-1]])
- ax.set_ylim([t[0], t[-1]])
- ax.tick_params(axis='x', pad=14, length=10, direction='inout')
- ### pcolor makes it sooo easy to add a color bar!
- plt.colorbar(cax=plt.subplot(gs[1]))
- ax2 = plt.subplot(gs[2])
- total_email = df.groupby(level=0).hour.count()
- plt.plot_date(total_email.index, total_email, '-', linewidth=1.5, color=cm(0.999))
- ax2.fill_between(total_email.index, 0, total_email, color=cm(0.5))
- ax2.xaxis.tick_top()
- out = ax2.set_xticks(total_email.index[::12])
- out = ax2.xaxis.set_ticklabels([])
- ax2.tick_params(axis='x', pad=14, length=10, direction='inout')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement