Advertisement
Guest User

Untitled

a guest
Jan 10th, 2016
99
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.51 KB | None | 0 0
  1. import numpy as np
  2. import pandas as pd
  3. import matplotlib.pyplot as plt
  4. import matplotlib.dates as dates
  5. import matplotlib.gridspec as gridspec
  6. from datetime import timedelta, datetime, date
  7.  
  8. import GmailAccount # my package
  9.  
  10. gmail = GmailAccount(username='you@gmail.com', password=password)
  11. gmail.login()
  12.  
  13. daysback = 6000 # ~10yrs...make this whatever ya like
  14. notsince = 0 # since now.
  15. since = (date.today() - timedelta(daysback)).strftime("%d-%b-%Y")
  16. before = (date.today() - timedelta(notsince)).strftime("%d-%b-%Y")
  17.  
  18. SEARCH = '(SENTSINCE {si} SENTBEFORE {bf})'.format(si=since, bf=before)
  19. ALL_HEADERS = '(BODY.PEEK[HEADER.FIELDS (DATE TO CC FROM SUBJECT)])'
  20.  
  21. # Search and fetch emails!
  22. received = gmail.load_parse_query(search_query=SEARCH,
  23. fetch_query=ALL_HEADERS,
  24. folder='"[Gmail]/All Mail"')
  25.  
  26.  
  27. def scrub_email(headers):
  28. # IMAP sometimes returns fields with varying capitalization. Lowercase each header name.
  29. return dict([(title.lower(), value) for title, value in headers])
  30.  
  31. df = pd.DataFrame([scrub_email(email._headers) for email in received])
  32.  
  33. # Parse date strings remaining naive across multiple timezones
  34. def try_parse_date(d):
  35. try:
  36. ts = pd.Timestamp(d)
  37. # IMAP is very much not perfect...some of my emails have no timezone
  38. # in their date string. ¯\_(ツ)_/¯
  39. if ts.tz is None:
  40. ts = ts.tz_localize('UTC')
  41. # I moved from east coast to west coast in 2010, so automatically assume EST/PST
  42. # before/after that date.
  43. if ts < pd.Timestamp('2010-09-01', tz='US/Eastern'):
  44. ts = ts.tz_convert('US/Eastern')
  45. else:
  46. ts = ts.tz_convert('US/Pacific')
  47. # Here's the magic to use timezone-naive timestamps
  48. return pd.Timestamp(ts.to_datetime().replace(tzinfo=None))
  49.  
  50. except:
  51. # If we fail, return NaN so pandas can remove this email later.
  52. return np.nan
  53.  
  54. df['timestamp'] = df.date.map(try_parse_date)
  55. # Remove any emails that Timestamp was unable to parse
  56. df = df.dropna(subset=['timestamp'])
  57.  
  58. df['hour'] = df.timestamp.map(lambda x: x.hour)
  59. freq = 'M' # could also be 'W' (week) or 'D' (day), but month looks nice.
  60. df = df.set_index('timestamp', drop=False)
  61. df.index = df.index.to_period(freq)
  62.  
  63.  
  64. mindate = df.timestamp.min()
  65. maxdate = df.timestamp.max()
  66. pr = pd.period_range(mindate, maxdate, freq=freq)
  67. # Initialize a new HeatMap dataframe where the indicies are actually Periods of time
  68. # Size the frame anticipating the correct number of rows (periods) and columns (hours in a day)
  69. hm = pd.DataFrame(np.zeros([len(pr), 24]) , index=pr)
  70.  
  71. for period in pr:
  72. # HERE'S where the magic happens...with pandas, when you structure your data correctly,
  73. # it can be so terse that you almost aren't sure the program does what it says it does...
  74. # For this period (month), find relevant emails and count how many emails were received in
  75. # each hour of the day. Takes more words to explain than to code.
  76. if period in df.index:
  77. hm.ix[period] = df.ix[[period]].hour.value_counts()
  78.  
  79. # If for some weird reason there was ever an hour period where you had no email,
  80. # fill those NaNs with zeros.
  81. hm.fillna(0, inplace=True)
  82.  
  83.  
  84. ### Set up figure
  85. fig = plt.figure(figsize=(12,8))
  86. # This will be useful laterz
  87. gs = gridspec.GridSpec(2, 2, height_ratios=[4,1], width_ratios=[20,1],)
  88. gs.update(wspace=0.05)
  89.  
  90. ### Plot our heatmap
  91. ax = plt.subplot(gs[0])
  92. x = dates.date2num([p.start_time for p in pr])
  93. t = [datetime(2000, 1, 1, h, 0, 0) for h in range(24)]
  94. t.append(datetime(2000, 1, 2, 0, 0, 0)) # add last fencepost
  95. y = dates.date2num(t)
  96. cm = plt.get_cmap('Oranges')
  97. plt.pcolor(x, y, hm.transpose().as_matrix(), cmap=cm)
  98.  
  99. ### Now format our axes to be human-readable
  100. ax.xaxis.set_major_formatter(dates.DateFormatter('%b %Y'))
  101. ax.yaxis.set_major_formatter(dates.DateFormatter('%H:%M'))
  102. ax.set_yticks(t[::2])
  103. ax.set_xticks(x[::12])
  104. ax.set_xlim([x[0], x[-1]])
  105. ax.set_ylim([t[0], t[-1]])
  106. ax.tick_params(axis='x', pad=14, length=10, direction='inout')
  107.  
  108. ### pcolor makes it sooo easy to add a color bar!
  109. plt.colorbar(cax=plt.subplot(gs[1]))
  110.  
  111. ax2 = plt.subplot(gs[2])
  112. total_email = df.groupby(level=0).hour.count()
  113. plt.plot_date(total_email.index, total_email, '-', linewidth=1.5, color=cm(0.999))
  114. ax2.fill_between(total_email.index, 0, total_email, color=cm(0.5))
  115.  
  116. ax2.xaxis.tick_top()
  117. out = ax2.set_xticks(total_email.index[::12])
  118. out = ax2.xaxis.set_ticklabels([])
  119. ax2.tick_params(axis='x', pad=14, length=10, direction='inout')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement