Guest User

Untitled

a guest
Apr 19th, 2018
78
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 8.28 KB | None | 0 0
  1. from __future__ import division
  2.  
  3. import matplotlib.pyplot as plt
  4. from matplotlib.ticker import AutoLocator
  5. import numpy as np
  6. import operator
  7. import pandas as pd
  8. import pprint
  9. import re
  10. import seaborn as sns
  11. from textblob import TextBlob
  12. from wordcloud import WordCloud
  13.  
  14. # config
  15. DEFAULT_PAL = 'GnBu_d'
  16. DIARY_PATH = '/home/karthik/Dropbox/vimwiki/diary/'
  17.  
  18. def open_entry(date):
  19. text = None
  20. date_str = date.strftime('%Y-%m-%d')
  21. with open(DIARY_PATH + date_str + '.wiki') as f:
  22. text = f.read()
  23.  
  24. return text
  25.  
  26. # fall = aug 24 - dec 16
  27. # winter = dec 17 - jan 16
  28. # spring = jan 17 - may 17
  29. # summer = may 18 - aug 23
  30. def date_to_semester(date):
  31. m = date.month
  32. d = date.day
  33.  
  34. if m == 8:
  35. if d > 24:
  36. return 'fall'
  37. else:
  38. return 'summer'
  39.  
  40. elif m == 12:
  41. if d > 16:
  42. return 'winter'
  43. else:
  44. return 'fall'
  45.  
  46. elif m == 1:
  47. if d > 16:
  48. return 'spring'
  49. else:
  50. return 'winter'
  51.  
  52. elif m == 5:
  53. if d > 17:
  54. return 'summer'
  55. else:
  56. return 'spring'
  57.  
  58. elif m > 8 and m < 12:
  59. return 'fall'
  60.  
  61. elif m > 1 and m < 5:
  62. return 'spring'
  63.  
  64. else:
  65. return 'summer'
  66.  
  67.  
  68. def get_entry_df():
  69. def entry_to_date(entry):
  70. # need this to remove '(x conflicted copy)' stuff
  71. filename = entry.split()[0]
  72. # remove .wiki extension
  73. return filename.split('.')[0]
  74.  
  75. entries = []
  76. with open('entry-list') as f:
  77. entries = map(entry_to_date, f.readlines())
  78.  
  79. items = {'date': entries}
  80.  
  81. df = pd.DataFrame.from_dict(items)
  82.  
  83. df['date'] = df['date'].astype('datetime64[D]')
  84. df['year'] = df['date'].dt.year
  85. df['month'] = df['date'].dt.month
  86. df['sem'] = df['date'].map(date_to_semester)
  87.  
  88. return df
  89.  
  90. def add_length_to_df(df):
  91. def get_entry_length(date):
  92. return len(open_entry(date).split())
  93.  
  94. df['length'] = df['date'].map(get_entry_length)
  95. return df
  96.  
  97. def text_blob_sentiment(text):
  98. try:
  99. blobject = TextBlob(text)
  100. return blobject.sentiment.polarity
  101. except Exception as e:
  102. print e
  103. return 0
  104.  
  105. def manual_sentiment(text):
  106. """ i tagged each entry manually with something like '#sent x' where x is an int [1, 10] """
  107. regex = re.compile(r'#sent (\d+)', re.MULTILINE)
  108. sum_scores = 0
  109. scores_count = 0
  110.  
  111. for match in regex.finditer(text):
  112. score = match.group(1)
  113. sum_scores += float(score)
  114. scores_count += 1
  115.  
  116. if sum_scores == 0:
  117. return 0
  118.  
  119. # scale from [0, 10] to [-1, 1]
  120. return (sum_scores / scores_count) / 5 - 1
  121.  
  122. # 2015-12-12.wiki shouldn't be a -.5
  123. # 2014-11-23.wiki shouldn't be a .8 lmao, not even close to that happy
  124. # more hangups, TextBlob('i got hella fucked up last night') is -.3 sentiment when it should be 1
  125. # 'last night was lit' - polarity 0... there goes half my positive ratings
  126. def add_sentiments(df, sentiment_func, save=False):
  127. # sentiment_func should take a string and return a polarity score (i'm gonna use -1 to 1)
  128. sents = {}
  129. def get_entry_sentiment(date):
  130. text = open_entry(date)
  131. sents[date] = sentiment_func(text)
  132. return sents[date]
  133.  
  134. df['sen'] = df['date'].map(get_entry_sentiment)
  135.  
  136. # sort semesters by sentiment and print to a file
  137. sents = sorted(sents.items(), key=operator.itemgetter(1))
  138. if save:
  139. with open('sents.txt', 'wt') as out:
  140. pprint.pprint(sents, stream=out)
  141.  
  142. return df
  143.  
  144. # makes a new figure and removes the top and right axes for aesthetics
  145. def graph_decorator(f):
  146. def wrapper(*args, **kwargs):
  147. if plt.get_fignums():
  148. plt.figure()
  149.  
  150. f(*args, **kwargs)
  151.  
  152. sns.despine()
  153.  
  154. return wrapper
  155.  
  156. @graph_decorator
  157. def graph_months(df, pal=DEFAULT_PAL):
  158. """ bar chart of # of entries for each month """
  159.  
  160. months = df.groupby([df['year'], df['month']]).size().reset_index(name='count')
  161. # for nice x-axis labels
  162. months['name'] = months['month'].map(str) + '-' + months['year'].map(str)
  163.  
  164. ax = sns.barplot(x='name', y='count', hue='year', data=months, palette=pal)
  165. plt.xticks(rotation=75)
  166. plt.yticks(np.arange(min(months['count']), max(months['count'])+1, 1))
  167.  
  168. ax.xaxis.set_ticks_position('none')
  169. ax.set(ylabel='# of entries')
  170.  
  171. # for some reason the bars are really skinny so expand them a bit manually
  172. for patch in ax.patches:
  173. patch.set_width(patch.get_width() * 4)
  174.  
  175. @graph_decorator
  176. def graph_sems(df, normalized=True, pal=DEFAULT_PAL):
  177. """ bar chart of # of entries or entry rate for each semester """
  178.  
  179. def normalize_sem_counts(entry):
  180. # entry is [sem, count]
  181. # turns counts into entries/day
  182. sem = entry[0]
  183. count = entry[1]
  184. print sem, count
  185. days = {'fall': 114, 'winter': 30, 'spring': 121, 'summer': 97}
  186. return float(count) / float(days[sem])
  187.  
  188. sems = df.groupby([df['year'], df['sem']], sort=False).size().reset_index(name='count')
  189.  
  190. response = 'count'
  191. if normalized:
  192. sems['norm_count'] = sems[['sem', 'count']].apply(normalize_sem_counts, axis=1)
  193. response = 'norm_count'
  194.  
  195. sems['name'] = sems['sem'].map(str) + ' ' + sems['year'].map(str)
  196.  
  197. ax = sns.barplot(x='name', y=response, data=sems, palette=pal)
  198.  
  199. plt.xticks(rotation=75)
  200. ax.xaxis.set_ticks_position('none')
  201.  
  202. if normalized:
  203. ax.set(ylabel='entries per day')
  204. else:
  205. ax.set(ylabel='# of entries')
  206.  
  207.  
  208. @graph_decorator
  209. def graph_sens(df, pal=DEFAULT_PAL):
  210. """ bar chart of sentiments for each semester """
  211. sems = df.groupby([df['year'], df['sem']], sort=False)['sen'].mean().reset_index(name='sen')
  212. sems['name'] = sems['sem'].map(str) + ' ' + sems['year'].map(str)
  213.  
  214. ax = sns.barplot(x='name', y='sen', data=sems, palette=pal)
  215.  
  216. # QUICK BONUS GRAPH: dates vs sentiment
  217. plt.figure()
  218. df[['date','sen']].set_index('date').plot()
  219.  
  220. # code to print best/worst semesters, could move to another function
  221. # print '== SORTED SEMESTERS BY SENTIMENT =='
  222. # print sems.sort_values(by=['sen'])
  223.  
  224. @graph_decorator
  225. def graph_sen_v_entry(df, pal=DEFAULT_PAL):
  226. """ graphs average sentiment for a semester vs. the number of entries for that semester
  227. to see if maybe i write more when i feel a certain way (turns out i dont) """
  228.  
  229. sems = df.groupby([df['year'], df['sem']], sort=False) \
  230. .agg({'date': 'size', 'sen': 'mean'}) \
  231. .rename(columns={'date':'count','sen':'mean_sen'}) \
  232. .reset_index()
  233. print sems
  234. sems['name'] = sems['sem'].map(str) + ' ' + sems['year'].map(str)
  235.  
  236. print sems
  237.  
  238. ax = sns.regplot(x='count', y='mean_sen', data=sems)
  239.  
  240. @graph_decorator
  241. def graph_length(df, pal=DEFAULT_PAL):
  242. """ bar chart of average entry length per semester """
  243.  
  244. df = add_length_to_df(df)
  245.  
  246. sems = df.groupby([df['year'], df['sem']], sort=False)['length'].mean().reset_index(name='length')
  247. sems['name'] = sems['sem'].map(str) + ' ' + sems['year'].map(str)
  248.  
  249. ax = sns.barplot(x='name', y='length', data=sems, palette=pal)
  250. ax.set(ylabel='length (words)')
  251.  
  252. @graph_decorator
  253. def graph_sentiment_wordcloud(df, sen_func=manual_sentiment, pos=True, pal=DEFAULT_PAL):
  254. """ displays a wordcloud of aggregated text from either the most positive or negative entries """
  255.  
  256. from wordcloud import WordCloud
  257.  
  258. df = add_sentiments(df, sen_func)
  259. strong = None
  260.  
  261. if pos:
  262. strong = df[df.sen > .5]
  263. else:
  264. strong = df[(df.sen < -.5) & (df.date != np.datetime64('2015-03-01'))]
  265.  
  266. entry_texts = []
  267. def append_entry_text(date):
  268. entry_texts.append(open_entry(date))
  269.  
  270. strong['date'].map(append_entry_text)
  271.  
  272. wordcloud = WordCloud(width=1000, height=500).generate(' '.join(entry_texts))
  273. plt.imshow(wordcloud, interpolation='bilinear')
  274.  
  275. def graph_stuff(df):
  276. sns.set_style('darkgrid', {'font.family': 'ubuntu'})
  277. #graph_months(df)
  278.  
  279. pal = sns.cubehelix_palette(2)
  280. pal = sns.color_palette("Reds_r", 20)
  281. graph_sems(df, pal=pal)
  282. # graph_sems(df, False)
  283.  
  284. df = add_sentiments(df, manual_sentiment)
  285. graph_sens(df, pal='husl')
  286.  
  287. df = add_sentiments(df, text_blob_sentiment)
  288. graph_sens(df, pal='husl')
  289.  
  290. graph_sen_v_entry(df)
  291. graph_length(df)
  292. graph_sentiment_wordcloud(df, pos=False)
  293. plt.show()
  294.  
  295. if __name__ == "__main__":
  296. graph_stuff(get_entry_df())
Add Comment
Please, Sign In to add comment