Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from __future__ import division
- import matplotlib.pyplot as plt
- from matplotlib.ticker import AutoLocator
- import numpy as np
- import operator
- import pandas as pd
- import pprint
- import re
- import seaborn as sns
- from textblob import TextBlob
- from wordcloud import WordCloud
- # config
- DEFAULT_PAL = 'GnBu_d'
- DIARY_PATH = '/home/karthik/Dropbox/vimwiki/diary/'
- def open_entry(date):
- text = None
- date_str = date.strftime('%Y-%m-%d')
- with open(DIARY_PATH + date_str + '.wiki') as f:
- text = f.read()
- return text
- # fall = aug 24 - dec 16
- # winter = dec 17 - jan 16
- # spring = jan 17 - may 17
- # summer = may 18 - aug 23
- def date_to_semester(date):
- m = date.month
- d = date.day
- if m == 8:
- if d > 24:
- return 'fall'
- else:
- return 'summer'
- elif m == 12:
- if d > 16:
- return 'winter'
- else:
- return 'fall'
- elif m == 1:
- if d > 16:
- return 'spring'
- else:
- return 'winter'
- elif m == 5:
- if d > 17:
- return 'summer'
- else:
- return 'spring'
- elif m > 8 and m < 12:
- return 'fall'
- elif m > 1 and m < 5:
- return 'spring'
- else:
- return 'summer'
- def get_entry_df():
- def entry_to_date(entry):
- # need this to remove '(x conflicted copy)' stuff
- filename = entry.split()[0]
- # remove .wiki extension
- return filename.split('.')[0]
- entries = []
- with open('entry-list') as f:
- entries = map(entry_to_date, f.readlines())
- items = {'date': entries}
- df = pd.DataFrame.from_dict(items)
- df['date'] = df['date'].astype('datetime64[D]')
- df['year'] = df['date'].dt.year
- df['month'] = df['date'].dt.month
- df['sem'] = df['date'].map(date_to_semester)
- return df
- def add_length_to_df(df):
- def get_entry_length(date):
- return len(open_entry(date).split())
- df['length'] = df['date'].map(get_entry_length)
- return df
- def text_blob_sentiment(text):
- try:
- blobject = TextBlob(text)
- return blobject.sentiment.polarity
- except Exception as e:
- print e
- return 0
- def manual_sentiment(text):
- """ i tagged each entry manually with something like '#sent x' where x is an int [1, 10] """
- regex = re.compile(r'#sent (\d+)', re.MULTILINE)
- sum_scores = 0
- scores_count = 0
- for match in regex.finditer(text):
- score = match.group(1)
- sum_scores += float(score)
- scores_count += 1
- if sum_scores == 0:
- return 0
- # scale from [0, 10] to [-1, 1]
- return (sum_scores / scores_count) / 5 - 1
- # 2015-12-12.wiki shouldn't be a -.5
- # 2014-11-23.wiki shouldn't be a .8 lmao, not even close to that happy
- # more hangups, TextBlob('i got hella fucked up last night') is -.3 sentiment when it should be 1
- # 'last night was lit' - polarity 0... there goes half my positive ratings
- def add_sentiments(df, sentiment_func, save=False):
- # sentiment_func should take a string and return a polarity score (i'm gonna use -1 to 1)
- sents = {}
- def get_entry_sentiment(date):
- text = open_entry(date)
- sents[date] = sentiment_func(text)
- return sents[date]
- df['sen'] = df['date'].map(get_entry_sentiment)
- # sort semesters by sentiment and print to a file
- sents = sorted(sents.items(), key=operator.itemgetter(1))
- if save:
- with open('sents.txt', 'wt') as out:
- pprint.pprint(sents, stream=out)
- return df
- # makes a new figure and removes the top and right axes for aesthetics
- def graph_decorator(f):
- def wrapper(*args, **kwargs):
- if plt.get_fignums():
- plt.figure()
- f(*args, **kwargs)
- sns.despine()
- return wrapper
- @graph_decorator
- def graph_months(df, pal=DEFAULT_PAL):
- """ bar chart of # of entries for each month """
- months = df.groupby([df['year'], df['month']]).size().reset_index(name='count')
- # for nice x-axis labels
- months['name'] = months['month'].map(str) + '-' + months['year'].map(str)
- ax = sns.barplot(x='name', y='count', hue='year', data=months, palette=pal)
- plt.xticks(rotation=75)
- plt.yticks(np.arange(min(months['count']), max(months['count'])+1, 1))
- ax.xaxis.set_ticks_position('none')
- ax.set(ylabel='# of entries')
- # for some reason the bars are really skinny so expand them a bit manually
- for patch in ax.patches:
- patch.set_width(patch.get_width() * 4)
- @graph_decorator
- def graph_sems(df, normalized=True, pal=DEFAULT_PAL):
- """ bar chart of # of entries or entry rate for each semester """
- def normalize_sem_counts(entry):
- # entry is [sem, count]
- # turns counts into entries/day
- sem = entry[0]
- count = entry[1]
- print sem, count
- days = {'fall': 114, 'winter': 30, 'spring': 121, 'summer': 97}
- return float(count) / float(days[sem])
- sems = df.groupby([df['year'], df['sem']], sort=False).size().reset_index(name='count')
- response = 'count'
- if normalized:
- sems['norm_count'] = sems[['sem', 'count']].apply(normalize_sem_counts, axis=1)
- response = 'norm_count'
- sems['name'] = sems['sem'].map(str) + ' ' + sems['year'].map(str)
- ax = sns.barplot(x='name', y=response, data=sems, palette=pal)
- plt.xticks(rotation=75)
- ax.xaxis.set_ticks_position('none')
- if normalized:
- ax.set(ylabel='entries per day')
- else:
- ax.set(ylabel='# of entries')
- @graph_decorator
- def graph_sens(df, pal=DEFAULT_PAL):
- """ bar chart of sentiments for each semester """
- sems = df.groupby([df['year'], df['sem']], sort=False)['sen'].mean().reset_index(name='sen')
- sems['name'] = sems['sem'].map(str) + ' ' + sems['year'].map(str)
- ax = sns.barplot(x='name', y='sen', data=sems, palette=pal)
- # QUICK BONUS GRAPH: dates vs sentiment
- plt.figure()
- df[['date','sen']].set_index('date').plot()
- # code to print best/worst semesters, could move to another function
- # print '== SORTED SEMESTERS BY SENTIMENT =='
- # print sems.sort_values(by=['sen'])
- @graph_decorator
- def graph_sen_v_entry(df, pal=DEFAULT_PAL):
- """ graphs average sentiment for a semester vs. the number of entries for that semester
- to see if maybe i write more when i feel a certain way (turns out i dont) """
- sems = df.groupby([df['year'], df['sem']], sort=False) \
- .agg({'date': 'size', 'sen': 'mean'}) \
- .rename(columns={'date':'count','sen':'mean_sen'}) \
- .reset_index()
- print sems
- sems['name'] = sems['sem'].map(str) + ' ' + sems['year'].map(str)
- print sems
- ax = sns.regplot(x='count', y='mean_sen', data=sems)
- @graph_decorator
- def graph_length(df, pal=DEFAULT_PAL):
- """ bar chart of average entry length per semester """
- df = add_length_to_df(df)
- sems = df.groupby([df['year'], df['sem']], sort=False)['length'].mean().reset_index(name='length')
- sems['name'] = sems['sem'].map(str) + ' ' + sems['year'].map(str)
- ax = sns.barplot(x='name', y='length', data=sems, palette=pal)
- ax.set(ylabel='length (words)')
- @graph_decorator
- def graph_sentiment_wordcloud(df, sen_func=manual_sentiment, pos=True, pal=DEFAULT_PAL):
- """ displays a wordcloud of aggregated text from either the most positive or negative entries """
- from wordcloud import WordCloud
- df = add_sentiments(df, sen_func)
- strong = None
- if pos:
- strong = df[df.sen > .5]
- else:
- strong = df[(df.sen < -.5) & (df.date != np.datetime64('2015-03-01'))]
- entry_texts = []
- def append_entry_text(date):
- entry_texts.append(open_entry(date))
- strong['date'].map(append_entry_text)
- wordcloud = WordCloud(width=1000, height=500).generate(' '.join(entry_texts))
- plt.imshow(wordcloud, interpolation='bilinear')
- def graph_stuff(df):
- sns.set_style('darkgrid', {'font.family': 'ubuntu'})
- #graph_months(df)
- pal = sns.cubehelix_palette(2)
- pal = sns.color_palette("Reds_r", 20)
- graph_sems(df, pal=pal)
- # graph_sems(df, False)
- df = add_sentiments(df, manual_sentiment)
- graph_sens(df, pal='husl')
- df = add_sentiments(df, text_blob_sentiment)
- graph_sens(df, pal='husl')
- graph_sen_v_entry(df)
- graph_length(df)
- graph_sentiment_wordcloud(df, pos=False)
- plt.show()
- if __name__ == "__main__":
- graph_stuff(get_entry_df())
Add Comment
Please, Sign In to add comment