Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python3
- import sys
- import os
- import argparse
- import re
- from pprint import pprint as pp
- import pytz
- import dateutil
- import matplotlib.pyplot as plt
- import numpy as np
- import pandas as pd
- import scipy.io as sc_io
- import tqdm
- IMG_FOLDER = './img'
- ZOE = '@SpatulaFish#8544'
- if not os.path.exists(IMG_FOLDER):
- os.mkdir(IMG_FOLDER)
- days_of_week = [
- 'Monday',
- 'Tuesday',
- 'Wednesday',
- 'Thursday',
- 'Friday',
- 'Saturday',
- 'Sunday'
- ]
- def getpath(filename):
- return os.path.join(IMG_FOLDER, filename)
- def main():
- args = get_args()
- df = get_data(args.directory)
- plot_message_counts(df)
- plot_over_time(df)
- for user in tqdm.tqdm(df['user'].unique()):
- try:
- plot_time_of_day(df, name=user)
- plot_time_of_week(df, name=user)
- except TypeError as e:
- print(f'{user}: {e}')
- pass
- def get_data(directory):
- # 2016-03-16 21:10:32 #general @SpatulaFish#8544: test
- num = '[0-9]'
- channel = '#[a-zA-Z\-]+'
- user = f'@.+#{num}{{4}}'
- date = f'{num}{{4}}-{num}{{2}}-{num}{{2}}'
- time = f'{num}{{2}}:{num}{{2}}:{num}{{2}}'
- message = f'({date} {time}) ({channel}) ({user}): (.*)'
- message_re = re.compile(message)
- messages = []
- for root, dirs, files in os.walk(directory):
- for filename in files:
- with open(os.path.join(root, filename), 'r') as fileobj:
- for message in re.findall(message_re, fileobj.read()):
- messages.append(message)
- df = pd.DataFrame(messages,
- columns=['timestamp', 'channel', 'user', 'text'])
- df['timestamp'] = pd.to_datetime(df['timestamp'])
- df['timestamp'] = df['timestamp'] - pd.Timedelta('07:00:00')
- df['time_of_day'] = df['timestamp'].apply(lambda x: x - pd.Timestamp(x.date()))
- df['day_of_week'] = df.timestamp.apply(lambda x: x.dayofweek)
- pp(sorted(df.user.unique()))
- return df
- def get_user_counts(df):
- user_counts = df[['user', 'channel']].groupby('user').count()
- user_counts = user_counts.reset_index()
- user_counts.columns = ['username', 'messagecount']
- user_counts = user_counts.sort_values('messagecount')
- return user_counts
- def plot_message_counts(df):
- # plot user message counts
- user_counts = get_user_counts(df)
- user_counts = user_counts[user_counts.messagecount > 10]
- user_counts['log(messagecount)'] = np.log10(user_counts.messagecount.values)
- for xaxis in ['messagecount', 'log(messagecount)']:
- user_counts.plot(x='username',
- y=xaxis,
- kind='barh',
- figsize=(8, 8),
- title=f'{xaxis} per user')
- plt.tight_layout()
- plt.savefig(getpath(f'{xaxis}.png'))
- def plot_over_time(df):
- # plot over time
- # Restructure for Michelle
- # overtime = df[['user', 'timestamp', 'channel']]\
- # .groupby([pd.Grouper(key='timestamp', freq='60min'), 'user'])\
- # .count()
- # overtime = overtime.reset_index()
- # overtime.columns = ['timestamp', 'username', 'count']
- # users = sorted(overtime['username'].unique())
- # weeks = sorted(overtime['timestamp'].unique())
- # user_weeks = pd.DataFrame({'username': users,
- # **{week: np.zeros(len(users))
- # for week in weeks}})
- # for user, group in tqdm.tqdm(overtime.groupby('username')):
- # row = []
- # for week in weeks:
- # weekdata = group[group['timestamp'] == pd.Timestamp(week)]
- # if len(weekdata) == 0:
- # row.append(0)
- # else:
- # row.append(weekdata['count'].values[0])
- # user_weeks.loc[user_weeks['username'] == user] = ([user] + row)
- # sc_io.savemat('./overtime.mat', {'data': user_weeks.values})
- # for week in weeks:
- # user_weeks[week] = overtime[overtime['timestamp'] == pd.Timestamp(week)].sort_values('username')[['count']]
- # sc_io.savemat(
- # 'overtime.mat',
- # {
- # 'timestamp': overtime['timestamp'].values,
- # 'username': overtime['username'].values,
- # 'count': overtime['count'].values
- # }
- # )
- overtime = df[['user', 'timestamp', 'channel']]\
- .groupby([pd.Grouper(key='timestamp', freq='1W'), 'user'])\
- .count()
- overtime = overtime.reset_index()
- overtime.columns = ['timestamp', 'username', 'count']
- fig = plt.figure(figsize=(8, 8))
- ax = fig.add_axes([0.1, 0.1, 0.8, 0.8])
- user_counts = get_user_counts(df)
- most_frequent = sorted(user_counts[user_counts.messagecount > 10000]['username'].values)
- freqs = []
- fft_N = 2048
- for user in most_frequent:
- user_df = overtime[overtime.username == user]
- user_df.plot(
- x='timestamp',
- y='count',
- kind='line',
- ax=ax
- )
- freq_data = np.abs(np.fft.fft(user_df['count'].values, fft_N))
- freqs.append(freq_data)
- plt.tight_layout()
- plt.ylabel('Messages per Week')
- plt.title('Most Frequent Users Posting Timeline')
- plt.legend(most_frequent, loc=0)
- plt.savefig(getpath('overtime.png'))
- weeks = sorted(overtime['timestamp'].unique())
- scatter_df = pd.DataFrame(
- {
- user: np.zeros(len(weeks))
- for user in most_frequent
- }
- )
- for i, week in enumerate(weeks):
- row = []
- for user in most_frequent:
- weekval = overtime[(overtime.username == user) &
- (overtime.timestamp == pd.Timestamp(week))]['count']
- if len(weekval) == 0:
- row.append(0)
- else:
- row.append(weekval.values[0])
- scatter_df.iloc[i] = row
- fig, axarr = plt.subplots(len(most_frequent), len(most_frequent), figsize=(16, 16))
- for i, user1 in enumerate(most_frequent):
- for j, user2 in enumerate(most_frequent):
- if i >= j:
- corr = np.correlate(
- scatter_df[user1].values / scatter_df[user1].std(),
- scatter_df[user2].values / scatter_df[user2].std(),
- mode='full'
- )
- corr /= len(corr)
- corr -= 1
- axarr[i, j].plot(np.arange(len(corr)), corr, label=f'{user1}\n{user2}')
- axarr[i, j].plot(np.linspace(0, len(corr), 10), np.zeros(10), 'k-', alpha=0.5)
- axarr[i, j].set_ylim((-1, 1))
- axarr[i, j].set_xlim((0, len(corr)))
- axarr[i, j].tick_params(
- axis='both',
- which='both',
- direction='in'
- )
- axarr[i, j].legend(loc=0)
- else:
- user1data = scatter_df[user1].values
- user2data = scatter_df[user2].values
- axarr[i, j].plot(np.arange(len(user1data)),
- user1data,
- label=user1)
- axarr[i, j].plot(np.arange(len(user2data)),
- user2data,
- label=user2)
- axarr[i, j].legend(loc=0)
- plt.tight_layout()
- plt.savefig(getpath('scatter.png'))
- #frequency domain
- plt.figure(figsize=(8, 8))
- for name, freq in zip(most_frequent, freqs):
- plt.plot(range(len(freq)), freq / fft_N, label=name)
- plt.xlabel('Normalized Frequency')
- plt.ylabel('FFT Values')
- plt.legend(loc=0)
- plt.savefig(getpath('frequency.png'))
- def plot_time_of_day(df, name=ZOE):
- # by time of day
- tod = df[['user', 'time_of_day', 'channel']]\
- .groupby(['user', pd.Grouper(key='time_of_day', freq='30T')])\
- .count()
- tod = tod.reset_index()
- tod.columns = ['user', 'tod', 'count']
- tod = tod[tod.user == name].sort_values('tod')
- tod.plot(
- x='tod',
- y='count',
- kind='bar'
- )
- plt.title(f'{name}\'s Day')
- plt.tight_layout()
- plt.savefig(getpath(f'{name}_tod.png'))
- def plot_time_of_week(df, name=ZOE):
- tod = df[['user', 'time_of_day', 'day_of_week', 'channel']]\
- .groupby(['user', 'day_of_week', pd.Grouper(key='time_of_day', freq='30T')])\
- .count()
- tod = tod.reset_index()
- tod.columns = ['user', 'dow', 'tod', 'count']
- tod = tod[tod.user == name].sort_values(['dow', 'tod'])
- maxval = tod['count'].values.max()
- fig, axarr = plt.subplots(7, 1, figsize=(8, 8))
- for i in range(7):
- tod[tod.dow == i].plot(
- x='tod',
- y='count',
- kind='bar',
- ax=axarr[i]
- )
- axarr[i].xaxis.set_visible(False)
- axarr[i].set_title(f'{days_of_week[i]}')
- legend = axarr[i].legend()
- legend.remove()
- axarr[i].set_ylim([0, maxval])
- plt.suptitle(f'{name}\'s Week')
- plt.tight_layout()
- plt.savefig(getpath(f'{name}_tow.png'))
- def get_args():
- parser = argparse.ArgumentParser()
- parser.add_argument('-d', '--directory', type=str, default='./cortex',
- help='Directory with logfiles')
- args = parser.parse_args()
- return args
- if __name__ == '__main__':
- sys.exit(main())
Add Comment
Please, Sign In to add comment