Guest User

Untitled

a guest
Dec 13th, 2017
90
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 9.13 KB | None | 0 0
  1. #!/usr/bin/env python3
  2.  
  3. import sys
  4. import os
  5. import argparse
  6.  
  7. import re
  8. from pprint import pprint as pp
  9. import pytz
  10. import dateutil
  11.  
  12. import matplotlib.pyplot as plt
  13. import numpy as np
  14. import pandas as pd
  15.  
  16. import scipy.io as sc_io
  17.  
  18. import tqdm
  19.  
  20.  
  21. IMG_FOLDER = './img'
  22. ZOE = '@SpatulaFish#8544'
  23.  
  24. if not os.path.exists(IMG_FOLDER):
  25. os.mkdir(IMG_FOLDER)
  26.  
  27. days_of_week = [
  28. 'Monday',
  29. 'Tuesday',
  30. 'Wednesday',
  31. 'Thursday',
  32. 'Friday',
  33. 'Saturday',
  34. 'Sunday'
  35. ]
  36.  
  37.  
  38. def getpath(filename):
  39. return os.path.join(IMG_FOLDER, filename)
  40.  
  41.  
  42. def main():
  43. args = get_args()
  44.  
  45. df = get_data(args.directory)
  46.  
  47. plot_message_counts(df)
  48. plot_over_time(df)
  49.  
  50. for user in tqdm.tqdm(df['user'].unique()):
  51. try:
  52. plot_time_of_day(df, name=user)
  53. plot_time_of_week(df, name=user)
  54. except TypeError as e:
  55. print(f'{user}: {e}')
  56. pass
  57.  
  58.  
  59. def get_data(directory):
  60. # 2016-03-16 21:10:32 #general @SpatulaFish#8544: test
  61. num = '[0-9]'
  62. channel = '#[a-zA-Z\-]+'
  63. user = f'@.+#{num}{{4}}'
  64. date = f'{num}{{4}}-{num}{{2}}-{num}{{2}}'
  65. time = f'{num}{{2}}:{num}{{2}}:{num}{{2}}'
  66. message = f'({date} {time}) ({channel}) ({user}): (.*)'
  67. message_re = re.compile(message)
  68.  
  69. messages = []
  70. for root, dirs, files in os.walk(directory):
  71. for filename in files:
  72. with open(os.path.join(root, filename), 'r') as fileobj:
  73. for message in re.findall(message_re, fileobj.read()):
  74. messages.append(message)
  75.  
  76. df = pd.DataFrame(messages,
  77. columns=['timestamp', 'channel', 'user', 'text'])
  78. df['timestamp'] = pd.to_datetime(df['timestamp'])
  79. df['timestamp'] = df['timestamp'] - pd.Timedelta('07:00:00')
  80. df['time_of_day'] = df['timestamp'].apply(lambda x: x - pd.Timestamp(x.date()))
  81. df['day_of_week'] = df.timestamp.apply(lambda x: x.dayofweek)
  82.  
  83. pp(sorted(df.user.unique()))
  84.  
  85. return df
  86.  
  87.  
  88. def get_user_counts(df):
  89. user_counts = df[['user', 'channel']].groupby('user').count()
  90. user_counts = user_counts.reset_index()
  91. user_counts.columns = ['username', 'messagecount']
  92. user_counts = user_counts.sort_values('messagecount')
  93. return user_counts
  94.  
  95.  
  96. def plot_message_counts(df):
  97. # plot user message counts
  98. user_counts = get_user_counts(df)
  99.  
  100. user_counts = user_counts[user_counts.messagecount > 10]
  101.  
  102. user_counts['log(messagecount)'] = np.log10(user_counts.messagecount.values)
  103.  
  104. for xaxis in ['messagecount', 'log(messagecount)']:
  105. user_counts.plot(x='username',
  106. y=xaxis,
  107. kind='barh',
  108. figsize=(8, 8),
  109. title=f'{xaxis} per user')
  110. plt.tight_layout()
  111. plt.savefig(getpath(f'{xaxis}.png'))
  112.  
  113.  
  114. def plot_over_time(df):
  115. # plot over time
  116. # Restructure for Michelle
  117. # overtime = df[['user', 'timestamp', 'channel']]\
  118. # .groupby([pd.Grouper(key='timestamp', freq='60min'), 'user'])\
  119. # .count()
  120. # overtime = overtime.reset_index()
  121. # overtime.columns = ['timestamp', 'username', 'count']
  122.  
  123. # users = sorted(overtime['username'].unique())
  124. # weeks = sorted(overtime['timestamp'].unique())
  125. # user_weeks = pd.DataFrame({'username': users,
  126. # **{week: np.zeros(len(users))
  127. # for week in weeks}})
  128.  
  129. # for user, group in tqdm.tqdm(overtime.groupby('username')):
  130. # row = []
  131. # for week in weeks:
  132. # weekdata = group[group['timestamp'] == pd.Timestamp(week)]
  133. # if len(weekdata) == 0:
  134. # row.append(0)
  135. # else:
  136. # row.append(weekdata['count'].values[0])
  137. # user_weeks.loc[user_weeks['username'] == user] = ([user] + row)
  138.  
  139. # sc_io.savemat('./overtime.mat', {'data': user_weeks.values})
  140.  
  141. # for week in weeks:
  142. # user_weeks[week] = overtime[overtime['timestamp'] == pd.Timestamp(week)].sort_values('username')[['count']]
  143.  
  144. # sc_io.savemat(
  145. # 'overtime.mat',
  146. # {
  147. # 'timestamp': overtime['timestamp'].values,
  148. # 'username': overtime['username'].values,
  149. # 'count': overtime['count'].values
  150. # }
  151. # )
  152.  
  153. overtime = df[['user', 'timestamp', 'channel']]\
  154. .groupby([pd.Grouper(key='timestamp', freq='1W'), 'user'])\
  155. .count()
  156. overtime = overtime.reset_index()
  157. overtime.columns = ['timestamp', 'username', 'count']
  158.  
  159. fig = plt.figure(figsize=(8, 8))
  160. ax = fig.add_axes([0.1, 0.1, 0.8, 0.8])
  161.  
  162. user_counts = get_user_counts(df)
  163. most_frequent = sorted(user_counts[user_counts.messagecount > 10000]['username'].values)
  164. freqs = []
  165. fft_N = 2048
  166. for user in most_frequent:
  167. user_df = overtime[overtime.username == user]
  168. user_df.plot(
  169. x='timestamp',
  170. y='count',
  171. kind='line',
  172. ax=ax
  173. )
  174. freq_data = np.abs(np.fft.fft(user_df['count'].values, fft_N))
  175. freqs.append(freq_data)
  176.  
  177. plt.tight_layout()
  178. plt.ylabel('Messages per Week')
  179. plt.title('Most Frequent Users Posting Timeline')
  180. plt.legend(most_frequent, loc=0)
  181. plt.savefig(getpath('overtime.png'))
  182.  
  183. weeks = sorted(overtime['timestamp'].unique())
  184. scatter_df = pd.DataFrame(
  185. {
  186. user: np.zeros(len(weeks))
  187. for user in most_frequent
  188. }
  189. )
  190.  
  191. for i, week in enumerate(weeks):
  192. row = []
  193. for user in most_frequent:
  194. weekval = overtime[(overtime.username == user) &
  195. (overtime.timestamp == pd.Timestamp(week))]['count']
  196. if len(weekval) == 0:
  197. row.append(0)
  198. else:
  199. row.append(weekval.values[0])
  200. scatter_df.iloc[i] = row
  201.  
  202. fig, axarr = plt.subplots(len(most_frequent), len(most_frequent), figsize=(16, 16))
  203. for i, user1 in enumerate(most_frequent):
  204. for j, user2 in enumerate(most_frequent):
  205. if i >= j:
  206. corr = np.correlate(
  207. scatter_df[user1].values / scatter_df[user1].std(),
  208. scatter_df[user2].values / scatter_df[user2].std(),
  209. mode='full'
  210. )
  211. corr /= len(corr)
  212. corr -= 1
  213. axarr[i, j].plot(np.arange(len(corr)), corr, label=f'{user1}\n{user2}')
  214. axarr[i, j].plot(np.linspace(0, len(corr), 10), np.zeros(10), 'k-', alpha=0.5)
  215. axarr[i, j].set_ylim((-1, 1))
  216. axarr[i, j].set_xlim((0, len(corr)))
  217. axarr[i, j].tick_params(
  218. axis='both',
  219. which='both',
  220. direction='in'
  221. )
  222. axarr[i, j].legend(loc=0)
  223. else:
  224. user1data = scatter_df[user1].values
  225. user2data = scatter_df[user2].values
  226. axarr[i, j].plot(np.arange(len(user1data)),
  227. user1data,
  228. label=user1)
  229. axarr[i, j].plot(np.arange(len(user2data)),
  230. user2data,
  231. label=user2)
  232. axarr[i, j].legend(loc=0)
  233. plt.tight_layout()
  234. plt.savefig(getpath('scatter.png'))
  235.  
  236. #frequency domain
  237. plt.figure(figsize=(8, 8))
  238. for name, freq in zip(most_frequent, freqs):
  239. plt.plot(range(len(freq)), freq / fft_N, label=name)
  240. plt.xlabel('Normalized Frequency')
  241. plt.ylabel('FFT Values')
  242. plt.legend(loc=0)
  243. plt.savefig(getpath('frequency.png'))
  244.  
  245.  
  246. def plot_time_of_day(df, name=ZOE):
  247. # by time of day
  248. tod = df[['user', 'time_of_day', 'channel']]\
  249. .groupby(['user', pd.Grouper(key='time_of_day', freq='30T')])\
  250. .count()
  251. tod = tod.reset_index()
  252. tod.columns = ['user', 'tod', 'count']
  253. tod = tod[tod.user == name].sort_values('tod')
  254.  
  255. tod.plot(
  256. x='tod',
  257. y='count',
  258. kind='bar'
  259. )
  260. plt.title(f'{name}\'s Day')
  261. plt.tight_layout()
  262. plt.savefig(getpath(f'{name}_tod.png'))
  263.  
  264.  
  265. def plot_time_of_week(df, name=ZOE):
  266. tod = df[['user', 'time_of_day', 'day_of_week', 'channel']]\
  267. .groupby(['user', 'day_of_week', pd.Grouper(key='time_of_day', freq='30T')])\
  268. .count()
  269. tod = tod.reset_index()
  270. tod.columns = ['user', 'dow', 'tod', 'count']
  271. tod = tod[tod.user == name].sort_values(['dow', 'tod'])
  272.  
  273. maxval = tod['count'].values.max()
  274.  
  275. fig, axarr = plt.subplots(7, 1, figsize=(8, 8))
  276. for i in range(7):
  277. tod[tod.dow == i].plot(
  278. x='tod',
  279. y='count',
  280. kind='bar',
  281. ax=axarr[i]
  282. )
  283. axarr[i].xaxis.set_visible(False)
  284. axarr[i].set_title(f'{days_of_week[i]}')
  285. legend = axarr[i].legend()
  286. legend.remove()
  287. axarr[i].set_ylim([0, maxval])
  288. plt.suptitle(f'{name}\'s Week')
  289. plt.tight_layout()
  290. plt.savefig(getpath(f'{name}_tow.png'))
  291.  
  292.  
  293. def get_args():
  294. parser = argparse.ArgumentParser()
  295. parser.add_argument('-d', '--directory', type=str, default='./cortex',
  296. help='Directory with logfiles')
  297. args = parser.parse_args()
  298. return args
  299.  
  300.  
  301. if __name__ == '__main__':
  302. sys.exit(main())
Add Comment
Please, Sign In to add comment