Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import re
- import sqlite3
- import pandas as pd
- import numpy as np
- import scipy.stats
- from sklearn.cluster import OPTICS
- import matplotlib.pyplot as plt
- plt.style.use("seaborn")
- db = sqlite3.connect('data/age.sqlite')
- df = pd.read_sql_query("select comments.* "
- "from comments join submissions on "
- "comments.submission_id = submissions.submission_id "
- "order by submissions.integer_id, comments.position",
- db)
- counters = ['TheNitromeFan', 'mistyskye14']
- relevant_comments = df.query("username in @counters")
- def conversation_filter(body):
- translation = str.maketrans("", "", "0123456789, ")
- return bool(body.translate(translation))
- mask = relevant_comments['body'].apply(conversation_filter)
- filtered = relevant_comments.loc[mask]
- mask = filtered['username'] != filtered['username'].shift()
- final = filtered.loc[mask].reset_index().iloc[2:].copy()
- final['date'] = pd.to_datetime(final['timestamp'], unit='s')
- timestamps = final['timestamp']
- kernel = scipy.stats.gaussian_kde(timestamps, bw_method=0.01)
- interval_width = 6 * 3600
- n_intervals = (timestamps.max() - timestamps.min()) // interval_width
- start = timestamps.min() - timestamps.min() % interval_width
- time = np.arange(start,
- start + (n_intervals + 1) * interval_width,
- interval_width)
- density = kernel.evaluate(time)
- density = pd.DataFrame.from_records([time, density]).T
- density.columns = ['timestamp', 'density']
- density['date'] = pd.to_datetime(density['timestamp'], unit='s', utc=True)
- density.set_index('date', inplace=True)
- clust = OPTICS(min_samples=10, xi=0.05)
- fit = clust.fit(final['timestamp'].to_numpy().reshape(-1, 1))
- final['cluster_label'] = clust.labels_
- final['position'] = 0
- ax = (density['density'] / density['density'].max()).plot()
- # sns.rugplot(data=final, x='date', hue='cluster_label')
- final.sample(frac=0.01).plot.scatter(x='date', y='position',
- ax=ax,
- marker='|',
- c='cluster_label', cmap='magma')
- ax.set_ylabel('Counting frequency (arbitrary units)')
- ax.set_xlabel('')
- fig = plt.gcf()
- cbar = fig.get_axes()[-1]
- cbar.remove()
- ax.set_title("Misty & TNF's counts in By Your Age")
- plt.savefig('plots/age_kde.png', bbox_inches='tight')
- def normalise_body(body):
- regex = re.compile(r"^[0-9, \n]*")
- return re.sub(regex, "", body).replace("\n\n", "\n")
- final['new_body'] = final['body'].apply(normalise_body)
- with open('tnf-misty-conversation.txt', 'w') as f:
- for row in final.itertuples():
- print(row.username,
- row.date.strftime('%Y-%m-%d %H:%M'),
- f'({row.submission_id}/_/{row.comment_id})',
- end=":\n", file=f)
- print(row.new_body, end="\n\n", file=f)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement