Advertisement
cobibh

Extracting conversation from your age thread

Jan 8th, 2022
94
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.88 KB | None | 0 0
  1. import re
  2. import sqlite3
  3.  
  4. import pandas as pd
  5. import numpy as np
  6. import scipy.stats
  7. from sklearn.cluster import OPTICS
  8.  
  9. import matplotlib.pyplot as plt
  10.  
  11. plt.style.use("seaborn")
  12.  
  13. db = sqlite3.connect('data/age.sqlite')
  14.  
  15. df = pd.read_sql_query("select comments.* "
  16.                        "from comments join submissions on "
  17.                        "comments.submission_id = submissions.submission_id "
  18.                        "order by submissions.integer_id, comments.position",
  19.                        db)
  20.  
  21. counters = ['TheNitromeFan', 'mistyskye14']
  22. relevant_comments = df.query("username in @counters")
  23.  
  24.  
  25. def conversation_filter(body):
  26.     translation = str.maketrans("", "", "0123456789, ")
  27.     return bool(body.translate(translation))
  28.  
  29.  
  30. mask = relevant_comments['body'].apply(conversation_filter)
  31. filtered = relevant_comments.loc[mask]
  32.  
  33. mask = filtered['username'] != filtered['username'].shift()
  34. final = filtered.loc[mask].reset_index().iloc[2:].copy()
  35. final['date'] = pd.to_datetime(final['timestamp'], unit='s')
  36.  
  37. timestamps = final['timestamp']
  38. kernel = scipy.stats.gaussian_kde(timestamps, bw_method=0.01)
  39. interval_width = 6 * 3600
  40. n_intervals = (timestamps.max() - timestamps.min()) // interval_width
  41. start = timestamps.min() - timestamps.min() % interval_width
  42. time = np.arange(start,
  43.                  start + (n_intervals + 1) * interval_width,
  44.                  interval_width)
  45. density = kernel.evaluate(time)
  46. density = pd.DataFrame.from_records([time, density]).T
  47. density.columns = ['timestamp', 'density']
  48. density['date'] = pd.to_datetime(density['timestamp'], unit='s', utc=True)
  49. density.set_index('date', inplace=True)
  50.  
  51. clust = OPTICS(min_samples=10, xi=0.05)
  52. fit = clust.fit(final['timestamp'].to_numpy().reshape(-1, 1))
  53.  
  54. final['cluster_label'] = clust.labels_
  55. final['position'] = 0
  56.  
  57. ax = (density['density'] / density['density'].max()).plot()
  58. # sns.rugplot(data=final, x='date', hue='cluster_label')
  59. final.sample(frac=0.01).plot.scatter(x='date', y='position',
  60.                                      ax=ax,
  61.                                      marker='|',
  62.                                      c='cluster_label', cmap='magma')
  63. ax.set_ylabel('Counting frequency (arbitrary units)')
  64. ax.set_xlabel('')
  65. fig = plt.gcf()
  66. cbar = fig.get_axes()[-1]
  67. cbar.remove()
  68. ax.set_title("Misty & TNF's counts in By Your Age")
  69. plt.savefig('plots/age_kde.png', bbox_inches='tight')
  70.  
  71.  
  72. def normalise_body(body):
  73.     regex = re.compile(r"^[0-9, \n]*")
  74.     return re.sub(regex, "", body).replace("\n\n", "\n")
  75.  
  76.  
  77. final['new_body'] = final['body'].apply(normalise_body)
  78.  
  79. with open('tnf-misty-conversation.txt', 'w') as f:
  80.     for row in final.itertuples():
  81.         print(row.username,
  82.               row.date.strftime('%Y-%m-%d %H:%M'),
  83.               f'({row.submission_id}/_/{row.comment_id})',
  84.               end=":\n", file=f)
  85.         print(row.new_body, end="\n\n", file=f)
  86.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement