4chan word clouds

import requests
from html2text import html2text
from nltk.probability import FreqDist
from nltk.corpus import brown
from collections import Counter
from pickle import dump, load
from wordcloud import WordCloud
from nltk.tokenize import TweetTokenizer
import matplotlib.pyplot as plt

board = 'sci'
# See https://github.com/4chan/4chan-API for more info

try:
  # Load word counts for this board from a file, if available
  with open(board, 'rb') as file:
    counter = load(file)

except:
  # Create list of thread IDs
  threads = [
    thread['no']
    for page in requests.get('https://a.4cdn.org/{}/threads.json'.format(board)).json()
    for thread in page['threads']
  ] + requests.get('https://a.4cdn.org/{}/archive.json'.format(board)).json()

  # Count words in every thread and every post
  counter = Counter()
  tokenizer = TweetTokenizer()
  for index, thread in enumerate(threads):
    print('Thread {} ({} / {})'.format(thread, index, len(threads)))
    for post in requests.get('https://a.4cdn.org/{}/thread/{}.json'.format(board, thread)).json()['posts']:
      if 'sub' in post:
        text = html2text(post['sub']).lower()
        counter.update(token for token in tokenizer.tokenize(text) if token.isalpha())
      if 'com' in post:
        text = html2text(post['com']).lower()
        counter.update(token for token in tokenizer.tokenize(text) if token.isalpha())

  # Store word counts for this board in a file
  with open(board, 'wb') as file:
    dump(counter, file)

# Reference frequencies of English words
freq = FreqDist(i.lower() for i in brown.words())
for word in counter:
  counter[word] = counter[word] / (1 + freq[word])

# Create and show word cloud
cloud = WordCloud(scale=8, max_words=2000).fit_words(counter)
plt.imshow(cloud, interpolation='bilinear')
plt.axis('off')
plt.show()