Advertisement
Guest User

4chan word clouds (v2)

a guest
Mar 1st, 2018
326
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.62 KB | None | 0 0
  1. import requests
  2. from html2text import html2text
  3. from nltk.probability import FreqDist
  4. from nltk.corpus import brown
  5. from collections import Counter
  6. from pickle import dump, load
  7. from wordcloud import WordCloud
  8. from nltk.tokenize import TweetTokenizer
  9. import matplotlib.pyplot as plt
  10. import os
  11.  
  12. # See https://github.com/4chan/4chan-API for more info
  13.  
  14. tokenizer = TweetTokenizer()
  15.  
  16. for board in requests.get('https://a.4cdn.org/boards.json').json()['boards']:
  17.   try:
  18.     print('Board: {} ({})'.format(board['board'], board['title']))
  19.  
  20.     try:
  21.       # Load word counts for this board from a file, if available
  22.       with open('{}.counts'.format(board['board']), 'rb') as file:
  23.         counter = load(file)
  24.  
  25.     except:
  26.       # Create list of thread IDs
  27.       threads = [
  28.         thread['no']
  29.         for page in requests.get('https://a.4cdn.org/{}/threads.json'.format(board['board'])).json()
  30.         for thread in page['threads']
  31.       ] + requests.get('https://a.4cdn.org/{}/archive.json'.format(board['board'])).json()
  32.      
  33.       threads = threads[:200] # Limit to last 200 threads at most (faster)
  34.  
  35.       # Count words in every thread and every post
  36.       counter = Counter()
  37.       for index, thread in enumerate(threads):
  38.         print('Thread {} ({} / {})'.format(thread, index, len(threads)))
  39.         for post in requests.get('https://a.4cdn.org/{}/thread/{}.json'.format(board['board'], thread)).json()['posts']:
  40.           if 'sub' in post:
  41.             text = html2text(post['sub']).lower()
  42.             counter.update(token for token in tokenizer.tokenize(text) if token.isalpha())
  43.           if 'com' in post:
  44.             text = html2text(post['com']).lower()
  45.             counter.update(token for token in tokenizer.tokenize(text) if token.isalpha())
  46.      
  47.       # Store word counts for this board in a file
  48.       with open('{}.counts'.format(board['board']), 'wb') as file:
  49.         dump(counter, file)
  50.  
  51.     # Create word cloud and save it if it doesn't exist
  52.     if not os.path.isfile('{}.png'.format(board['board'])):
  53.  
  54.       # Reference frequencies of English words
  55.       freq = FreqDist(word.lower() for word in brown.words())
  56.       for word in counter:
  57.         counter[word] /= 1 + freq[word]
  58.  
  59.       del counter[board['board']] # Remove board name
  60.       for word in tokenizer.tokenize(board['title'].lower()):
  61.         del counter[word] # Remove other keywords
  62.  
  63.       cloud = WordCloud(scale=10, max_words=2000).fit_words(counter)
  64.       plt.imsave('{}.png'.format(board['board']), cloud)
  65.  
  66.   except Exception as e:
  67.     print('ERROR: {}'.format(e))
  68.     continue # Error occurred, skip this board for now
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement