Advertisement
Guest User

4chan word clouds

a guest
Feb 27th, 2018
377
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.83 KB | None | 0 0
  1. import requests
  2. from html2text import html2text
  3. from nltk.probability import FreqDist
  4. from nltk.corpus import brown
  5. from collections import Counter
  6. from pickle import dump, load
  7. from wordcloud import WordCloud
  8. from nltk.tokenize import TweetTokenizer
  9. import matplotlib.pyplot as plt
  10.  
  11. board = 'sci'
  12. # See https://github.com/4chan/4chan-API for more info
  13.  
  14. try:
  15.   # Load word counts for this board from a file, if available
  16.   with open(board, 'rb') as file:
  17.     counter = load(file)
  18.  
  19. except:
  20.   # Create list of thread IDs
  21.   threads = [
  22.     thread['no']
  23.     for page in requests.get('https://a.4cdn.org/{}/threads.json'.format(board)).json()
  24.     for thread in page['threads']
  25.   ] + requests.get('https://a.4cdn.org/{}/archive.json'.format(board)).json()
  26.  
  27.   # Count words in every thread and every post
  28.   counter = Counter()
  29.   tokenizer = TweetTokenizer()
  30.   for index, thread in enumerate(threads):
  31.     print('Thread {} ({} / {})'.format(thread, index, len(threads)))
  32.     for post in requests.get('https://a.4cdn.org/{}/thread/{}.json'.format(board, thread)).json()['posts']:
  33.       if 'sub' in post:
  34.         text = html2text(post['sub']).lower()
  35.         counter.update(token for token in tokenizer.tokenize(text) if token.isalpha())
  36.       if 'com' in post:
  37.         text = html2text(post['com']).lower()
  38.         counter.update(token for token in tokenizer.tokenize(text) if token.isalpha())
  39.  
  40.   # Store word counts for this board in a file
  41.   with open(board, 'wb') as file:
  42.     dump(counter, file)
  43.  
  44. # Reference frequencies of English words
  45. freq = FreqDist(i.lower() for i in brown.words())
  46. for word in counter:
  47.   counter[word] = counter[word] / (1 + freq[word])
  48.  
  49. # Create and show word cloud
  50. cloud = WordCloud(scale=8, max_words=2000).fit_words(counter)
  51. plt.imshow(cloud, interpolation='bilinear')
  52. plt.axis('off')
  53. plt.show()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement