Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import requests
- from html2text import html2text
- from nltk.probability import FreqDist
- from nltk.corpus import brown
- from collections import Counter
- from pickle import dump, load
- from wordcloud import WordCloud
- from nltk.tokenize import TweetTokenizer
- import matplotlib.pyplot as plt
- import os
- # See https://github.com/4chan/4chan-API for more info
- tokenizer = TweetTokenizer()
- for board in requests.get('https://a.4cdn.org/boards.json').json()['boards']:
- try:
- print('Board: {} ({})'.format(board['board'], board['title']))
- try:
- # Load word counts for this board from a file, if available
- with open('{}.counts'.format(board['board']), 'rb') as file:
- counter = load(file)
- except:
- # Create list of thread IDs
- threads = [
- thread['no']
- for page in requests.get('https://a.4cdn.org/{}/threads.json'.format(board['board'])).json()
- for thread in page['threads']
- ] + requests.get('https://a.4cdn.org/{}/archive.json'.format(board['board'])).json()
- threads = threads[:200] # Limit to last 200 threads at most (faster)
- # Count words in every thread and every post
- counter = Counter()
- for index, thread in enumerate(threads):
- print('Thread {} ({} / {})'.format(thread, index, len(threads)))
- for post in requests.get('https://a.4cdn.org/{}/thread/{}.json'.format(board['board'], thread)).json()['posts']:
- if 'sub' in post:
- text = html2text(post['sub']).lower()
- counter.update(token for token in tokenizer.tokenize(text) if token.isalpha())
- if 'com' in post:
- text = html2text(post['com']).lower()
- counter.update(token for token in tokenizer.tokenize(text) if token.isalpha())
- # Store word counts for this board in a file
- with open('{}.counts'.format(board['board']), 'wb') as file:
- dump(counter, file)
- # Create word cloud and save it if it doesn't exist
- if not os.path.isfile('{}.png'.format(board['board'])):
- # Reference frequencies of English words
- freq = FreqDist(word.lower() for word in brown.words())
- for word in counter:
- counter[word] /= 1 + freq[word]
- del counter[board['board']] # Remove board name
- for word in tokenizer.tokenize(board['title'].lower()):
- del counter[word] # Remove other keywords
- cloud = WordCloud(scale=10, max_words=2000).fit_words(counter)
- plt.imsave('{}.png'.format(board['board']), cloud)
- except Exception as e:
- print('ERROR: {}'.format(e))
- continue # Error occurred, skip this board for now
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement