Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import requests
- from html2text import html2text
- from nltk.probability import FreqDist
- from nltk.corpus import brown
- from collections import Counter
- from pickle import dump, load
- from wordcloud import WordCloud
- from nltk.tokenize import TweetTokenizer
- import matplotlib.pyplot as plt
- board = 'sci'
- # See https://github.com/4chan/4chan-API for more info
- try:
- # Load word counts for this board from a file, if available
- with open(board, 'rb') as file:
- counter = load(file)
- except:
- # Create list of thread IDs
- threads = [
- thread['no']
- for page in requests.get('https://a.4cdn.org/{}/threads.json'.format(board)).json()
- for thread in page['threads']
- ] + requests.get('https://a.4cdn.org/{}/archive.json'.format(board)).json()
- # Count words in every thread and every post
- counter = Counter()
- tokenizer = TweetTokenizer()
- for index, thread in enumerate(threads):
- print('Thread {} ({} / {})'.format(thread, index, len(threads)))
- for post in requests.get('https://a.4cdn.org/{}/thread/{}.json'.format(board, thread)).json()['posts']:
- if 'sub' in post:
- text = html2text(post['sub']).lower()
- counter.update(token for token in tokenizer.tokenize(text) if token.isalpha())
- if 'com' in post:
- text = html2text(post['com']).lower()
- counter.update(token for token in tokenizer.tokenize(text) if token.isalpha())
- # Store word counts for this board in a file
- with open(board, 'wb') as file:
- dump(counter, file)
- # Reference frequencies of English words
- freq = FreqDist(i.lower() for i in brown.words())
- for word in counter:
- counter[word] = counter[word] / (1 + freq[word])
- # Create and show word cloud
- cloud = WordCloud(scale=8, max_words=2000).fit_words(counter)
- plt.imshow(cloud, interpolation='bilinear')
- plt.axis('off')
- plt.show()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement