Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # Use this by calling `python3 4chan_parser.py <Thread URL> <Some output filename/filepath>, e.g.:
- # python3 4chan_parser.py https://boards.4channel.org/vt/thread/12765676 12765676.txt
- import re
- import requests
- import sys
- URL, OUT_FILENAME = sys.argv[1:3]
- parsed_url = requests.utils.urlparse(URL)
- API_URL = f'https://a.4cdn.org{parsed_url.path}.json'
- response = requests.get(API_URL)
- json_posts = response.json()['posts']
- def parse_post(post):
- com = post.get('com')
- if not com:
- return ""
- for quote in re.findall(r'\<span class=\"(?:quote|deadlink)\".*?>(.*?)\<\/span\>', com):
- com = re.sub(r'\<span class=\"(?:quote|deadlink)\".*?\<\/span\>', f'>{quote}', com)
- for post_number in re.findall(r';>(\d*)\<\/a\>', com):
- com = re.sub(r'\<a.*?\<\/a\>', f'>>{post_number}', com)
- # TODO: Is there some sort of HTML element converter out there?
- sub_patterns = (
- (r'\<br\>', "\n"),
- (r'\<wbr\>', ""),
- (r''', "'"),
- (r'"', "\""),
- (r'\<\/?s\>', "")
- )
- for pattern, replacement in sub_patterns:
- com = re.sub(pattern, replacement, com)
- return f"{com}"
- parsed_posts = [parse_post(post) + '\n' + '-'*80 + '\n' for post in json_posts]
- with open(OUT_FILENAME, 'w') as f:
- f.writelines(parsed_posts)
Advertisement
Add Comment
Please, Sign In to add comment