Guest User

Untitled

a guest
Nov 10th, 2021
76
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.32 KB | None | 0 0
  1. # Use this by calling `python3 4chan_parser.py <Thread URL> <Some output filename/filepath>, e.g.:
  2. # python3 4chan_parser.py https://boards.4channel.org/vt/thread/12765676 12765676.txt
  3.  
  4. import re
  5. import requests
  6. import sys
  7.  
  8.  
  9. URL, OUT_FILENAME = sys.argv[1:3]
  10. parsed_url = requests.utils.urlparse(URL)
  11. API_URL = f'https://a.4cdn.org{parsed_url.path}.json'
  12.  
  13. response = requests.get(API_URL)
  14. json_posts = response.json()['posts']
  15.  
  16. def parse_post(post):
  17.     com = post.get('com')
  18.     if not com:
  19.         return ""
  20.     for quote in re.findall(r'\<span class=\"(?:quote|deadlink)\".*?&gt;(.*?)\<\/span\>', com):
  21.         com = re.sub(r'\<span class=\"(?:quote|deadlink)\".*?\<\/span\>', f'>{quote}', com)
  22.     for post_number in re.findall(r';&gt;(\d*)\<\/a\>', com):
  23.         com = re.sub(r'\<a.*?\<\/a\>', f'>>{post_number}', com)
  24.  
  25.     # TODO: Is there some sort of HTML element converter out there?
  26.     sub_patterns = (
  27.         (r'\<br\>', "\n"),
  28.         (r'\<wbr\>', ""),
  29.         (r'&#039;', "'"),
  30.         (r'&quot;', "\""),
  31.         (r'\<\/?s\>', "")
  32.     )
  33.     for pattern, replacement in sub_patterns:
  34.         com = re.sub(pattern, replacement, com)
  35.     return f"{com}"
  36.    
  37. parsed_posts = [parse_post(post) + '\n' + '-'*80 + '\n' for post in json_posts]
  38.  
  39. with open(OUT_FILENAME, 'w') as f:
  40.     f.writelines(parsed_posts)
Advertisement
Add Comment
Please, Sign In to add comment