Advertisement
arch239

Untitled

Dec 10th, 2023 (edited)
613
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.38 KB | None | 0 0
  1. from pprint import PrettyPrinter
  2. from urllib.parse import quote
  3.  
  4. import re
  5. import requests
  6. import xmltodict
  7.  
  8. def search_questions(query='', limit=1):
  9.     query = quote(query)
  10.     url = 'https://db.chgk.info/xml/search/questions/{}/Q/limit{}'.format(query, limit)
  11.     response = xmltodict.parse(requests.get(url).text)
  12.  
  13.     if 'question' not in response['search']:
  14.         return
  15.  
  16.     entities = response['search']['question']
  17.     if not isinstance(entities, list):
  18.         entities = [entities]
  19.  
  20.     for entity in entities:
  21.         for field in ['Comments', 'Question']:
  22.             data = entity[field]
  23.             if data is None:
  24.                 continue
  25.  
  26.             regex = r'\s*\(pic: [^)]+\)\s*'
  27.             matches = re.findall(regex, data)
  28.             if not matches:
  29.                 continue
  30.  
  31.             images = []
  32.             image_base_url = 'https://db.chgk.info/images/db/'
  33.             regex_group = r'\s*\(pic: ([^)]+)\)\s*'
  34.             for match in matches:
  35.                 url = re.match(regex_group, match)[1]
  36.                 if not url.startswith('http'):
  37.                     url = image_base_url + url
  38.                 images.append(url)
  39.  
  40.             entity[field + 'Images'] = images
  41.             entity[field] = re.sub(regex, '', data).strip()
  42.    
  43.     return entities
  44.  
  45. entities = search_questions('pic', 1)
  46. PrettyPrinter(width=100).pprint(entities)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement