Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from pprint import PrettyPrinter
- from urllib.parse import quote
- import re
- import requests
- import xmltodict
- def search_questions(query='', limit=1):
- query = quote(query)
- url = 'https://db.chgk.info/xml/search/questions/{}/Q/limit{}'.format(query, limit)
- response = xmltodict.parse(requests.get(url).text)
- if 'question' not in response['search']:
- return
- entities = response['search']['question']
- if not isinstance(entities, list):
- entities = [entities]
- for entity in entities:
- for field in ['Comments', 'Question']:
- data = entity[field]
- if data is None:
- continue
- regex = r'\s*\(pic: [^)]+\)\s*'
- matches = re.findall(regex, data)
- if not matches:
- continue
- images = []
- image_base_url = 'https://db.chgk.info/images/db/'
- regex_group = r'\s*\(pic: ([^)]+)\)\s*'
- for match in matches:
- url = re.match(regex_group, match)[1]
- if not url.startswith('http'):
- url = image_base_url + url
- images.append(url)
- entity[field + 'Images'] = images
- entity[field] = re.sub(regex, '', data).strip()
- return entities
- entities = search_questions('pic', 1)
- PrettyPrinter(width=100).pprint(entities)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement