Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from typing import Dict, Optional, Tuple
- import requests
- import pandas as pd
- import numpy as np
- import itertools
- import re
- def get_content(url: str, cookies: Optional[Dict[str, str]] = None, headers: Optional[Dict[str, str]] = None):
- return requests.get(url, cookies=cookies, headers=headers).text
- def get_regions_data(resp: str) -> Tuple[str, str]:
- return re.findall('href="(http://www.vybory.izbirkom.ru/region/region/izbirkom\?action=show.*?)">(.*?)<', resp)
- def to_frame(response: str):
- response_tables = pd.read_html(response)
- __, region, sub_region = response_tables[0].iat[0, 0].split(' > ')
- sub_region = response_tables[3].iat[1, 1]
- res = response_tables[-1]
- return res.iloc[1:] \
- .set_axis(response_tables[6].iloc[1:, 1], axis=0) \
- .set_axis(res.iloc[0], axis=1) \
- .drop(index='Вы одобряете изменения в Конституцию Российской Федерации?') \
- .iloc[:, :-1] \
- .T \
- .assign(
- **{
- 'ДА': lambda x: x['ДА'].str.split(' ').str[0],
- 'НЕТ': lambda x: x['НЕТ'].str.split(' ').str[0]
- }
- ) \
- .astype(np.int) \
- .assign(
- SUB_REGION=sub_region,
- REG=region
- ) \
- .reset_index() \
- .rename({0: 'УИК'}, axis=1) \
- # Получиь куку на сайте и потом подставить сюда
- headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36'}
- cookies = {
- '__utma': '252441553.597083863.1593662989.1593662989.1593667178.2',
- '__utmc': '252441553',
- '__utmz': '252441553.1593662989.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)',
- 'izbirkomSession': '47956e9f-0bed-403e-9409-22824cb90d13',
- 'JSESSIONID': 'e8b50acdcf0776605c6de30b5199',
- '__utmb': '252441553.65.10.1593667178',
- }
- get_content_partial = lambda url: get_content(url, headers=headers, cookies=cookies)
- start_url = 'http://www.vybory.izbirkom.ru/region/region/izbirkom?action=show&root=1&tvd=100100163596969&vrn=100100163596966®ion=0&global=1&sub_region=0&prver=0&pronetvd=null&vibid=100100163596969&type=465'
- full_content = get_content_partial(start_url)
- result = pd.DataFrame()
- for region_url, region_name in get_regions_data(full_content):
- print(f'Обработка региона: {region_name}')
- region_content = get_content_partial(region_url.replace('amp;', ''))
- region_data = get_regions_data(region_content)
- print(f'Всего {len(region_data)} районов')
- for subregion_url, subregion_name in region_data:
- subregion_content = get_content_partial(subregion_url.replace('amp;', ''))
- try:
- frame = to_frame(subregion_content)
- except Exception as ex:
- print(f'{region_name}: {subregion_name}', ex)
- result = pd.concat([result, frame], ignore_index=True, axis=0)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement