Untitled

from bs4 import BeautifulSoup
import json
import requests
from random import choice
import time

##########################
FOLDER_NAME_HTML = 'html_kadastres'

##########################

SEARCH_HEAD={
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'ru,en;q=0.9',
    'cache-control': 'no-cache',
    'dnt': '1',
    'pragma': 'no-cache',
    'referer': 'https:/kadbase.ru',
    'sec-ch-ua': '"Chromium";v="94", "Google Chrome";v="94", ";Not A Brand";v="99"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
    'sec-fetch-dest': 'document',
    'sec-fetch-mode': 'navigate',
    'sec-fetch-site': 'same-origin',
    'sec-fetch-user': '?1',
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36'
}
def getProxy():
    proxies = [
            '192.168.0.1',
        ]
    proxies = choice(proxies)
    proxyDict = {
        'http': f'http://login:pass@{proxies}:8000',
        'https':  f'https://login:pass@{proxies}:8000',
    }
    return proxyDict
def getKadArr():
    try:
        page = requests.get('https://kadbase.ru/cadastre/', headers=SEARCH_HEAD, proxies=getProxy())
    except Exception as e:
        print(str(e))
        exit()
    df = page.text
    # f = open('kadastre.html','r', encoding='utf-8')
    # df = f.read()
    # f.close()
    soup = BeautifulSoup(df, 'html.parser')
    list_kadastres = soup.find('ul', {'class':'ul_list_open'})
    answer = []
    for row in list_kadastres.findAll('a'):
        print(row)
        tab_lists = soup.find('div', {'id':str(row['number'])})
        ins = []
        for link in tab_lists.findAll('a'):
            print(link)
            if '01:00-uslovnyj-kadastrovyj-rajon' in link['href']:
                continue
            ins.append({
                'href':link['href'],
                'number': link['number'],
                'text': link.text,
            })
        answer.append( {
            'href':row['href'],
            'number': row['number'],
            'text': row.text,
            'in': ins
        })
    with open('kadastres.json', 'w', encoding='utf-8') as f:
        json.dump(answer, f, ensure_ascii=False, indent=4)
    return answer


def req(URL, page_index, is_start = 1, index = 0):
    try:
        page = requests.get(URL, headers=SEARCH_HEAD, proxies=getProxy())
    except:
        print("ERR", URL)
        exit()
    if 'Ваш IP-адрес заблокирован.' in page.text:
        print('Ваш IP-адрес заблокирован.')
        index = index + 1
        if index > 5:
            exit()
        return req(URL, page_index, is_start, index)
    try:
        htmlfilename = FOLDER_NAME_HTML+'/'+URL.replace('https://kadbase.ru/cadastre/','').replace('/','_').replace(':','#').replace('?','&')+'.html'
        f = open(htmlfilename, 'w+')
        f.write(page.text)
        f.close()
    except Exception as e:
        print(str(e))
    soup = BeautifulSoup(page.text, 'html.parser')
    list_kadastres = soup.findAll('div', {'class':'kad_num', 'attrth':'Кадастровый квартал'})
    list_kadastres_array  = []
    for el in list_kadastres:
        list_kadastres_array.append(el.text)
    if is_start == 0:
        try:
            total_records = soup.find('div', {'class':'total_records'}).text.split('(страниц ')[1].split(')')[0]
        except:
            total_records = 1
        return list_kadastres_array, int(int(total_records) + 1)
    return list_kadastres_array, None

def parserPage(URL):
    list_kadastres_array, total_records = req(URL, 0, 0)
    for i in range(1, total_records):
        tmp, _ = req(URL+'?page='+str(i), i)
        list_kadastres_array = list_kadastres_array + tmp
        time.sleep(2)
        exit()
    return list_kadastres_array

a = getKadArr()
# f = open('kadastres.json', 'r', encoding='utf-8')
# a = json.loads(f.read())
# f.close()
for el in a:
    for e in el['in']:
        list_kadastres = parserPage('https://kadbase.ru'+e['href'])
        # e['list'] = list_kadastres
        # with open('kadastres.json', 'w', encoding='utf-8') as f:
        #     json.dump(a, f, ensure_ascii=False, indent=4)