Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from bs4 import BeautifulSoup as BS
- from openpyxl import Workbook, load_workbook
- from itertools import *
- from time import sleep
- import requests
- import asyncio
- import aiohttp
- from selenium import webdriver
- from selenium.webdriver.common.keys import Keys
- def add_university_data(ws_universities, id_num, name, link, city):
- ws_universities.append([id_num, name, link, city])
- def add_item_data(ws_items ,id_num, name, link):
- ws_items.append([id_num, name, link])
- def add_directions_data(ws_directions, id_num, directions, num):
- ws_directions.append([id_num, directions, num])
- universities_data = []
- items_data = []
- direction_data = []
- async def get_page_data(session, page):
- headers = {
- "accept": 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
- 'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'
- }
- print(f'Страница №{page + 1} / 250')
- url = f'https://www.ucheba.ru/for-abiturients/vuz/rossiya?s={page}0'
- async with session.get(url=url, headers=headers) as response:
- response_text = await response.text()
- # Основная страница
- src = response_text.text
- soup = BS(src, "lxml")
- all_universiti_hrefs = soup.find_all('a', 'js_webstat')
- count_university = 1
- for link in all_universiti_hrefs:
- count_items = 0
- count_derection = 0
- link_text = link.text
- link_href = 'https://www.ucheba.ru' + link.get('href')
- response = session.get(url=link_href, headers=headers)
- src = response.text
- soup = BS(src, 'lxml')
- universities_citi = soup.find(
- 'ul', 'params-list').find_all('li')[0].text.strip()
- universities_data.append(
- {
- count_university: [link_text, link_href, universities_citi]
- }
- )
- response = session.get(url=link_href, headers=headers)
- src = response.text
- soup = BS(src, 'lxml')
- receipt_group = soup.find('div', class_='ege-groups-list')
- try:
- for ul in receipt_group.find_all('div', class_='ege-groups-list__item'):
- items = []
- for li in ul.find('ul', class_='ege-groups-list__subjects subjects-list').find_all('li'):
- items.append(li.text.replace(
- '\n', '').replace(' ', '-'))
- items_name = '_'.join(items)
- items_href = 'https://www.ucheba.ru' + ul.find('div', class_='ege-groups-list__info').find(
- 'a', class_='ege-groups-list__programs-link').get('href')
- items_data.append(
- {
- f'{count_university}_{count_items}': [items_name, items_href]
- }
- )
- src = session.get(url=items_href, headers=headers)
- soup = BS(src.text, 'lxml')
- if not soup.find('div', class_='paginator mt-25'):
- all_derection = [i.text for i in soup.find_all(
- 'a', class_='js_webstat')]
- all_passing_score = []
- for i in soup.find_all('div', class_='search-results-options'):
- if isinstance(i.text.split()[2]) or i.text.split()[2] == '—':
- all_passing_score.append(i.text.split()[2])
- else:
- all_passing_score.append(i.text.split()[3])
- for d, p in zip_longest(all_derection, all_passing_score, fillvalue=' '):
- direction_data.append(
- {
- f'{count_university}_{count_items}_{count_derection}': [d, p]
- }
- )
- count_derection += 1
- else:
- page = soup.find(
- 'div', class_='paginator mt-25').find_all('a')
- for i in page:
- items_href = items_href[:items_href[::-
- 1].page('//'[0])-1] + i.get('href')
- src = session.get(url=items_href, headers=headers)
- soup = BS(src.text, 'lxml')
- all_derection = [j.text for j in soup.find_all(
- 'a', class_='js_webstat')]
- all_passing_score = []
- for j in soup.find_all('div', class_='search-results-options'):
- if isinstance(j.text.split()[2]) or j.text.split()[2] == '—':
- all_passing_score.append(j.text.split()[2])
- else:
- all_passing_score.append(j.text.split()[3])
- for d, p in zip_longest(all_derection, all_passing_score, fillvalue=' '):
- direction_data.append(
- {
- f'{count_university}_{count_items}_{count_derection}': [d, p]
- }
- )
- count_derection += 1
- count_items += 1
- except:
- print('Ошибка')
- continue
- count_university += 1
- print(f'Обработана страница {page}')
- #Без авторизации
- async def gather_data():
- headers = {
- "accept": 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
- 'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'
- }
- async with aiohttp.ClientSession() as session:
- url = 'https://www.ucheba.ru/for-abiturients/vuz/rossiya'
- response = await session.get(url = url, headers = headers)
- soup = BS(await response.text(), 'lxml')
- pages_count = soup.find('div', 'paginator mt-25').find('ul').find_all('li')[-1].text
- tasks = []
- for page in range(0, pages_count + 1):
- task = asyncio.create_task(get_page_data(session, page))
- tasks.append(task)
- await asyncio.gether(*tasks)
- '''#Авторизация добавлена, но не работает
- async def gather_data():
- auth_data = {
- "password": "Vi31128282",
- "username": "brykovvita173@gmail.com",
- "rememberMe": "true"
- }
- auth_url = 'https://api.ucheba.ru/v1/auth'
- headers = {
- "accept": 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
- 'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'
- }
- async with aiohttp.ClientSession() as session:
- # authorize first
- await session.post(auth_url, data=auth_data, headers=headers)
- # get the initial page
- url = 'https://www.ucheba.ru/for-abiturients/vuz/rossiya'
- response = await session.get(url=url, headers=headers)
- soup = BS(await response.text(), 'lxml')
- pages_count = soup.find('div', 'paginator mt-25').find('ul').find_all('li')[-1].text
- tasks = []
- for page in range(0, int(pages_count) + 1):
- task = asyncio.create_task(get_page_data(session, page, headers))
- tasks.append(task)
- await asyncio.gather(*tasks)
- async def get_page_data(session, page, headers):
- url = f'https://www.ucheba.ru/for-abiturients/vuz/rossiya?page={page}'
- response = await session.get(url=url, headers=headers)
- '''
- def main():
- asyncio.run(gather_data())
- wb = Workbook()
- ws_universities = wb.active
- ws_universities.title = 'Universities'
- ws_universities.append(
- ['ID', 'Название вуза', 'Ссылка на страницу вуза', 'Город вуза'])
- ws_items = wb.create_sheet('Items')
- ws_items.append(['ID', 'Предметы для поступления',
- 'Ссылка на все направления по предметам'])
- ws_directions = wb.create_sheet('Directions')
- ws_directions.append(['ID', 'Направления', 'Минимальный балл'])
- wb.save(
- f'/home/kukuruzka-vitya/CODE/za_python/parsing/pars_university_first/university.xlsx')
- for data_un in universities_data:
- for university_num, items in data_un.items():
- add_university_data(ws_universities, university_num, items[0], items[1], items[2])
- for data_un in items_data:
- for university_num, items in data_un.items():
- add_university_data(ws_items ,university_num.split('_')[0], items[0], items[1])
- for data_un in direction_data:
- for university_num, items in data_un.items():
- add_university_data(ws_directions ,university_num.split('_')[0], items[0], items[1])
- wb.save(
- f'/home/kukuruzka-vitya/CODE/za_python/parsing/pars_university_first/university.xlsx')
- if __name__ == '__main__':
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement