Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import re
- import requests
- from bs4 import BeautifulSoup
- from urllib.parse import urljoin
- base = 'https://compranet.hacienda.gob.mx'
- link = 'https://compranet.hacienda.gob.mx/web/login.html'
- vigen_detail_page = 'https://compranet.hacienda.gob.mx/esop/toolkit/opportunity/current/{}/detail.si'
- def grab_first_link_from_dropdown(s,link):
- r = s.get(link)
- soup = BeautifulSoup(r.text,"html.parser")
- category_link = urljoin(base,soup.select_one('ul.dropdown-menu > li > a:contains("Vigentes")').get("href"))
- return category_link
- def fetch_detail_page_link(s,cat_link):
- res = s.get(cat_link)
- soup = BeautifulSoup(res.text,"html.parser")
- for items in soup.select("table.list-table > tbody.list-tbody > tr"):
- target_link = items.select_one("a.detailLink").get("onclick")
- detail_num = re.findall(r"goToDetail\(\'(\d+?)\'",target_link)[0]
- inner_link = vigen_detail_page.format(detail_num)
- yield inner_link
- def get_content(s,inner_link):
- res = s.get(inner_link)
- soup = BeautifulSoup(res.text,"html.parser")
- try:
- expediente = soup.select_one(".form_question:contains('Código del Expediente') + .form_answer").get_text(strip=True)
- except AttributeError: expediente = ""
- try:
- descripcion = soup.select_one(".form_question:contains('Descripción del Expediente') + .form_answer").get_text(strip=True)
- except AttributeError: descripcion = ""
- return expediente,descripcion
- if __name__ == '__main__':
- with requests.Session() as s:
- s.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36'
- category_link = grab_first_link_from_dropdown(s,link)
- for detail_page_link in fetch_detail_page_link(s,category_link):
- print(get_content(s,detail_page_link))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement