Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import os, re, time, random
- import pandas as pd
- import requests
- from bs4 import BeautifulSoup
- BASE = "http://www.isofmap.bg"
- SEARCH_URL = f"{BASE}/search"
- BASE_DIR = r"C:\Working_desktop\GATE_stuff\tst_isofmap"
- INPUT_XLSX = os.path.join(BASE_DIR, "isofmap.xlsx") # колони: kv, upi
- OUTPUT_CSV = os.path.join(BASE_DIR, "isofmap_results.csv")
- DEBUG_HTML = os.path.join(BASE_DIR, "last_response.html")
- HEADERS = {
- "User-Agent": "Mozilla/5.0",
- "Referer": f"{BASE}/",
- "Origin": BASE,
- "X-Requested-With": "XMLHttpRequest",
- }
- TOKEN_PATTERNS = [
- re.compile(r'name=["\']token["\']\s+value=["\']([^"\']+)["\']', re.I),
- re.compile(r'\btoken["\']?\s*[:=]\s*["\']([^"\']+)["\']'),
- ]
- def fetch_token(session: requests.Session) -> str:
- r = session.get(BASE, headers=HEADERS, timeout=30)
- r.raise_for_status()
- html = r.text
- for pat in TOKEN_PATTERNS:
- m = pat.search(html)
- if m:
- return m.group(1)
- soup = BeautifulSoup(html, "html.parser")
- token_input = soup.find("input", {"name": "token"})
- if token_input and token_input.get("value"):
- return token_input["value"]
- raise RuntimeError("Не намерих token в началната страница.")
- def query(session: requests.Session, token: str, kv: str, upi: str) -> dict:
- data = {
- "token": token,
- "searchType": "searchByRegulation",
- "admRegion": "",
- "place": "",
- "quarter": kv,
- "parcel": upi,
- "regDestine": "",
- "regPurpose": "",
- }
- r = session.post(SEARCH_URL, data=data, headers=HEADERS, timeout=30)
- r.raise_for_status()
- # запази HTML за дебъг
- with open(DEBUG_HTML, "w", encoding="utf-8") as f:
- f.write(r.text)
- soup = BeautifulSoup(r.text, "lxml")
- # ---------- Стратегия 1: data-title колони ----------
- td_main = soup.find("td", attrs={"data-title": "Основна информация"})
- td_extra = soup.find("td", attrs={"data-title": "Допълнителна информация"})
- if td_main and td_extra:
- # опитай да вземеш и първата клетка от същия ред (ID/етикет)
- upi_cell = ""
- tr = td_main.find_parent("tr")
- if tr:
- tds = tr.find_all("td")
- if tds:
- upi_cell = tds[0].get_text(strip=True)
- return {
- "found": 1,
- "upi_id_or_label": upi_cell,
- "main_info": td_main.get_text(strip=True),
- "extra_info": td_extra.get_text(strip=True),
- }
- # ---------- Стратегия 2: таблица след секцията „УПИ“ ----------
- upi_anchor = soup.find(lambda t: t.get_text(strip=True) == "УПИ")
- if not upi_anchor:
- # понякога е текстов възел
- upi_txt = soup.find(string=lambda s: isinstance(s, str) and s.strip() == "УПИ")
- if upi_txt:
- upi_anchor = upi_txt.parent
- if upi_anchor:
- table = upi_anchor.find_next("table")
- if table:
- data_tr = None
- for tr in table.find_all("tr"):
- tds = tr.find_all("td")
- if len(tds) >= 3:
- data_tr = tr
- break
- if data_tr:
- tds = data_tr.find_all("td")
- return {
- "found": 1,
- "upi_id_or_label": tds[0].get_text(strip=True),
- "main_info": tds[1].get_text(strip=True),
- "extra_info": tds[2].get_text(strip=True),
- }
- # ---------- Стратегия 3: regex върху текст след „УПИ“ ----------
- # вземи текста след котвата „УПИ“
- full_text = " ".join(soup.stripped_strings)
- # опитай да изрежеш всичко след „УПИ“
- part = full_text
- if "УПИ" in full_text:
- part = full_text.split("УПИ", 1)[1]
- # шаблон: ... Намерени резултати: 1 Основна информация Допълнителна информация <ID> <MAIN> <EXTRA>
- m = re.search(
- r"Намерени\s+резултати:\s*1\s*Основна\s+информация\s*Допълнителна\s+информация\s*(\d+)\s*(.+?)\s{2,}(.+?)\s*(?:Намерени\s+резултати:|$)",
- part,
- flags=re.S
- )
- if m:
- upi_id = m.group(1).strip()
- main_info = re.sub(r"\s+", " ", m.group(2)).strip()
- extra_info = re.sub(r"\s+", " ", m.group(3)).strip()
- return {
- "found": 1,
- "upi_id_or_label": upi_id,
- "main_info": main_info,
- "extra_info": extra_info,
- }
- return {"found": 0, "reason": "no_row", "html_len": len(r.text)}
- def main():
- if not os.path.exists(INPUT_XLSX):
- raise FileNotFoundError(f"Липсва входният Excel: {INPUT_XLSX}")
- df = pd.read_excel(INPUT_XLSX)
- if not {"kv", "upi"}.issubset(df.columns):
- raise ValueError("Excel-ът трябва да има колони 'kv' и 'upi'.")
- out = []
- with requests.Session() as s:
- token = fetch_token(s)
- for _, rec in df.iterrows():
- kv = str(rec["kv"]).strip()
- upi = str(rec["upi"]).strip()
- res = query(s, token, kv, upi)
- # ако token се окаже проблем – опитай един refresh
- if res.get("found") == 0 and res.get("html_len", 0) < 100:
- token = fetch_token(s)
- res = query(s, token, kv, upi)
- res.update({"kv": kv, "upi": upi})
- out.append(res)
- time.sleep(0.5 + random.random()*0.6)
- pd.DataFrame(out).to_csv(OUTPUT_CSV, index=False, encoding="utf-8-sig")
- print("Готово ->", OUTPUT_CSV, "\n(ако има проблем, виж HTML в)", DEBUG_HTML)
- if __name__ == "__main__":
- main()
Advertisement
Add Comment
Please, Sign In to add comment