Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from bs4 import BeautifulSoup as soup
- from urllib.request import urlopen as req
- from urllib.parse import urljoin
- import re
- urls = ["https://www.helios-gesundheit.de"]
- domain_list = ["https://www.helios-gesundheit.de/kliniken/schwerin/"]
- prohibited = ["info", "news"]
- text_keywords = ["Ritz", "Ristig"]
- url_list = []
- desired = "https://www.helios-gesundheit.de/kliniken/schwerin/unser-angebot/unsere-fachbereiche-klinikum/allgemein-und-viszeralchirurgie/team-allgemein-und-viszeralchirurgie/"
- for x in range(len(domain_list)):
- url_list.append(urls[x]+domain_list[x].replace(urls[x], ""))
- print(url_list)
- def prohibitedChecker(prohibited_list, string):
- for x in prohibited_list:
- if x in string:
- return True
- else:
- return False
- break
- def parseHTML(url):
- requestHTML = req(url)
- htmlPage = requestHTML.read()
- requestHTML.close()
- parsedHTML = soup(htmlPage, "html.parser")
- return parsedHTML
- searched_word = "Helios"
- for url in url_list:
- parsedHTML = parseHTML(url)
- href_crawler = parsedHTML.find_all("a", href=True)
- for href in href_crawler:
- crawled_url = urljoin(url,href.get("href"))
- print(crawled_url)
- if "www" not in crawled_url:
- continue
- parsedHTML = parseHTML(crawled_url)
- results = parsedHTML.body.find_all(string=re.compile('.*{0}.*'.format(searched_word)), recursive=True)
- for single_result in results:
- keyword_text_check = prohibitedChecker(text_keywords, single_result.string)
- if keyword_text_check != True:
- continue
- print(single_result.string)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement