Advertisement
Guest User

Untitled

a guest
Jun 19th, 2019
90
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.67 KB | None | 0 0
  1. from bs4 import BeautifulSoup as soup
  2. from urllib.request import urlopen as req
  3. from urllib.parse import urljoin
  4. import re
  5.  
  6. urls = ["https://www.helios-gesundheit.de"]
  7. domain_list = ["https://www.helios-gesundheit.de/kliniken/schwerin/"]
  8. prohibited = ["info", "news"]
  9. text_keywords = ["Ritz", "Ristig"]
  10. url_list = []
  11.  
  12. desired = "https://www.helios-gesundheit.de/kliniken/schwerin/unser-angebot/unsere-fachbereiche-klinikum/allgemein-und-viszeralchirurgie/team-allgemein-und-viszeralchirurgie/"
  13.  
  14. for x in range(len(domain_list)):
  15. url_list.append(urls[x]+domain_list[x].replace(urls[x], ""))
  16.  
  17. print(url_list)
  18.  
  19. def prohibitedChecker(prohibited_list, string):
  20. for x in prohibited_list:
  21. if x in string:
  22. return True
  23. else:
  24. return False
  25. break
  26.  
  27. def parseHTML(url):
  28. requestHTML = req(url)
  29. htmlPage = requestHTML.read()
  30. requestHTML.close()
  31. parsedHTML = soup(htmlPage, "html.parser")
  32. return parsedHTML
  33.  
  34. searched_word = "Helios"
  35.  
  36. for url in url_list:
  37. parsedHTML = parseHTML(url)
  38. href_crawler = parsedHTML.find_all("a", href=True)
  39. for href in href_crawler:
  40. crawled_url = urljoin(url,href.get("href"))
  41. print(crawled_url)
  42. if "www" not in crawled_url:
  43. continue
  44. parsedHTML = parseHTML(crawled_url)
  45. results = parsedHTML.body.find_all(string=re.compile('.*{0}.*'.format(searched_word)), recursive=True)
  46. for single_result in results:
  47. keyword_text_check = prohibitedChecker(text_keywords, single_result.string)
  48. if keyword_text_check != True:
  49. continue
  50. print(single_result.string)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement