Advertisement
Guest User

Untitled

a guest
Jun 19th, 2019
127
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.61 KB | None | 0 0
  1. from bs4 import BeautifulSoup as soup
  2. from urllib.request import urlopen as req
  3. from urllib.parse import urljoin
  4. import re
  5.  
  6. urls = ["https://www.helios-gesundheit.de"]
  7. domain_list = ["https://www.helios-gesundheit.de/kliniken/schwerin/"]
  8. prohibited = ["news", "ponitz"]
  9. text_keywords = ["Ritz", "Ristig"]
  10. url_list = []
  11. for x in range(len(domain_list)):
  12. url_list.append(urls[x]+domain_list[x].replace(urls[x], ""))
  13.  
  14. print(url_list)
  15.  
  16. def prohibitedChecker(prohibited_list, string):
  17. for x in prohibited_list:
  18. if x in string:
  19. return True
  20. else:
  21. return False
  22.  
  23. def parseHTML(url):
  24. requestHTML = req(url)
  25. htmlPage = requestHTML.read()
  26. requestHTML.close()
  27. parsedHTML = soup(htmlPage, "html.parser")
  28. return parsedHTML
  29.  
  30. searched_word = "Helios"
  31.  
  32. for url in url_list:
  33. parsedHTML = parseHTML(url)
  34. href_crawler = parsedHTML.find_all("a", href=True)
  35. for href in href_crawler:
  36. crawled_url = urljoin(url,href.get("href"))
  37. print(crawled_url)
  38. if "www" not in crawled_url:
  39. continue
  40. parsedHTML = parseHTML(crawled_url)
  41. prohibited_check = prohibitedChecker(prohibited, crawled_url)
  42.  
  43. if prohibited_check == True:
  44. continue
  45. results = parsedHTML.body.find_all(string=re.compile('.*{0}.*'.format(searched_word)), recursive=True)
  46. for single_result in results:
  47. keyword_text_check = prohibitedChecker(text_keywords, single_result.string)
  48. if keyword_text_check != True:
  49. continue
  50. print(single_result.string)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement