Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import requests
- from bs4 import BeautifulSoup
- pagecount = 1
- visited = []
- def crawler(page):
- global pagecount
- global maxpagecount
- global visited
- pages = []
- url = "https://en.wikipedia.org" + page
- if url in visited:
- return
- visited.append(str(url))
- websitecode = requests.get(url)
- text = websitecode.text
- soup = BeautifulSoup(text, "html.parser")
- seealso = soup.select("h2 > #See_also")[0]
- list = seealso.parent.find_next_sibling("ul")
- for a in list.findAll('a'):
- pages.append(str(a.get('href')))
- for a in pages:
- if "/wiki/" not in a:
- deadend = True
- break
- else:
- deadend = False
- print("Site number: " + str(pagecount))
- pagecount += 1
- print("URL: " + str(url))
- print("See also:")
- if deadend:
- div = see_also.parent.find_next_sibling("div")
- for li in div.findAll("li"):
- print(">", li.get_text())
- for a in div.findAll('a'):
- pages.append(str(a.get('href')))
- print("---------------------------")
- else:
- for li in list.findAll("li"):
- print(">", li.get_text())
- print("---------------------------")
- try:
- for link in pages:
- if pagecount <= maxpagecount:
- crawler(link)
- except:
- pass
- start = "/wiki/Online_chat"
- print("How many sites do you want to see?")
- maxpagecount = int(input())
- crawler(start)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement