Advertisement
Guest User

Untitled

a guest
Dec 10th, 2018
69
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.66 KB | None | 0 0
  1. import requests
  2. from bs4 import BeautifulSoup
  3.  
  4. pagecount = 1
  5. visited = []
  6.  
  7. def crawler(page):
  8.         global pagecount
  9.         global maxpagecount
  10.         global visited
  11.         pages = []
  12.         url = "https://en.wikipedia.org" + page
  13.         if url in visited:
  14.             return
  15.         visited.append(str(url))
  16.         websitecode = requests.get(url)
  17.         text = websitecode.text
  18.         soup = BeautifulSoup(text, "html.parser")
  19.         seealso = soup.select("h2 > #See_also")[0]
  20.  
  21.         list = seealso.parent.find_next_sibling("ul")
  22.  
  23.         for a in list.findAll('a'):
  24.             pages.append(str(a.get('href')))
  25.         for a in pages:
  26.             if "/wiki/" not in a:
  27.                 deadend = True
  28.                 break
  29.             else:
  30.                 deadend = False
  31.  
  32.         print("Site number: " + str(pagecount))
  33.         pagecount += 1
  34.         print("URL: " + str(url))
  35.         print("See also:")
  36.  
  37.         if deadend:
  38.             div = see_also.parent.find_next_sibling("div")
  39.             for li in div.findAll("li"):
  40.                 print(">", li.get_text())
  41.             for a in div.findAll('a'):
  42.                 pages.append(str(a.get('href')))
  43.             print("---------------------------")
  44.         else:
  45.             for li in list.findAll("li"):
  46.                 print(">", li.get_text())
  47.             print("---------------------------")
  48.  
  49.         try:
  50.             for link in pages:
  51.                 if pagecount <= maxpagecount:
  52.                     crawler(link)
  53.         except:
  54.             pass
  55. start = "/wiki/Online_chat"
  56. print("How many sites do you want to see?")
  57. maxpagecount = int(input())
  58. crawler(start)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement