Advertisement
Guest User

Untitled

a guest
Dec 19th, 2018
105
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.71 KB | None | 0 0
  1. from bs4 import BeautifulSoup
  2. import requests
  3.  
  4.  
  5. start_url = "https://ru.wikipedia.org"
  6.  
  7.  
  8. def get_soup(url):
  9.     print(f'Parse {url} page.')
  10.     response = requests.get(url)
  11.     soup = BeautifulSoup(response.text, 'lxml')
  12.     return soup
  13.  
  14.  
  15. def find_links(soup):
  16.     links = []
  17.     for a in soup.find_all('a'):
  18.         link = a.get('href', None)
  19.         if link:
  20.             if not link.startswith('https'):
  21.                 link = start_url + link
  22.             elif link.startswith('//'):
  23.                 link = 'https' + link
  24.             links.append(link)
  25.     return links
  26.  
  27.  
  28. def find_images(soup):
  29.     images = []
  30.     for img in soup.find_all('img'):
  31.         src = img.get('src', None)
  32.         if src:
  33.             link = src
  34.             if link.endswith('.png'):
  35.                 if link.startswith('//'):
  36.                     link = 'https:' + link
  37.                 elif link.startswith('/'):
  38.                     link = start_url + link
  39.                 images.append(link)
  40.     return images
  41.  
  42.  
  43. def writer(links):
  44.     with open('links.txt', 'w') as file:
  45.         file.write('\n'.join(links))
  46.  
  47.  
  48. def main():
  49.     seen = set()
  50.     soup = get_soup(start_url)
  51.     imgs = find_images(soup)
  52.     for url in find_links(soup):
  53.         if url not in seen:
  54.             seen.add(url)
  55.             try:
  56.                 soup = get_soup(url)
  57.                 for img in find_images(soup):
  58.                     if img not in seen:
  59.                         seen.add(img)
  60.                         imgs.append(img)
  61.             except requests.exceptions.ConnectionError:
  62.                 print(f'Error {url}')
  63.         else:
  64.             print(f'Pass {url}')
  65.     writer(imgs)
  66.  
  67.  
  68. if __name__ == '__main__':
  69.     main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement