Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from bs4 import BeautifulSoup
- import requests
- start_url = "https://ru.wikipedia.org"
- def get_soup(url):
- print(f'Parse {url} page.')
- response = requests.get(url)
- soup = BeautifulSoup(response.text, 'lxml')
- return soup
- def find_links(soup):
- links = []
- for a in soup.find_all('a'):
- link = a.get('href', None)
- if link:
- if not link.startswith('https'):
- link = start_url + link
- elif link.startswith('//'):
- link = 'https' + link
- links.append(link)
- return links
- def find_images(soup):
- images = []
- for img in soup.find_all('img'):
- src = img.get('src', None)
- if src:
- link = src
- if link.endswith('.png'):
- if link.startswith('//'):
- link = 'https:' + link
- elif link.startswith('/'):
- link = start_url + link
- images.append(link)
- return images
- def writer(links):
- with open('links.txt', 'w') as file:
- file.write('\n'.join(links))
- def main():
- seen = set()
- soup = get_soup(start_url)
- imgs = find_images(soup)
- for url in find_links(soup):
- if url not in seen:
- seen.add(url)
- try:
- soup = get_soup(url)
- for img in find_images(soup):
- if img not in seen:
- seen.add(img)
- imgs.append(img)
- except requests.exceptions.ConnectionError:
- print(f'Error {url}')
- else:
- print(f'Pass {url}')
- writer(imgs)
- if __name__ == '__main__':
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement