Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import requests
- from bs4 import BeautifulSoup
- import re
- from urllib.request import urlretrieve
- import os
- import sys
- def download_images(articles):
- for article in articles:
- print(article.text, article["href"])
- if not os.path.isdir(os.path.join(directory_name, article.text)):
- os.mkdir(os.path.join(directory_name, article.text.replace('\\', ' ').replace('/', ' ').replace(':', ' ').replace('*', ' ').replace('?', ' ').replace('"', ' ').replace('<', ' ').replace('>', ' ').replace('|', ' ' )))
- res = requests.get("https://www.ptt.cc" + article["href"])
- images = reg_imgur_file.findall(res.text)
- print(images)
- for image in set(images):
- result = re.search("http[s]?://[i.]*imgur.com/(\w+\.(?:jpg|png|gif))",image)
- if result:
- ID = re.search("http[s]?://[i.]*imgur.com/(\w+\.(?:jpg|png|gif))",image).group(1)
- print(ID)
- urlretrieve(image, os.path.join(directory_name, article.text, ID))
- def crawler(pages=3): #pages=3. if we call fxn with crawler() without input, default = 3
- if not os.path.isdir(directory_name):
- os.mkdir(directory_name)
- url = "https://www.ptt.cc/bbs/Beauty/index.html"
- for round in range(pages):
- res = requests.get(url)
- soup = BeautifulSoup(res.text, "html.parser")
- articles = soup.select("div.title a")
- paging = soup.select("div.btn-group-paging a")
- next_url = "https://www.ptt.cc" + paging[1]["href"]
- url = next_url
- download_images(articles)
- directory_name = "PTT_Beauty"
- reg_imgur_file = re.compile('http[s]?://[i.]*imgur.com/\w+')
- crawler()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement