Advertisement
Guest User

Untitled

a guest
Aug 21st, 2017
65
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.75 KB | None | 0 0
  1. import requests
  2. from bs4 import BeautifulSoup
  3. import re
  4. from urllib.request import urlretrieve
  5. import os
  6. import sys
  7.  
  8. def download_images(articles):
  9.         for article in articles:
  10.             print(article.text, article["href"])
  11.             if not os.path.isdir(os.path.join(directory_name, article.text)):
  12.                 os.mkdir(os.path.join(directory_name, article.text.replace('\\', ' ').replace('/', ' ').replace(':', ' ').replace('*', ' ').replace('?', ' ').replace('"', ' ').replace('<', ' ').replace('>', ' ').replace('|', ' ' )))
  13.             res = requests.get("https://www.ptt.cc" + article["href"])
  14.             images = reg_imgur_file.findall(res.text)
  15.             print(images)
  16.        
  17.             for image in set(images):
  18.                 result = re.search("http[s]?://[i.]*imgur.com/(\w+\.(?:jpg|png|gif))",image)
  19.                 if result:
  20.                     ID = re.search("http[s]?://[i.]*imgur.com/(\w+\.(?:jpg|png|gif))",image).group(1)
  21.                     print(ID)
  22.                     urlretrieve(image, os.path.join(directory_name, article.text, ID))
  23.  
  24. def crawler(pages=3):   #pages=3. if we call fxn with crawler() without input, default = 3
  25.     if not os.path.isdir(directory_name):
  26.         os.mkdir(directory_name)
  27.    
  28.     url = "https://www.ptt.cc/bbs/Beauty/index.html"
  29.  
  30.     for round in range(pages):
  31.         res = requests.get(url)
  32.  
  33.         soup = BeautifulSoup(res.text, "html.parser")
  34.  
  35.         articles = soup.select("div.title a")
  36.  
  37.         paging = soup.select("div.btn-group-paging a")
  38.  
  39.         next_url = "https://www.ptt.cc" + paging[1]["href"]
  40.  
  41.         url = next_url
  42.  
  43.         download_images(articles)
  44.  
  45.  
  46. directory_name = "PTT_Beauty"
  47.  
  48. reg_imgur_file = re.compile('http[s]?://[i.]*imgur.com/\w+')
  49. crawler()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement