networkcat

Untitled

Mar 20th, 2019
73
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. # Scrape Alfie Comic
  2. # March 2019
  3.  
  4. import shutil
  5. import urllib.request
  6. from pprint import pprint
  7.  
  8. import requests
  9. from bs4 import BeautifulSoup
  10. from fake_useragent import UserAgent
  11.  
  12. STARTURL = "https://buttsmithy.com/archives/comic/p1"
  13. ua = UserAgent()
  14.  
  15.  
  16. def dl_randua(fu, fn, uagent):
  17.     r = requests.get(fu, stream=True, headers={"User-agent": uagent})
  18.     with open(fn, "wb") as f:
  19.         r.raw.decode_content = True
  20.         shutil.copyfileobj(r.raw, f)
  21.  
  22.  
  23. def nexturl(soup):
  24.     for a in soup.findAll("a", {"class": "comic-nav-base comic-nav-next"}):
  25.         alist = a.get_attribute_list("href")
  26.         pprint(alist)
  27.         src = alist[0]
  28.         if src is not None:
  29.             break
  30.     return src
  31.  
  32.  
  33. def main():
  34.     url = STARTURL
  35.     counter = 0
  36.     while url:
  37.         response = requests.get(url)
  38.         soup = BeautifulSoup(response.text, "html.parser")
  39.         for i in soup.findAll("img"):
  40.             src = i.get_attribute_list("src")[0]
  41.             if (src is not None) and ("imgur" not in src) and ("rss" not in src):
  42.                 f_ext = src.split(".")[-1]
  43.                 dl_randua(src, str(counter)+"."+f_ext, ua.chrome)
  44.                 break
  45.         url = nexturl(soup)
  46.         counter += 1
  47.  
  48.  
  49. if __name__ == "__main__":
  50.     main()
RAW Paste Data

Adblocker detected! Please consider disabling it...

We've detected AdBlock Plus or some other adblocking software preventing Pastebin.com from fully loading.

We don't have any obnoxious sound, or popup ads, we actively block these annoying types of ads!

Please add Pastebin.com to your ad blocker whitelist or disable your adblocking software.

×