Advertisement
konchin_shih

light novel picture scraper

Feb 19th, 2021
120
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.72 KB | None | 0 0
  1. from urllib.request import urlopen, Request
  2. from urllib.error import HTTPError, URLError
  3. from bs4 import BeautifulSoup
  4. from os import mkdir
  5.  
  6.  
  7. AID = "1508"
  8. print("AID =", AID)
  9.  
  10.  
  11. def buildpath(path):
  12.     curpath = './'
  13.     while path.find('/') != -1:
  14.         curpath += path[:path.find('/') + 1]
  15.         path = path[path.find('/') + 1:]
  16.         try:
  17.             mkdir(curpath)
  18.         finally:
  19.             continue
  20.  
  21.  
  22. articleUrl = 'https://www.wenku8.net/modules/article/reader.php?aid=' + AID
  23.  
  24.  
  25. def cmp(e):
  26.     return int(e)
  27.  
  28.  
  29. def getCID():
  30.     req = Request(articleUrl, headers={'User-Agent': 'Mozilla/5.0'})
  31.     data = urlopen(req)
  32.     bs = BeautifulSoup(data, 'html.parser')
  33.     CID = []
  34.     for i in bs.find_all('a'):
  35.         if i.text == "插图":
  36.             CID.append(i['href'][i['href'].rfind('=') + 1:])
  37.     CID.sort(key=cmp)
  38.     print("CID =", CID)
  39.     return CID
  40.  
  41.  
  42. pictureUrlBase = "http://picture.wenku8.com/pictures/1/" + AID + "/";
  43.  
  44.  
  45. def getImage(cid, pid):
  46.     url = pictureUrlBase + cid + '/' + pid + '.jpg'
  47.     try:
  48.         req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
  49.         data = urlopen(req)
  50.     except HTTPError as e:
  51.         return False
  52.     except URLError as e:
  53.         return False
  54.     else:
  55.         print(url, " ... success")
  56.         url = url[26:]
  57.         buildpath(url)
  58.         file = open(url, 'wb')
  59.         file.write(data.read())
  60.         file.close()
  61.         return True
  62.  
  63.  
  64. cur = 1
  65. for cid in getCID():
  66.     print("now scraping:", cid)
  67.     while getImage(cid, str(cur)) == False:
  68.         cur += 10
  69.  
  70.     forward = cur
  71.     while getImage(cid, str(forward)) == True:
  72.         forward -= 1
  73.     while getImage(cid, str(cur)) == True:
  74.         cur += 1
  75.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement