Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from urllib.request import urlopen, Request
- from urllib.error import HTTPError, URLError
- from bs4 import BeautifulSoup
- from os import mkdir
- AID = "1508"
- print("AID =", AID)
- def buildpath(path):
- curpath = './'
- while path.find('/') != -1:
- curpath += path[:path.find('/') + 1]
- path = path[path.find('/') + 1:]
- try:
- mkdir(curpath)
- finally:
- continue
- articleUrl = 'https://www.wenku8.net/modules/article/reader.php?aid=' + AID
- def cmp(e):
- return int(e)
- def getCID():
- req = Request(articleUrl, headers={'User-Agent': 'Mozilla/5.0'})
- data = urlopen(req)
- bs = BeautifulSoup(data, 'html.parser')
- CID = []
- for i in bs.find_all('a'):
- if i.text == "插图":
- CID.append(i['href'][i['href'].rfind('=') + 1:])
- CID.sort(key=cmp)
- print("CID =", CID)
- return CID
- pictureUrlBase = "http://picture.wenku8.com/pictures/1/" + AID + "/";
- def getImage(cid, pid):
- url = pictureUrlBase + cid + '/' + pid + '.jpg'
- try:
- req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
- data = urlopen(req)
- except HTTPError as e:
- return False
- except URLError as e:
- return False
- else:
- print(url, " ... success")
- url = url[26:]
- buildpath(url)
- file = open(url, 'wb')
- file.write(data.read())
- file.close()
- return True
- cur = 1
- for cid in getCID():
- print("now scraping:", cid)
- while getImage(cid, str(cur)) == False:
- cur += 10
- forward = cur
- while getImage(cid, str(forward)) == True:
- forward -= 1
- while getImage(cid, str(cur)) == True:
- cur += 1
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement