Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import requests as rq,bs4 as bs
- import os,glob,json
- client = rq.Session()
- def out(*s):
- print(' '.join(s))
- def download_file(url,filename):
- r = rq.get(url, stream=True)
- with open( filename , 'wb') as f:
- for chunk in r.iter_content(chunk_size=1024):
- if chunk: # filter out keep-alive new chunks
- f.write(chunk)
- f.flush()
- return filename
- def xkcd():
- COMIC = "xkcd"
- if not os.path.exists(COMIC):
- os.makedirs(COMIC)
- current = client.get("http://xkcd.com/info.0.json")
- current_comic = json.loads(current.text)
- code = current_comic['num']
- out( COMIC,"- Found %d Comics." % (code) )
- for i in range(1, int(code) + 1):
- if i == 404: continue # very funny randall! :|
- filename = str(i)
- if not glob.glob(os.path.join(COMIC, filename) + ".*" ):
- out( COMIC, "- Downloading ", filename)
- imglink = json.loads(client.get( "http://xkcd.com/%d/info.0.json" % i ).text )['img']
- ext = "." + imglink.split(".")[-1]
- download_file(imglink, os.path.join(COMIC, filename + ext) )
- def poorly_drawn_lines():
- COMIC = "Poorly Drawn Lines"
- if not os.path.exists(COMIC):
- os.makedirs(COMIC)
- response = client.get('http://poorlydrawnlines.com/archive/')
- html = bs.BeautifulSoup(response.text)
- comic_links = dict( map(lambda x: (x.text,x.get('href')) , filter(lambda x : '/comic/' in x.get('href'),html.findAll('a'))) )
- out( COMIC,"- Found %d Comics." % (len(comic_links)) )
- for comic in sorted(comic_links) :
- url = comic_links[comic]
- filename = url.split('/')[-2]
- if not glob.glob(os.path.join(COMIC, filename) + ".*" ):
- out( COMIC,"- Downloading ", filename )
- html = bs.BeautifulSoup(client.get(url).text)
- post = html.find('div', {'id' : 'post'})
- if post :
- imgtag = post.find('img')
- if imgtag:
- imglink = imgtag.get('src')
- ext = "." + imglink.split(".")[-1]
- download_file(imglink, os.path.join(COMIC, filename + ext) )
- def clean(s):
- codes = ['’','“','”','/']
- for code in codes:
- s = s.replace(code,'')
- s = ''.join(filter(lambda x : x.isalnum() or x.isspace(),s))
- return s
- def doghousediaries():
- COMIC = "Doghouse Diaries"
- if not os.path.exists(COMIC):
- os.makedirs(COMIC)
- response = client.get('http://thedoghousediaries.com/archive')
- html = bs.BeautifulSoup(response.text)
- comic_links = dict( map(lambda x: (x.text,x.get('href')) , html.findAll('a',{'rel':'bookmark'}) ))
- out( COMIC,"- Found %d Comics." % (len(comic_links)) )
- for comic in sorted(comic_links) :
- url = comic_links[comic]
- filename = clean(comic)
- if not glob.glob(os.path.join(COMIC, filename) + ".*" ):
- out( COMIC,"- Downloading ", filename )
- html = bs.BeautifulSoup(client.get(url).text)
- post = html.find('div', {'class' : 'object'})
- if post :
- imgtag = post.find('img')
- if imgtag:
- imglink = imgtag.get('src')
- ext = "." + imglink.split(".")[-1]
- download_file(imglink, os.path.join(COMIC, filename + ext) )
- def loading_artist():
- COMIC = "Loading Artist"
- if not os.path.exists(COMIC):
- os.makedirs(COMIC)
- response = client.get('http://www.loadingartist.com/archives/')
- html = bs.BeautifulSoup(response.text)
- comic_links = dict( map(lambda x: (x.text,x.get('href')) , filter(lambda x : '/comic/' in x.get('href'),html.findAll('a'))) )
- out( COMIC,"- Found %d Comics." % (len(comic_links)) )
- for comic in sorted(comic_links) :
- url = comic_links[comic]
- filename = url.split('/')[-2]
- if not glob.glob(os.path.join(COMIC, filename) + ".*" ):
- out( COMIC,"- Downloading ", filename, url )
- html = bs.BeautifulSoup(client.get(url).text)
- post = html.find('div', {'id' : 'comic'})
- if post :
- imgtag = post.find('img')
- if imgtag:
- imglink = imgtag.get('src')
- ext = "." + imglink.split(".")[-1]
- download_file(imglink, os.path.join(COMIC, filename + ext) )
- poorly_drawn_lines()
- xkcd()
- doghousediaries()
- loading_artist()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement