Comics Webscrapper

import requests as rq,bs4 as bs
import os,glob,json


client = rq.Session()

def out(*s):
    print(' '.join(s))

def download_file(url,filename):
    r = rq.get(url, stream=True)
    with open( filename , 'wb') as f:
        for chunk in r.iter_content(chunk_size=1024):
            if chunk: # filter out keep-alive new chunks
                f.write(chunk)
                f.flush()
    return filename


def xkcd():
    COMIC = "xkcd"
    if not os.path.exists(COMIC):
        os.makedirs(COMIC)
    current = client.get("http://xkcd.com/info.0.json")
    current_comic =  json.loads(current.text)

    code = current_comic['num']
    out( COMIC,"- Found %d Comics." % (code) )

    for i in range(1, int(code) + 1):
        if i == 404: continue # very funny randall! :|
        filename = str(i)
        if not  glob.glob(os.path.join(COMIC, filename) + ".*" ):
            out( COMIC, "- Downloading ", filename)
            imglink = json.loads(client.get( "http://xkcd.com/%d/info.0.json" % i ).text )['img']
            ext = "." + imglink.split(".")[-1]
            download_file(imglink, os.path.join(COMIC, filename + ext) )


def poorly_drawn_lines():
    COMIC = "Poorly Drawn Lines"
    if not os.path.exists(COMIC):
        os.makedirs(COMIC)

    response = client.get('http://poorlydrawnlines.com/archive/')
    html = bs.BeautifulSoup(response.text)
    comic_links = dict( map(lambda x: (x.text,x.get('href')) , filter(lambda x : '/comic/' in x.get('href'),html.findAll('a'))) )
    out( COMIC,"- Found %d Comics." % (len(comic_links)) )

    for comic in sorted(comic_links) :

        url = comic_links[comic]
        filename = url.split('/')[-2]
        if not glob.glob(os.path.join(COMIC, filename) + ".*" ):
            out( COMIC,"- Downloading ", filename )
            html = bs.BeautifulSoup(client.get(url).text)
            post = html.find('div', {'id' : 'post'})
            if post :
                imgtag = post.find('img')
                if imgtag:
                    imglink = imgtag.get('src')
                    ext = "." + imglink.split(".")[-1]
                    download_file(imglink, os.path.join(COMIC, filename + ext) )

def clean(s):
    codes = ['&#8217;','&#8220;','&#8221;','/']

    for code in codes:
        s = s.replace(code,'')
    s = ''.join(filter(lambda x : x.isalnum() or x.isspace(),s))
    return s

def doghousediaries():
    COMIC = "Doghouse Diaries"
    if not os.path.exists(COMIC):
        os.makedirs(COMIC)

    response = client.get('http://thedoghousediaries.com/archive')
    html = bs.BeautifulSoup(response.text)

    comic_links = dict( map(lambda x: (x.text,x.get('href')) , html.findAll('a',{'rel':'bookmark'}) ))
    out( COMIC,"- Found %d Comics." % (len(comic_links)) )

    for comic in sorted(comic_links) :
        url = comic_links[comic]
        filename = clean(comic)
        if not glob.glob(os.path.join(COMIC, filename) + ".*" ):
            out( COMIC,"- Downloading ", filename )
            html = bs.BeautifulSoup(client.get(url).text)
            post = html.find('div', {'class' : 'object'})
            if post :
                imgtag = post.find('img')
                if imgtag:
                    imglink = imgtag.get('src')
                    ext = "." + imglink.split(".")[-1]
                    download_file(imglink, os.path.join(COMIC, filename + ext) )

def loading_artist():
    COMIC = "Loading Artist"
    if not os.path.exists(COMIC):
        os.makedirs(COMIC)

    response = client.get('http://www.loadingartist.com/archives/')
    html = bs.BeautifulSoup(response.text)
    comic_links = dict( map(lambda x: (x.text,x.get('href')) , filter(lambda x : '/comic/' in x.get('href'),html.findAll('a'))) )
    out( COMIC,"- Found %d Comics." % (len(comic_links)) )

    for comic in sorted(comic_links) :
        url = comic_links[comic]
        filename = url.split('/')[-2]

        if not glob.glob(os.path.join(COMIC, filename) + ".*" ):
            out( COMIC,"- Downloading ", filename, url )
            html = bs.BeautifulSoup(client.get(url).text)
            post = html.find('div', {'id' : 'comic'})
            if post :
                imgtag = post.find('img')
                if imgtag:
                    imglink = imgtag.get('src')
                    ext = "." + imglink.split(".")[-1]
                    download_file(imglink, os.path.join(COMIC, filename + ext) )


poorly_drawn_lines()
xkcd()
doghousediaries()
loading_artist()