Advertisement
Guest User

Comics Webscrapper

a guest
Aug 1st, 2014
191
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.56 KB | None | 0 0
  1. import requests as rq,bs4 as bs
  2. import os,glob,json
  3.  
  4.  
  5. client = rq.Session()
  6.  
  7. def out(*s):
  8.     print(' '.join(s))
  9.  
  10. def download_file(url,filename):
  11.     r = rq.get(url, stream=True)
  12.     with open( filename , 'wb') as f:
  13.         for chunk in r.iter_content(chunk_size=1024):
  14.             if chunk: # filter out keep-alive new chunks
  15.                 f.write(chunk)
  16.                 f.flush()
  17.     return filename
  18.    
  19.    
  20. def xkcd():
  21.     COMIC = "xkcd"
  22.     if not os.path.exists(COMIC):
  23.         os.makedirs(COMIC)
  24.     current = client.get("http://xkcd.com/info.0.json")
  25.     current_comic =  json.loads(current.text)
  26.  
  27.     code = current_comic['num']
  28.     out( COMIC,"- Found %d Comics." % (code) )
  29.  
  30.     for i in range(1, int(code) + 1):
  31.         if i == 404: continue # very funny randall! :|
  32.         filename = str(i)
  33.         if not  glob.glob(os.path.join(COMIC, filename) + ".*" ):
  34.             out( COMIC, "- Downloading ", filename)
  35.             imglink = json.loads(client.get( "http://xkcd.com/%d/info.0.json" % i ).text )['img']
  36.             ext = "." + imglink.split(".")[-1]
  37.             download_file(imglink, os.path.join(COMIC, filename + ext) )
  38.        
  39.    
  40.  
  41.  
  42. def poorly_drawn_lines():
  43.     COMIC = "Poorly Drawn Lines"
  44.     if not os.path.exists(COMIC):
  45.         os.makedirs(COMIC)
  46.  
  47.     response = client.get('http://poorlydrawnlines.com/archive/')
  48.     html = bs.BeautifulSoup(response.text)
  49.     comic_links = dict( map(lambda x: (x.text,x.get('href')) , filter(lambda x : '/comic/' in x.get('href'),html.findAll('a'))) )
  50.     out( COMIC,"- Found %d Comics." % (len(comic_links)) )
  51.    
  52.     for comic in sorted(comic_links) :
  53.        
  54.         url = comic_links[comic]
  55.         filename = url.split('/')[-2]
  56.         if not glob.glob(os.path.join(COMIC, filename) + ".*" ):
  57.             out( COMIC,"- Downloading ", filename )
  58.             html = bs.BeautifulSoup(client.get(url).text)
  59.             post = html.find('div', {'id' : 'post'})
  60.             if post :
  61.                 imgtag = post.find('img')
  62.                 if imgtag:
  63.                     imglink = imgtag.get('src')
  64.                     ext = "." + imglink.split(".")[-1]
  65.                     download_file(imglink, os.path.join(COMIC, filename + ext) )
  66.  
  67. def clean(s):
  68.     codes = ['’','“','”','/']
  69.  
  70.     for code in codes:
  71.         s = s.replace(code,'')
  72.     s = ''.join(filter(lambda x : x.isalnum() or x.isspace(),s))
  73.     return s
  74.  
  75. def doghousediaries():
  76.     COMIC = "Doghouse Diaries"
  77.     if not os.path.exists(COMIC):
  78.         os.makedirs(COMIC)
  79.  
  80.     response = client.get('http://thedoghousediaries.com/archive')
  81.     html = bs.BeautifulSoup(response.text)
  82.  
  83.     comic_links = dict( map(lambda x: (x.text,x.get('href')) , html.findAll('a',{'rel':'bookmark'}) ))
  84.     out( COMIC,"- Found %d Comics." % (len(comic_links)) )
  85.    
  86.     for comic in sorted(comic_links) :        
  87.         url = comic_links[comic]
  88.         filename = clean(comic)
  89.         if not glob.glob(os.path.join(COMIC, filename) + ".*" ):
  90.             out( COMIC,"- Downloading ", filename )
  91.             html = bs.BeautifulSoup(client.get(url).text)
  92.             post = html.find('div', {'class' : 'object'})
  93.             if post :
  94.                 imgtag = post.find('img')
  95.                 if imgtag:
  96.                     imglink = imgtag.get('src')
  97.                     ext = "." + imglink.split(".")[-1]
  98.                     download_file(imglink, os.path.join(COMIC, filename + ext) )
  99.  
  100. def loading_artist():
  101.     COMIC = "Loading Artist"
  102.     if not os.path.exists(COMIC):
  103.         os.makedirs(COMIC)
  104.  
  105.     response = client.get('http://www.loadingartist.com/archives/')
  106.     html = bs.BeautifulSoup(response.text)
  107.     comic_links = dict( map(lambda x: (x.text,x.get('href')) , filter(lambda x : '/comic/' in x.get('href'),html.findAll('a'))) )
  108.     out( COMIC,"- Found %d Comics." % (len(comic_links)) )
  109.    
  110.     for comic in sorted(comic_links) :
  111.         url = comic_links[comic]
  112.         filename = url.split('/')[-2]
  113.  
  114.         if not glob.glob(os.path.join(COMIC, filename) + ".*" ):
  115.             out( COMIC,"- Downloading ", filename, url )
  116.             html = bs.BeautifulSoup(client.get(url).text)
  117.             post = html.find('div', {'id' : 'comic'})
  118.             if post :
  119.                 imgtag = post.find('img')
  120.                 if imgtag:
  121.                     imglink = imgtag.get('src')
  122.                     ext = "." + imglink.split(".")[-1]
  123.                     download_file(imglink, os.path.join(COMIC, filename + ext) )
  124.  
  125.  
  126.        
  127.  
  128.  
  129. poorly_drawn_lines()
  130. xkcd()
  131. doghousediaries()
  132. loading_artist()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement