Advertisement
Guest User

2ch pic parser v. 0.0.1

a guest
Jul 20th, 2015
266
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.70 KB | None | 0 0
  1. import re
  2. import urllib2
  3. from sys import argv
  4.  
  5. def error(n, e = ''):
  6.     print {
  7.         1: 'Invalid arguments or arguments is not set. Format: python picgrab.py [url] [pack]',
  8.         2: 'HTTP Error Occured: ' + e,
  9.         3: 'URL Error Occured: ' + e
  10.     }[n]
  11.     raise SystemExit
  12.  
  13. def run(url):
  14.     print('Fetching page ' + url + '...')
  15.     html = fetch(url)
  16.  
  17.     print('Retrieving image paths..')
  18.     paths = parse(html)
  19.  
  20. def fetch(url):
  21.     request = urllib2.Request(url)
  22.     request.add_header('Referer', 'http://lurkmore.to')
  23.     request.add_header('User-Agent', 'Mozilla/5.0 Firefox/39.0')
  24.  
  25.     try:
  26.         response = urllib2.urlopen(request)
  27.     except urllib2.HTTPError, e:
  28.         error(2, e.code)
  29.     except urllib2.URLError, e:
  30.         error(3, e.args)
  31.    
  32.     return response.read()
  33.  
  34. def parse(html):
  35.     img_paths = re.findall(r"/[a-z]+/src/[0-9]+/[0-9]+.jpg", html)
  36.     print(str(len(img_paths)) + ' images found. Removing duplicates...')
  37.  
  38.     img_paths = list(set(img_paths))
  39.     print(str(len(img_paths)) + ' images left after removing duplicated. Downloading started.')
  40.     load_pics(img_paths)
  41.  
  42. def load_pics(paths):
  43.     i = 1
  44.     packname = argv[2]
  45.     os.mkdir(packname)
  46.     for p in paths:
  47.         print('Fetching image ' + str(i) + ' of ' + str(len(paths)) )
  48.         f = open(packname + '/' + str(i) + '.jpg', 'wb')
  49.         f.write(urllib2.urlopen('https://2ch.hk' + p).read())
  50.         f.close()
  51.         i+=1
  52.  
  53. def init():
  54.     regexp = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
  55.     if (len(argv) == 3 and re.match(regexp, argv[1])):
  56.         run(argv[1])
  57.     else:
  58.         error(1)
  59.  
  60.  
  61. __name__ == "__main__" and init()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement