imcrazytwkr

Dumb e621 parser

Feb 10th, 2018
413
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.27 KB | None | 0 0
  1. import http.client
  2. import json
  3. import os
  4.  
  5. from random import randint
  6. from time import sleep
  7.  
  8. def preppost(post):
  9.     post['src_id'] = post['id']
  10.     post['timestamp'] = post['created_at']['s']
  11.     post['tags'] = post['tags'].split(' ')
  12.     post['description'] = post['description'].strip()
  13.  
  14.     # Sweet Luna, why do I even have to write what comes next?? >__<
  15.     if('sources' in post): post['sources'] = list(map(str.strip, post['sources']))
  16.     elif('source' in post):
  17.         post['sources'] = []
  18.         if(type(post['source']) is str and post['source'].strip != ''): post['sources'].append(post['source'])
  19.  
  20.     # Cleaning up some unnesessary trash
  21.     del(
  22.         post['author'],
  23.         post['change'],
  24.         post['children'],
  25.         post['creator_id'],
  26.         post['fav_count'],
  27.         post['has_children'],
  28.         post['has_comments'],
  29.         post['has_notes'],
  30.         post['parent_id'],
  31.         post['preview_height'],
  32.         post['preview_width'],
  33.         post['preview_url'],
  34.         post['sample_height'],
  35.         post['sample_width'],
  36.         post['sample_url'],
  37.         post['id'],
  38.         post['created_at'],
  39.         post['source']
  40.     )
  41.  
  42.     return post
  43.  
  44. def getposts(page):
  45.     connection.request('GET', '/post/index.json?limit=50&page='+str(page), headers={
  46.         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0'
  47.     })
  48.  
  49.     # Because sometimes shit happens, duh
  50.     result = None
  51.     sleeptime = 0
  52.     while(result is None):
  53.         try: result = json.loads(connection.getresponse().read().decode('utf-8'))
  54.         except:
  55.             # Yes, we increase waiting time on every error because reasons
  56.             sleeptime += 60
  57.             print("Error, waiting %s seconds", (str(sleeptime)))
  58.             sleep(sleeptime)
  59.  
  60.     return result
  61.  
  62. def writejson(data, filename):
  63.     if(os.path.isfile(filename)): os.remove(filename)
  64.     outfile = open(filename, 'w')
  65.     outfile.write(json.dumps(data, indent=4, sort_keys=True))
  66.     outfile.close()
  67.  
  68. # Proxy because reasons
  69. connection = http.client.HTTPSConnection('ftp.steamvape.ru:8080')
  70. connection.set_tunnel('e621.net')
  71.  
  72. # Since we don't know the exact quantity of pages, we have to use while
  73. stop = False
  74. #page = 1
  75. page = 1838
  76. while(stop == False):
  77.     posts = list(map(preppost, getposts(page)))
  78.     writejson(posts, 'pages/page_'+str(page)+'.json')
  79.     print('Successfully dumped page', str(page))
  80.     page += 1
  81.     if(len(posts) < 50): stop = True
  82.     else: sleep(randint(5,20))
Advertisement
Add Comment
Please, Sign In to add comment