Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import http.client
- import json
- import os
- from random import randint
- from time import sleep
- def preppost(post):
- post['src_id'] = post['id']
- post['timestamp'] = post['created_at']['s']
- post['tags'] = post['tags'].split(' ')
- post['description'] = post['description'].strip()
- # Sweet Luna, why do I even have to write what comes next?? >__<
- if('sources' in post): post['sources'] = list(map(str.strip, post['sources']))
- elif('source' in post):
- post['sources'] = []
- if(type(post['source']) is str and post['source'].strip != ''): post['sources'].append(post['source'])
- # Cleaning up some unnesessary trash
- del(
- post['author'],
- post['change'],
- post['children'],
- post['creator_id'],
- post['fav_count'],
- post['has_children'],
- post['has_comments'],
- post['has_notes'],
- post['parent_id'],
- post['preview_height'],
- post['preview_width'],
- post['preview_url'],
- post['sample_height'],
- post['sample_width'],
- post['sample_url'],
- post['id'],
- post['created_at'],
- post['source']
- )
- return post
- def getposts(page):
- connection.request('GET', '/post/index.json?limit=50&page='+str(page), headers={
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0'
- })
- # Because sometimes shit happens, duh
- result = None
- sleeptime = 0
- while(result is None):
- try: result = json.loads(connection.getresponse().read().decode('utf-8'))
- except:
- # Yes, we increase waiting time on every error because reasons
- sleeptime += 60
- print("Error, waiting %s seconds", (str(sleeptime)))
- sleep(sleeptime)
- return result
- def writejson(data, filename):
- if(os.path.isfile(filename)): os.remove(filename)
- outfile = open(filename, 'w')
- outfile.write(json.dumps(data, indent=4, sort_keys=True))
- outfile.close()
- # Proxy because reasons
- connection = http.client.HTTPSConnection('ftp.steamvape.ru:8080')
- connection.set_tunnel('e621.net')
- # Since we don't know the exact quantity of pages, we have to use while
- stop = False
- #page = 1
- page = 1838
- while(stop == False):
- posts = list(map(preppost, getposts(page)))
- writejson(posts, 'pages/page_'+str(page)+'.json')
- print('Successfully dumped page', str(page))
- page += 1
- if(len(posts) < 50): stop = True
- else: sleep(randint(5,20))
Advertisement
Add Comment
Please, Sign In to add comment