Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # E-Hentai Favorites Extractor 0.1 for Python
- # 2019-07-27
- # ========================== FILL IN THESE VALUES ==================================
- # Before running this script, replace the values below with your own.
- # All the values are of string type, so they should be wrapped in either single or
- # double quotes. Just follow the example of the pre-filled, dummy values below:
- __cfduid = "0000000000000000000000000000000000000000000000"
- ipb_member_id = "0000000"
- ipb_pass_hash = "000000000000000000000000000000000"
- ipb_session_id = "000000000000000000000000000000000"
- # Those values must be taken from the COOKIES of the web request of your
- # favourites page in E-Hentai. You can see them by using the developer console
- # of your web browser.
- # This developer console can only be found in desktop browsers (usually F12 key).
- # How to find the active cookies in Chrome:
- # https://developer.chrome.com/devtools/docs/resources#cookies
- # The same, but in Firefox:
- # https://developer.mozilla.org/en-US/docs/Tools/Network_Monitor#UI_overview
- # No matter what browser you're using, you want to copy the cookie values that were
- # used for the "https://e-hentai.org/favorites.php" page.
- # You don't need them all, just the ones required to fill in those names above.
- # Absolute path to save the output file. Use forward slashes:
- FAVORITES_PATH = 'D:/E-Hentai Favorites 2019-07-27.json'
- # =================================================================================
- import re
- import json
- from time import time, sleep
- from requests import Session
- session = Session()
- session.headers['User-Agent'] = 'Mozilla/5.0 (compatible; python-favorites-extractor/0.1;)'
- session.cookies.update(
- {
- '__cfduid': __cfduid,
- 'ipb_session_id': ipb_session_id,
- 'ipb_member_id': ipb_member_id,
- 'ipb_pass_hash': ipb_pass_hash
- }
- )
- FAVORITES_URL = 'https://e-hentai.org/favorites.php'
- GALLERY_URL = 'https://e-hentai.org/g/%s/%s/'
- API_URL = 'https://api.e-hentai.org/api.php'
- allMyFavorites = { } # Dictionary of favorite categories.
- idToCategory = { } # Dictionary of gallery IDs mapping to their favorite categories.
- def listChunks(l, n):
- # From (https://stackoverflow.com/a/312464).
- # Yield successive n-sized chunks from l.
- for i in range(0, len(l), n):
- yield l[i:i + n]
- def waitDelay(start, seconds):
- elapsed = time() - start
- if elapsed < seconds:
- sleep(max(elapsed, 0.1))
- def log(*messages):
- print(*(['X-EXTRACTOR:'] + list(messages)))
- def main():
- # 1) Load the main favorites page.
- log('Starting ---')
- r = session.get(FAVORITES_URL)
- if r.ok:
- pageHTML = r.text
- else:
- raise Exception('Failed to GET the main favorites page')
- # Total number of galleries.
- totalGalleries = re.search('Showing ([0-9,]+) results', pageHTML).group(1)
- totalGalleries = totalGalleries.replace(',', '')
- log('Total favorited galleries: ' + totalGalleries)
- # Total number of pages.
- temp = pageHTML
- totalPages = 1 + max(int(match.group(1)) for match in re.finditer('favorites\.php\?page=(\d+)', pageHTML))
- log('Total favorite pages: ', str(totalPages), '\n')
- # 2) Begin (gently) scraping each favorites page.
- START_PAGE = 1 # Begin at the first page (aka "main page").
- # Pattern to find some gallery data (see https://ehwiki.org/wiki/API#Gallery_Tokens).
- idTokenCatPattern = re.compile('onclick="popUp.*?gid=(.*?)&t=(.*?)&act=addfav.*?title="(.*?)".*?glfav">(.*?)<')
- for currentPage in range(START_PAGE - 1, totalPages):
- log('Fetching page %i / %i...' % ((currentPage + 1), totalPages))
- startTime = time()
- if currentPage > 0:
- r = session.get(FAVORITES_URL + '?page=' + str(currentPage), timeout=10)
- if r.ok:
- pageHTML = r.text
- else:
- log('Failed to get favorites for page %i' % (currentPage + 1))
- log('>', r.text)
- break
- waitDelay(startTime, 3.0) # Limit seems to be 2.5 seconds, so we use 3 seconds to be safe.
- else:
- # The main page ("favorites.php") is also the first page (page zero, as pages are zero-indexed).
- pass
- # Read all gallery IDs, tokens and their categories (eg favorites 1, 2, 3) in this favorites page.
- startIndex = pageHTML.find('class="itg gltm"')
- pageHTML = pageHTML[startIndex : pageHTML.find('</table', startIndex)]
- galleriesToQuery = [ ]
- for match in idTokenCatPattern.finditer(pageHTML):
- id, token, categoryName, favoritedTime = match.groups()
- favCategory = allMyFavorites.setdefault(categoryName, { })
- if id not in favCategory:
- favCategory[id] = {'favorited': favoritedTime, 'url': GALLERY_URL % (id, token)}
- idToCategory[id] = favCategory
- galleriesToQuery.append((id, token))
- else:
- log('Duplicate gallery found and ignored')
- # Use the E-H API to get the metadata of the galleries in this page.
- log('Galleries in this page:', str(len(galleriesToQuery)))
- requestGalleries = listChunks(galleriesToQuery, 25)
- totalObtained = 0
- for requestIndex, chunk in enumerate(requestGalleries, 1):
- r = session.post(API_URL, json={'method': 'gdata', 'namespace': 1, 'gidlist': chunk}, timeout=10)
- if r.ok:
- metaItems = r.json()['gmetadata']
- totalObtained += len(metaItems)
- for galleryMeta in metaItems:
- id = str(galleryMeta['gid'])
- favCategory = idToCategory[id]
- favCategory[id].update(galleryMeta)
- if not (requestIndex % 4):
- log('4 requests done, delaying for 5 seconds...')
- sleep(5.0)
- else:
- log('Chunk request failed (HTTP %i)' % r.status_code)
- log(r.text, '\n')
- log('Delaying for 5 seconds...')
- sleep(5.0)
- log('Obtained metadata for: %i' % totalObtained, '\n')
- if totalObtained != len(galleriesToQuery):
- log('WARNING!')
- log('Could not find metadata for all of page %i\'s galleries' % (currentPage + 1), '\n')
- # Force at least a 1-second delay, to avoid hammering the source.
- # If you try to mess with the delays you might be IP-banned.
- # It should take less than 5 minutes for a full backup anyway.
- sleep(1.0)
- # Finally, write your favorites to the favorites file.
- import datetime
- output = {
- 'Information': {
- 'About': 'This is a JSON backup of your favorited galleries in ' \
- 'your E-Hentai account, for archival purposes.',
- 'Total Galleries': int(totalGalleries),
- 'Total Categories': len(allMyFavorites.keys()),
- 'Items Per Category': {key: len(value) for key, value in allMyFavorites.items()},
- 'Archive Date': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
- },
- 'E-Hentai Favorites': allMyFavorites
- }
- with open(FAVORITES_PATH, 'w', encoding='utf-8') as f:
- log('Generating JSON file, please wait...')
- f.write(json.dumps(output, indent=4, ensure_ascii=False))
- log('Created file:', FAVORITES_PATH)
- log('FINISHED ---')
- # Entry point.
- if __name__ == '__main__':
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement