E-Hentai Favorites Extractor

# E-Hentai Favorites Extractor 0.1 for Python
# 2019-07-27

# ========================== FILL IN THESE VALUES ==================================

# Before running this script, replace the values below with your own.
# All the values are of string type, so they should be wrapped in either single or
# double quotes. Just follow the example of the pre-filled, dummy values below:

__cfduid = "0000000000000000000000000000000000000000000000"
ipb_member_id = "0000000"
ipb_pass_hash = "000000000000000000000000000000000"
ipb_session_id = "000000000000000000000000000000000"

# Those values must be taken from the COOKIES of the web request of your
# favourites page in E-Hentai. You can see them by using the developer console
# of your web browser.
# This developer console can only be found in desktop browsers (usually F12 key).

# How to find the active cookies in Chrome:
# https://developer.chrome.com/devtools/docs/resources#cookies

# The same, but in Firefox:
# https://developer.mozilla.org/en-US/docs/Tools/Network_Monitor#UI_overview

# No matter what browser you're using, you want to copy the cookie values that were
# used for the "https://e-hentai.org/favorites.php" page.
# You don't need them all, just the ones required to fill in those names above.

# Absolute path to save the output file. Use forward slashes:
FAVORITES_PATH = 'D:/E-Hentai Favorites 2019-07-27.json'

# =================================================================================

import re
import json
from time import time, sleep
from requests import Session

session = Session()
session.headers['User-Agent'] = 'Mozilla/5.0 (compatible; python-favorites-extractor/0.1;)'
session.cookies.update(
    {
        '__cfduid': __cfduid,
        'ipb_session_id': ipb_session_id,
        'ipb_member_id': ipb_member_id,
        'ipb_pass_hash': ipb_pass_hash
    }
)
FAVORITES_URL = 'https://e-hentai.org/favorites.php'
GALLERY_URL = 'https://e-hentai.org/g/%s/%s/'
API_URL = 'https://api.e-hentai.org/api.php'

allMyFavorites = { } # Dictionary of favorite categories.
idToCategory = { } # Dictionary of gallery IDs mapping to their favorite categories.


def listChunks(l, n):
    # From (https://stackoverflow.com/a/312464).
    # Yield successive n-sized chunks from l.
    for i in range(0, len(l), n):
        yield l[i:i + n]


def waitDelay(start, seconds):
    elapsed = time() - start
    if elapsed < seconds:
        sleep(max(elapsed, 0.1))


def log(*messages):
    print(*(['X-EXTRACTOR:'] + list(messages)))


def main():
    # 1) Load the main favorites page.

    log('Starting ---')
    r = session.get(FAVORITES_URL)
    if r.ok:
        pageHTML = r.text
    else:
        raise Exception('Failed to GET the main favorites page')
    # Total number of galleries.
    totalGalleries = re.search('Showing ([0-9,]+) results', pageHTML).group(1)
    totalGalleries = totalGalleries.replace(',', '')
    log('Total favorited galleries: ' + totalGalleries)

    # Total number of pages.
    temp = pageHTML
    totalPages = 1 + max(int(match.group(1)) for match in re.finditer('favorites\.php\?page=(\d+)', pageHTML))
    log('Total favorite pages: ', str(totalPages), '\n')

    # 2) Begin (gently) scraping each favorites page.

    START_PAGE = 1 # Begin at the first page (aka "main page").

    # Pattern to find some gallery data (see https://ehwiki.org/wiki/API#Gallery_Tokens).
    idTokenCatPattern = re.compile('onclick="popUp.*?gid=(.*?)&amp;t=(.*?)&amp;act=addfav.*?title="(.*?)".*?glfav">(.*?)<')

    for currentPage in range(START_PAGE - 1, totalPages):
        log('Fetching page %i / %i...' % ((currentPage + 1), totalPages))

        startTime = time()

        if currentPage > 0:
            r = session.get(FAVORITES_URL + '?page=' + str(currentPage), timeout=10)
            if r.ok:
                pageHTML = r.text
            else:
                log('Failed to get favorites for page %i' % (currentPage + 1))
                log('>', r.text)
                break

            waitDelay(startTime, 3.0) # Limit seems to be 2.5 seconds, so we use 3 seconds to be safe.
        else:
            # The main page ("favorites.php") is also the first page (page zero, as pages are zero-indexed).
            pass

        # Read all gallery IDs, tokens and their categories (eg favorites 1, 2, 3) in this favorites page.
        startIndex = pageHTML.find('class="itg gltm"')
        pageHTML = pageHTML[startIndex : pageHTML.find('</table', startIndex)]

        galleriesToQuery = [ ]

        for match in idTokenCatPattern.finditer(pageHTML):
            id, token, categoryName, favoritedTime = match.groups()
            favCategory = allMyFavorites.setdefault(categoryName, { })
            if id not in favCategory:
                favCategory[id] = {'favorited': favoritedTime, 'url': GALLERY_URL % (id, token)}
                idToCategory[id] = favCategory
                galleriesToQuery.append((id, token))
            else:
                log('Duplicate gallery found and ignored')

        # Use the E-H API to get the metadata of the galleries in this page.
        log('Galleries in this page:', str(len(galleriesToQuery)))

        requestGalleries = listChunks(galleriesToQuery, 25)
        totalObtained = 0
        for requestIndex, chunk in enumerate(requestGalleries, 1):
            r = session.post(API_URL, json={'method': 'gdata', 'namespace': 1, 'gidlist': chunk}, timeout=10)
            if r.ok:
                metaItems = r.json()['gmetadata']
                totalObtained += len(metaItems)

                for galleryMeta in metaItems:
                    id = str(galleryMeta['gid'])
                    favCategory = idToCategory[id]
                    favCategory[id].update(galleryMeta)
                if not (requestIndex % 4):
                    log('4 requests done, delaying for 5 seconds...')
                    sleep(5.0)
            else:
                log('Chunk request failed (HTTP %i)' % r.status_code)
                log(r.text, '\n')
                log('Delaying for 5 seconds...')
                sleep(5.0)

        log('Obtained metadata for: %i' % totalObtained, '\n')
        if totalObtained != len(galleriesToQuery):
            log('WARNING!')
            log('Could not find metadata for all of page %i\'s galleries' % (currentPage + 1), '\n')

        # Force at least a 1-second delay, to avoid hammering the source.
        # If you try to mess with the delays you might be IP-banned.
        # It should take less than 5 minutes for a full backup anyway.
        sleep(1.0)

    # Finally, write your favorites to the favorites file.
    import datetime
    output = {
        'Information': {
            'About': 'This is a JSON backup of your favorited galleries in ' \
            'your E-Hentai account, for archival purposes.',
            'Total Galleries': int(totalGalleries),
            'Total Categories': len(allMyFavorites.keys()),
            'Items Per Category': {key: len(value) for key, value in allMyFavorites.items()},
            'Archive Date': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        },
        'E-Hentai Favorites': allMyFavorites
    }
    with open(FAVORITES_PATH, 'w', encoding='utf-8') as f:
        log('Generating JSON file, please wait...')
        f.write(json.dumps(output, indent=4, ensure_ascii=False))
        log('Created file:', FAVORITES_PATH)
        log('FINISHED ---')


# Entry point.
if __name__ == '__main__':
    main()