Advertisement
Guest User

E-Hentai Favorites Extractor

a guest
Jul 27th, 2019
1,148
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 7.62 KB | None | 0 0
  1. # E-Hentai Favorites Extractor 0.1 for Python
  2. # 2019-07-27
  3.  
  4. # ========================== FILL IN THESE VALUES ==================================
  5.  
  6. # Before running this script, replace the values below with your own.
  7. # All the values are of string type, so they should be wrapped in either single or
  8. # double quotes. Just follow the example of the pre-filled, dummy values below:
  9.  
  10. __cfduid = "0000000000000000000000000000000000000000000000"
  11. ipb_member_id = "0000000"
  12. ipb_pass_hash = "000000000000000000000000000000000"
  13. ipb_session_id = "000000000000000000000000000000000"
  14.  
  15. # Those values must be taken from the COOKIES of the web request of your
  16. # favourites page in E-Hentai. You can see them by using the developer console
  17. # of your web browser.
  18. # This developer console can only be found in desktop browsers (usually F12 key).
  19.  
  20. # How to find the active cookies in Chrome:
  21. # https://developer.chrome.com/devtools/docs/resources#cookies
  22.  
  23. # The same, but in Firefox:
  24. # https://developer.mozilla.org/en-US/docs/Tools/Network_Monitor#UI_overview
  25.  
  26. # No matter what browser you're using, you want to copy the cookie values that were
  27. # used for the "https://e-hentai.org/favorites.php" page.
  28. # You don't need them all, just the ones required to fill in those names above.
  29.  
  30. # Absolute path to save the output file. Use forward slashes:
  31. FAVORITES_PATH = 'D:/E-Hentai Favorites 2019-07-27.json'
  32.  
  33. # =================================================================================
  34.  
  35. import re
  36. import json
  37. from time import time, sleep
  38. from requests import Session
  39.  
  40. session = Session()
  41. session.headers['User-Agent'] = 'Mozilla/5.0 (compatible; python-favorites-extractor/0.1;)'
  42. session.cookies.update(
  43.     {
  44.         '__cfduid': __cfduid,
  45.         'ipb_session_id': ipb_session_id,
  46.         'ipb_member_id': ipb_member_id,
  47.         'ipb_pass_hash': ipb_pass_hash
  48.     }
  49. )
  50. FAVORITES_URL = 'https://e-hentai.org/favorites.php'
  51. GALLERY_URL = 'https://e-hentai.org/g/%s/%s/'
  52. API_URL = 'https://api.e-hentai.org/api.php'
  53.  
  54. allMyFavorites = { } # Dictionary of favorite categories.
  55. idToCategory = { } # Dictionary of gallery IDs mapping to their favorite categories.
  56.  
  57.  
  58. def listChunks(l, n):
  59.     # From (https://stackoverflow.com/a/312464).
  60.     # Yield successive n-sized chunks from l.
  61.     for i in range(0, len(l), n):
  62.         yield l[i:i + n]
  63.    
  64.        
  65. def waitDelay(start, seconds):
  66.     elapsed = time() - start
  67.     if elapsed < seconds:
  68.         sleep(max(elapsed, 0.1))
  69.  
  70.        
  71. def log(*messages):
  72.     print(*(['X-EXTRACTOR:'] + list(messages)))
  73.  
  74.  
  75. def main():
  76.     # 1) Load the main favorites page.
  77.    
  78.     log('Starting ---')
  79.     r = session.get(FAVORITES_URL)
  80.     if r.ok:
  81.         pageHTML = r.text
  82.     else:
  83.         raise Exception('Failed to GET the main favorites page')
  84.     # Total number of galleries.
  85.     totalGalleries = re.search('Showing ([0-9,]+) results', pageHTML).group(1)
  86.     totalGalleries = totalGalleries.replace(',', '')
  87.     log('Total favorited galleries: ' + totalGalleries)
  88.  
  89.     # Total number of pages.
  90.     temp = pageHTML
  91.     totalPages = 1 + max(int(match.group(1)) for match in re.finditer('favorites\.php\?page=(\d+)', pageHTML))
  92.     log('Total favorite pages: ', str(totalPages), '\n')
  93.        
  94.     # 2) Begin (gently) scraping each favorites page.
  95.  
  96.     START_PAGE = 1 # Begin at the first page (aka "main page").
  97.    
  98.     # Pattern to find some gallery data (see https://ehwiki.org/wiki/API#Gallery_Tokens).
  99.     idTokenCatPattern = re.compile('onclick="popUp.*?gid=(.*?)&amp;t=(.*?)&amp;act=addfav.*?title="(.*?)".*?glfav">(.*?)<')
  100.  
  101.     for currentPage in range(START_PAGE - 1, totalPages):
  102.         log('Fetching page %i / %i...' % ((currentPage + 1), totalPages))
  103.  
  104.         startTime = time()
  105.        
  106.         if currentPage > 0:
  107.             r = session.get(FAVORITES_URL + '?page=' + str(currentPage), timeout=10)
  108.             if r.ok:
  109.                 pageHTML = r.text
  110.             else:
  111.                 log('Failed to get favorites for page %i' % (currentPage + 1))
  112.                 log('>', r.text)
  113.                 break
  114.      
  115.             waitDelay(startTime, 3.0) # Limit seems to be 2.5 seconds, so we use 3 seconds to be safe.
  116.         else:
  117.             # The main page ("favorites.php") is also the first page (page zero, as pages are zero-indexed).
  118.             pass    
  119.  
  120.         # Read all gallery IDs, tokens and their categories (eg favorites 1, 2, 3) in this favorites page.
  121.         startIndex = pageHTML.find('class="itg gltm"')
  122.         pageHTML = pageHTML[startIndex : pageHTML.find('</table', startIndex)]
  123.        
  124.         galleriesToQuery = [ ]
  125.        
  126.         for match in idTokenCatPattern.finditer(pageHTML):
  127.             id, token, categoryName, favoritedTime = match.groups()
  128.             favCategory = allMyFavorites.setdefault(categoryName, { })
  129.             if id not in favCategory:
  130.                 favCategory[id] = {'favorited': favoritedTime, 'url': GALLERY_URL % (id, token)}
  131.                 idToCategory[id] = favCategory
  132.                 galleriesToQuery.append((id, token))
  133.             else:
  134.                 log('Duplicate gallery found and ignored')
  135.                
  136.         # Use the E-H API to get the metadata of the galleries in this page.
  137.         log('Galleries in this page:', str(len(galleriesToQuery)))
  138.  
  139.         requestGalleries = listChunks(galleriesToQuery, 25)
  140.         totalObtained = 0
  141.         for requestIndex, chunk in enumerate(requestGalleries, 1):
  142.             r = session.post(API_URL, json={'method': 'gdata', 'namespace': 1, 'gidlist': chunk}, timeout=10)
  143.             if r.ok:
  144.                 metaItems = r.json()['gmetadata']
  145.                 totalObtained += len(metaItems)
  146.                
  147.                 for galleryMeta in metaItems:
  148.                     id = str(galleryMeta['gid'])
  149.                     favCategory = idToCategory[id]
  150.                     favCategory[id].update(galleryMeta)
  151.                 if not (requestIndex % 4):
  152.                     log('4 requests done, delaying for 5 seconds...')
  153.                     sleep(5.0)
  154.             else:
  155.                 log('Chunk request failed (HTTP %i)' % r.status_code)
  156.                 log(r.text, '\n')
  157.                 log('Delaying for 5 seconds...')
  158.                 sleep(5.0)
  159.                
  160.         log('Obtained metadata for: %i' % totalObtained, '\n')
  161.         if totalObtained != len(galleriesToQuery):
  162.             log('WARNING!')
  163.             log('Could not find metadata for all of page %i\'s galleries' % (currentPage + 1), '\n')
  164.  
  165.         # Force at least a 1-second delay, to avoid hammering the source.
  166.         # If you try to mess with the delays you might be IP-banned.
  167.         # It should take less than 5 minutes for a full backup anyway.
  168.         sleep(1.0)
  169.  
  170.     # Finally, write your favorites to the favorites file.
  171.     import datetime
  172.     output = {
  173.         'Information': {
  174.             'About': 'This is a JSON backup of your favorited galleries in ' \
  175.             'your E-Hentai account, for archival purposes.',
  176.             'Total Galleries': int(totalGalleries),
  177.             'Total Categories': len(allMyFavorites.keys()),
  178.             'Items Per Category': {key: len(value) for key, value in allMyFavorites.items()},
  179.             'Archive Date': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
  180.         },
  181.         'E-Hentai Favorites': allMyFavorites
  182.     }
  183.     with open(FAVORITES_PATH, 'w', encoding='utf-8') as f:
  184.         log('Generating JSON file, please wait...')
  185.         f.write(json.dumps(output, indent=4, ensure_ascii=False))
  186.         log('Created file:', FAVORITES_PATH)
  187.         log('FINISHED ---')
  188.  
  189.  
  190. # Entry point.
  191. if __name__ == '__main__':
  192.     main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement