Guest User

claim_itch

a guest
Mar 26th, 2020
159
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 17.36 KB | None | 0 0
  1. '''
  2. ClaimItch/0.4
  3.  
  4. requirements:
  5. - python (tested on 3.8)
  6. - requests
  7. - beautiful soup
  8. - lxml
  9. - selenium
  10. - firefox
  11. - geckodriver
  12.  
  13. files and variables:
  14. - SOURCES variable:   includes itch sales/collections or reddit threads you want check, pass --recheck to retcheck them
  15. - history file:       includes the results of the current run so they can be used in future runs
  16.                      see the HISTORY_KEYS variable
  17. - log file
  18.  
  19. todo:
  20. - download non-claimable games?
  21. - login?
  22. - proper log
  23. - proper config
  24. - claim() return values
  25. - "selenium.common.exceptions.ElementNotInteractableException: Message: Element <a class="button buy_btn" href=".."> could not be scrolled into view"
  26. - selenium's performance?
  27. - less strict parsing / navigation (use .lower)
  28. - pylint
  29. - a claimable game was recorded as dl_only, was it changed? https://melessthanthree.itch.io/lucah
  30. '''
  31.  
  32. import os
  33. import sys
  34. import re
  35. import json
  36. import html
  37. import argparse
  38. import requests
  39. from time import sleep, time
  40. from bs4 import BeautifulSoup
  41. from selenium import webdriver
  42. from selenium.common.exceptions import NoSuchElementException
  43.  
  44.  
  45. # add any itch sale/collection or reddit thread to this set
  46. SOURCES = {
  47.     'https://itch.io/c/757294/games-to-help-you-stay-inside',
  48.     'https://itch.io/c/759545/self-isolation-on-a-budget',
  49.     'https://old.reddit.com/r/FreeGameFindings/comments/fka4be/itchio_mega_thread/'
  50. }
  51.  
  52.  
  53. PATTERNS = {
  54.     'itch_collection': r'.+itch\.io/c/.+',
  55.     'itch_sale': r'.+itch\.io/s/.+',
  56.     'itch_group': r'.+itch\.io/[sc]/\d+/.+', # sale or collection
  57.     'reddit_thread': r'.+(?P<thread>reddit\.com/r/.+/comments/.+)/.+',
  58.     'itch_game': r'(http://|https://)?(?P<game>.+\.itch\.io/[^/]+)'
  59. }
  60.  
  61.  
  62. USER_AGENT = 'ClaimItch/0.4'
  63.  
  64.  
  65. HISTORY_KEYS = [
  66.     'urls',           # discovered game urls
  67.     'claimed',        # claimed games
  68.     'has_more',       # a sale, collection, or game that is connected to more sales
  69.     'checked_groups', # a sale/collection that was checked for games, pass --recheck-groups to recheck it
  70.     'dl_only',        # game is not claimable
  71.     'downloaded',     # games that were downloaded (edit this manually)
  72.     'buy',            # game is not free
  73.     'removed',        # game does not exist
  74.     'error',          # games that broke the script
  75. ]
  76.  
  77.  
  78. class ParsingError(Exception):
  79.     def __init__(self, url, *args, **kwargs):
  80.         breakpoint()
  81.         self.url = url
  82.         super().__init__(url, *args, **kwargs)
  83.  
  84.  
  85. def extract_from_itch_group(group_page):
  86.     '''
  87.    INPUT  html sale or collection page
  88.    OUTPUT urls of all games, urls of games that avie noted is connected to more sales
  89.    '''
  90.     soup = BeautifulSoup(group_page, 'lxml')
  91.     urls, more = set(), set()
  92.     games = soup.find_all('div', class_='game_cell')
  93.     for game in games:
  94.         url = game.find('a').get('href')
  95.         urls.add(url)
  96.         if game.find('div', class_='blurb_outer') is not None:
  97.             more.add(url)
  98.     return urls, more
  99.  
  100.  
  101. def get_from_itch_group(group_url, sleep_time=15, max_page=None, sale=False):
  102.     '''
  103.    INPUT  itch.io collection url
  104.    OUTPUT see extract_urls
  105.    '''
  106.     if sale:
  107.         max_page = 1 # sales don't seem to have pages
  108.     page = 1
  109.     urls = set()
  110.     has_more = set()
  111.     while max_page is None or page <= max_page:
  112.         print(f' getting page {page}')
  113.         params = {'page': page} if not sale else None
  114.         res = requests.get(group_url, params=params)
  115.         if res.status_code == 404:
  116.             break
  117.         elif res.status_code != 200:
  118.             breakpoint()
  119.             break
  120.         page += 1
  121.         new_urls, new_more = extract_from_itch_group(res.text)
  122.         urls.update(new_urls)
  123.         has_more.update(new_more)
  124.         print(f' sleeping for {sleep_time}s')
  125.         sleep(sleep_time)
  126.     print(f' got {len(urls)} games')
  127.     return urls, has_more
  128.  
  129.  
  130. def get_from_reddit_thread(url, sleep_time=15):
  131.     '''
  132.    INPUT  reddit thread url
  133.    OUTPUT itch.io game urls, itch.io groups (sales, collections)
  134.    '''
  135.     global USER_AGENT, PATTERNS
  136.  
  137.     # https://www.reddit.com/dev/api#GET_comments_{article}
  138.     json_url = f"https://{re.match(PATTERNS['reddit_thread'], url)['thread']}.json?threaded=false"
  139.     urls = set()
  140.     has_more = set()
  141.     res = requests.get(json_url, headers={'User-Agent': USER_AGENT})
  142.     if res.status_code != 200:
  143.         res.raise_for_status()
  144.     data = res.json()
  145.     for listing in data:
  146.         if listing['kind'].lower() != 'listing':
  147.             raise ParsingError(json_url)
  148.         children = listing['data']['children']
  149.         for child in children:
  150.             text = None
  151.             if child['kind'] == 't3':
  152.                 text = child['data']['selftext_html']
  153.             elif child['kind'] == 't1':
  154.                 text = child['data']['body_html']
  155.             else:
  156.                 raise ParsingError(json_url)
  157.             soup = BeautifulSoup(html.unescape(text), 'lxml')
  158.             new_urls = set(a.get('href') for a in soup.find_all('a'))
  159.             urls.update(url for url in new_urls if re.match(PATTERNS['itch_game'], url))
  160.             has_more.update(url for url in new_urls if re.match(PATTERNS['itch_group'], url))
  161.     print(f' got {len(urls)} games | {len(has_more)} collections/sales')
  162.     print(f' sleeping for {sleep_time}s')
  163.     sleep(sleep_time)
  164.     return urls, has_more
  165.  
  166.  
  167. def get_urls(url, sleep_time=15, max_page=None):
  168.     global PATTERNS
  169.  
  170.     print(f'getting games from {url}')
  171.     if re.match(PATTERNS['itch_collection'], url):
  172.         return get_from_itch_group(url, sleep_time, max_page)
  173.     elif re.match(PATTERNS['itch_sale'], url):
  174.         return get_from_itch_group(url, sleep_time, sale=True)
  175.     elif re.match(PATTERNS['reddit_thread'], url):
  176.         return get_from_reddit_thread(url, sleep_time)
  177.     else:
  178.         breakpoint()
  179.         raise NotImplementedError(f'{url} is not supported')
  180.  
  181.  
  182. def claim(url, driver):
  183.     '''
  184.    INPUTS
  185.      url     game url
  186.      driver  a webdriver for a browser that is logged in to itch.io
  187.    OUTPUT
  188.      status
  189.        'claimed'           success
  190.        'dl_only'           cannot be claimed
  191.        'buy'               not for sale
  192.        'claimed has_more'  success, and indicaes that the game is connected to another sale
  193.        'removed'           game does not exist
  194.    '''
  195.     global PATTERNS
  196.     print(f'handling {url}')
  197.  
  198.     driver.get(f"https://{re.search(PATTERNS['itch_game'], url)['game']}")
  199.     original_window = driver.current_window_handle
  200.     assert len(driver.window_handles) == 1
  201.  
  202.     # removed game
  203.     try:
  204.         driver.find_element_by_css_selector('div.not_found_game_page')
  205.         return 'removed'
  206.     except NoSuchElementException:
  207.         pass
  208.  
  209.     # already owned
  210.     try:
  211.         if 'You own this' in driver.find_element_by_css_selector('div.purchase_banner_inner h2').get_attribute('textContent'):
  212.             print(f' already claimed: {url}')
  213.             return 'claimed'
  214.     except NoSuchElementException:
  215.         pass
  216.  
  217.     # check if claimable
  218.     try:
  219.         buy = driver.find_element_by_css_selector('div.buy_row a.buy_btn')
  220.     except NoSuchElementException:
  221.         try:
  222.             buy = driver.find_element_by_css_selector('section.game_download a.buy_btn')
  223.         except NoSuchElementException:
  224.             driver.find_element_by_css_selector('div.uploads')
  225.             print(f' download only: {url}')
  226.             return 'dl_only'
  227.     if 'Download Now' in buy.get_attribute('textContent'):
  228.         print(f' download only: {url}')
  229.         return 'dl_only'
  230.     elif 'buy now' in buy.get_attribute('textContent').lower():
  231.         print(f' buy: {url}')
  232.         return 'buy'
  233.     # claim
  234.     elif 'Download or claim' in buy.get_attribute('textContent'):
  235.         #buy.location_once_scrolled_into_view
  236.         #buy.click()
  237.         driver.get(f'{url}/purchase')
  238.         no_thanks = driver.find_element_by_css_selector('a.direct_download_btn')
  239.         if 'No thanks, just take me to the downloads' in no_thanks.get_attribute('textContent'):
  240.             no_thanks.click()
  241.  
  242.             # in case the download page opens in a new window
  243.             sleep(1)
  244.             if len(driver.window_handles) > 1:
  245.                 new_handle = None
  246.                 for window_handle in driver.window_handles:
  247.                     if window_handle != original_window:
  248.                         new_handle = window_handle
  249.                         break
  250.                 driver.close()
  251.                 driver.switch_to.window(new_handle)
  252.  
  253.             claim_btn = driver.find_element_by_css_selector('div.claim_to_download_box form button')
  254.             if 'claim' in claim_btn.get_attribute('textContent').lower():
  255.                 claim_btn.click()
  256.                 message = driver.find_element_by_css_selector('div.game_download_page div.inner_column p')
  257.                 if 'for the promotion' in message.get_attribute('textContent'):
  258.                     print(f' part of a sale: {url}')
  259.                     return 'claimed has_more'
  260.                 if 'You claimed this game' in message.get_attribute('textContent'):
  261.                     print(f' just claimed: {url}')
  262.                     return 'claimed'
  263.                 else:
  264.                     raise ParsingError(url)
  265.             else:
  266.                 raise ParsingError(url)
  267.         else:
  268.             raise ParsingError(url)
  269.     else:
  270.         raise ParsingError(url)
  271.  
  272.  
  273. def create_driver():
  274.     options = webdriver.firefox.options.Options()
  275.     options.set_preference('permissions.default.image', 2)
  276.     if os.path.exists('geckodriver.exe'):
  277.         driver = webdriver.Firefox(options=options, executable_path='geckodriver.exe')
  278.     else:
  279.         # geckodriver should be in PATH
  280.         driver = webdriver.Firefox(options=options)
  281.     driver.implicitly_wait(10)
  282.     return driver
  283.  
  284.  
  285. def log(name, data):
  286.     with open(name, 'a') as f:
  287.         for k, v in data.items():
  288.             f.write(k + ': ' + str(v) + '\n')
  289.  
  290.  
  291. def load_history(name):
  292.     global HISTORY_KEYS
  293.  
  294.     try:
  295.         f = open(name, 'r')
  296.         with f:
  297.             data = json.load(f)
  298.         print(f'loaded history from file {name}')
  299.     except FileNotFoundError:
  300.         data = dict()
  301.         print(f'new history file will be created: {name}')
  302.     history = {k: set(data.get(k, [])) for k in HISTORY_KEYS}
  303.     return history
  304.  
  305.  
  306. def save_history(name, data):
  307.     print(f'writing history to file {name}')
  308.     with open(name, 'w') as f:
  309.         json.dump({k: list(v) for k, v in data.items()}, f, indent=2)
  310.  
  311.  
  312. def print_summary(history_file, history):
  313.     global SOURCES, PATTERNS
  314.  
  315.     print('\nSUMMARY')
  316.  
  317.     if not os.path.exists(history_file):
  318.         print(f'No history is stored in {history_file}')
  319.         return
  320.  
  321.     print(f'History stored in {history_file}')
  322.     print()
  323.  
  324.     print(f'Using {len(SOURCES)} main sources (use --recheck to recheck them)')
  325.     print(f"Discovered {len(history['urls'])} games")
  326.     print(f"Claimed {len(history['claimed'])} games")
  327.     not_processed = history['urls'].difference(*map(history.get, ('claimed', 'dl_only', 'downloaded', 'buy', 'removed')))
  328.     print(f"{len(not_processed)} games should be claimed on the next run")
  329.     print()
  330.  
  331.     itch_groups = set(filter(re.compile(PATTERNS['itch_group']).match, history['has_more']))
  332.     itch_games = set(filter(re.compile(PATTERNS['itch_game']).match, history['has_more']))
  333.     print(f"{len(itch_groups)} discovered collections / sales should be checked on the next run")
  334.     print(f"{len(history['checked_groups'])} discovered collections / sales were checked (use --recheck-groups to recheck them)")
  335.     print(f"{len(itch_games)} discovered games are connected to sales that may not have been checked")
  336.     print(f"{len(history['removed'])} games were removed or invalid")
  337.     print()
  338.  
  339.     print(f"Download {len(history['dl_only'])} non-claimable games manually:")
  340.     for url in history['dl_only']:
  341.         print(f'  {url}')
  342.     print(f"{len(history['downloaded'])} games were marked as downloaded (to mark games: move them in the history file from 'dl_only' to 'downloaded')")
  343.     print()
  344.  
  345.     print(f"Buy {len(history['buy'])} non-free games:")
  346.     for url in history['buy']:
  347.         print(f'  {url}')
  348.     print()
  349.  
  350.  
  351. def get_urls_and_update_history(history, sources, itch_groups):
  352.     '''
  353.    INPUT
  354.      history      a dict that'll be updates as `sources` are processed
  355.      sources      sources to get links from
  356.      itch_groups  itch sales/collections in `sources` that should be marked as checked in `history`
  357.    '''
  358.     for i, source in enumerate(sources):
  359.         print(f'{i+1}/{len(sources)}')
  360.         new_urls, new_more = get_urls(source)
  361.         history['urls'].update(new_urls)
  362.         history['has_more'].update(new_more)
  363.     history['checked_groups'].update(itch_groups)
  364.     history['has_more'].difference_update(history['checked_groups'])
  365.  
  366.  
  367. def main():
  368.     global SOURCES, HISTORY_KEYS
  369.  
  370.     run_time = int(time())
  371.     script_name = os.path.basename(os.path.splitext(sys.argv[0])[0])
  372.     log_file = f'{script_name}.log.txt'
  373.     default_history_file = f'{script_name}.history.json'
  374.     log(log_file, {'# new run': run_time})
  375.  
  376.     arg_parser = argparse.ArgumentParser(
  377.         description=f'Claim free itch.io games in an itch.io sale/collection or reddit thread. \
  378.                     Writes the results (game links, claimed games, ..) to history_file. Logs to {log_file}')
  379.     arg_parser.add_argument('history_file', nargs='?', help=f'a json file generated by a previous run of this script (default: {default_history_file})')
  380.     arg_parser.add_argument('--show-history', action='store_true', help='show summary of history in history_file and exit')
  381.     arg_parser.add_argument('--recheck', action='store_true', help='reload game links from SOURCES')
  382.     arg_parser.add_argument('--recheck-groups', action='store_true', help='reload game links from discovered itch collections / sales')
  383.     args = arg_parser.parse_args()
  384.  
  385.     if args.history_file is not None:
  386.         history_file = args.history_file
  387.     else:
  388.         history_file = default_history_file
  389.     history = load_history(history_file)
  390.     log(log_file, {'history_file': history_file})
  391.     log(log_file, {k: len(v) for k, v in history.items()})
  392.  
  393.     if args.show_history:
  394.         print_summary(history_file, history)
  395.         sys.exit(0)
  396.  
  397.     # getting game links
  398.     itch_groups = set(filter(re.compile(PATTERNS['itch_group']).match, history['has_more']))
  399.     check_sources = not os.path.exists(history_file) or args.recheck
  400.     check_groups = len(itch_groups) > 0 or args.recheck_groups
  401.     if check_sources or check_groups:
  402.         print('will reload game urls from the internet')
  403.         # keep getting newly discovered sales/collections
  404.         first_pass = True
  405.         while True:
  406.             target_sources = set()
  407.             itch_groups = set(filter(re.compile(PATTERNS['itch_group']).match, history['has_more']))
  408.             if first_pass:
  409.                 if check_sources:
  410.                     target_sources.update(SOURCES)
  411.                 if args.recheck_groups:
  412.                     itch_groups.update(history['checked_groups'])
  413.             else:
  414.                 if len(itch_groups) == 0:
  415.                     break
  416.                 else:
  417.                     print('getting links from newly discovered sales/collections')
  418.             target_sources.update(itch_groups)
  419.             get_urls_and_update_history(history, target_sources, itch_groups)
  420.             first_pass = False
  421.             log(log_file, {'## got links': time(), 'sources': target_sources, 'urls': history['urls'], 'has_more': history['has_more']})
  422.     else:
  423.         print('using game urls saved in the history file')
  424.         print(' pass the option --recheck and/or --recheck-groups to reload game urls from the internet')
  425.  
  426.     # claiming games
  427.     url = None
  428.     sleep_time = 15
  429.     try:
  430.         ignore = set().union(*map(history.get, ('claimed', 'dl_only', 'downloaded', 'buy', 'removed')))
  431.         valid = history['urls'].difference(ignore)
  432.         if len(valid) > 0:
  433.             with create_driver() as driver:
  434.                 driver.get('https://itch.io/login')
  435.                 # manually log in
  436.                 input('A new Firefox window was opened. Log in to itch then click enter to continue')
  437.                 for i, url in enumerate(valid):
  438.                     print(f"{i+1}/{len(valid)} ({len(history['urls'])})")
  439.                     if url not in ignore:
  440.                         result = claim(url, driver)
  441.                         if 'claimed' in result:
  442.                             history['claimed'].add(url)
  443.                         if 'dl_only' in result:
  444.                             history['dl_only'].add(url)
  445.                         if 'has_more' in result:
  446.                             history['has_more'].add(url)
  447.                         if 'buy' in result:
  448.                             history['buy'].add(url)
  449.                         if 'removed' in result:
  450.                             history['removed'].add(url)
  451.                         print(f' sleeping for {sleep_time}s')
  452.                         sleep(sleep_time)
  453.     except ParsingError as pe:
  454.         history['error'].add(pe.url)
  455.     except Exception as e:
  456.         history['error'].add(url)
  457.         raise
  458.     finally:
  459.         print()
  460.         save_history(history_file, history)
  461.         print_summary(history_file, history)
  462.  
  463.  
  464. if __name__ == '__main__':
  465.     main()
Add Comment
Please, Sign In to add comment