Guest User

claim_itch.py (v0.3)

a guest
Mar 24th, 2020
213
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 14.73 KB | None | 0 0
  1. '''
  2. version (see USER_AGENT)
  3.  
  4. requirements:
  5. - python (tested on 3.8)
  6. - requests
  7. - beautiful soup
  8. - lxml
  9. - selenium
  10. - firefox
  11. - geckodriver
  12.  
  13. files and variables:
  14. - SOURCES variable:   includes itch sales/collections or reddit threads you want check, pass --fetch to retcheck them
  15. - claim_itch_*.conf:  includes the results of the current run, can be passed to the script on future runs
  16.                      see the CONFIG_KEYS variable
  17.                      each run creates a new file
  18. - claim_itch.log:     log file
  19.  
  20. todo:
  21. - download non-claimable games?
  22. - login?
  23. - proper log
  24. - proper config
  25. - claim() return values
  26. - "selenium.common.exceptions.ElementNotInteractableException: Message: Element <a class="button buy_btn" href=".."> could not be scrolled into view"
  27. - selenium's performance?
  28. - less strict parsing / navigation (use .lower)
  29. - pylint
  30. - a claimable game was recorded as dl_only, was it changed? https://melessthanthree.itch.io/lucah
  31. '''
  32.  
  33. import re
  34. import json
  35. import html
  36. import argparse
  37. import requests
  38. from time import sleep, time
  39. from bs4 import BeautifulSoup
  40. from selenium import webdriver
  41. from selenium.common.exceptions import NoSuchElementException
  42.  
  43.  
  44. # add any itch sale/collection or reddit thread to this set
  45. SOURCES = {
  46.     'https://itch.io/c/757294/games-to-help-you-stay-inside',
  47.     'https://itch.io/c/759545/self-isolation-on-a-budget',
  48.     'https://old.reddit.com/r/FreeGameFindings/comments/fka4be/itchio_mega_thread/'
  49. }
  50.  
  51.  
  52. PATTERNS = {
  53.     'itch_collection': r'.+itch\.io/c/.+',
  54.     'itch_sale': r'.+itch\.io/s/.+',
  55.     'itch_group': r'.+itch\.io/[sc]/\d+/.+', # sale or collection
  56.     'reddit_thread': r'.+(?P<thread>reddit\.com/r/.+/comments/.+)/.+',
  57.     'itch_game': r'(?P<game>[^/]+\.itch\.io/[^/]+)'
  58. }
  59.  
  60.  
  61. USER_AGENT = 'ClaimItch/0.3'
  62.  
  63.  
  64. CONFIG_KEYS = [
  65.     'urls',           # collected urls
  66.     'claimed',        # claimed games
  67.     'has_more',       # a sale, collection, or game that is connected to more sales
  68.     'checked_groups', # a sale/collection that was checked for games, pass --recheck-groups to recheck it
  69.     'dl_only',        # game is not claimable
  70.     'downloaded',     # games that were downloaded (edit this manually)
  71.     'buy',            # game is not free
  72.     'removed',        # game does not exist
  73.     'error',          # games that broke the script
  74. ]
  75.  
  76.  
  77. class ParsingError(Exception):
  78.     def __init__(self, url, *args, **kwargs):
  79.         breakpoint()
  80.         self.url = url
  81.         super().__init__(url, *args, **kwargs)
  82.  
  83.  
  84. def extract_from_itch_group(group_page):
  85.     '''
  86.    INPUT  html sale or collection page
  87.    OUTPUT urls of all games, urls of games that avie noted is connected to more sales
  88.    '''
  89.     soup = BeautifulSoup(group_page, 'lxml')
  90.     urls, more = set(), set()
  91.     games = soup.find_all('div', class_='game_cell')
  92.     for game in games:
  93.         url = game.find('a').get('href')
  94.         urls.add(url)
  95.         if game.find('div', class_='blurb_outer') is not None:
  96.             more.add(url)
  97.     return urls, more
  98.  
  99.  
  100. def get_from_itch_group(group_url, sleep_time=15, max_page=None, sale=False):
  101.     '''
  102.    INPUT  itch.io collection url
  103.    OUTPUT see extract_urls
  104.    '''
  105.     if sale:
  106.         max_page = 1 # sales don't seem to have pages
  107.     page = 1
  108.     urls = set()
  109.     has_more = set()
  110.     while max_page is None or page <= max_page:
  111.         print(f'getting page {page}')
  112.         params = {'page': page} if not sale else None
  113.         res = requests.get(group_url, params=params)
  114.         if res.status_code == 404:
  115.             break
  116.         elif res.status_code != 200:
  117.             breakpoint()
  118.             break
  119.         page += 1
  120.         new_urls, new_more = extract_from_itch_group(res.text)
  121.         urls.update(new_urls)
  122.         has_more.update(new_more)
  123.         print(f'sleeping for {sleep_time}s')
  124.         sleep(sleep_time)
  125.     print(f'got {len(urls)} games')
  126.     return urls, has_more
  127.  
  128.  
  129. def get_from_reddit_thread(url, sleep_time=15):
  130.     '''
  131.    INPUT  reddit thread url
  132.    OUTPUT itch.io game urls, itch.io groups (sales, collections)
  133.    '''
  134.     global USER_AGENT, PATTERNS
  135.  
  136.     # https://www.reddit.com/dev/api#GET_comments_{article}
  137.     json_url = f"https://{re.match(PATTERNS['reddit_thread'], url)['thread']}.json?threaded=false"
  138.     urls = set()
  139.     has_more = set()
  140.     res = requests.get(json_url, headers={'User-Agent': USER_AGENT})
  141.     if res.status_code != 200:
  142.         res.raise_for_status()
  143.     data = res.json()
  144.     for listing in data:
  145.         if listing['kind'].lower() != 'listing':
  146.             raise ParsingError(json_url)
  147.         children = listing['data']['children']
  148.         for child in children:
  149.             text = None
  150.             if child['kind'] == 't3':
  151.                 text = child['data']['selftext_html']
  152.             elif child['kind'] == 't1':
  153.                 text = child['data']['body_html']
  154.             else:
  155.                 raise ParsingError(json_url)
  156.             soup = BeautifulSoup(html.unescape(text), 'lxml')
  157.             new_urls = set(a.get('href') for a in soup.find_all('a'))
  158.             urls.update(url for url in new_urls if re.match(PATTERNS['itch_game'], url))
  159.             has_more.update(url for url in new_urls if re.match(PATTERNS['itch_group'], url))
  160.     print(f'got {len(urls)} games')
  161.     print(f'sleeping for {sleep_time}s')
  162.     sleep(sleep_time)
  163.     return urls, has_more
  164.  
  165.  
  166. def get_urls(url, sleep_time=15, max_page=None):
  167.     global PATTERNS
  168.  
  169.     print(f'getting games from {url}')
  170.     if re.match(PATTERNS['itch_collection'], url):
  171.         return get_from_itch_group(url, sleep_time, max_page)
  172.     elif re.match(PATTERNS['itch_sale'], url):
  173.         return get_from_itch_group(url, sleep_time, sale=True)
  174.     elif re.match(PATTERNS['reddit_thread'], url):
  175.         return get_from_reddit_thread(url, sleep_time)
  176.     else:
  177.         breakpoint()
  178.         raise NotImplementedError(f'{url} is not supported')
  179.  
  180.  
  181. def claim(url, driver):
  182.     '''
  183.    INPUTS
  184.      url     game url
  185.      driver  a webdriver for a browser that is logged in to itch.io
  186.    OUTPUT
  187.      status
  188.        'claimed'           success
  189.        'dl_only'           cannot be claimed
  190.        'buy'               not for sale
  191.        'claimed has_more'  success, and indicaes that the game is connected to another sale
  192.        'removed'           game does not exist
  193.    '''
  194.     global PATTERNS
  195.     print(f'handling {url}')
  196.  
  197.     driver.get(f"https://{re.search(PATTERNS['itch_game'], url)['game']}")
  198.     original_window = driver.current_window_handle
  199.     assert len(driver.window_handles) == 1
  200.  
  201.     # removed game
  202.     try:
  203.         driver.find_element_by_css_selector('div.not_found_game_page')
  204.         return 'removed'
  205.     except NoSuchElementException:
  206.         pass
  207.  
  208.     # already owned
  209.     try:
  210.         if 'You own this' in driver.find_element_by_css_selector('div.purchase_banner_inner h2').get_attribute('textContent'):
  211.             print(f'already claimed: {url}')
  212.             return 'claimed'
  213.     except NoSuchElementException:
  214.         pass
  215.  
  216.     # check if claimable
  217.     try:
  218.         buy = driver.find_element_by_css_selector('div.buy_row a.buy_btn')
  219.     except NoSuchElementException:
  220.         try:
  221.             buy = driver.find_element_by_css_selector('section.game_download a.buy_btn')
  222.         except NoSuchElementException:
  223.             driver.find_element_by_css_selector('div.uploads')
  224.             print(f'dl only uploads: {url}')
  225.             return 'dl_only'
  226.     if 'Download Now' in buy.get_attribute('textContent'):
  227.         print(f'dl only: {url}')
  228.         return 'dl_only'
  229.     elif 'buy now' in buy.get_attribute('textContent').lower():
  230.         print(f'buy: {url}')
  231.         return 'buy'
  232.     # claim
  233.     elif 'Download or claim' in buy.get_attribute('textContent'):
  234.         #buy.location_once_scrolled_into_view
  235.         #buy.click()
  236.         driver.get(f'{url}/purchase')
  237.         no_thanks = driver.find_element_by_css_selector('a.direct_download_btn')
  238.         if 'No thanks, just take me to the downloads' in no_thanks.get_attribute('textContent'):
  239.             no_thanks.click()
  240.  
  241.             # in case the download page opens in a new window
  242.             sleep(1)
  243.             if len(driver.window_handles) > 1:
  244.                 new_handle = None
  245.                 for window_handle in driver.window_handles:
  246.                     if window_handle != original_window:
  247.                         new_handle = window_handle
  248.                         break
  249.                 driver.close()
  250.                 driver.switch_to.window(new_handle)
  251.  
  252.             claim_btn = driver.find_element_by_css_selector('div.claim_to_download_box form button')
  253.             if 'claim' in claim_btn.get_attribute('textContent').lower():
  254.                 claim_btn.click()
  255.                 message = driver.find_element_by_css_selector('div.game_download_page div.inner_column p')
  256.                 if 'for the promotion' in message.get_attribute('textContent'):
  257.                     print(f'has more after claim: {url}')
  258.                     return 'claimed has_more'
  259.                 if 'You claimed this game' in message.get_attribute('textContent'):
  260.                     print(f'new claim: {url}')
  261.                     return 'claimed'
  262.                 else:
  263.                     raise ParsingError(url)
  264.             else:
  265.                 raise ParsingError(url)
  266.         else:
  267.             raise ParsingError(url)
  268.     else:
  269.         raise ParsingError(url)
  270.  
  271.  
  272. def create_driver():
  273.     #input('Start the browser')
  274.     options = webdriver.firefox.options.Options()
  275.     options.set_preference('permissions.default.image', 2)
  276.     driver = webdriver.Firefox(options=options)
  277.     driver.implicitly_wait(10)
  278.     return driver
  279.  
  280.  
  281. def log(name, data):
  282.     with open(name, 'a') as f:
  283.         for k, v in data.items():
  284.             f.write(k + ' = ' + str(v) + '\n')
  285.  
  286.  
  287. def load_config(name):
  288.     global CONFIG_KEYS
  289.  
  290.     with open(name, 'r') as f:
  291.         data = json.load(f)
  292.     print(f'loaded config from file {name}')
  293.     config = {k: set(data.get(k, [])) for k in CONFIG_KEYS}
  294.     for k, v in config.items():
  295.         print(f'{k}: {len(v)} items')
  296.     return config
  297.  
  298.  
  299. def save_config(name, data):
  300.     print(f'writing config to file {name}')
  301.     with open(name, 'w') as f:
  302.         json.dump({k: list(v) for k, v in data.items()}, f, indent=2)
  303.  
  304.  
  305. def get_urls_and_update_config(config, sources, itch_groups):
  306.     '''
  307.    INPUT
  308.      config       a dict that'll be updates as `sources` are processed
  309.      sources      sources to get links from
  310.      itch_groups  itch sales/collections in `sources` that should be marked as checked in `config`
  311.    '''
  312.     for i, source in enumerate(sources):
  313.         print(f'{i+1}/{len(sources)}')
  314.         new_urls, new_more = get_urls(source)
  315.         config['urls'].update(new_urls)
  316.         config['has_more'].update(new_more)
  317.     config['checked_groups'].update(itch_groups)
  318.     config['has_more'].difference_update(itch_groups)
  319.  
  320.  
  321. def main():
  322.     global SOURCES, CONFIG_KEYS
  323.  
  324.     arg_parser = argparse.ArgumentParser(
  325.         description='Claim free itch.io games in an itch.io sale/collection or reddit thread. \
  326.                     Writes the results (game links, claimed games, ..) to a new file (claim_itch_*.conf) and logs to claim_itch.log')
  327.     arg_parser.add_argument('old_config', nargs='?', help='A json file generated by a previous run of this script')
  328.     arg_parser.add_argument('--fetch', action='store_true', help='Get game links from SOURCES instead of old_config')
  329.     arg_parser.add_argument('--recheck-groups', action='store_true', help='Get game links from already checked_groups')
  330.     args = arg_parser.parse_args()
  331.  
  332.     run_time = int(time())
  333.     log_file = 'claim_itch.log'
  334.     log(log_file, {'# new run': run_time})
  335.  
  336.     config = {k: set() for k in CONFIG_KEYS}
  337.     new_config = f'claim_itch_{run_time}.conf'
  338.     if args.old_config is not None:
  339.         config = load_config(args.old_config)
  340.     else:
  341.         input('will run with no config (click enter to continue)')
  342.  
  343.     # getting game links
  344.     itch_groups = set(filter(re.compile(PATTERNS['itch_group']).match, config['has_more']))
  345.     check_sources = args.old_config is None or args.fetch
  346.     check_groups = len(itch_groups) > 0 or args.recheck_groups
  347.     if check_sources or check_groups:
  348.         input('will reload urls from online pages (click enter to continue)')
  349.         # keep getting newly discovered sales/collections
  350.         first_pass = True
  351.         while True:
  352.             target_sources = set()
  353.             itch_groups = set(filter(re.compile(PATTERNS['itch_group']).match, config['has_more']))
  354.             if first_pass:
  355.                 if check_sources:
  356.                     target_sources.update(SOURCES)
  357.                 if args.recheck_groups:
  358.                     itch_groups.update(config['checked_groups'])
  359.             else:
  360.                 if len(itch_groups) == 0:
  361.                     break
  362.                 else:
  363.                     print('getting links from newly discovered sales/collections')
  364.             target_sources.update(itch_groups)
  365.             get_urls_and_update_config(config, target_sources, itch_groups)
  366.             first_pass = False
  367.             log(log_file, {'## got links': time(), 'sources': target_sources, 'urls': config['urls'], 'has_more': config['has_more']})
  368.  
  369.     # claiming games
  370.     url = None
  371.     sleep_time = 15
  372.     try:
  373.         ignore = set().union(*map(config.get, ('claimed', 'dl_only', 'downloaded', 'buy', 'removed')))
  374.         valid = config['urls'].difference(ignore)
  375.         if len(valid) > 0:
  376.             with create_driver() as driver:
  377.                 driver.get('https://itch.io/login')
  378.                 # manually log in
  379.                 input('Log in then click enter to continue')
  380.                 for i, url in enumerate(valid):
  381.                     print(f"{i+1}/{len(valid)} ({len(config['urls'])})")
  382.                     if url not in ignore:
  383.                         result = claim(url, driver)
  384.                         if 'claimed' in result:
  385.                             config['claimed'].add(url)
  386.                         if 'dl_only' in result:
  387.                             config['dl_only'].add(url)
  388.                         if 'has_more' in result:
  389.                             config['has_more'].add(url)
  390.                         if 'buy' in result:
  391.                             config['buy'].add(url)
  392.                         if 'removed' in result:
  393.                             config['removed'].add(url)
  394.                         print(f'sleeping for {sleep_time}s')
  395.                         sleep(sleep_time)
  396.     except ParsingError as pe:
  397.         config['error'].add(pe.url)
  398.     except Exception as e:
  399.         config['error'].add(url)
  400.         raise
  401.     finally:
  402.         save_config(new_config, config)
  403.  
  404.  
  405. if __name__ == '__main__':
  406.     main()
Add Comment
Please, Sign In to add comment