Guest User

claim_itch.py (v0.2)

a guest
Mar 23rd, 2020
191
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 13.41 KB | None | 0 0
  1. '''
  2. version (see USER_AGENT)
  3.  
  4. requirements:
  5. - python (tested on 3.8)
  6. - requests
  7. - beautiful soup
  8. - lxml
  9. - selenium
  10. - firefox
  11. - geckodriver
  12.  
  13. todo:
  14. - download non-claimable games?
  15. - login?
  16. - proper log
  17. - proper config
  18. - claim() return values
  19. - "selenium.common.exceptions.ElementNotInteractableException: Message: Element <a class="button buy_btn" href=".."> could not be scrolled into view"
  20. - selenium's performance?
  21. - less strict parsing / navigation (use .lower)
  22. - pylint
  23. - a claimable game was recorded as dl_only, was it changed? https://melessthanthree.itch.io/lucah
  24. '''
  25.  
  26. import re
  27. import json
  28. import html
  29. import argparse
  30. import requests
  31. from time import sleep, time
  32. from functools import reduce
  33. from bs4 import BeautifulSoup
  34. from selenium import webdriver
  35. from selenium.common.exceptions import NoSuchElementException
  36.  
  37.  
  38. # add any itch sale/collection or reddit thread to this set
  39. SOURCES = {
  40.     'https://itch.io/c/757294/games-to-help-you-stay-inside',
  41.     'https://itch.io/c/759545/self-isolation-on-a-budget',
  42.     'https://old.reddit.com/r/FreeGameFindings/comments/fka4be/itchio_mega_thread/'
  43. }
  44.  
  45.  
  46. PATTERNS = {
  47.     'itch_collection': r'.+itch\.io/c/.+',
  48.     'itch_sale': r'.+itch\.io/s/.+',
  49.     'itch_group': r'.+itch\.io/[sc]/\d+/.+', # sale or collection
  50.     'reddit_thread': r'.+(?P<thread>reddit\.com/r/.+/comments/.+)/.+',
  51.     'itch_game': r'(?P<game>[^/]+\.itch\.io/[^/]+)'
  52. }
  53.  
  54.  
  55. USER_AGENT = 'ClaimItch/0.2'
  56.  
  57.  
  58. CONFIG_KEYS = [
  59.     'urls',
  60.     'claimed',
  61.     'has_more', # a sale, collection, or game that is connected to more sales
  62.     'checked_groups', # a sale/collection that was checked for games
  63.     'dl_only', # game is not claimable
  64.     'downloaded', # in the generated *.conf file, manually move urls from 'dl_only' to 'downloaded' if you manually downloaded them
  65.     'buy', # game is not free
  66.     'removed',
  67.     'error',
  68. ]
  69.  
  70.  
  71. class ParsingError(Exception):
  72.     def __init__(self, url, *args, **kwargs):
  73.         breakpoint()
  74.         self.url = url
  75.         super().__init__(url, *args, **kwargs)
  76.  
  77.  
  78. def extract_from_itch_group(group_page):
  79.     '''
  80.    INPUT  html sale or collection page
  81.    OUTPUT urls of all games, urls of games that avie noted is connected to more sales
  82.    '''
  83.     soup = BeautifulSoup(group_page, 'lxml')
  84.     urls, more = set(), set()
  85.     games = soup.find_all('div', class_='game_cell')
  86.     for game in games:
  87.         url = game.find('a').get('href')
  88.         urls.add(url)
  89.         if game.find('div', class_='blurb_outer') is not None:
  90.             more.add(url)
  91.     return urls, more
  92.  
  93.  
  94. def get_from_itch_group(group_url, sleep_time=15, max_page=None, sale=False):
  95.     '''
  96.    INPUT  itch.io collection url
  97.    OUTPUT see extract_urls
  98.    '''
  99.     if sale:
  100.         max_page = 1 # sales don't seem to have pages
  101.     page = 1
  102.     urls = set()
  103.     has_more = set()
  104.     while max_page is None or page <= max_page:
  105.         print(f'getting page {page}')
  106.         params = {'page': page} if not sale else None
  107.         res = requests.get(group_url, params=params)
  108.         if res.status_code == 404:
  109.             break
  110.         elif res.status_code != 200:
  111.             breakpoint()
  112.             break
  113.         page += 1
  114.         new_urls, new_more = extract_from_itch_group(res.text)
  115.         urls = urls.union(new_urls)
  116.         has_more = has_more.union(new_more)
  117.         print(f'sleeping for {sleep_time}s')
  118.         sleep(sleep_time)
  119.     print(f'got {len(urls)} games')
  120.     return urls, has_more
  121.  
  122.  
  123. def get_from_reddit_thread(url, sleep_time=15):
  124.     '''
  125.    INPUT  reddit thread url
  126.    OUTPUT itch.io game urls, itch.io groups (sales, collections)
  127.    '''
  128.     global USER_AGENT, PATTERNS
  129.  
  130.     # https://www.reddit.com/dev/api#GET_comments_{article}
  131.     json_url = f"https://{re.match(PATTERNS['reddit_thread'], url)['thread']}.json?threaded=false"
  132.     urls = set()
  133.     has_more = set()
  134.     res = requests.get(json_url, headers={'User-Agent': USER_AGENT})
  135.     if res.status_code != 200:
  136.         res.raise_for_status()
  137.     data = res.json()
  138.     for listing in data:
  139.         if listing['kind'].lower() != 'listing':
  140.             raise ParsingError(json_url)
  141.         children = listing['data']['children']
  142.         for child in children:
  143.             text = None
  144.             if child['kind'] == 't3':
  145.                 text = child['data']['selftext_html']
  146.             elif child['kind'] == 't1':
  147.                 text = child['data']['body_html']
  148.             else:
  149.                 raise ParsingError(json_url)
  150.             soup = BeautifulSoup(html.unescape(text), 'lxml')
  151.             new_urls = set(a.get('href') for a in soup.find_all('a'))
  152.             urls = urls.union(url for url in new_urls if re.match(PATTERNS['itch_game'], url))
  153.             has_more = has_more.union(url for url in new_urls if re.match(PATTERNS['itch_group'], url))
  154.     print(f'got {len(urls)} games')
  155.     print(f'sleeping for {sleep_time}s')
  156.     sleep(sleep_time)
  157.     return urls, has_more
  158.  
  159.  
  160. def get_urls(url, sleep_time=15, max_page=None):
  161.     global PATTERNS
  162.  
  163.     print(f'getting games from {url}')
  164.     if re.match(PATTERNS['itch_collection'], url):
  165.         return get_from_itch_group(url, sleep_time, max_page)
  166.     elif re.match(PATTERNS['itch_sale'], url):
  167.         return get_from_itch_group(url, sleep_time, sale=True)
  168.     elif re.match(PATTERNS['reddit_thread'], url):
  169.         return get_from_reddit_thread(url, sleep_time)
  170.     else:
  171.         breakpoint()
  172.         raise NotImplementedError(f'{url} is not supported')
  173.  
  174.  
  175. def claim(url, driver):
  176.     '''
  177.    INPUTS
  178.      url     game url
  179.      driver  a webdriver for a browser that is logged in to itch.io
  180.    OUTPUT
  181.      status
  182.        'claimed'           success
  183.        'dl_only'           cannot be claimed
  184.        'buy'               not for sale
  185.        'claimed has_more'  success, and indicaes that the game is connected to another sale
  186.        'removed'           game does not exist
  187.    '''
  188.     global PATTERNS
  189.     print(f'handling {url}')
  190.  
  191.     driver.get(f"https://{re.search(PATTERNS['itch_game'], url)['game']}")
  192.     original_window = driver.current_window_handle
  193.     assert len(driver.window_handles) == 1
  194.  
  195.     # removed game
  196.     try:
  197.         driver.find_element_by_css_selector('div.not_found_game_page')
  198.         return 'removed'
  199.     except NoSuchElementException:
  200.         pass
  201.  
  202.     # already owned
  203.     try:
  204.         if 'You own this' in driver.find_element_by_css_selector('div.purchase_banner_inner h2').get_attribute('textContent'):
  205.             print(f'already claimed: {url}')
  206.             return 'claimed'
  207.     except NoSuchElementException:
  208.         pass
  209.  
  210.     # check if claimable
  211.     try:
  212.         buy = driver.find_element_by_css_selector('div.buy_row a.buy_btn')
  213.     except NoSuchElementException:
  214.         try:
  215.             buy = driver.find_element_by_css_selector('section.game_download a.buy_btn')
  216.         except NoSuchElementException:
  217.             driver.find_element_by_css_selector('div.uploads')
  218.             print(f'dl only uploads: {url}')
  219.             return 'dl_only'
  220.     if 'Download Now' in buy.get_attribute('textContent'):
  221.         print(f'dl only: {url}')
  222.         return 'dl_only'
  223.     elif 'buy now' in buy.get_attribute('textContent').lower():
  224.         print(f'buy: {url}')
  225.         return 'buy'
  226.     # claim
  227.     elif 'Download or claim' in buy.get_attribute('textContent'):
  228.         #buy.location_once_scrolled_into_view
  229.         #buy.click()
  230.         driver.get(f'{url}/purchase')
  231.         no_thanks = driver.find_element_by_css_selector('a.direct_download_btn')
  232.         if 'No thanks, just take me to the downloads' in no_thanks.get_attribute('textContent'):
  233.             no_thanks.click()
  234.  
  235.             # in case the download page opens in a new window
  236.             sleep(1)
  237.             if len(driver.window_handles) > 1:
  238.                 new_handle = None
  239.                 for window_handle in driver.window_handles:
  240.                     if window_handle != original_window:
  241.                         new_handle = window_handle
  242.                         break
  243.                 driver.close()
  244.                 driver.switch_to.window(new_handle)
  245.  
  246.             claim_btn = driver.find_element_by_css_selector('div.claim_to_download_box form button')
  247.             if 'claim' in claim_btn.get_attribute('textContent').lower():
  248.                 claim_btn.click()
  249.                 message = driver.find_element_by_css_selector('div.game_download_page div.inner_column p')
  250.                 if 'for the promotion' in message.get_attribute('textContent'):
  251.                     print(f'has more after claim: {url}')
  252.                     return 'claimed has_more'
  253.                 if 'You claimed this game' in message.get_attribute('textContent'):
  254.                     print(f'new claim: {url}')
  255.                     return 'claimed'
  256.                 else:
  257.                     raise ParsingError(url)
  258.             else:
  259.                 raise ParsingError(url)
  260.         else:
  261.             raise ParsingError(url)
  262.     else:
  263.         raise ParsingError(url)
  264.  
  265.  
  266. def create_driver():
  267.     #input('Start the browser')
  268.     options = webdriver.firefox.options.Options()
  269.     options.set_preference('permissions.default.image', 2)
  270.     driver = webdriver.Firefox(options=options)
  271.     driver.implicitly_wait(10)
  272.     return driver
  273.  
  274.  
  275. def log(name, data):
  276.     with open(name, 'a') as f:
  277.         for k, v in data.items():
  278.             f.write(k + ' = ' + str(v) + '\n')
  279.  
  280.  
  281. def load_config(name):
  282.     global CONFIG_KEYS
  283.  
  284.     with open(name, 'r') as f:
  285.         data = json.load(f)
  286.     print(f'loaded config from file {name}')
  287.     config = {k: set(data.get(k, [])) for k in CONFIG_KEYS}
  288.     for k, v in config.items():
  289.         print(f'{k}: {len(v)} items')
  290.     return config
  291.  
  292.  
  293. def save_config(name, data):
  294.     print(f'writing config to file {name}')
  295.     with open(name, 'w') as f:
  296.         json.dump({k: list(v) for k, v in data.items()}, f, indent=2)
  297.  
  298.  
  299. def main():
  300.     global SOURCES, CONFIG_KEYS
  301.  
  302.     arg_parser = argparse.ArgumentParser(
  303.         description='Claim free itch.io games in an itch.io sale/collection or reddit thread. \
  304.                     Writes the results (game links, claimed games, ..) to a new file (claim_itch_*.conf) and logs to claim_itch.log')
  305.     arg_parser.add_argument('old_config', nargs='?', help='A json file generated by a previous run of this script')
  306.     arg_parser.add_argument('--fetch', action='store_true', help='Get game links from SOURCES and sales/collections in has_more instead of old_config')
  307.     arg_parser.add_argument('--recheck-groups', action='store_true', help='Get links from already checked_groups (implies --fetch)')
  308.     args = arg_parser.parse_args()
  309.  
  310.     run_time = int(time())
  311.     log_file = 'claim_itch.log'
  312.     log(log_file, {'# new run': run_time})
  313.  
  314.     config = {k: set() for k in CONFIG_KEYS}
  315.     new_config = f'claim_itch_{run_time}.conf'
  316.     if args.old_config is not None:
  317.         config = load_config(args.old_config)
  318.     else:
  319.         input('will run with no config (click enter to continue)')
  320.  
  321.     # getting game links
  322.     itch_groups = set(filter(re.compile(PATTERNS['itch_group']).match, config['has_more']))
  323.     if args.old_config is None or args.fetch or args.recheck_groups:
  324.         input('will reload urls from online pages (click enter to continue)')
  325.         if args.recheck_groups:
  326.             itch_groups = itch_groups.union(config['checked_groups'])
  327.         all_sources = SOURCES.union(itch_groups)
  328.         for i, source in enumerate(all_sources):
  329.             print(f'{i+1}/{len(all_sources)}')
  330.             new_urls, new_more = get_urls(source, max_page=None)
  331.             config['urls'] = config['urls'].union(new_urls)
  332.             config['has_more'] = config['has_more'].union(new_more)
  333.         config['checked_groups'] = config['checked_groups'].union(itch_groups)
  334.         config['has_more'] = config['has_more'].difference(itch_groups)
  335.     log(log_file, {'collections': SOURCES.union(itch_groups), 'url': config['urls'], 'has_more': config['has_more']})
  336.  
  337.     # claiming games
  338.     url = None
  339.     sleep_time = 15
  340.     try:
  341.         ignore = reduce(set.union, map(config.get, ('claimed', 'dl_only', 'downloaded', 'buy', 'removed')))
  342.         valid = config['urls'].difference(ignore)
  343.         if len(valid) > 0:
  344.             with create_driver() as driver:
  345.                 driver.get('https://itch.io/login')
  346.                 # manually log in
  347.                 input('Log in then click enter to continue')
  348.                 for i, url in enumerate(valid):
  349.                     print(f"{i+1}/{len(valid)} ({len(config['urls'])})")
  350.                     if url not in ignore:
  351.                         result = claim(url, driver)
  352.                         if 'claimed' in result:
  353.                             config['claimed'].add(url)
  354.                         if 'dl_only' in result:
  355.                             config['dl_only'].add(url)
  356.                         if 'has_more' in result:
  357.                             config['has_more'].add(url)
  358.                         if 'buy' in result:
  359.                             config['buy'].add(url)
  360.                         if 'removed' in result:
  361.                             config['removed'].add(url)
  362.                         print(f'sleeping for {sleep_time}s')
  363.                         sleep(sleep_time)
  364.     except ParsingError as pe:
  365.         config['error'].add(pe.url)
  366.     except Exception as e:
  367.         config['error'].add(url)
  368.         raise
  369.     finally:
  370.         save_config(new_config, config)
  371.  
  372.  
  373. if __name__ == '__main__':
  374.     main()
Add Comment
Please, Sign In to add comment