claim_itch.py (v0.3)

'''
version (see USER_AGENT)

requirements:
- python (tested on 3.8)
- requests
- beautiful soup
- lxml
- selenium
- firefox
- geckodriver

files and variables:
- SOURCES variable:   includes itch sales/collections or reddit threads you want check, pass --fetch to retcheck them
- claim_itch_*.conf:  includes the results of the current run, can be passed to the script on future runs
                      see the CONFIG_KEYS variable
                      each run creates a new file
- claim_itch.log:     log file

todo:
- download non-claimable games?
- login?
- proper log
- proper config
- claim() return values
- "selenium.common.exceptions.ElementNotInteractableException: Message: Element <a class="button buy_btn" href=".."> could not be scrolled into view"
- selenium's performance?
- less strict parsing / navigation (use .lower)
- pylint
- a claimable game was recorded as dl_only, was it changed? https://melessthanthree.itch.io/lucah
'''

import re
import json
import html
import argparse
import requests
from time import sleep, time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException


# add any itch sale/collection or reddit thread to this set
SOURCES = {
    'https://itch.io/c/757294/games-to-help-you-stay-inside',
    'https://itch.io/c/759545/self-isolation-on-a-budget',
    'https://old.reddit.com/r/FreeGameFindings/comments/fka4be/itchio_mega_thread/'
}


PATTERNS = {
    'itch_collection': r'.+itch\.io/c/.+',
    'itch_sale': r'.+itch\.io/s/.+',
    'itch_group': r'.+itch\.io/[sc]/\d+/.+', # sale or collection
    'reddit_thread': r'.+(?P<thread>reddit\.com/r/.+/comments/.+)/.+',
    'itch_game': r'(?P<game>[^/]+\.itch\.io/[^/]+)'
}


USER_AGENT = 'ClaimItch/0.3'


CONFIG_KEYS = [
    'urls',           # collected urls
    'claimed',        # claimed games
    'has_more',       # a sale, collection, or game that is connected to more sales
    'checked_groups', # a sale/collection that was checked for games, pass --recheck-groups to recheck it
    'dl_only',        # game is not claimable
    'downloaded',     # games that were downloaded (edit this manually)
    'buy',            # game is not free
    'removed',        # game does not exist
    'error',          # games that broke the script
]


class ParsingError(Exception):
    def __init__(self, url, *args, **kwargs):
        breakpoint()
        self.url = url
        super().__init__(url, *args, **kwargs)


def extract_from_itch_group(group_page):
    '''
    INPUT  html sale or collection page
    OUTPUT urls of all games, urls of games that avie noted is connected to more sales
    '''
    soup = BeautifulSoup(group_page, 'lxml')
    urls, more = set(), set()
    games = soup.find_all('div', class_='game_cell')
    for game in games:
        url = game.find('a').get('href')
        urls.add(url)
        if game.find('div', class_='blurb_outer') is not None:
            more.add(url)
    return urls, more


def get_from_itch_group(group_url, sleep_time=15, max_page=None, sale=False):
    '''
    INPUT  itch.io collection url
    OUTPUT see extract_urls
    '''
    if sale:
        max_page = 1 # sales don't seem to have pages
    page = 1
    urls = set()
    has_more = set()
    while max_page is None or page <= max_page:
        print(f'getting page {page}')
        params = {'page': page} if not sale else None
        res = requests.get(group_url, params=params)
        if res.status_code == 404:
            break
        elif res.status_code != 200:
            breakpoint()
            break
        page += 1
        new_urls, new_more = extract_from_itch_group(res.text)
        urls.update(new_urls)
        has_more.update(new_more)
        print(f'sleeping for {sleep_time}s')
        sleep(sleep_time)
    print(f'got {len(urls)} games')
    return urls, has_more


def get_from_reddit_thread(url, sleep_time=15):
    '''
    INPUT  reddit thread url
    OUTPUT itch.io game urls, itch.io groups (sales, collections)
    '''
    global USER_AGENT, PATTERNS

    # https://www.reddit.com/dev/api#GET_comments_{article}
    json_url = f"https://{re.match(PATTERNS['reddit_thread'], url)['thread']}.json?threaded=false"
    urls = set()
    has_more = set()
    res = requests.get(json_url, headers={'User-Agent': USER_AGENT})
    if res.status_code != 200:
        res.raise_for_status()
    data = res.json()
    for listing in data:
        if listing['kind'].lower() != 'listing':
            raise ParsingError(json_url)
        children = listing['data']['children']
        for child in children:
            text = None
            if child['kind'] == 't3':
                text = child['data']['selftext_html']
            elif child['kind'] == 't1':
                text = child['data']['body_html']
            else:
                raise ParsingError(json_url)
            soup = BeautifulSoup(html.unescape(text), 'lxml')
            new_urls = set(a.get('href') for a in soup.find_all('a'))
            urls.update(url for url in new_urls if re.match(PATTERNS['itch_game'], url))
            has_more.update(url for url in new_urls if re.match(PATTERNS['itch_group'], url))
    print(f'got {len(urls)} games')
    print(f'sleeping for {sleep_time}s')
    sleep(sleep_time)
    return urls, has_more


def get_urls(url, sleep_time=15, max_page=None):
    global PATTERNS

    print(f'getting games from {url}')
    if re.match(PATTERNS['itch_collection'], url):
        return get_from_itch_group(url, sleep_time, max_page)
    elif re.match(PATTERNS['itch_sale'], url):
        return get_from_itch_group(url, sleep_time, sale=True)
    elif re.match(PATTERNS['reddit_thread'], url):
        return get_from_reddit_thread(url, sleep_time)
    else:
        breakpoint()
        raise NotImplementedError(f'{url} is not supported')


def claim(url, driver):
    '''
    INPUTS
      url     game url
      driver  a webdriver for a browser that is logged in to itch.io
    OUTPUT
      status
        'claimed'           success
        'dl_only'           cannot be claimed
        'buy'               not for sale
        'claimed has_more'  success, and indicaes that the game is connected to another sale
        'removed'           game does not exist
    '''
    global PATTERNS
    print(f'handling {url}')

    driver.get(f"https://{re.search(PATTERNS['itch_game'], url)['game']}")
    original_window = driver.current_window_handle
    assert len(driver.window_handles) == 1

    # removed game
    try:
        driver.find_element_by_css_selector('div.not_found_game_page')
        return 'removed'
    except NoSuchElementException:
        pass

    # already owned
    try:
        if 'You own this' in driver.find_element_by_css_selector('div.purchase_banner_inner h2').get_attribute('textContent'):
            print(f'already claimed: {url}')
            return 'claimed'
    except NoSuchElementException:
        pass

    # check if claimable
    try:
        buy = driver.find_element_by_css_selector('div.buy_row a.buy_btn')
    except NoSuchElementException:
        try:
            buy = driver.find_element_by_css_selector('section.game_download a.buy_btn')
        except NoSuchElementException:
            driver.find_element_by_css_selector('div.uploads')
            print(f'dl only uploads: {url}')
            return 'dl_only'
    if 'Download Now' in buy.get_attribute('textContent'):
        print(f'dl only: {url}')
        return 'dl_only'
    elif 'buy now' in buy.get_attribute('textContent').lower():
        print(f'buy: {url}')
        return 'buy'
    # claim
    elif 'Download or claim' in buy.get_attribute('textContent'):
        #buy.location_once_scrolled_into_view
        #buy.click()
        driver.get(f'{url}/purchase')
        no_thanks = driver.find_element_by_css_selector('a.direct_download_btn')
        if 'No thanks, just take me to the downloads' in no_thanks.get_attribute('textContent'):
            no_thanks.click()

            # in case the download page opens in a new window
            sleep(1)
            if len(driver.window_handles) > 1:
                new_handle = None
                for window_handle in driver.window_handles:
                    if window_handle != original_window:
                        new_handle = window_handle
                        break
                driver.close()
                driver.switch_to.window(new_handle)

            claim_btn = driver.find_element_by_css_selector('div.claim_to_download_box form button')
            if 'claim' in claim_btn.get_attribute('textContent').lower():
                claim_btn.click()
                message = driver.find_element_by_css_selector('div.game_download_page div.inner_column p')
                if 'for the promotion' in message.get_attribute('textContent'):
                    print(f'has more after claim: {url}')
                    return 'claimed has_more'
                if 'You claimed this game' in message.get_attribute('textContent'):
                    print(f'new claim: {url}')
                    return 'claimed'
                else:
                    raise ParsingError(url)
            else:
                raise ParsingError(url)
        else:
            raise ParsingError(url)
    else:
        raise ParsingError(url)


def create_driver():
    #input('Start the browser')
    options = webdriver.firefox.options.Options()
    options.set_preference('permissions.default.image', 2)
    driver = webdriver.Firefox(options=options)
    driver.implicitly_wait(10)
    return driver


def log(name, data):
    with open(name, 'a') as f:
        for k, v in data.items():
            f.write(k + ' = ' + str(v) + '\n')


def load_config(name):
    global CONFIG_KEYS

    with open(name, 'r') as f:
        data = json.load(f)
    print(f'loaded config from file {name}')
    config = {k: set(data.get(k, [])) for k in CONFIG_KEYS}
    for k, v in config.items():
        print(f'{k}: {len(v)} items')
    return config


def save_config(name, data):
    print(f'writing config to file {name}')
    with open(name, 'w') as f:
        json.dump({k: list(v) for k, v in data.items()}, f, indent=2)


def get_urls_and_update_config(config, sources, itch_groups):
    '''
    INPUT
      config       a dict that'll be updates as `sources` are processed
      sources      sources to get links from
      itch_groups  itch sales/collections in `sources` that should be marked as checked in `config`
    '''
    for i, source in enumerate(sources):
        print(f'{i+1}/{len(sources)}')
        new_urls, new_more = get_urls(source)
        config['urls'].update(new_urls)
        config['has_more'].update(new_more)
    config['checked_groups'].update(itch_groups)
    config['has_more'].difference_update(itch_groups)


def main():
    global SOURCES, CONFIG_KEYS

    arg_parser = argparse.ArgumentParser(
        description='Claim free itch.io games in an itch.io sale/collection or reddit thread. \
                     Writes the results (game links, claimed games, ..) to a new file (claim_itch_*.conf) and logs to claim_itch.log')
    arg_parser.add_argument('old_config', nargs='?', help='A json file generated by a previous run of this script')
    arg_parser.add_argument('--fetch', action='store_true', help='Get game links from SOURCES instead of old_config')
    arg_parser.add_argument('--recheck-groups', action='store_true', help='Get game links from already checked_groups')
    args = arg_parser.parse_args()

    run_time = int(time())
    log_file = 'claim_itch.log'
    log(log_file, {'# new run': run_time})

    config = {k: set() for k in CONFIG_KEYS}
    new_config = f'claim_itch_{run_time}.conf'
    if args.old_config is not None:
        config = load_config(args.old_config)
    else:
        input('will run with no config (click enter to continue)')

    # getting game links
    itch_groups = set(filter(re.compile(PATTERNS['itch_group']).match, config['has_more']))
    check_sources = args.old_config is None or args.fetch
    check_groups = len(itch_groups) > 0 or args.recheck_groups
    if check_sources or check_groups:
        input('will reload urls from online pages (click enter to continue)')
        # keep getting newly discovered sales/collections
        first_pass = True
        while True:
            target_sources = set()
            itch_groups = set(filter(re.compile(PATTERNS['itch_group']).match, config['has_more']))
            if first_pass:
                if check_sources:
                    target_sources.update(SOURCES)
                if args.recheck_groups:
                    itch_groups.update(config['checked_groups'])
            else:
                if len(itch_groups) == 0:
                    break
                else:
                    print('getting links from newly discovered sales/collections')
            target_sources.update(itch_groups)
            get_urls_and_update_config(config, target_sources, itch_groups)
            first_pass = False
            log(log_file, {'## got links': time(), 'sources': target_sources, 'urls': config['urls'], 'has_more': config['has_more']})

    # claiming games
    url = None
    sleep_time = 15
    try:
        ignore = set().union(*map(config.get, ('claimed', 'dl_only', 'downloaded', 'buy', 'removed')))
        valid = config['urls'].difference(ignore)
        if len(valid) > 0:
            with create_driver() as driver:
                driver.get('https://itch.io/login')
                # manually log in
                input('Log in then click enter to continue')
                for i, url in enumerate(valid):
                    print(f"{i+1}/{len(valid)} ({len(config['urls'])})")
                    if url not in ignore:
                        result = claim(url, driver)
                        if 'claimed' in result:
                            config['claimed'].add(url)
                        if 'dl_only' in result:
                            config['dl_only'].add(url)
                        if 'has_more' in result:
                            config['has_more'].add(url)
                        if 'buy' in result:
                            config['buy'].add(url)
                        if 'removed' in result:
                            config['removed'].add(url)
                        print(f'sleeping for {sleep_time}s')
                        sleep(sleep_time)
    except ParsingError as pe:
        config['error'].add(pe.url)
    except Exception as e:
        config['error'].add(url)
        raise
    finally:
        save_config(new_config, config)


if __name__ == '__main__':
    main()