claim_itch.py (v0.2)

'''
version (see USER_AGENT)

requirements:
- python (tested on 3.8)
- requests
- beautiful soup
- lxml
- selenium
- firefox
- geckodriver

todo:
- download non-claimable games?
- login?
- proper log
- proper config
- claim() return values
- "selenium.common.exceptions.ElementNotInteractableException: Message: Element <a class="button buy_btn" href=".."> could not be scrolled into view"
- selenium's performance?
- less strict parsing / navigation (use .lower)
- pylint
- a claimable game was recorded as dl_only, was it changed? https://melessthanthree.itch.io/lucah
'''

import re
import json
import html
import argparse
import requests
from time import sleep, time
from functools import reduce
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException


# add any itch sale/collection or reddit thread to this set
SOURCES = {
    'https://itch.io/c/757294/games-to-help-you-stay-inside',
    'https://itch.io/c/759545/self-isolation-on-a-budget',
    'https://old.reddit.com/r/FreeGameFindings/comments/fka4be/itchio_mega_thread/'
}


PATTERNS = {
    'itch_collection': r'.+itch\.io/c/.+',
    'itch_sale': r'.+itch\.io/s/.+',
    'itch_group': r'.+itch\.io/[sc]/\d+/.+', # sale or collection
    'reddit_thread': r'.+(?P<thread>reddit\.com/r/.+/comments/.+)/.+',
    'itch_game': r'(?P<game>[^/]+\.itch\.io/[^/]+)'
}


USER_AGENT = 'ClaimItch/0.2'


CONFIG_KEYS = [
    'urls',
    'claimed',
    'has_more', # a sale, collection, or game that is connected to more sales
    'checked_groups', # a sale/collection that was checked for games
    'dl_only', # game is not claimable
    'downloaded', # in the generated *.conf file, manually move urls from 'dl_only' to 'downloaded' if you manually downloaded them
    'buy', # game is not free
    'removed',
    'error',
]


class ParsingError(Exception):
    def __init__(self, url, *args, **kwargs):
        breakpoint()
        self.url = url
        super().__init__(url, *args, **kwargs)


def extract_from_itch_group(group_page):
    '''
    INPUT  html sale or collection page
    OUTPUT urls of all games, urls of games that avie noted is connected to more sales
    '''
    soup = BeautifulSoup(group_page, 'lxml')
    urls, more = set(), set()
    games = soup.find_all('div', class_='game_cell')
    for game in games:
        url = game.find('a').get('href')
        urls.add(url)
        if game.find('div', class_='blurb_outer') is not None:
            more.add(url)
    return urls, more


def get_from_itch_group(group_url, sleep_time=15, max_page=None, sale=False):
    '''
    INPUT  itch.io collection url
    OUTPUT see extract_urls
    '''
    if sale:
        max_page = 1 # sales don't seem to have pages
    page = 1
    urls = set()
    has_more = set()
    while max_page is None or page <= max_page:
        print(f'getting page {page}')
        params = {'page': page} if not sale else None
        res = requests.get(group_url, params=params)
        if res.status_code == 404:
            break
        elif res.status_code != 200:
            breakpoint()
            break
        page += 1
        new_urls, new_more = extract_from_itch_group(res.text)
        urls = urls.union(new_urls)
        has_more = has_more.union(new_more)
        print(f'sleeping for {sleep_time}s')
        sleep(sleep_time)
    print(f'got {len(urls)} games')
    return urls, has_more


def get_from_reddit_thread(url, sleep_time=15):
    '''
    INPUT  reddit thread url
    OUTPUT itch.io game urls, itch.io groups (sales, collections)
    '''
    global USER_AGENT, PATTERNS

    # https://www.reddit.com/dev/api#GET_comments_{article}
    json_url = f"https://{re.match(PATTERNS['reddit_thread'], url)['thread']}.json?threaded=false"
    urls = set()
    has_more = set()
    res = requests.get(json_url, headers={'User-Agent': USER_AGENT})
    if res.status_code != 200:
        res.raise_for_status()
    data = res.json()
    for listing in data:
        if listing['kind'].lower() != 'listing':
            raise ParsingError(json_url)
        children = listing['data']['children']
        for child in children:
            text = None
            if child['kind'] == 't3':
                text = child['data']['selftext_html']
            elif child['kind'] == 't1':
                text = child['data']['body_html']
            else:
                raise ParsingError(json_url)
            soup = BeautifulSoup(html.unescape(text), 'lxml')
            new_urls = set(a.get('href') for a in soup.find_all('a'))
            urls = urls.union(url for url in new_urls if re.match(PATTERNS['itch_game'], url))
            has_more = has_more.union(url for url in new_urls if re.match(PATTERNS['itch_group'], url))
    print(f'got {len(urls)} games')
    print(f'sleeping for {sleep_time}s')
    sleep(sleep_time)
    return urls, has_more


def get_urls(url, sleep_time=15, max_page=None):
    global PATTERNS

    print(f'getting games from {url}')
    if re.match(PATTERNS['itch_collection'], url):
        return get_from_itch_group(url, sleep_time, max_page)
    elif re.match(PATTERNS['itch_sale'], url):
        return get_from_itch_group(url, sleep_time, sale=True)
    elif re.match(PATTERNS['reddit_thread'], url):
        return get_from_reddit_thread(url, sleep_time)
    else:
        breakpoint()
        raise NotImplementedError(f'{url} is not supported')


def claim(url, driver):
    '''
    INPUTS
      url     game url
      driver  a webdriver for a browser that is logged in to itch.io
    OUTPUT
      status
        'claimed'           success
        'dl_only'           cannot be claimed
        'buy'               not for sale
        'claimed has_more'  success, and indicaes that the game is connected to another sale
        'removed'           game does not exist
    '''
    global PATTERNS
    print(f'handling {url}')

    driver.get(f"https://{re.search(PATTERNS['itch_game'], url)['game']}")
    original_window = driver.current_window_handle
    assert len(driver.window_handles) == 1

    # removed game
    try:
        driver.find_element_by_css_selector('div.not_found_game_page')
        return 'removed'
    except NoSuchElementException:
        pass

    # already owned
    try:
        if 'You own this' in driver.find_element_by_css_selector('div.purchase_banner_inner h2').get_attribute('textContent'):
            print(f'already claimed: {url}')
            return 'claimed'
    except NoSuchElementException:
        pass

    # check if claimable
    try:
        buy = driver.find_element_by_css_selector('div.buy_row a.buy_btn')
    except NoSuchElementException:
        try:
            buy = driver.find_element_by_css_selector('section.game_download a.buy_btn')
        except NoSuchElementException:
            driver.find_element_by_css_selector('div.uploads')
            print(f'dl only uploads: {url}')
            return 'dl_only'
    if 'Download Now' in buy.get_attribute('textContent'):
        print(f'dl only: {url}')
        return 'dl_only'
    elif 'buy now' in buy.get_attribute('textContent').lower():
        print(f'buy: {url}')
        return 'buy'
    # claim
    elif 'Download or claim' in buy.get_attribute('textContent'):
        #buy.location_once_scrolled_into_view
        #buy.click()
        driver.get(f'{url}/purchase')
        no_thanks = driver.find_element_by_css_selector('a.direct_download_btn')
        if 'No thanks, just take me to the downloads' in no_thanks.get_attribute('textContent'):
            no_thanks.click()

            # in case the download page opens in a new window
            sleep(1)
            if len(driver.window_handles) > 1:
                new_handle = None
                for window_handle in driver.window_handles:
                    if window_handle != original_window:
                        new_handle = window_handle
                        break
                driver.close()
                driver.switch_to.window(new_handle)

            claim_btn = driver.find_element_by_css_selector('div.claim_to_download_box form button')
            if 'claim' in claim_btn.get_attribute('textContent').lower():
                claim_btn.click()
                message = driver.find_element_by_css_selector('div.game_download_page div.inner_column p')
                if 'for the promotion' in message.get_attribute('textContent'):
                    print(f'has more after claim: {url}')
                    return 'claimed has_more'
                if 'You claimed this game' in message.get_attribute('textContent'):
                    print(f'new claim: {url}')
                    return 'claimed'
                else:
                    raise ParsingError(url)
            else:
                raise ParsingError(url)
        else:
            raise ParsingError(url)
    else:
        raise ParsingError(url)


def create_driver():
    #input('Start the browser')
    options = webdriver.firefox.options.Options()
    options.set_preference('permissions.default.image', 2)
    driver = webdriver.Firefox(options=options)
    driver.implicitly_wait(10)
    return driver


def log(name, data):
    with open(name, 'a') as f:
        for k, v in data.items():
            f.write(k + ' = ' + str(v) + '\n')


def load_config(name):
    global CONFIG_KEYS

    with open(name, 'r') as f:
        data = json.load(f)
    print(f'loaded config from file {name}')
    config = {k: set(data.get(k, [])) for k in CONFIG_KEYS}
    for k, v in config.items():
        print(f'{k}: {len(v)} items')
    return config


def save_config(name, data):
    print(f'writing config to file {name}')
    with open(name, 'w') as f:
        json.dump({k: list(v) for k, v in data.items()}, f, indent=2)


def main():
    global SOURCES, CONFIG_KEYS

    arg_parser = argparse.ArgumentParser(
        description='Claim free itch.io games in an itch.io sale/collection or reddit thread. \
                     Writes the results (game links, claimed games, ..) to a new file (claim_itch_*.conf) and logs to claim_itch.log')
    arg_parser.add_argument('old_config', nargs='?', help='A json file generated by a previous run of this script')
    arg_parser.add_argument('--fetch', action='store_true', help='Get game links from SOURCES and sales/collections in has_more instead of old_config')
    arg_parser.add_argument('--recheck-groups', action='store_true', help='Get links from already checked_groups (implies --fetch)')
    args = arg_parser.parse_args()

    run_time = int(time())
    log_file = 'claim_itch.log'
    log(log_file, {'# new run': run_time})

    config = {k: set() for k in CONFIG_KEYS}
    new_config = f'claim_itch_{run_time}.conf'
    if args.old_config is not None:
        config = load_config(args.old_config)
    else:
        input('will run with no config (click enter to continue)')

    # getting game links
    itch_groups = set(filter(re.compile(PATTERNS['itch_group']).match, config['has_more']))
    if args.old_config is None or args.fetch or args.recheck_groups:
        input('will reload urls from online pages (click enter to continue)')
        if args.recheck_groups:
            itch_groups = itch_groups.union(config['checked_groups'])
        all_sources = SOURCES.union(itch_groups)
        for i, source in enumerate(all_sources):
            print(f'{i+1}/{len(all_sources)}')
            new_urls, new_more = get_urls(source, max_page=None)
            config['urls'] = config['urls'].union(new_urls)
            config['has_more'] = config['has_more'].union(new_more)
        config['checked_groups'] = config['checked_groups'].union(itch_groups)
        config['has_more'] = config['has_more'].difference(itch_groups)
    log(log_file, {'collections': SOURCES.union(itch_groups), 'url': config['urls'], 'has_more': config['has_more']})

    # claiming games
    url = None
    sleep_time = 15
    try:
        ignore = reduce(set.union, map(config.get, ('claimed', 'dl_only', 'downloaded', 'buy', 'removed')))
        valid = config['urls'].difference(ignore)
        if len(valid) > 0:
            with create_driver() as driver:
                driver.get('https://itch.io/login')
                # manually log in
                input('Log in then click enter to continue')
                for i, url in enumerate(valid):
                    print(f"{i+1}/{len(valid)} ({len(config['urls'])})")
                    if url not in ignore:
                        result = claim(url, driver)
                        if 'claimed' in result:
                            config['claimed'].add(url)
                        if 'dl_only' in result:
                            config['dl_only'].add(url)
                        if 'has_more' in result:
                            config['has_more'].add(url)
                        if 'buy' in result:
                            config['buy'].add(url)
                        if 'removed' in result:
                            config['removed'].add(url)
                        print(f'sleeping for {sleep_time}s')
                        sleep(sleep_time)
    except ParsingError as pe:
        config['error'].add(pe.url)
    except Exception as e:
        config['error'].add(url)
        raise
    finally:
        save_config(new_config, config)


if __name__ == '__main__':
    main()