Author Affiliation

import json
from functools import reduce
from pathlib import Path

import httpx
from loguru import logger
from lxml.etree import fromstring
from retrying import retry
from tqdm import tqdm
from xmljson import badgerfish

logger.remove()
logger.add(lambda msg: tqdm.write(msg, end=""), colorize=True)


client = httpx.Client()


DB = Path('inst-data')
author_db = DB / 'authors.json'

if not author_db.exists():
    author_db.write_text('{}', encoding='utf-8')

author_list = json.loads(Path(author_db).read_bytes())


def fetch_publications(key):
    path: Path = DB / key
    if not path.exists():
        resp = client.get(f'https://dblp.org/search/publ/api?q=toc%3A{key}.bht%3A&h=1000&format=json')
        logger.debug(f'{resp.request.method} {resp.status_code} {resp.url}')
        resp.raise_for_status()

        try:
            path.parent.mkdir(exist_ok=True, parents=True)
            path.write_text(json.dumps(resp.json()['result']['hits']['hit']), encoding='utf-8')
        except KeyError:
            Path(DB / 'error.json').write_text(json.dumps(resp.json()), encoding='utf-8')
            raise

    return json.loads(path.read_text(encoding='utf-8'))


@retry(stop_max_attempt_number=10, wait_exponential_multiplier=100, wait_exponential_max=10000)
def fetch_author(pid):
    if pid not in author_list:
        resp = client.get(f'https://dblp.org/pid/{pid}.xml')
        logger.debug(f'{resp.request.method} {resp.status_code} {resp.url}')
        if resp.status_code == 410:
            return None

        resp.raise_for_status()

        xml = fromstring(resp.text)
        author_list[pid] = badgerfish.data(xml)['dblpperson']['person']
        Path(author_db).write_text(json.dumps(author_list), encoding='utf-8')

    return author_list[pid]


def on_authors(key, authors):
    for idx, author in enumerate(authors):
        if '@pid' not in author:
            print(author)
        yield {
            'key': key,
            'sequence': idx + 1,
            'author': fetch_author(author['@pid'])
        }


def on_publication(key, title, authors, **kwargs):
    # logger.info(f'{key} - {title}')
    author = authors['author']
    if type(author) == list:
        return on_authors(key, author)
    else:
        assert type(author) == dict
        return on_authors(key, [author])


def on_publications(key):
    publ_list = fetch_publications(key)
    for publ in tqdm(publ_list, desc=key):
        if 'authors' in publ['info']:
            yield list(on_publication(**publ['info']))


def main(pubs):
    for pub in tqdm(pubs, desc='main'):
        yield reduce(list.__add__, list(on_publications(pub)), [])


if __name__ == '__main__':
    pubs = [
        'db/conf/ccs/ccs2018',
        'db/conf/ccs/ccs2019',
        'db/conf/ccs/ccs2020',
        # 'db/conf/crypto/crypto2018-1',
        # 'db/conf/crypto/crypto2018-2',
        # 'db/conf/crypto/crypto2018-3',
        # 'db/conf/crypto/crypto2019-1',
        # 'db/conf/crypto/crypto2019-2',
        # 'db/conf/crypto/crypto2019-3',
        # 'db/conf/crypto/crypto2020-1',
        # 'db/conf/crypto/crypto2020-2',
        # 'db/conf/crypto/crypto2020-3',
        # 'db/conf/eurocrypt/eurocrypt2018-1',
        # 'db/conf/eurocrypt/eurocrypt2018-2',
        # 'db/conf/eurocrypt/eurocrypt2018-3',
        # 'db/conf/eurocrypt/eurocrypt2019-1',
        # 'db/conf/eurocrypt/eurocrypt2019-2',
        # 'db/conf/eurocrypt/eurocrypt2019-3',
        # 'db/conf/eurocrypt/eurocrypt2020-1',
        # 'db/conf/eurocrypt/eurocrypt2020-2',
        # 'db/conf/eurocrypt/eurocrypt2020-3',
        'db/conf/ndss/ndss2020',
        'db/conf/ndss/ndss2019',
        'db/conf/ndss/ndss2018',
        'db/conf/sp/sp2018',
        'db/conf/sp/sp2019',
        'db/conf/sp/sp2020',
        'db/conf/uss/uss2018',
        'db/conf/uss/uss2019',
        'db/conf/uss/uss2020',
        'db/journals/joc/joc31',
        'db/journals/joc/joc32',
        'db/journals/joc/joc33',
        'db/journals/joc/joc34',
        'db/journals/tdsc/tdsc15',
        'db/journals/tdsc/tdsc16',
        'db/journals/tdsc/tdsc17',
        'db/journals/tdsc/tdsc18',
        'db/journals/tifs/tifs13',
        'db/journals/tifs/tifs14',
        'db/journals/tifs/tifs15',
        'db/journals/tifs/tifs16',
    ]

    def filter_affiliation(notes):
        for note in notes:
            if 'shanghai jiao tong' in note["$"].lower():
                return True

        return False

    interested_authors = set(
        map(
            lambda author: author['author']['@pid'],
            filter(
                lambda author: filter_affiliation(author['note']) if type(author['note']) == list else filter_affiliation([author['note']]),
                filter(lambda author: 'note' in author, author_list.values())
            )
        )
    )

    interested_publications = []

    for pub in reduce(list.__add__, main(pubs), []):
        try:
            if pub.get('author'):
                author = pub['author']['author']
                if type(author) == list:
                    for a in pub['author']['author']:
                        if a['@pid'] in interested_authors:
                            interested_publications.append(pub)
                elif pub['author']['author']['@pid'] in interested_authors:
                    interested_publications.append(pub)

        except TypeError:
            print(json.dumps(pub))
            raise

    for pub in interested_publications:
        print(pub['key'], pub['author']['author']['$'], pub['sequence'], sep=',')