Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import json
- from functools import reduce
- from pathlib import Path
- import httpx
- from loguru import logger
- from lxml.etree import fromstring
- from retrying import retry
- from tqdm import tqdm
- from xmljson import badgerfish
- logger.remove()
- logger.add(lambda msg: tqdm.write(msg, end=""), colorize=True)
- client = httpx.Client()
- DB = Path('inst-data')
- author_db = DB / 'authors.json'
- if not author_db.exists():
- author_db.write_text('{}', encoding='utf-8')
- author_list = json.loads(Path(author_db).read_bytes())
- def fetch_publications(key):
- path: Path = DB / key
- if not path.exists():
- resp = client.get(f'https://dblp.org/search/publ/api?q=toc%3A{key}.bht%3A&h=1000&format=json')
- logger.debug(f'{resp.request.method} {resp.status_code} {resp.url}')
- resp.raise_for_status()
- try:
- path.parent.mkdir(exist_ok=True, parents=True)
- path.write_text(json.dumps(resp.json()['result']['hits']['hit']), encoding='utf-8')
- except KeyError:
- Path(DB / 'error.json').write_text(json.dumps(resp.json()), encoding='utf-8')
- raise
- return json.loads(path.read_text(encoding='utf-8'))
- @retry(stop_max_attempt_number=10, wait_exponential_multiplier=100, wait_exponential_max=10000)
- def fetch_author(pid):
- if pid not in author_list:
- resp = client.get(f'https://dblp.org/pid/{pid}.xml')
- logger.debug(f'{resp.request.method} {resp.status_code} {resp.url}')
- if resp.status_code == 410:
- return None
- resp.raise_for_status()
- xml = fromstring(resp.text)
- author_list[pid] = badgerfish.data(xml)['dblpperson']['person']
- Path(author_db).write_text(json.dumps(author_list), encoding='utf-8')
- return author_list[pid]
- def on_authors(key, authors):
- for idx, author in enumerate(authors):
- if '@pid' not in author:
- print(author)
- yield {
- 'key': key,
- 'sequence': idx + 1,
- 'author': fetch_author(author['@pid'])
- }
- def on_publication(key, title, authors, **kwargs):
- # logger.info(f'{key} - {title}')
- author = authors['author']
- if type(author) == list:
- return on_authors(key, author)
- else:
- assert type(author) == dict
- return on_authors(key, [author])
- def on_publications(key):
- publ_list = fetch_publications(key)
- for publ in tqdm(publ_list, desc=key):
- if 'authors' in publ['info']:
- yield list(on_publication(**publ['info']))
- def main(pubs):
- for pub in tqdm(pubs, desc='main'):
- yield reduce(list.__add__, list(on_publications(pub)), [])
- if __name__ == '__main__':
- pubs = [
- 'db/conf/ccs/ccs2018',
- 'db/conf/ccs/ccs2019',
- 'db/conf/ccs/ccs2020',
- # 'db/conf/crypto/crypto2018-1',
- # 'db/conf/crypto/crypto2018-2',
- # 'db/conf/crypto/crypto2018-3',
- # 'db/conf/crypto/crypto2019-1',
- # 'db/conf/crypto/crypto2019-2',
- # 'db/conf/crypto/crypto2019-3',
- # 'db/conf/crypto/crypto2020-1',
- # 'db/conf/crypto/crypto2020-2',
- # 'db/conf/crypto/crypto2020-3',
- # 'db/conf/eurocrypt/eurocrypt2018-1',
- # 'db/conf/eurocrypt/eurocrypt2018-2',
- # 'db/conf/eurocrypt/eurocrypt2018-3',
- # 'db/conf/eurocrypt/eurocrypt2019-1',
- # 'db/conf/eurocrypt/eurocrypt2019-2',
- # 'db/conf/eurocrypt/eurocrypt2019-3',
- # 'db/conf/eurocrypt/eurocrypt2020-1',
- # 'db/conf/eurocrypt/eurocrypt2020-2',
- # 'db/conf/eurocrypt/eurocrypt2020-3',
- 'db/conf/ndss/ndss2020',
- 'db/conf/ndss/ndss2019',
- 'db/conf/ndss/ndss2018',
- 'db/conf/sp/sp2018',
- 'db/conf/sp/sp2019',
- 'db/conf/sp/sp2020',
- 'db/conf/uss/uss2018',
- 'db/conf/uss/uss2019',
- 'db/conf/uss/uss2020',
- 'db/journals/joc/joc31',
- 'db/journals/joc/joc32',
- 'db/journals/joc/joc33',
- 'db/journals/joc/joc34',
- 'db/journals/tdsc/tdsc15',
- 'db/journals/tdsc/tdsc16',
- 'db/journals/tdsc/tdsc17',
- 'db/journals/tdsc/tdsc18',
- 'db/journals/tifs/tifs13',
- 'db/journals/tifs/tifs14',
- 'db/journals/tifs/tifs15',
- 'db/journals/tifs/tifs16',
- ]
- def filter_affiliation(notes):
- for note in notes:
- if 'shanghai jiao tong' in note["$"].lower():
- return True
- return False
- interested_authors = set(
- map(
- lambda author: author['author']['@pid'],
- filter(
- lambda author: filter_affiliation(author['note']) if type(author['note']) == list else filter_affiliation([author['note']]),
- filter(lambda author: 'note' in author, author_list.values())
- )
- )
- )
- interested_publications = []
- for pub in reduce(list.__add__, main(pubs), []):
- try:
- if pub.get('author'):
- author = pub['author']['author']
- if type(author) == list:
- for a in pub['author']['author']:
- if a['@pid'] in interested_authors:
- interested_publications.append(pub)
- elif pub['author']['author']['@pid'] in interested_authors:
- interested_publications.append(pub)
- except TypeError:
- print(json.dumps(pub))
- raise
- for pub in interested_publications:
- print(pub['key'], pub['author']['author']['$'], pub['sequence'], sep=',')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement