Taraxacum

Author Affiliation

Apr 22nd, 2021
573
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. import json
  2. from functools import reduce
  3. from pathlib import Path
  4.  
  5. import httpx
  6. from loguru import logger
  7. from lxml.etree import fromstring
  8. from retrying import retry
  9. from tqdm import tqdm
  10. from xmljson import badgerfish
  11.  
  12. logger.remove()
  13. logger.add(lambda msg: tqdm.write(msg, end=""), colorize=True)
  14.  
  15.  
  16. client = httpx.Client()
  17.  
  18.  
  19. DB = Path('inst-data')
  20. author_db = DB / 'authors.json'
  21.  
  22. if not author_db.exists():
  23.     author_db.write_text('{}', encoding='utf-8')
  24.  
  25. author_list = json.loads(Path(author_db).read_bytes())
  26.  
  27.  
  28. def fetch_publications(key):
  29.     path: Path = DB / key
  30.     if not path.exists():
  31.         resp = client.get(f'https://dblp.org/search/publ/api?q=toc%3A{key}.bht%3A&h=1000&format=json')
  32.         logger.debug(f'{resp.request.method} {resp.status_code} {resp.url}')
  33.         resp.raise_for_status()
  34.  
  35.         try:
  36.             path.parent.mkdir(exist_ok=True, parents=True)
  37.             path.write_text(json.dumps(resp.json()['result']['hits']['hit']), encoding='utf-8')
  38.         except KeyError:
  39.             Path(DB / 'error.json').write_text(json.dumps(resp.json()), encoding='utf-8')
  40.             raise
  41.  
  42.     return json.loads(path.read_text(encoding='utf-8'))
  43.  
  44.  
  45. @retry(stop_max_attempt_number=10, wait_exponential_multiplier=100, wait_exponential_max=10000)
  46. def fetch_author(pid):
  47.     if pid not in author_list:
  48.         resp = client.get(f'https://dblp.org/pid/{pid}.xml')
  49.         logger.debug(f'{resp.request.method} {resp.status_code} {resp.url}')
  50.         if resp.status_code == 410:
  51.             return None
  52.  
  53.         resp.raise_for_status()
  54.  
  55.         xml = fromstring(resp.text)
  56.         author_list[pid] = badgerfish.data(xml)['dblpperson']['person']
  57.         Path(author_db).write_text(json.dumps(author_list), encoding='utf-8')
  58.  
  59.     return author_list[pid]
  60.  
  61.  
  62. def on_authors(key, authors):
  63.     for idx, author in enumerate(authors):
  64.         if '@pid' not in author:
  65.             print(author)
  66.         yield {
  67.             'key': key,
  68.             'sequence': idx + 1,
  69.             'author': fetch_author(author['@pid'])
  70.         }
  71.  
  72.  
  73. def on_publication(key, title, authors, **kwargs):
  74.     # logger.info(f'{key} - {title}')
  75.     author = authors['author']
  76.     if type(author) == list:
  77.         return on_authors(key, author)
  78.     else:
  79.         assert type(author) == dict
  80.         return on_authors(key, [author])
  81.  
  82.  
  83. def on_publications(key):
  84.     publ_list = fetch_publications(key)
  85.     for publ in tqdm(publ_list, desc=key):
  86.         if 'authors' in publ['info']:
  87.             yield list(on_publication(**publ['info']))
  88.  
  89.  
  90. def main(pubs):
  91.     for pub in tqdm(pubs, desc='main'):
  92.         yield reduce(list.__add__, list(on_publications(pub)), [])
  93.  
  94.  
  95. if __name__ == '__main__':
  96.     pubs = [
  97.         'db/conf/ccs/ccs2018',
  98.         'db/conf/ccs/ccs2019',
  99.         'db/conf/ccs/ccs2020',
  100.         # 'db/conf/crypto/crypto2018-1',
  101.         # 'db/conf/crypto/crypto2018-2',
  102.         # 'db/conf/crypto/crypto2018-3',
  103.         # 'db/conf/crypto/crypto2019-1',
  104.         # 'db/conf/crypto/crypto2019-2',
  105.         # 'db/conf/crypto/crypto2019-3',
  106.         # 'db/conf/crypto/crypto2020-1',
  107.         # 'db/conf/crypto/crypto2020-2',
  108.         # 'db/conf/crypto/crypto2020-3',
  109.         # 'db/conf/eurocrypt/eurocrypt2018-1',
  110.         # 'db/conf/eurocrypt/eurocrypt2018-2',
  111.         # 'db/conf/eurocrypt/eurocrypt2018-3',
  112.         # 'db/conf/eurocrypt/eurocrypt2019-1',
  113.         # 'db/conf/eurocrypt/eurocrypt2019-2',
  114.         # 'db/conf/eurocrypt/eurocrypt2019-3',
  115.         # 'db/conf/eurocrypt/eurocrypt2020-1',
  116.         # 'db/conf/eurocrypt/eurocrypt2020-2',
  117.         # 'db/conf/eurocrypt/eurocrypt2020-3',
  118.         'db/conf/ndss/ndss2020',
  119.         'db/conf/ndss/ndss2019',
  120.         'db/conf/ndss/ndss2018',
  121.         'db/conf/sp/sp2018',
  122.         'db/conf/sp/sp2019',
  123.         'db/conf/sp/sp2020',
  124.         'db/conf/uss/uss2018',
  125.         'db/conf/uss/uss2019',
  126.         'db/conf/uss/uss2020',
  127.         'db/journals/joc/joc31',
  128.         'db/journals/joc/joc32',
  129.         'db/journals/joc/joc33',
  130.         'db/journals/joc/joc34',
  131.         'db/journals/tdsc/tdsc15',
  132.         'db/journals/tdsc/tdsc16',
  133.         'db/journals/tdsc/tdsc17',
  134.         'db/journals/tdsc/tdsc18',
  135.         'db/journals/tifs/tifs13',
  136.         'db/journals/tifs/tifs14',
  137.         'db/journals/tifs/tifs15',
  138.         'db/journals/tifs/tifs16',
  139.     ]
  140.  
  141.     def filter_affiliation(notes):
  142.         for note in notes:
  143.             if 'shanghai jiao tong' in note["$"].lower():
  144.                 return True
  145.  
  146.         return False
  147.  
  148.     interested_authors = set(
  149.         map(
  150.             lambda author: author['author']['@pid'],
  151.             filter(
  152.                 lambda author: filter_affiliation(author['note']) if type(author['note']) == list else filter_affiliation([author['note']]),
  153.                 filter(lambda author: 'note' in author, author_list.values())
  154.             )
  155.         )
  156.     )
  157.  
  158.     interested_publications = []
  159.  
  160.     for pub in reduce(list.__add__, main(pubs), []):
  161.         try:
  162.             if pub.get('author'):
  163.                 author = pub['author']['author']
  164.                 if type(author) == list:
  165.                     for a in pub['author']['author']:
  166.                         if a['@pid'] in interested_authors:
  167.                             interested_publications.append(pub)
  168.                 elif pub['author']['author']['@pid'] in interested_authors:
  169.                     interested_publications.append(pub)
  170.  
  171.         except TypeError:
  172.             print(json.dumps(pub))
  173.             raise
  174.  
  175.     for pub in interested_publications:
  176.         print(pub['key'], pub['author']['author']['$'], pub['sequence'], sep=',')
  177.  
RAW Paste Data

Adblocker detected! Please consider disabling it...

We've detected AdBlock Plus or some other adblocking software preventing Pastebin.com from fully loading.

We don't have any obnoxious sound, or popup ads, we actively block these annoying types of ads!

Please add Pastebin.com to your ad blocker whitelist or disable your adblocking software.

×