Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ## for https://stackoverflow.com/q/75828625/6146136
- import pandas as pd
- output_fp = 'digital-innovation-hubs.csv'
- dfList, pg_num, max_pg = [], 0, None
- base_url = 'https://s3platform.jrc.ec.europa.eu/digital-innovation-hubs-tool'
- while (pg_num:=pg_num+1) and (not isinstance(max_pg,int) or pg_num<max_pg):
- pg_url = f'{base_url}?_eu_europa_ec_jrc_dih_web_DihWebPortlet_cur={pg_num}'
- # try: dfList += pd.read_html(pg_url, extract_links='all')[:1] ## [needs v1.5.0.]
- try: dfList += pd.read_html(pg_url)[:1]
- except Exception as e: pg_num, _ = -1, print(f'\n{e!r} from {pg_url}')
- else: print('', end=f'\rScraped {len(dfList[-1])} rows from {pg_url}')
- # pd.concat(dfList).to_csv(output_fp, index=False) ## save without page numbers
- df=pd.concat(dfList, keys=list(range(1,len(dfList)+1)),names=['from_pg','pgi'])
- df.reset_index().drop('pgi',axis='columns').to_csv(output_fp, index=False)
Advertisement
Add Comment
Please, Sign In to add comment