SHOW:
|
|
- or go back to the newest paste.
1 | ## for https://stackoverflow.com/q/75828625/6146136 | |
2 | ||
3 | # !!pip install --upgrade --pre pandas==1.5.0 ## this solution requires pandas 1.5.0 | |
4 | import pandas as pd | |
5 | ||
6 | - | dfList, pg_num, max_pg = [], 0, None |
6 | + | |
7 | dfList, pg_num, max_pg = [], 0, 3 | |
8 | base_url = 'https://s3platform.jrc.ec.europa.eu/digital-innovation-hubs-tool' | |
9 | while (pg_num:=pg_num+1) and (not isinstance(max_pg,int) or pg_num<max_pg): | |
10 | - | # try: dfList += pd.read_html(pg_url, extract_links='all')[:1] ## [needs v1.5.0.] |
10 | + | |
11 | - | try: dfList += pd.read_html(pg_url)[:1] |
11 | + | try: dfList += pd.read_html(pg_url, extract_links='all')[:1] |
12 | except Exception as e: pg_num, _ = -1, print(f'\n{e!r} from {pg_url}') | |
13 | - | else: print('', end=f'\rScraped {len(dfList[-1])} rows from {pg_url}') |
13 | + | else: print('', end=f'\rScraped {len(dfList[-1])} rows from {pg_url}') |
14 | ||
15 | - | # pd.concat(dfList).to_csv(output_fp, index=False) ## save without page numbers |
15 | + | # df = pd.concat(dfList) ## if you don't want to add a column for page numbers |
16 | - | df=pd.concat(dfList, keys=list(range(1,len(dfList)+1)),names=['from_pg','pgi']) |
16 | + | df = pd.concat( |
17 | - | df.reset_index().drop('pgi',axis='columns').to_csv(output_fp, index=False) |
17 | + | dfList, keys=list(range(1,len(dfList)+1)), names=['from_pg','pgi'] |
18 | ).reset_index().drop('pgi',axis='columns') | |
19 | ||
20 | ## clean up - separate links from text | |
21 | orig_cols = [c for c in df.columns if c != 'from_pg'] | |
22 | for ocn in orig_cols: | |
23 | if any(vals:=[cv for cv,*_ in df[ocn]]): df[ocn[0]] = vals | |
24 | if any(links:=[c[1] for c in df[ocn]]): df[ocn[0].split()[0]+' Links'] = links | |
25 | if 'Email Links' in df.columns: | |
26 | df['Email'] = df['Email Links'].str.replace('mailto:', '', 1) | |
27 | df = df.drop('Email Links', axis='columns') | |
28 | df = df.drop(orig_cols, axis='columns') | |
29 | ||
30 | ## save csv file | |
31 | df.to_csv(output_fp, index=False) |