Advertisement
Try95th

s3platform_pandas_w_links for so_q_75828625

Mar 25th, 2023 (edited)
92
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. ## for https://stackoverflow.com/q/75828625/6146136
  2.  
  3. # !!pip install --upgrade --pre pandas==1.5.0 ## this solution requires pandas 1.5.0
  4. import pandas as pd
  5.  
  6. output_fp = 'digital-innovation-hubs.csv'
  7. dfList, pg_num, max_pg = [], 0, 3
  8. base_url = 'https://s3platform.jrc.ec.europa.eu/digital-innovation-hubs-tool'
  9. while (pg_num:=pg_num+1) and (not isinstance(max_pg,int) or pg_num<max_pg):
  10.     pg_url = f'{base_url}?_eu_europa_ec_jrc_dih_web_DihWebPortlet_cur={pg_num}'
  11.     try: dfList += pd.read_html(pg_url, extract_links='all')[:1]
  12.     except Exception as e: pg_num, _ = -1, print(f'\n{e!r} from {pg_url}')
  13.     else: print('', end=f'\rScraped {len(dfList[-1])} rows from {pg_url}')
  14.  
  15. # df = pd.concat(dfList) ## if you don't want to add a column for page numbers
  16. df = pd.concat(
  17.     dfList, keys=list(range(1,len(dfList)+1)), names=['from_pg','pgi']
  18. ).reset_index().drop('pgi',axis='columns')
  19.  
  20. ## clean up - separate links from text
  21. orig_cols = [c for c in df.columns if c != 'from_pg']
  22. for ocn in orig_cols:
  23.     if any(vals:=[cv for cv,*_ in df[ocn]]): df[ocn[0]] = vals
  24.     if any(links:=[c[1] for c in df[ocn]]): df[ocn[0].split()[0]+' Links'] = links
  25. if 'Email Links' in df.columns:
  26.     df['Email'] = df['Email Links'].str.replace('mailto:', '', 1)
  27.     df = df.drop('Email Links', axis='columns')
  28. df = df.drop(orig_cols, axis='columns')
  29.  
  30. ## save csv file
  31. df.to_csv(output_fp, index=False)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement