Advertisement
Try95th

s3platform_reqs+bs4+pandas for so_q_75828625

Mar 25th, 2023
103
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.91 KB | None | 0 0
  1. ## for https://stackoverflow.com/q/75828625/6146136
  2.  
  3.  
  4. import requests
  5. import pandas as pd
  6. from bs4 import BeautifulSoup
  7.  
  8. def get_row_dict(trTag):
  9.     row = { td['data-ecl-table-header']: td.get_text(' ', strip=True)
  10.             for td in trTag.select('td[data-ecl-table-header]')}
  11.     for td in trTag.select('td[data-ecl-table-header]:has(a[href])'):
  12.         k, link = td['data-ecl-table-header'], td.find('a',href=True)['href']
  13.         if k=='Email' and link.startswith('mailto:'):
  14.             link = link.replace('mailto:', '', 1)
  15.         row[(k.split()[0]+' Link') if row[k] else k] = link
  16.     return row
  17.  
  18. output_fp = 'digital-innovation-hubs.csv'
  19.  
  20. all_rows, pg_num, max_pg = [], 0, None
  21. next_link = 'https://s3platform.jrc.ec.europa.eu/digital-innovation-hubs-tool'
  22. while next_link:
  23.     pg_num += 1
  24.     if isinstance(max_pg, int) and pg_num>max_pg: break
  25.     pgSoup = BeautifulSoup((pgReq:=requests.get(next_link)).content, 'lxml')
  26.     rows = pgSoup.select('tr:has(td[data-ecl-table-header])')
  27.     all_rows += [{'from_pg': pg_num, **get_row_dict(r)} for r in rows]
  28.     # all_rows += [get_row_dict(r) for r in rows] # no "from_pg" column
  29.  
  30.     ## just for printing ##
  31.     pgNum = pgSoup.find('span', {'aria-current':"true", 'aria-label':True})
  32.     if pgNum: pgNum = ['',*pgNum.get_text(' ', strip=True).split()][-1]
  33.     from_pg=int(pgNum) if isinstance(pgNum,str) and pgNum.isdigit() else pg_num
  34.     rowCt = pgSoup.find('div', class_='ecl-u-type-prolonged-s')
  35.     rowCt = rowCt.text.split(':')[-1].strip() if rowCt else 'UNKNOWN'  
  36.     vStr = f'{len(rows)} scraped [total: {len(all_rows)} of {rowCt}] - '
  37.     vStr += f'<{pgReq.status_code} {pgReq.reason}> from {pgReq.url}'
  38.     print(f'\r[{pg_num}][{pgNum}] {vStr}', end='')
  39.  
  40.     next_link = pgSoup.find('a', {'href':True, 'aria-label':'Go to next page'})
  41.     if next_link: next_link = next_link['href']
  42.  
  43. pd.DataFrame(all_rows).to_csv(output_fp, index=False)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement