Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ## for https://stackoverflow.com/q/75828625/6146136
- import requests
- import pandas as pd
- from bs4 import BeautifulSoup
- def get_row_dict(trTag):
- row = { td['data-ecl-table-header']: td.get_text(' ', strip=True)
- for td in trTag.select('td[data-ecl-table-header]')}
- for td in trTag.select('td[data-ecl-table-header]:has(a[href])'):
- k, link = td['data-ecl-table-header'], td.find('a',href=True)['href']
- if k=='Email' and link.startswith('mailto:'):
- link = link.replace('mailto:', '', 1)
- row[(k.split()[0]+' Link') if row[k] else k] = link
- return row
- output_fp = 'digital-innovation-hubs.csv'
- all_rows, pg_num, max_pg = [], 0, None
- next_link = 'https://s3platform.jrc.ec.europa.eu/digital-innovation-hubs-tool'
- while next_link:
- pg_num += 1
- if isinstance(max_pg, int) and pg_num>max_pg: break
- pgSoup = BeautifulSoup((pgReq:=requests.get(next_link)).content, 'lxml')
- rows = pgSoup.select('tr:has(td[data-ecl-table-header])')
- all_rows += [{'from_pg': pg_num, **get_row_dict(r)} for r in rows]
- # all_rows += [get_row_dict(r) for r in rows] # no "from_pg" column
- ## just for printing ##
- pgNum = pgSoup.find('span', {'aria-current':"true", 'aria-label':True})
- if pgNum: pgNum = ['',*pgNum.get_text(' ', strip=True).split()][-1]
- from_pg=int(pgNum) if isinstance(pgNum,str) and pgNum.isdigit() else pg_num
- rowCt = pgSoup.find('div', class_='ecl-u-type-prolonged-s')
- rowCt = rowCt.text.split(':')[-1].strip() if rowCt else 'UNKNOWN'
- vStr = f'{len(rows)} scraped [total: {len(all_rows)} of {rowCt}] - '
- vStr += f'<{pgReq.status_code} {pgReq.reason}> from {pgReq.url}'
- print(f'\r[{pg_num}][{pgNum}] {vStr}', end='')
- next_link = pgSoup.find('a', {'href':True, 'aria-label':'Go to next page'})
- if next_link: next_link = next_link['href']
- pd.DataFrame(all_rows).to_csv(output_fp, index=False)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement