Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ## example of usage: https://stackoverflow.com/a/75206102/6146136
- ## for nested headers, see https://pastebin.com/vsKc9Qq5
- def get_wikiSection(header, wSoup, sec1Header='? Abstract ?'):
- sSel = 'h2:has(span.mw-headline[id])'
- sSel = f'*:has(~{sSel}):not({sSel}~*)'
- if not header: hId = hSel = None # [first section has no header]
- elif isinstance(header, str):
- hId, hSel = header, f'h2:has(span.mw-headline[id="{header}"])'
- header = wSoup.select_one(hSel)
- if not header: return {'errorMsg': f'Not found: {hSel}'}
- else: hId = header.select_one('span.mw-headline[id]')['id']
- ## header SHOULD BE: None/hId/a tag containing span.mw-headline[id] ##
- if hId:
- hSel = f'h2:has(span.mw-headline[id="{hId}"])'
- sSel = f'{hSel}~*:not({hSel}~h2~*):not(h2)'
- header = header.get_text(' ').strip()
- else: header = sec1Header
- sect = wSoup.select(sSel)
- sText = '\n'.join([s.get_text(' ').strip() for s in sect])
- sHtml = '\n'.join([''.join(s.prettify().splitlines()) for s in sect])
- if not sect: sText = sHtml = None
- return {'headerId': hId, 'sectionHeader': header,
- 'sectionText': sText, 'sectionHtml': sHtml}
Advertisement
Add Comment
Please, Sign In to add comment