Try95th

get_wikiSection

Jan 23rd, 2023 (edited)
143
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.21 KB | None | 0 0
  1. ## example of usage: https://stackoverflow.com/a/75206102/6146136
  2. ## for nested headers, see https://pastebin.com/vsKc9Qq5
  3.  
  4. def get_wikiSection(header, wSoup, sec1Header='? Abstract ?'):
  5.     sSel = 'h2:has(span.mw-headline[id])'
  6.     sSel = f'*:has(~{sSel}):not({sSel}~*)'
  7.     if not header: hId = hSel = None # [first section has no header]
  8.     elif isinstance(header, str):
  9.         hId, hSel = header, f'h2:has(span.mw-headline[id="{header}"])'
  10.         header = wSoup.select_one(hSel)
  11.         if not header: return {'errorMsg': f'Not found: {hSel}'}
  12.     else: hId = header.select_one('span.mw-headline[id]')['id']
  13.     ## header SHOULD BE: None/hId/a tag containing span.mw-headline[id] ##
  14.  
  15.     if hId:
  16.         hSel = f'h2:has(span.mw-headline[id="{hId}"])'
  17.         sSel = f'{hSel}~*:not({hSel}~h2~*):not(h2)'
  18.         header = header.get_text(' ').strip()
  19.     else: header = sec1Header
  20.    
  21.     sect = wSoup.select(sSel)
  22.     sText = '\n'.join([s.get_text(' ').strip() for s in sect])
  23.     sHtml = '\n'.join([''.join(s.prettify().splitlines()) for s in sect])
  24.     if not sect: sText = sHtml = None
  25.     return {'headerId': hId, 'sectionHeader': header,
  26.             'sectionText': sText, 'sectionHtml': sHtml}
Advertisement
Add Comment
Please, Sign In to add comment