Try95th

read_htmlTable

Dec 5th, 2022 (edited)
195
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.27 KB | None | 0 0
  1. ## extracts table data from bs4 table Tag
  2.  
  3. def read_htmlTable(tSoup,tSep=' ',colWidth_max=None,minUniq=2,minFilld=2,asObj=False):
  4.     # returns str unless asObj is "lists" or "tuples" or "dicts" or "str+"
  5.     if len(tSoup.select('thead tr')) == 1:
  6.         simpleHeader = True
  7.     elif tSoup.find('tr') and tSoup.find('tr').find('th'):
  8.         simpleHeader = len(tSoup.select('tr:has(th)')) == 1
  9.     else:
  10.         simpleHeader = False
  11.  
  12.     tData = [[
  13.         (tSep if tSep==' ' else '\n').join([sl for sl in [
  14.             ' '.join([w for w in l.split() if w]).strip()
  15.             for l in c.get_text(tSep).splitlines()
  16.         ] if sl]) for c in r.select('th, td')
  17.     ] for r in tSoup.select('tr')]
  18.  
  19.     colCt = max([len(r) for r in tData])
  20.     if not simpleHeader:
  21.         tData = [[
  22.             f'col_{i}' for i in range(1, colCt+1)
  23.         ]] + tData
  24.     else: tData[0] = [c if c else f'col_{i}' for i, c in enumerate(tData[0])]
  25.     if minFilld > 0 or minUniq > 0:
  26.         colData = [[
  27.             r[i] for r in tData[1:] if len(r) > i
  28.         ] for i in range(colCt)]
  29.         colData = [(
  30.             list(set(cd)), len([d for d in cd if d])
  31.         ) for cd in colData]
  32.         dropCols = [i for i, cu in enumerate(colData) if
  33.             len(cu[0]) < minUniq or cu[1] < minFilld]
  34.         tData = [[
  35.             c for i, c in enumerate(r) if i not in dropCols
  36.         ] for r in tData]
  37.         colCt = max([len(r) for r in tData])
  38.  
  39.     if asObj == 'lists': return tData
  40.     if asObj == 'tuples': return [tuple(r) for r in tData]
  41.     if asObj in ['dicts', 'str+']:
  42.         tDict = [{
  43.             k: v for k, v in zip(tData[0], r)
  44.         } for r in tData[1:]]
  45.         if asObj == 'dicts': return tDict
  46.  
  47.     if type(colWidth_max) == int and colWidth_max > 5:
  48.         cw = [colWidth_max]*colCt
  49.     else:
  50.         cw = [max([
  51.             len(r[i]) for r in tData if len(r) > i
  52.         ]) for i in range(colCt)]
  53.      
  54.     tData_f = [[
  55.         f'{{:{w}}}'.format(c[:w]) for c, w in zip(r, cw)
  56.     ] for r in tData]
  57.  
  58.     tData_f = [tData_f[0]] + [['-'*w for w in cw]] + tData_f[1:]
  59.     tData_f = [' | '.join(r) for r in tData_f]
  60.  
  61.     tStr = '\n'.join([f'| {r} |' for r in tData_f])
  62.     return {'asText': tStr, 'asDict': tDict} if asObj == 'str+' else tStr
Advertisement
Add Comment
Please, Sign In to add comment