Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ## extracts table data from bs4 table Tag
- def read_htmlTable(tSoup,tSep=' ',colWidth_max=None,minUniq=2,minFilld=2,asObj=False):
- # returns str unless asObj is "lists" or "tuples" or "dicts" or "str+"
- if len(tSoup.select('thead tr')) == 1:
- simpleHeader = True
- elif tSoup.find('tr') and tSoup.find('tr').find('th'):
- simpleHeader = len(tSoup.select('tr:has(th)')) == 1
- else:
- simpleHeader = False
- tData = [[
- (tSep if tSep==' ' else '\n').join([sl for sl in [
- ' '.join([w for w in l.split() if w]).strip()
- for l in c.get_text(tSep).splitlines()
- ] if sl]) for c in r.select('th, td')
- ] for r in tSoup.select('tr')]
- colCt = max([len(r) for r in tData])
- if not simpleHeader:
- tData = [[
- f'col_{i}' for i in range(1, colCt+1)
- ]] + tData
- else: tData[0] = [c if c else f'col_{i}' for i, c in enumerate(tData[0])]
- if minFilld > 0 or minUniq > 0:
- colData = [[
- r[i] for r in tData[1:] if len(r) > i
- ] for i in range(colCt)]
- colData = [(
- list(set(cd)), len([d for d in cd if d])
- ) for cd in colData]
- dropCols = [i for i, cu in enumerate(colData) if
- len(cu[0]) < minUniq or cu[1] < minFilld]
- tData = [[
- c for i, c in enumerate(r) if i not in dropCols
- ] for r in tData]
- colCt = max([len(r) for r in tData])
- if asObj == 'lists': return tData
- if asObj == 'tuples': return [tuple(r) for r in tData]
- if asObj in ['dicts', 'str+']:
- tDict = [{
- k: v for k, v in zip(tData[0], r)
- } for r in tData[1:]]
- if asObj == 'dicts': return tDict
- if type(colWidth_max) == int and colWidth_max > 5:
- cw = [colWidth_max]*colCt
- else:
- cw = [max([
- len(r[i]) for r in tData if len(r) > i
- ]) for i in range(colCt)]
- tData_f = [[
- f'{{:{w}}}'.format(c[:w]) for c, w in zip(r, cw)
- ] for r in tData]
- tData_f = [tData_f[0]] + [['-'*w for w in cw]] + tData_f[1:]
- tData_f = [' | '.join(r) for r in tData_f]
- tStr = '\n'.join([f'| {r} |' for r in tData_f])
- return {'asText': tStr, 'asDict': tDict} if asObj == 'str+' else tStr
Advertisement
Add Comment
Please, Sign In to add comment