extract table rows from pdf

## demonstrated at https://stackoverflow.com/a/74973195/6146136
## for building table rows from pdf documents based on character positions extracted with minecart
## [each row is represented as a dictionary in a list of dictionaries]
### [merged cells cannot be detected]
### [merged cells acorss multiple rows are blank after the first row]
### [merged cells acorss multiple columns end up broken]


#!pip install minecart # https://github.com/felipeochoa/minecart#installation
import minecart

######################################### GENERAL FUNCTIONS #########################################
## [mostly] for printing error and returning some null value ##
def vRet(toPrint, toReturn=None):
    print(toPrint if len(str(toPrint)) < 200 else (str(toPrint)[:197]+'...'))
    return toReturn

## for reducing/normalizing whitespace ##
def miniStr(obj, lineSep=' ', wordSep=' '):
    return lineSep.join(
        wordSep.join(w for w in l.split() if w)
        for l in str(obj).splitlines() if l.strip()
    )
#####################################################################################################


######################################## GET ROWS  FROM PAGE ########################################
#################################### ABOUT ARGS ####################################
# pdfDoc: a minecart.miner.Document object or a string with pdf filepath
# colCt: number of column or list of columns' left-border positions
## [if colCt is intger it will try to guess left-border positions from frequencies]
# pgNum, startPos: page to traverse and lettering index to start at
####################################################################################
def pdfPg_table(pdfDoc, colCt=[], pgNum=0, startPos=0):
    if not (colCt and pdfDoc): return vRet('invalid pdfDoc and/or colCt', [])
    if isinstance(pdfDoc, str):
        pdfDoc = minecart.Document(open(pdfDoc, 'rb'))
    if isinstance(colCt, int):
        pos_y1s = [l.get_bbox()[0] for l in pdfDoc.get_page(pgNum).letterings]
        colStarts = sorted(set(pos_y1s), key=(
            lambda p: pos_y1s.count(p)), reverse=True)[:colCt]
    elif isinstance(colCt, list): colStarts, colCt = colCt, len(colCt)
    else: return vRet(f'invalid colCt argument: {miniStr(colCt)}', [])

    rows, curRow, curCol, cellTxt = [], {}, 0, ''
    for l in pdfDoc.get_page(pgNum).letterings[startPos:]:
        cp = len([p for p in colStarts if not p > l.get_bbox()[0]])
        if curCol != cp:
            curRow[f'col_{curCol}'] = cellTxt.strip()
            if curCol > cp:
                rows.append(curRow)
                curRow = {}
            cellTxt, curCol = '', cp
        cellTxt += f'{l}'
    if cellTxt: curRow[f'col_{curCol}'] = cellTxt.strip()
    if curRow: rows.append(curRow)
    return [{c: r[c] if c in r else None for c in [
        f'col_{i}' for i in range(colCt+1)
    ]} for r in rows]
#####################################################################################################


########################################### GET PAGE TEXT ###########################################
#################################### ABOUT ARGS ####################################
# pdfPg: a page or page.letterings
# startPos, endPos: start and end indices [if youdon't want the whole page]
# norm_ws: set True to normalize whitespaces
# mkLowr: set True to make all chars lowercase (for case-insensitive search)
####################################################################################
def getPdfPgText(pdfPg, startPos=0, endPos=None, norm_ws=False, mkLowr=False):
    if not isinstance(pdfPg, (minecart.content.GraphicsCollection, minecart.content.Page)):
        return vRet(f'invalid pdfPg argument: {miniStr(pdfPg)}', '')
    ppll = pdfPg if isinstance(pdfPg, list) else pdfPg.letterings
    txt = ''.join(ppll[startPos:endPos] if endPos else ppll[startPos:])
    if norm_ws: txt = miniStr(txt)
    return txt.lower() if mkLowr else txt
#####################################################################################################


############################################# FIND TEXT #############################################
####################################### ABOUT ARGS #######################################
# dPdf & sf: a minecart.miner.Document object to search through and a string to search for
# pgi: a page to focus on [if null search all pages, if int search only that page]
## [if pgi is a stringified int start search there, but continue to others if no matches]
# findAll: set True to keep searching even after find the first match
# norm_ws: set True to normalize whitespaces before search
# ignoreCase: set True for case-insensitive search
##########################################################################################
## output will be the first "match" [or None], or [if findAll is True] a list of "matches" ##
## each "match" is a tuple with page-number as well as position-x, bbox, and lettering of 1st char ##
def findInPdf(dPdf, sf, pgi=None, findAll=False, norm_ws=True, ignoreCase=False):
    if norm_ws: sf = miniStr(sf)
    if ignoreCase: sf = sf.lower()
    sflen = len(sf)
    strict_pgi = isinstance(pgi, int)
    pgi = int(str(pgi)) if str(pgi).isdigit() else None
    if pgi is None: spp = dPdf.iter_pages()
    else:
        lpgi = dPdf.get_page(pgi)
        if sf not in getPdfPgText(lpgi, norm_ws=norm_ws, mkLowr=ignoreCase):
            if strict_pgi: return None
            spp = dPdf.iter_pages()
        else: spp = [lpgi]

    matchList = []
    for pgNum, pg in enumerate(spp, pgi if pgi else 0):
        lpgi = pg.letterings
        if sf not in getPdfPgText(lpgi, norm_ws=norm_ws, mkLowr=ignoreCase):
            # print(f'"{sf}" not in page', pgNum+1)
            continue
        # print('~'*30, 'start page', pgNum+1, '~'*30)
        for i, l in enumerate(lpgi):
            if not sf[:1] == str(l)[:1]: continue
            txt_i = getPdfPgText(
                lpgi, i, i+sflen, norm_ws=norm_ws, mkLowr=ignoreCase)
            tp = f'getPdfPgText(pdfDoc.get_page({pgNum}).letterings,{i},'
            tp += f'{sflen},norm_ws={norm_ws},mkLowr={ignoreCase})'
            if txt_i.startswith(sf):
                if findAll: matchList.append((pgNum, i, l.get_bbox(), l))
                else: return (pgNum, i, l.get_bbox(), l)
            # else: print(f'"{sf}" not at start of', txt_i, '<---', tp)
        # print('~'*30, 'endof page', pgNum+1, '~'*30, '\n')
    return matchList if findAll else None
#####################################################################################################


######################################## GET ROWS ACROSS DOC ########################################
#################################### ABOUT ARGS ####################################
# pDoc: a minecart.miner.Document object or a string with pdf filepath
# startSeq: expected text from first row or line before
## [if startSeq is empty, whole page is treated like a table]
# colHint: integer [number of columns] or list of floats [left-border positions] or
## a list of sample strings - one from each column to find and pinpoint positions
## [try to get unique strings, and they should be from the first page of table]
# skipRows: number of rows to skip [including row with startSeq]
# pages: can be 'all' or an integer or a list of page indices [1st index is 0]
####################################################################################
def getRows_fromPdfDoc(pDoc, startSeq='', colHint='', skipRows=0, pages='all'):
    if isinstance(pDoc, str):
        try: pDoc = minecart.Document(open(pDoc, 'rb'))
        except Exception as e: pDoc = f'{type(e)} {e}'

    if not isinstance(pDoc, minecart.miner.Document):
        return vRet(f'invalid document: {miniStr(pDoc)[:100]}', [])
    if not colHint: return vRet('columns hint is required', [])
    if isinstance(colHint, int): colStarts = colHint
    elif isinstance(colHint, list):
        invch = [ch for ch in colHint if not isinstance(ch, type(colHint[0]))]
        if invch: colStarts = None
        elif isinstance(colHint[0], float): colStarts = colHint
        elif isinstance(colHint[0], str): colStarts = []
        else: colStarts = None
    else: colStarts = None
    if colStarts is None: return vRet(f'invalid columns hint: {colHint}', [])
    if not (isinstance(skipRows, int) and skipRows > -1):
        return vRet(f'invalid skipRows argument: {skipRows}', [])

    if pages == 'all': pgList = enumerate(pDoc.iter_pages())
    elif isinstance(pages, int): pgList = [(pages, pDoc.get_page(pages))]
    elif isinstance(pages, list):
        if [p for p in pages if not isinstance(p, int)]: pgList = []
        else:
            pgList = [(pi, pg) for pi, pg in enumerate(
                pDoc.iter_pages()) if pi in pages]
    else: pgList = []
    if not pgList: return vRet(f'invalid pages argument: {pages}', [])

    rowsList, matchPages = [], 0
    for pi, pg in pgList:
        startPos = 0
        if startSeq:
            pgStart = findInPdf(pDoc, str(startSeq), pi)
            if pgStart: matchPages, startPos = (matchPages + 1), pgStart[1]
            else: continue
        if not colStarts:
            colStarts = [findInPdf(pDoc, csSamp, pi) for csSamp in colHint]
            colStarts = [(clst[2][0] if clst else clst) for clst in colStarts]
        for csSamp, clst in zip(colHint, colStarts):
            if not clst: return vRet(
                f'failed to find column hint "{csSamp}" on page {pi}', [])
        rowsList += pdfPg_table(pDoc, colStarts, pi, startPos)[skipRows:]

    if startSeq and not matchPages: print(f'failed to find "{startSeq}"')
    return rowsList
#####################################################################################################