Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ## demonstrated at https://stackoverflow.com/a/74973195/6146136
- ## for building table rows from pdf documents based on character positions extracted with minecart
- ## [each row is represented as a dictionary in a list of dictionaries]
- ### [merged cells cannot be detected]
- ### [merged cells acorss multiple rows are blank after the first row]
- ### [merged cells acorss multiple columns end up broken]
- #!pip install minecart # https://github.com/felipeochoa/minecart#installation
- import minecart
- ######################################### GENERAL FUNCTIONS #########################################
- ## [mostly] for printing error and returning some null value ##
- def vRet(toPrint, toReturn=None):
- print(toPrint if len(str(toPrint)) < 200 else (str(toPrint)[:197]+'...'))
- return toReturn
- ## for reducing/normalizing whitespace ##
- def miniStr(obj, lineSep=' ', wordSep=' '):
- return lineSep.join(
- wordSep.join(w for w in l.split() if w)
- for l in str(obj).splitlines() if l.strip()
- )
- #####################################################################################################
- ######################################## GET ROWS FROM PAGE ########################################
- #################################### ABOUT ARGS ####################################
- # pdfDoc: a minecart.miner.Document object or a string with pdf filepath
- # colCt: number of column or list of columns' left-border positions
- ## [if colCt is intger it will try to guess left-border positions from frequencies]
- # pgNum, startPos: page to traverse and lettering index to start at
- ####################################################################################
- def pdfPg_table(pdfDoc, colCt=[], pgNum=0, startPos=0):
- if not (colCt and pdfDoc): return vRet('invalid pdfDoc and/or colCt', [])
- if isinstance(pdfDoc, str):
- pdfDoc = minecart.Document(open(pdfDoc, 'rb'))
- if isinstance(colCt, int):
- pos_y1s = [l.get_bbox()[0] for l in pdfDoc.get_page(pgNum).letterings]
- colStarts = sorted(set(pos_y1s), key=(
- lambda p: pos_y1s.count(p)), reverse=True)[:colCt]
- elif isinstance(colCt, list): colStarts, colCt = colCt, len(colCt)
- else: return vRet(f'invalid colCt argument: {miniStr(colCt)}', [])
- rows, curRow, curCol, cellTxt = [], {}, 0, ''
- for l in pdfDoc.get_page(pgNum).letterings[startPos:]:
- cp = len([p for p in colStarts if not p > l.get_bbox()[0]])
- if curCol != cp:
- curRow[f'col_{curCol}'] = cellTxt.strip()
- if curCol > cp:
- rows.append(curRow)
- curRow = {}
- cellTxt, curCol = '', cp
- cellTxt += f'{l}'
- if cellTxt: curRow[f'col_{curCol}'] = cellTxt.strip()
- if curRow: rows.append(curRow)
- return [{c: r[c] if c in r else None for c in [
- f'col_{i}' for i in range(colCt+1)
- ]} for r in rows]
- #####################################################################################################
- ########################################### GET PAGE TEXT ###########################################
- #################################### ABOUT ARGS ####################################
- # pdfPg: a page or page.letterings
- # startPos, endPos: start and end indices [if youdon't want the whole page]
- # norm_ws: set True to normalize whitespaces
- # mkLowr: set True to make all chars lowercase (for case-insensitive search)
- ####################################################################################
- def getPdfPgText(pdfPg, startPos=0, endPos=None, norm_ws=False, mkLowr=False):
- if not isinstance(pdfPg, (minecart.content.GraphicsCollection, minecart.content.Page)):
- return vRet(f'invalid pdfPg argument: {miniStr(pdfPg)}', '')
- ppll = pdfPg if isinstance(pdfPg, list) else pdfPg.letterings
- txt = ''.join(ppll[startPos:endPos] if endPos else ppll[startPos:])
- if norm_ws: txt = miniStr(txt)
- return txt.lower() if mkLowr else txt
- #####################################################################################################
- ############################################# FIND TEXT #############################################
- ####################################### ABOUT ARGS #######################################
- # dPdf & sf: a minecart.miner.Document object to search through and a string to search for
- # pgi: a page to focus on [if null search all pages, if int search only that page]
- ## [if pgi is a stringified int start search there, but continue to others if no matches]
- # findAll: set True to keep searching even after find the first match
- # norm_ws: set True to normalize whitespaces before search
- # ignoreCase: set True for case-insensitive search
- ##########################################################################################
- ## output will be the first "match" [or None], or [if findAll is True] a list of "matches" ##
- ## each "match" is a tuple with page-number as well as position-x, bbox, and lettering of 1st char ##
- def findInPdf(dPdf, sf, pgi=None, findAll=False, norm_ws=True, ignoreCase=False):
- if norm_ws: sf = miniStr(sf)
- if ignoreCase: sf = sf.lower()
- sflen = len(sf)
- strict_pgi = isinstance(pgi, int)
- pgi = int(str(pgi)) if str(pgi).isdigit() else None
- if pgi is None: spp = dPdf.iter_pages()
- else:
- lpgi = dPdf.get_page(pgi)
- if sf not in getPdfPgText(lpgi, norm_ws=norm_ws, mkLowr=ignoreCase):
- if strict_pgi: return None
- spp = dPdf.iter_pages()
- else: spp = [lpgi]
- matchList = []
- for pgNum, pg in enumerate(spp, pgi if pgi else 0):
- lpgi = pg.letterings
- if sf not in getPdfPgText(lpgi, norm_ws=norm_ws, mkLowr=ignoreCase):
- # print(f'"{sf}" not in page', pgNum+1)
- continue
- # print('~'*30, 'start page', pgNum+1, '~'*30)
- for i, l in enumerate(lpgi):
- if not sf[:1] == str(l)[:1]: continue
- txt_i = getPdfPgText(
- lpgi, i, i+sflen, norm_ws=norm_ws, mkLowr=ignoreCase)
- tp = f'getPdfPgText(pdfDoc.get_page({pgNum}).letterings,{i},'
- tp += f'{sflen},norm_ws={norm_ws},mkLowr={ignoreCase})'
- if txt_i.startswith(sf):
- if findAll: matchList.append((pgNum, i, l.get_bbox(), l))
- else: return (pgNum, i, l.get_bbox(), l)
- # else: print(f'"{sf}" not at start of', txt_i, '<---', tp)
- # print('~'*30, 'endof page', pgNum+1, '~'*30, '\n')
- return matchList if findAll else None
- #####################################################################################################
- ######################################## GET ROWS ACROSS DOC ########################################
- #################################### ABOUT ARGS ####################################
- # pDoc: a minecart.miner.Document object or a string with pdf filepath
- # startSeq: expected text from first row or line before
- ## [if startSeq is empty, whole page is treated like a table]
- # colHint: integer [number of columns] or list of floats [left-border positions] or
- ## a list of sample strings - one from each column to find and pinpoint positions
- ## [try to get unique strings, and they should be from the first page of table]
- # skipRows: number of rows to skip [including row with startSeq]
- # pages: can be 'all' or an integer or a list of page indices [1st index is 0]
- ####################################################################################
- def getRows_fromPdfDoc(pDoc, startSeq='', colHint='', skipRows=0, pages='all'):
- if isinstance(pDoc, str):
- try: pDoc = minecart.Document(open(pDoc, 'rb'))
- except Exception as e: pDoc = f'{type(e)} {e}'
- if not isinstance(pDoc, minecart.miner.Document):
- return vRet(f'invalid document: {miniStr(pDoc)[:100]}', [])
- if not colHint: return vRet('columns hint is required', [])
- if isinstance(colHint, int): colStarts = colHint
- elif isinstance(colHint, list):
- invch = [ch for ch in colHint if not isinstance(ch, type(colHint[0]))]
- if invch: colStarts = None
- elif isinstance(colHint[0], float): colStarts = colHint
- elif isinstance(colHint[0], str): colStarts = []
- else: colStarts = None
- else: colStarts = None
- if colStarts is None: return vRet(f'invalid columns hint: {colHint}', [])
- if not (isinstance(skipRows, int) and skipRows > -1):
- return vRet(f'invalid skipRows argument: {skipRows}', [])
- if pages == 'all': pgList = enumerate(pDoc.iter_pages())
- elif isinstance(pages, int): pgList = [(pages, pDoc.get_page(pages))]
- elif isinstance(pages, list):
- if [p for p in pages if not isinstance(p, int)]: pgList = []
- else:
- pgList = [(pi, pg) for pi, pg in enumerate(
- pDoc.iter_pages()) if pi in pages]
- else: pgList = []
- if not pgList: return vRet(f'invalid pages argument: {pages}', [])
- rowsList, matchPages = [], 0
- for pi, pg in pgList:
- startPos = 0
- if startSeq:
- pgStart = findInPdf(pDoc, str(startSeq), pi)
- if pgStart: matchPages, startPos = (matchPages + 1), pgStart[1]
- else: continue
- if not colStarts:
- colStarts = [findInPdf(pDoc, csSamp, pi) for csSamp in colHint]
- colStarts = [(clst[2][0] if clst else clst) for clst in colStarts]
- for csSamp, clst in zip(colHint, colStarts):
- if not clst: return vRet(
- f'failed to find column hint "{csSamp}" on page {pi}', [])
- rowsList += pdfPg_table(pDoc, colStarts, pi, startPos)[skipRows:]
- if startSeq and not matchPages: print(f'failed to find "{startSeq}"')
- return rowsList
- #####################################################################################################
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement