Advertisement
Try95th

extract table rows from pdf

Dec 31st, 2022 (edited)
1,104
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 9.77 KB | None | 0 0
  1. ## demonstrated at https://stackoverflow.com/a/74973195/6146136
  2. ## for building table rows from pdf documents based on character positions extracted with minecart
  3. ## [each row is represented as a dictionary in a list of dictionaries]
  4. ### [merged cells cannot be detected]
  5. ### [merged cells acorss multiple rows are blank after the first row]
  6. ### [merged cells acorss multiple columns end up broken]
  7.  
  8.  
  9. #!pip install minecart # https://github.com/felipeochoa/minecart#installation
  10. import minecart
  11.  
  12. ######################################### GENERAL FUNCTIONS #########################################
  13. ## [mostly] for printing error and returning some null value ##
  14. def vRet(toPrint, toReturn=None):
  15.     print(toPrint if len(str(toPrint)) < 200 else (str(toPrint)[:197]+'...'))
  16.     return toReturn
  17.  
  18. ## for reducing/normalizing whitespace ##
  19. def miniStr(obj, lineSep=' ', wordSep=' '):
  20.     return lineSep.join(
  21.         wordSep.join(w for w in l.split() if w)
  22.         for l in str(obj).splitlines() if l.strip()
  23.     )
  24. #####################################################################################################
  25.  
  26.  
  27. ######################################## GET ROWS  FROM PAGE ########################################
  28. #################################### ABOUT ARGS ####################################
  29. # pdfDoc: a minecart.miner.Document object or a string with pdf filepath
  30. # colCt: number of column or list of columns' left-border positions
  31. ## [if colCt is intger it will try to guess left-border positions from frequencies]
  32. # pgNum, startPos: page to traverse and lettering index to start at
  33. ####################################################################################
  34. def pdfPg_table(pdfDoc, colCt=[], pgNum=0, startPos=0):
  35.     if not (colCt and pdfDoc): return vRet('invalid pdfDoc and/or colCt', [])
  36.     if isinstance(pdfDoc, str):
  37.         pdfDoc = minecart.Document(open(pdfDoc, 'rb'))    
  38.     if isinstance(colCt, int):
  39.         pos_y1s = [l.get_bbox()[0] for l in pdfDoc.get_page(pgNum).letterings]
  40.         colStarts = sorted(set(pos_y1s), key=(
  41.             lambda p: pos_y1s.count(p)), reverse=True)[:colCt]
  42.     elif isinstance(colCt, list): colStarts, colCt = colCt, len(colCt)
  43.     else: return vRet(f'invalid colCt argument: {miniStr(colCt)}', [])
  44.    
  45.     rows, curRow, curCol, cellTxt = [], {}, 0, ''
  46.     for l in pdfDoc.get_page(pgNum).letterings[startPos:]:
  47.         cp = len([p for p in colStarts if not p > l.get_bbox()[0]])
  48.         if curCol != cp:
  49.             curRow[f'col_{curCol}'] = cellTxt.strip()
  50.             if curCol > cp:
  51.                 rows.append(curRow)
  52.                 curRow = {}
  53.             cellTxt, curCol = '', cp
  54.         cellTxt += f'{l}'
  55.     if cellTxt: curRow[f'col_{curCol}'] = cellTxt.strip()
  56.     if curRow: rows.append(curRow)
  57.     return [{c: r[c] if c in r else None for c in [
  58.         f'col_{i}' for i in range(colCt+1)
  59.     ]} for r in rows]
  60. #####################################################################################################
  61.  
  62.  
  63. ########################################### GET PAGE TEXT ###########################################
  64. #################################### ABOUT ARGS ####################################
  65. # pdfPg: a page or page.letterings
  66. # startPos, endPos: start and end indices [if youdon't want the whole page]
  67. # norm_ws: set True to normalize whitespaces
  68. # mkLowr: set True to make all chars lowercase (for case-insensitive search)
  69. ####################################################################################
  70. def getPdfPgText(pdfPg, startPos=0, endPos=None, norm_ws=False, mkLowr=False):
  71.     if not isinstance(pdfPg, (minecart.content.GraphicsCollection, minecart.content.Page)):
  72.         return vRet(f'invalid pdfPg argument: {miniStr(pdfPg)}', '')
  73.     ppll = pdfPg if isinstance(pdfPg, list) else pdfPg.letterings
  74.     txt = ''.join(ppll[startPos:endPos] if endPos else ppll[startPos:])
  75.     if norm_ws: txt = miniStr(txt)
  76.     return txt.lower() if mkLowr else txt
  77. #####################################################################################################
  78.  
  79.  
  80. ############################################# FIND TEXT #############################################
  81. ####################################### ABOUT ARGS #######################################
  82. # dPdf & sf: a minecart.miner.Document object to search through and a string to search for
  83. # pgi: a page to focus on [if null search all pages, if int search only that page]
  84. ## [if pgi is a stringified int start search there, but continue to others if no matches]
  85. # findAll: set True to keep searching even after find the first match
  86. # norm_ws: set True to normalize whitespaces before search
  87. # ignoreCase: set True for case-insensitive search
  88. ##########################################################################################
  89. ## output will be the first "match" [or None], or [if findAll is True] a list of "matches" ##
  90. ## each "match" is a tuple with page-number as well as position-x, bbox, and lettering of 1st char ##
  91. def findInPdf(dPdf, sf, pgi=None, findAll=False, norm_ws=True, ignoreCase=False):
  92.     if norm_ws: sf = miniStr(sf)
  93.     if ignoreCase: sf = sf.lower()
  94.     sflen = len(sf)
  95.     strict_pgi = isinstance(pgi, int)
  96.     pgi = int(str(pgi)) if str(pgi).isdigit() else None
  97.     if pgi is None: spp = dPdf.iter_pages()
  98.     else:
  99.         lpgi = dPdf.get_page(pgi)
  100.         if sf not in getPdfPgText(lpgi, norm_ws=norm_ws, mkLowr=ignoreCase):
  101.             if strict_pgi: return None
  102.             spp = dPdf.iter_pages()
  103.         else: spp = [lpgi]
  104.  
  105.     matchList = []
  106.     for pgNum, pg in enumerate(spp, pgi if pgi else 0):
  107.         lpgi = pg.letterings
  108.         if sf not in getPdfPgText(lpgi, norm_ws=norm_ws, mkLowr=ignoreCase):
  109.             # print(f'"{sf}" not in page', pgNum+1)
  110.             continue
  111.         # print('~'*30, 'start page', pgNum+1, '~'*30)
  112.         for i, l in enumerate(lpgi):
  113.             if not sf[:1] == str(l)[:1]: continue
  114.             txt_i = getPdfPgText(
  115.                 lpgi, i, i+sflen, norm_ws=norm_ws, mkLowr=ignoreCase)
  116.             tp = f'getPdfPgText(pdfDoc.get_page({pgNum}).letterings,{i},'
  117.             tp += f'{sflen},norm_ws={norm_ws},mkLowr={ignoreCase})'
  118.             if txt_i.startswith(sf):
  119.                 if findAll: matchList.append((pgNum, i, l.get_bbox(), l))
  120.                 else: return (pgNum, i, l.get_bbox(), l)
  121.             # else: print(f'"{sf}" not at start of', txt_i, '<---', tp)
  122.         # print('~'*30, 'endof page', pgNum+1, '~'*30, '\n')
  123.     return matchList if findAll else None
  124. #####################################################################################################
  125.  
  126.  
  127. ######################################## GET ROWS ACROSS DOC ########################################
  128. #################################### ABOUT ARGS ####################################
  129. # pDoc: a minecart.miner.Document object or a string with pdf filepath
  130. # startSeq: expected text from first row or line before
  131. ## [if startSeq is empty, whole page is treated like a table]
  132. # colHint: integer [number of columns] or list of floats [left-border positions] or
  133. ## a list of sample strings - one from each column to find and pinpoint positions
  134. ## [try to get unique strings, and they should be from the first page of table]
  135. # skipRows: number of rows to skip [including row with startSeq]
  136. # pages: can be 'all' or an integer or a list of page indices [1st index is 0]
  137. ####################################################################################
  138. def getRows_fromPdfDoc(pDoc, startSeq='', colHint='', skipRows=0, pages='all'):
  139.     if isinstance(pDoc, str):
  140.         try: pDoc = minecart.Document(open(pDoc, 'rb'))  
  141.         except Exception as e: pDoc = f'{type(e)} {e}'
  142.    
  143.     if not isinstance(pDoc, minecart.miner.Document):
  144.         return vRet(f'invalid document: {miniStr(pDoc)[:100]}', [])
  145.     if not colHint: return vRet('columns hint is required', [])
  146.     if isinstance(colHint, int): colStarts = colHint
  147.     elif isinstance(colHint, list):
  148.         invch = [ch for ch in colHint if not isinstance(ch, type(colHint[0]))]
  149.         if invch: colStarts = None
  150.         elif isinstance(colHint[0], float): colStarts = colHint
  151.         elif isinstance(colHint[0], str): colStarts = []
  152.         else: colStarts = None  
  153.     else: colStarts = None
  154.     if colStarts is None: return vRet(f'invalid columns hint: {colHint}', [])
  155.     if not (isinstance(skipRows, int) and skipRows > -1):
  156.         return vRet(f'invalid skipRows argument: {skipRows}', [])
  157.  
  158.     if pages == 'all': pgList = enumerate(pDoc.iter_pages())
  159.     elif isinstance(pages, int): pgList = [(pages, pDoc.get_page(pages))]
  160.     elif isinstance(pages, list):
  161.         if [p for p in pages if not isinstance(p, int)]: pgList = []
  162.         else:
  163.             pgList = [(pi, pg) for pi, pg in enumerate(
  164.                 pDoc.iter_pages()) if pi in pages]
  165.     else: pgList = []
  166.     if not pgList: return vRet(f'invalid pages argument: {pages}', [])
  167.  
  168.     rowsList, matchPages = [], 0
  169.     for pi, pg in pgList:
  170.         startPos = 0
  171.         if startSeq:  
  172.             pgStart = findInPdf(pDoc, str(startSeq), pi)
  173.             if pgStart: matchPages, startPos = (matchPages + 1), pgStart[1]
  174.             else: continue
  175.         if not colStarts:
  176.             colStarts = [findInPdf(pDoc, csSamp, pi) for csSamp in colHint]
  177.             colStarts = [(clst[2][0] if clst else clst) for clst in colStarts]
  178.         for csSamp, clst in zip(colHint, colStarts):
  179.             if not clst: return vRet(
  180.                 f'failed to find column hint "{csSamp}" on page {pi}', [])
  181.         rowsList += pdfPg_table(pDoc, colStarts, pi, startPos)[skipRows:]
  182.    
  183.     if startSeq and not matchPages: print(f'failed to find "{startSeq}"')
  184.     return rowsList
  185. #####################################################################################################
  186.  
  187.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement