Try95th

pdf2csv so_q_74924464

Dec 27th, 2022 (edited)
480
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.88 KB | None | 0 0
  1. ## for https://stackoverflow.com/q/74924464/6146136
  2. ## sample usage:
  3. #### extract_drugsDf('https://www.premera.com/documents/052166_2023.pdf') # [link to output below]
  4. #### https://docs.google.com/spreadsheets/d/1VLQRwX1gIPzJSZnypJCZmgziOL0HFkZVW8VqvnnH11I/edit?usp=sharing
  5.  
  6. import requests
  7. import pandas
  8.  
  9. # !pip install tabula-py
  10. from tabula import read_pdf
  11.  
  12. def float_or_miniStr(val):
  13.     try: return float(str(val).strip())
  14.     except: return ' '.join(w for w in str(val).split() if w)
  15.  
  16. def extract_drugsDf(pdfLink, saveCsv=True, saveUnfixed=True):
  17.     ### FETCH PDF DATA ###
  18.     r = requests.get(pdfLink, headers={'user-agent': (
  19.         'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
  20.         + ' (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
  21.     )})
  22.     r.raise_for_status()
  23.  
  24.     ### SAVE&READ PDF ###
  25.     fileName = pdfLink.split('/')[-1].strip('.pdf').replace('.', '-')+'.pdf'
  26.     with open(fileName, 'wb') as f: f.write(r.content)
  27.     dfList = read_pdf(fileName, pages='all')
  28.  
  29.     ### put together the drug list dataframes ###
  30.     ddf = pandas.concat([
  31.         d for d in dfList if d.columns[0] == 'Drug Name'
  32.     ], ignore_index=True)
  33.     ## needs fixing - see example at https://i.stack.imgur.com/b1u3B.png
  34.  
  35.     ### [conditionally] save extracted table before cleaning ###
  36.     if saveUnfixed:
  37.         u_fn = f"concat_unfixed[minified]-{fileName.strip('.pdf')}.csv"
  38.         pandas.DataFrame([{k: None if pandas.isna(v) else (
  39.             float_or_miniStr(v) if v else v ## [because multiline cells can break csv rows]
  40.         ) for k, v in rd.items()} for rd in ddf.to_dict('records')]).to_csv(u_fn, index=False)    
  41.    
  42.     ### convert to list of tuples ###
  43.     dd = ddf[list(ddf.columns)[:3]].dropna(axis='rows', how='all')
  44.     dd = [tuple(
  45.         [None if pandas.isna(c) else float_or_miniStr(c) for c in r]
  46.     ) for r in dd.to_dict('split')['data']]
  47.  
  48.     ### loop through the tuples to re-build rows ###
  49.     dictList, prSect, mSect, sSect = [], False, None, None
  50.     for dname, dTier, dReq in dd:
  51.         if type(dTier) == str and len(dTier) > 2 and not dname and not dReq:
  52.             if prSect: mSect = sSect
  53.             sSect, prSect = dTier, not prSect
  54.             continue
  55.         prSect = False
  56.  
  57.         if dname and dname != dTier:  
  58.             dictList.append({
  59.                 'Section': mSect, 'Subsection': sSect, 'Drug Name': dname,
  60.                 'Drug Tier': None, 'Requirements/Limits': dReq
  61.             })
  62.             continue
  63.        
  64.         if (dTier or dReq) and dictList:
  65.             dictList[-1]['Drug Tier'] = dReq if dReq else dTier    
  66.  
  67.     ### form output dataframe from re-built rows ###
  68.     fdf = pandas.DataFrame(dictList)
  69.  
  70.     ### [conditionally] save extracted output to csv ###
  71.     if saveCsv:
  72.         fdf.to_csv(f"drugs_list-{fileName.strip('.pdf')}.csv", index=False)
  73.    
  74.     return fdf
  75.  
Advertisement
Add Comment
Please, Sign In to add comment