Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ## for https://stackoverflow.com/q/74924464/6146136
- ## sample usage:
- #### extract_drugsDf('https://www.premera.com/documents/052166_2023.pdf') # [link to output below]
- #### https://docs.google.com/spreadsheets/d/1VLQRwX1gIPzJSZnypJCZmgziOL0HFkZVW8VqvnnH11I/edit?usp=sharing
- import requests
- import pandas
- # !pip install tabula-py
- from tabula import read_pdf
- def float_or_miniStr(val):
- try: return float(str(val).strip())
- except: return ' '.join(w for w in str(val).split() if w)
- def extract_drugsDf(pdfLink, saveCsv=True, saveUnfixed=True):
- ### FETCH PDF DATA ###
- r = requests.get(pdfLink, headers={'user-agent': (
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
- + ' (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
- )})
- r.raise_for_status()
- ### SAVE&READ PDF ###
- fileName = pdfLink.split('/')[-1].strip('.pdf').replace('.', '-')+'.pdf'
- with open(fileName, 'wb') as f: f.write(r.content)
- dfList = read_pdf(fileName, pages='all')
- ### put together the drug list dataframes ###
- ddf = pandas.concat([
- d for d in dfList if d.columns[0] == 'Drug Name'
- ], ignore_index=True)
- ## needs fixing - see example at https://i.stack.imgur.com/b1u3B.png
- ### [conditionally] save extracted table before cleaning ###
- if saveUnfixed:
- u_fn = f"concat_unfixed[minified]-{fileName.strip('.pdf')}.csv"
- pandas.DataFrame([{k: None if pandas.isna(v) else (
- float_or_miniStr(v) if v else v ## [because multiline cells can break csv rows]
- ) for k, v in rd.items()} for rd in ddf.to_dict('records')]).to_csv(u_fn, index=False)
- ### convert to list of tuples ###
- dd = ddf[list(ddf.columns)[:3]].dropna(axis='rows', how='all')
- dd = [tuple(
- [None if pandas.isna(c) else float_or_miniStr(c) for c in r]
- ) for r in dd.to_dict('split')['data']]
- ### loop through the tuples to re-build rows ###
- dictList, prSect, mSect, sSect = [], False, None, None
- for dname, dTier, dReq in dd:
- if type(dTier) == str and len(dTier) > 2 and not dname and not dReq:
- if prSect: mSect = sSect
- sSect, prSect = dTier, not prSect
- continue
- prSect = False
- if dname and dname != dTier:
- dictList.append({
- 'Section': mSect, 'Subsection': sSect, 'Drug Name': dname,
- 'Drug Tier': None, 'Requirements/Limits': dReq
- })
- continue
- if (dTier or dReq) and dictList:
- dictList[-1]['Drug Tier'] = dReq if dReq else dTier
- ### form output dataframe from re-built rows ###
- fdf = pandas.DataFrame(dictList)
- ### [conditionally] save extracted output to csv ###
- if saveCsv:
- fdf.to_csv(f"drugs_list-{fileName.strip('.pdf')}.csv", index=False)
- return fdf
Advertisement
Add Comment
Please, Sign In to add comment