Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ## for https://stackoverflow.com/q/74939979/6146136
- ## sample outputs [as csv] on Sheet "[API] dfn_prTimeline_api" at
- #### https://docs.google.com/spreadsheets/d/1yiDCrMi3Sog7qLcT_6pS-71xL1FkgNQnOS5ngPlMf_M
- import cloudscraper
- from datetime import datetime
- ########################################## [needed for main scraper] ##########################################
- ## part of getNestedVal [ at https://pastebin.com/NRxKRJPn ] ##
- ## will be called by danskfolkeparti_apiScraper to get values from json according to pathsRef ##
- def getVal_byPath(obj, kPath:list, printError=False, returnError=False):
- try:
- for k in kPath: obj = obj[k]
- return obj
- except Exception as e:
- errMsg = f'could not access {kPath} - {e}'
- if printError: print(errMsg)
- return errMsg if returnError else None
- ################################################################################################################
- ################################################# MAIN SCRAPER #################################################
- def danskfolkeparti_apiScraper(searchFor={}, perPage=99, pathsRef={}):
- searchFor = searchFor if isinstance(searchFor, (dict, str)) else {}
- perPage = perPage if isinstance(perPage, int) and 0 < perPage < 100 else 99
- qParams = {
- 'categories': '20,85,15,83,73,84,', 'before': '2022-12-28T23:59:59',
- 'after': '1990-05-08T01:01:01', 'per_page': perPage, 'page': '1'
- }
- pathsRef = None if not isinstance(pathsRef, dict) else ({
- 'id': ['id'], 'title': ['title', 'rendered'], 'url': ['link'],
- 'date': ['date'], # ['date_gmt'],
- 'modified': ['modified'], # ['modified_gmt'],
- 'category': ['categories'], 'excerpt': ['acf', 'excerpt'],
- 'content': ['content', 'rendered'] # html
- } if pathsRef == {} else pathsRef)
- wMsgs = []
- #### PREPARE QUERY PARAMETERS ####
- if isinstance(searchFor, str):
- ## search keywords --> fetch postIds for query ##
- sf, srj, errMsg = searchFor.replace(' ', '%20'), None, None
- sUrl = f'https://danskfolkeparti.dk/wp-json/wp/v2/search'
- sUrl = f'{sUrl}?search={sf}&per_page={perPage}'
- sResp = cloudscraper.create_scraper().get(sUrl)
- rsMsg = f'{sResp.status_code} {sResp.reason}'
- try:
- sResp.raise_for_status() # check for request error
- srj, pIds = sResp.json(), []
- if srj and isinstance(srj, list):
- pIds = [p['id'] for p in sResp.json() if isinstance(
- p['id'], int) and type(p['type'] == 'post')]
- wMsgs += ([{
- 'msg': f'no postIds - "{searchFor}"','search_json': srj
- }] if not pIds else [])
- qParams = {
- 'include': ''.join([f'{p},' for p in pIds]),
- 'per_page': perPage
- }
- except Exception as e:
- return [], {'error': f'searchError: {type(e)} {e}',
- 'warnings':wMsgs, 'response_status':rsMsg}
- else:
- ## no search keywords --> replace default parameters with VALID inputs ##
- for k, v in searchFor.items():
- try:
- if k in ['before', 'after']:
- qParams[k] = datetime.fromisoformat(v).isoformat()
- elif k == 'categories':
- if isinstance(v, str):
- v = [int(i) for i in v.split(',') if i.isdigit()]
- v = [i for i in v if isinstance(i, int)]
- if v and isinstance(v, list): qParams[k] = v
- elif k == 'page' and int(v) > 0: qParams[k] = int(v)
- else: continue # pass
- except: continue # pass
- #### PREPARE API URL FOR QUERY AND CALL API ####
- qStr = '&'.join([f'{k}={v}' for k, v in qParams.items()])
- api_url = f'https://danskfolkeparti.dk/wp-json/wp/v2/posts?{qStr}'
- apiResp = cloudscraper.create_scraper().get(api_url)
- rsMsg = f'{apiResp.status_code} {apiResp.reason}'
- try:
- apiResp.raise_for_status() # check for request error
- arjData = apiResp.json()
- except Exception as e:
- return [], {'error': f'searchError: {type(e)} {e}',
- 'warnings': wMsgs, 'response_status': rsMsg}
- #### EXTRACT FROM API RESPONSE ACCORDING TO pathsRef ARGUMENT ####
- if not isinstance(arjData, list): tprList = []
- elif pathsRef:
- tprList = [{
- k: getVal_byPath(pr, v) for k, v in pathsRef.items()
- } for pr in arjData if 'id' in pr]
- else: tprList = arjData[:] ## set pathsRef=None to jet full json response
- if not tprList:
- wMsgs.append({'msg': 'no posts with ids', 'json': arjData})
- wMsgs = wMsgs if wMsgs else None
- return tprList, {'error':None, 'warnings':wMsgs, 'response_status':rsMsg}
- ################################################################################################################
- ################################################ GET CATEGORIES ################################################
- def danskfolkeparti_categories(cidList='all', printCategories=False):
- cat_url = 'https://danskfolkeparti.dk/wp-json/wp/v2/categories?per_page=50'
- catResp = cloudscraper.create_scraper().get(cat_url)
- catResp.raise_for_status()
- carjData = catResp.json()
- cidList = cidList if cidList and isinstance(cidList, list) else 'all'
- catList = [c for c in carjData if cidList == 'all' or c['id'] in cidList]
- mIds = [i for i in cidList if not [c for c in catList if c['id'] == i]]
- if mIds and printCategories: print('could not find IDs: ', mIds)
- clen = len(catList)
- for ci, c in enumerate(catList):
- cpi = c['parent'] if 'parent' in c else None
- cpn = [p['name'] for p in carjData if p['id'] == cpi]
- catList[ci]['parent_name'] = cpn[0] if cpn else None
- if not printCategories: continue
- cpn = '' if cpi is None else f"<-- [{cpi}] {cpn[0] if cpn else ''}"
- ci = f"[category {ci+1:>2} of {clen}]"
- print(f"# {ci} {c['id']:>2} --> {c['name']} {cpn}")
- return [{'id': i, 'error': 'not found'} for i in mIds] + [{
- k: v for k, v in c.items() if not isinstance(v, (list, dict))
- } for c in catList]
- ################################################################################################################
- ################################################ CATEGORY NAMES ################################################
- ############################## examples: ##############################
- # danskfolkeparti_catNames(20) #--> "Dansk Folkeblad"
- # danskfolkeparti_catNames([20, 85]) #--> ['Dansk Folkeblad', 'Fokus']
- # danskfolkeparti_catNames(85, namePar=True) #--> "Fokus [Nyheder]"
- #######################################################################
- def danskfolkeparti_catNames(catIds,namePar=False,cDict=None,printErr=False):
- try:
- if not isinstance(cDict, dict):
- catsList = danskfolkeparti_categories()
- cDict = {c['id']: c for c in catsList if 'id' in c}
- if isinstance(catIds, list) and len(catIds) == 1: catIds = catIds[0]
- if isinstance(catIds, list):
- if not catIds: return None
- return [danskfolkeparti_catNames(
- c, namePar=namePar, cDict=cDict, printErr=printErr
- ) for c in catIds]
- if catIds in cDict and 'name' in cDict[catIds]:
- cn, cpn = cDict[catIds]['name'], None
- if 'parent_name' in cDict[catIds]:
- cpn = cDict[catIds]['parent_name']
- return f'{cn} [{cpn}]' if namePar and cpn else cn
- except Exception as e:
- if printErr: print(f'{type(e)} {e}')
- return catIds
- ################################################################################################################
- ################################################################################################################
- ################################################ ALL CATEGORIES ################################################
- # danskfolkeparti_categories() ## prints all categories
- # [category 1 of 47] 22 --> Ansat <-- [0]
- # [category 2 of 47] 54 --> Arrangement <-- [53] Kategorier - lokalforeninger
- # [category 3 of 47] 64 --> Artikler <-- [53] Kategorier - lokalforeninger
- # [category 4 of 47] 43 --> Byrådskandidat <-- [18] Politiker
- # [category 5 of 47] 42 --> Byrådsmedlem <-- [18] Politiker
- # [category 6 of 47] 20 --> Dansk Folkeblad <-- [31] Nyheder
- # [category 7 of 47] 85 --> Fokus <-- [31] Nyheder
- # [category 8 of 47] 40 --> Folketingskandidat <-- [18] Politiker
- # [category 9 of 47] 39 --> Folketingsmedlem <-- [18] Politiker
- # [category 10 of 47] 62 --> Forside <-- [53] Kategorier - lokalforeninger
- # [category 11 of 47] 47 --> Hovedbestyrelsen <-- [18] Politiker
- # [category 12 of 47] 61 --> Hovedmenu <-- [53] Kategorier - lokalforeninger
- # [category 13 of 47] 86 --> Kampagne <-- [0]
- # [category 14 of 47] 49 --> Kandidat til Europa-Parlamentsvalg <-- [18] Politiker
- # [category 15 of 47] 50 --> Kandidat til regionsrådet <-- [18] Politiker
- # [category 16 of 47] 53 --> Kategorier - lokalforeninger <-- [0]
- # [category 17 of 47] 28 --> Kurser for nye medlemmer <-- [27] Kurser og Arrangementer
- # [category 18 of 47] 27 --> Kurser og Arrangementer <-- [0]
- # [category 19 of 47] 41 --> Lokalbestyrelsesmedlem <-- [18] Politiker
- # [category 20 of 47] 46 --> Lokalformand <-- [18] Politiker
- # [category 21 of 47] 65 --> Lokalnæstformand <-- [18] Politiker
- # [category 22 of 47] 51 --> Medlem af Europa-Parlamentet <-- [18] Politiker
- # [category 23 of 47] 15 --> Mortens Nyhedsbrev <-- [31] Nyheder
- # [category 24 of 47] 29 --> new test <-- [27] Kurser og Arrangementer
- # [category 25 of 47] 31 --> Nyheder <-- [0]
- # [category 26 of 47] 26 --> Partiets historie <-- [0]
- # [category 27 of 47] 83 --> Pias Blog <-- [31] Nyheder
- # [category 28 of 47] 63 --> Politik <-- [53] Kategorier - lokalforeninger
- # [category 29 of 47] 4 --> Politik <-- [0]
- # [category 30 of 47] 18 --> Politiker <-- [0]
- # [category 31 of 47] 59 --> Politiske Assistenter <-- [22] Ansat
- # [category 32 of 47] 73 --> Politiske udspil <-- [31] Nyheder
- # [category 33 of 47] 57 --> Presseafdeling <-- [22] Ansat
- # [category 34 of 47] 84 --> Pressemeddelelser <-- [31] Nyheder
- # [category 35 of 47] 34 --> Region Hovedstaden <-- [33] Regioner - Lokalforening
- # [category 36 of 47] 37 --> Region Midtjylland <-- [33] Regioner - Lokalforening
- # [category 37 of 47] 38 --> Region Nordjylland <-- [33] Regioner - Lokalforening
- # [category 38 of 47] 35 --> Region Sjælland <-- [33] Regioner - Lokalforening
- # [category 39 of 47] 36 --> Region Syddanmark <-- [33] Regioner - Lokalforening
- # [category 40 of 47] 33 --> Regioner - Lokalforening <-- [0]
- # [category 41 of 47] 48 --> Regionsrådsmedlem <-- [18] Politiker
- # [category 42 of 47] 60 --> Sekretærer <-- [22] Ansat
- # [category 43 of 47] 58 --> Sekretariatet <-- [22] Ansat
- # [category 44 of 47] 30 --> test pdf <-- [0]
- # [category 45 of 47] 16 --> Ugebrev <-- [0]
- # [category 46 of 47] 1 --> Uncategorized <-- [0]
- # [category 47 of 47] 87 --> Underside <-- [53] Kategorier - lokalforeninger
Advertisement
Add Comment
Please, Sign In to add comment