Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ## example usage of the confParse function [ defined at https://pastebin.com/c0TjDmNE ] ########
- ## [ confParse recursively extracts details from a bs4 document or Tag ] #######################
- ## [ structure of output mimics the tSel argument - areference object containing selectors ] ###
- ################################################################################################
- ## REQUIREMENTS:
- import json # [ just for saving output ] ## OUTPUT at https://pastebin.com/4mcmzCiP
- from urllib.parse import urljoin # required for selRef['hyperlinks']['mFunc']
- # linkToSoup [+requirements] # PASTE FROM https://pastebin.com/rBTr06vy
- # confParse [+requirements] # PASTE FROM https://pastebin.com/c0TjDmNE ]
- #################
- url = 'https://en.wikipedia.org/wiki/White_Nile'
- # [ can try other URLs, this selRef is quite generic ]
- selRef = {
- 'page_title': 'head title',
- 'title_words': {'tSel': 'head title', 'mFunc': 'split'},
- 'button_attrs': (*[None]*3, 'button', getattr, 'attrs'),
- 'full_links': {
- 'tSel': None, 'tAttr': 'href',
- 'listSel': 'a[href^="https://"], a[href^="http://"]'
- },
- 'hyperlinks': {
- 'tSel': None, 'tAttr': None,
- 'listSel': 'a[href]:not([href^="javascript:"])',
- 'mFunc': lambda a: {
- 'link': a['href'], 'full_link': urljoin(url, a['href']),
- 'link_text': a.get_text(' ', strip=True)
- }
- },
- 'images_dict': {
- 'tSel': ((None, 'alt'), {'tSel':None, 'mFunc': (
- lambda m: m.get('data-src', m.get('src'))
- )}), 'listSel': 'img[alt]:is(img[data-src],img[data-src])',
- '__apply2resultSet__': dict
- },
- 'images_list': (
- {a: (None,a) for a in ['alt','src','data-src']}, '', None, 'img')
- }
- data = confParse(linkToSoup(url), selRef) ## see https://pastebin.com/4mcmzCiP
- with open('confParse-wiki-White_Nile.json', 'w') as f:
- json.dump(data, f, indent=4)
- ## TO VIEW JSON OUTPUT COMFORTABLY:
- # go to https://codebeautify.org/jsonviewer
- # click the 🔗 URL button
- # enter https://pastebin.com/raw/4mcmzCiP
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement