Advertisement
Try95th

confParse example

Mar 19th, 2023 (edited)
207
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.07 KB | None | 0 0
  1. ## example usage of the confParse function [ defined at https://pastebin.com/c0TjDmNE ] ########
  2. ## [ confParse recursively extracts details from a bs4 document or Tag ] #######################
  3. ## [ structure of output mimics the tSel argument - areference object containing selectors ] ###
  4. ################################################################################################
  5.  
  6. ## REQUIREMENTS:
  7. import json # [ just for saving output ] ## OUTPUT at https://pastebin.com/4mcmzCiP
  8. from urllib.parse import urljoin # required for selRef['hyperlinks']['mFunc']
  9. # linkToSoup [+requirements] # PASTE FROM https://pastebin.com/rBTr06vy
  10. # confParse [+requirements] # PASTE FROM https://pastebin.com/c0TjDmNE ]
  11. #################
  12.  
  13. url = 'https://en.wikipedia.org/wiki/White_Nile'
  14. # [ can try other URLs, this selRef is quite generic ]
  15. selRef = {
  16.     'page_title': 'head title',
  17.     'title_words': {'tSel': 'head title', 'mFunc': 'split'},
  18.     'button_attrs': (*[None]*3, 'button', getattr, 'attrs'),
  19.     'full_links': {
  20.         'tSel': None, 'tAttr': 'href',
  21.         'listSel': 'a[href^="https://"], a[href^="http://"]'
  22.     },
  23.     'hyperlinks': {
  24.         'tSel': None, 'tAttr': None,  
  25.         'listSel': 'a[href]:not([href^="javascript:"])',
  26.         'mFunc': lambda a: {
  27.             'link': a['href'], 'full_link': urljoin(url, a['href']),
  28.             'link_text': a.get_text(' ', strip=True)
  29.         }
  30.     },
  31.     'images_dict': {
  32.         'tSel': ((None, 'alt'), {'tSel':None, 'mFunc': (
  33.             lambda m: m.get('data-src', m.get('src'))
  34.         )}), 'listSel': 'img[alt]:is(img[data-src],img[data-src])',
  35.         '__apply2resultSet__': dict
  36.     },
  37.     'images_list': (
  38.         {a: (None,a) for a in ['alt','src','data-src']}, '', None, 'img')
  39. }
  40.  
  41. data = confParse(linkToSoup(url), selRef) ## see https://pastebin.com/4mcmzCiP
  42. with open('confParse-wiki-White_Nile.json', 'w') as f:
  43.     json.dump(data, f, indent=4)
  44.  
  45. ## TO VIEW JSON OUTPUT COMFORTABLY:
  46. # go to https://codebeautify.org/jsonviewer
  47. # click the 🔗 URL button
  48. # enter https://pastebin.com/raw/4mcmzCiP
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement