Try95th

htreeToDict_sampleUsage

Nov 17th, 2022 (edited)
197
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 8.65 KB | None | 0 0
  1. import requests
  2. from bs4 import BeautifulSoup
  3.  
  4. ####### FIRST COPY&PASTE FUNCTION DEFINITIONS [OR DOWNLOAD&IMPORT] #######
  5. ### alterTextVals                                  <-- from https://pastebin.com/DSXcgQyC  
  6. ### getElementAttr, selectElement___, htreeToDict  <-- from https://pastebin.com/BpjZSQPi
  7. ### [htreeToDict is the function we're calling, but it depends on the others]
  8.  
  9. # fetching and parsing html
  10. r = requests.get('https://www.gelbeseiten.de/suche/architekturb%c3%bcros/aachen?umkreis=21000')
  11. print(f'Error Message "{r.raise_for_status()}" to GET', r.url)
  12. soup = BeautifulSoup(r.content, 'html5lib')
  13.  
  14. # preparing the second input (a dictionary of selectors and attributes)
  15. selRef = {
  16.     'docTitle': ('head title', '', ''),  
  17.     'headText': ('head', '', [('elimWhite', '')]),
  18.     'headLinks': ('head *[href]', ('localName', 'href'), 'list'),
  19.     'bodyText': ('body', '', [('elimWhite', ' '), ('truncSplit...', 100)]),
  20.     'resultsCt': ('#mod-TrefferlisteInfo', '', ''),  
  21.     'resultStr': ('.mod-TrefferlisteInfo', '', ''),
  22.     'listings1': (
  23.         'article[id^="treffer_"]:has(p.mod-Treffer--besteBranche)'
  24.         , {
  25.             'articleId': ('', 'id', [('replace', ('treffer_', 1))]),
  26.             'Title': ('h2[data-wipe-name="Titel"]', '', ''),
  27.             'Branch': ('p.mod-Treffer--besteBranche', '', ''),
  28.             'Address': ('p[data-wipe-name="Adresse"]', '', ''),
  29.             'Contact': ('p[data-wipe-name="Kontaktdaten"]', '', ''),
  30.             'Website': ('a.contains-icon-homepage[href]', 'href', ''),
  31.             'Email': ('a.contains-icon-email[href^="mailto:"]', 'href', [
  32.                 ('replace', ('mailto:', 1)), ('word1', '?')
  33.             ]), 'DetailsPage': ('a.contains-icon-details[href]', 'href', '')
  34.         } , 'list'),
  35.     'listings2': (
  36.         'article[id^="treffer_"]:not(:has(p.mod-Treffer--besteBranche))'
  37.         , {
  38.             'articleId': ('', 'id', [('replace', ('treffer_', 1))]),
  39.             'Title': ('h2[data-wipe-name="Titel"]', '', '')
  40.         }, 'list'),
  41. }
  42.  
  43. ## calling the function
  44. htreeToDict(soup, selRef)
  45.  
  46.  
  47. ################# THE OUTPUT (pasted from terminal) #################
  48. {
  49.  'docTitle': 'ᐅ Top 10 Architekturbüros  Aachen | ✉ Adresse | ☎ Telefonnummer | 📝 Kontakt | ✅ Bewertungen ➤ Jetzt auf GelbeSeiten.de ansehen.',
  50.  'headText': 'ᐅTop10ArchitekturbürosAachen|✉Adresse|☎Telefonnummer|📝Kontakt|✅Bewertungen➤JetztaufGelbeSeiten.deansehen.',
  51.  'headLinks': [('base', '/'),
  52.   ('link', '/webgs/css/global_above.css?1668512157093'),
  53.   ('link', '/webgs/css/global_above.css?1668512157093'),
  54.   ('link', 'https://wwa.wipe.de/wwa.js'),
  55.   ('link', 'https://a.delivery.consentmanager.net/delivery/cmp.php?id=15760'),
  56.   ('link', 'https://cdn.consentmanager.net/delivery/js/cmp_de.min.js'),
  57.   ('link', 'https://consentmanager.mgr.consensu.org'),
  58.   ('link', '/webgs/css/trefferliste_above.css?1668512157093'),
  59.   ('link', '/webgs/css/trefferliste_above.css?1668512157093'),
  60.   ('link', '/webgs/css/schaufenster_global.css?1668512157093'),
  61.   ('link', 'https://www.gelbeseiten.de/suche/architekturb%c3%bcros/aachen?umkreis=21000'),
  62.   ('link', '/webgs/images/fav/[email protected]'),
  63.   ('link', '/webgs/images/fav/[email protected]'),
  64.   ('link', '/webgs/images/fav/[email protected]'),
  65.   ('link', '/webgs/images/fav/[email protected]'),
  66.   ('link', '/webgs/images/fav/[email protected]'),
  67.   ('link', '/webgs/images/fav/[email protected]'),
  68.   ('link', '/webgs/images/fav/[email protected]'),
  69.   ('link', '/webgs/images/fav/[email protected]'),
  70.   ('link', '/webgs/images/fav/[email protected]'),
  71.   ('link', '/webgs/fonts/TheSansB4-3_Light.woff2'),
  72.   ('link', '/webgs/fonts/TheSansB4-5_Plain.woff2'),
  73.   ('link', '/webgs/fonts/TheSansB4-7_Bold.woff2'),
  74.   ('link', 'https://data-a0f0ae1310.gelbeseiten.de/sensor.modern.ncl.min.js'),
  75.   ('link', '/webgs/css/global_below.css?1668512157093'),
  76.   ('link', '/webgs/css/global_below.css?1668512157093'),
  77.   ('link', '/webgs/libraries/manifest.json'),
  78.   ('link', 'https://ad13.adfarm1.adition.com'),
  79.   ('link', '/webgs/css/trefferliste_below.css?1668512157093'),
  80.   ('link', '/webgs/css/trefferliste_below.css?1668512157093'),
  81.   ('link', 'https://imagesrv.adition.com')],
  82.  'bodyText': 'Suchen>Was & Wo SucheBranchenkatalogService>FÜR SIEV...experten.deEin Service IhrerGelbe Seiten Verlage',
  83.  'resultsCt': '326',
  84.  'resultStr': '326Treffer\n\tfürArchitekturbürosinAachen',
  85.  'listings1': [{'articleId': '128049362580',
  86.    'Title': 'JOCHUM MARTIN Dipl.Ing. Architekt AKNW',
  87.    'Branch': 'Architektur',
  88.    'Address': 'Kardinalstr. 7,52070 Aachen922 m',
  89.    'Contact': '0241 15 87 07',
  90.    'Website': 'http://www.bauplanungsbuero.net',
  91.    'Email': '[email protected]',
  92.    'DetailsPage': 'https://www.gelbeseiten.de/gsbiz/f52ce153-0e47-410c-b357-fd4c69a15d9c'},
  93.   {'articleId': '128038111883',
  94.    'Title': 'Architektur Hammers',
  95.    'Branch': 'Architektur',
  96.    'Address': 'Melatener Str. 82,52074 Aachen1,5 km',
  97.    'Contact': '0241 87 79 37',
  98.    'Website': 'http://www.architektur-hammers.de',
  99.    'Email': '[email protected]',
  100.    'DetailsPage': 'https://www.gelbeseiten.de/gsbiz/fa71f80d-db96-4279-b97c-46afbd7bd356'}],
  101.  'listings2': [
  102.   {'articleId': '128046113791', 'Title': 'Hautmann + Metz Architekten Partnerschaft mbB'},
  103.   {'articleId': '128026414465', 'Title': 'Werrens M. Dipl.-Ing.'},
  104.   {'articleId': '128026414714', 'Title': 'Leidinger U. Dipl.-Ing.'},
  105.   {'articleId': '128055874705', 'Title': 'Ortmanns Edith'},
  106.   {'articleId': '128026416297', 'Title': 'Ince H. Dipl.-Ing.'},
  107.   {'articleId': '128027601198', 'Title': 'g.u.t. Architekten Gerhards u. Thomé'},
  108.   {'articleId': '128026423021', 'Title': 'Barhainski HorstHorst Architekt'},
  109.   {'articleId': '128027879691', 'Title': 'Daheim + Uppenkamp Architektur-Ing.-Büro Architekten'},
  110.   {'articleId': '128026421674', 'Title': 'gmp Generalplanungsgesellschaft mbH Architekturbüro'},
  111.   {'articleId': '128026418543', 'Title': 'Aymen S.'},
  112.   {'articleId': '128036948668', 'Title': 'Ritter Karl-Heinz Architekt'},
  113.   {'articleId': '128026418055', 'Title': 'Rennecke G. Dipl.-Ing.'},
  114.   {'articleId': '128027945884', 'Title': 'IParch GmbH'},
  115.   {'articleId': '9190128871613690', 'Title': 'Michael Golombek GAP Architekturbüro und Projektsteuerung'},
  116.   {'articleId': '128026417128', 'Title': 'Klinge Manfred Architekturbüro'},
  117.   {'articleId': '128026420536', 'Title': 'Stollmann P.P.H. Dipl.-Ing.'},
  118.   {'articleId': '128026419763', 'Title': 'Kasaci Okyay Architekt'},
  119.   {'articleId': '128047603432', 'Title': 'pfeiffer.volland architekten'},
  120.   {'articleId': '9190128866480780', 'Title': 'Dipl.-Ing. D. Körkel-Greuel und Dipl.-Ing. J. Greuel Architekturbüro'},
  121.   {'articleId': '128026420850', 'Title': 'Glashaus Architekturbüro'},
  122.   {'articleId': '9190128869930960', 'Title': 'Dipl.-Ing. Jan Konwinski Architekturbüro'},
  123.   {'articleId': '128026416136', 'Title': 'Spielmann M.'},
  124.   {'articleId': '128026416418', 'Title': 'ecf architekten gmbh'},
  125.   {'articleId': '128026412934', 'Title': 'Invention Industriedesign und Entwicklungen'},
  126.   {'articleId': '128030495940', 'Title': 'Heuer, Faust Dipl.-Ing.'},
  127.   {'articleId': '128026417904', 'Title': 'Kadawittfeldarchitektur'},
  128.   {'articleId': '128048126846', 'Title': 'Klösgen H.'},
  129.   {'articleId': '128026414591', 'Title': 'Karsten u. Partner'},
  130.   {'articleId': '128044241455', 'Title': 'SW-Häuser GmbH Architekt'},
  131.   {'articleId': '128036891386', 'Title': 'Rudolph W. Dipl.-Ing.'},
  132.   {'articleId': '128030377819', 'Title': 'Bernardi K.'},
  133.   {'articleId': '128026421328', 'Title': 'Kessler G. Dipl.-Ing.'},
  134.   {'articleId': '128026422117', 'Title': 'Borgmann Architekten und Ingenieure GmbH'},
  135.   {'articleId': '128050235257', 'Title': 'M Bauträger und Projektentwicklung UG (haftungsbeschränkt)'},
  136.   {'articleId': '128026992787', 'Title': 'OX2architekten, Orawiec I.-M., M. Prof.'},
  137.   {'articleId': '128026415299', 'Title': 'Frey A. Dipl.-Ing.'},
  138.   {'articleId': '128050790865', 'Title': 'Hermsen Robert Dipl.-Ing. Architekt'},
  139.   {'articleId': '128026415216', 'Title': 'Morauke A.'},
  140.   {'articleId': '128050804669', 'Title': 'Schommers H. Dipl.-Ing.'},
  141.   {'articleId': '128055904022', 'Title': 'Schommers Hans-Joachim Dipl.-Ing. Architekt Innenarchitekt'},
  142.   {'articleId': '128026421171', 'Title': 'Üstündag H. Dipl.-Ing.'},
  143.   {'articleId': '128026414775', 'Title': 'Hirsch Eberhard Dipl.-Ing. Architekt'},
  144.   {'articleId': '128037170468', 'Title': 'Greuel D.'},
  145.   {'articleId': '128026412827', 'Title': 'Fuss U.'},
  146.   {'articleId': '128026422187', 'Title': 'Hansen P.'},
  147.   {'articleId': '128055287150', 'Title': 'Hansen Peter Architekt'},
  148.   {'articleId': '128026422039', 'Title': 'Völlings-Grube D. Dipl.-Ing.'},
  149.   {'articleId': '9190128864101650', 'Title': 'Wolfgang Kleicker Architekturbüro'}
  150.  ]
  151. }
  152.  
  153.  
  154.  
Advertisement
Add Comment
Please, Sign In to add comment