LordBlick

WorldOMeters coronas scraper

Aug 7th, 2020 (edited)
179
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.68 KB | None | 0 0
  1. #!/usr/bin/env python3
  2. # -*- encoding: utf-8 -*-
  3. # -*- coding: utf-8 -*-
  4. # -*- tabstop: 2 -*-
  5.  
  6. from os import path as ph
  7. H = ph.expanduser('~') # Home dir
  8. hh = lambda s: s.replace(H, '~')
  9. from sys import stdout as sto
  10. _p = lambda _str: sto.write(hh(str(_str)))
  11. debug = (False, True)[0]
  12. def _d(_str):
  13.     if debug: _p(_str)
  14.  
  15. url_base_wdm_crv = r'https://www.worldometers.info/coronavirus/'
  16. view_countries = "Poland", "New Zealand", "USA", "MS Zaandam"
  17.  
  18. from urllib.request import Request as ulrq, urlopen as urlo
  19. from urllib.parse import urlparse as urlp
  20. from lxml import html as lxHtml
  21. url_base_wdm_crv_p = urlp(url_base_wdm_crv)
  22. url_base_wdm_crv_path_l = len(url_base_wdm_crv_p.path)
  23.  
  24. class URLparse():
  25.     def __init__(it, url):
  26.         purl = urlp(url)
  27.         for _attr in dir(purl):
  28.             r_atrr = getattr(purl, _attr)
  29.             if not(callable(r_atrr)):
  30.                 try:
  31.                     setattr(it, _attr, r_atrr[:])
  32.                 except TypeError:
  33.                     setattr(it, _attr, r_atrr)
  34.  
  35.     get_no_query_url = lambda it: f"{it.scheme}://{it.netloc}{it.path}"
  36.  
  37.  
  38. class CoroWorldMeter:
  39.     def __init__(mn):
  40.         mn.go()
  41.  
  42.     def go(mn):
  43.         response = None
  44.         url_p = URLparse(f"{url_base_wdm_crv}")
  45.         r_url = ulrq(url_p.get_no_query_url(), headers={'User-Agent': 'Coronzilla/5.0'})
  46.         h_url = urlo(r_url)
  47.         if h_url.code == 200:
  48.             response = h_url.read()
  49.         h_url.close()
  50.         if response:
  51.             html = response.decode('utf-8')
  52.             tree = lxHtml.fromstring(html)
  53.         for cntr in view_countries:
  54.             cnt, act = mn.wordmeter_row(tree, cntr)
  55.             if cnt:
  56.                 _p(f"{cntr} total: {cnt}, active: {act}\n")
  57.  
  58.     def wordmeter_row(mn, tree, country):
  59.         rowElement = tree.xpath(f'//table[@id="main_table_countries_today"]/tbody[1]/tr/td//text()[contains(.,"{country}")]/..')
  60.         if len(rowElement):
  61.             rowElement = rowElement[0]
  62.             if hasattr(rowElement, 'tag'):
  63.                 tag = rowElement.tag.lower()
  64.                 _d(f"Tag:{tag}\n")
  65.             while tag!='tr':
  66.                 if hasattr(rowElement, 'getparent'):
  67.                     rowElement = rowElement.getparent()
  68.                 else:
  69.                     _p(f"Strange situation, ")
  70.                     if hasattr(rowElement, 'tag'):
  71.                         tag = rowElement.tag.lower()
  72.                         _p(f"tag:{tag} wthout parent…\n")
  73.                     else:
  74.                         _p(f"no tag, no parent…\n")
  75.                        
  76.                     break
  77.                 if hasattr(rowElement, 'tag'):
  78.                     tag = rowElement.tag.lower()
  79.                     _d(f"Tag:{tag}\n")
  80.             if tag=='tr':
  81.                 cntry_stats = tuple(map(lambda s: s.text if s.text else '0', rowElement.getchildren()))
  82.                 _d(f"Childs:\n")
  83.                 for idx, txt in enumerate(cntry_stats):
  84.                     _d(f"\t[{idx}]{txt}\n")
  85.                 CoronaTotal = cntry_stats[2]
  86.                 CoronaActive = cntry_stats[8]
  87.                 return CoronaTotal.replace(',', ' '), CoronaActive.replace(',', ' ')
  88.         return '', ''
  89.  
  90. # Entry point
  91. if __name__ == "__main__":
  92.     CoroWorldMeter()
  93.  
Add Comment
Please, Sign In to add comment