Masoko

python script to count and compare html tags

Dec 17th, 2020 (edited)
815
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.31 KB | None | 0 0
  1. import re
  2. from xml.sax.saxutils import unescape
  3.  
  4. html_str = """\
  5. <div class="productdetailcontainer" id="detailtab1"><div class="r nom"><div class="c10"><div class="space ptb"><b>Description</b><br />Tr&egrave;s confortable pour un sommeil r&eacute;parateur. Id&eacute;al pour soulager les tensions : rev&ecirc;tement en mousse confortable et 7 zones de confort constitu&eacute;es de 600 ressorts ensach&eacute;s pour un soutien cibl&eacute; et optimal de l&#39;ensemble du corps. Position de couchage sur&eacute;lev&eacute;e : hauteur totale du matelas : env. 19 cm. R&eacute;gulation optimale de la temp&eacute;rature : la housse surpiqu&eacute;e de fibres respirantes de chaque c&ocirc;t&eacute; et les poches d&#39;air dans les ressorts ensach&eacute;s permettent une a&eacute;ration parfaite du matelas. Facile &agrave; transporter et rapide &agrave; d&eacute;baller. Se d&eacute;place et se retourne facilement gr&acirc;ce aux 4 poign&eacute;es.<p fontsize:1.0em="" style=""><strong>Vous h&eacute;sitez dans votre choix du meilleur matelas? Cliquez <a advies="" frbe="" href="https://www.lidlshop.be/frBE/wonen/slaapkamer/matrassen/advies" https:="" matrassen="" slaapkamer="" wonen="" www.lidlshop.be="">ici</a> pour obtenir plus d&#39;infos.</strong></p></div></div><div><ul class="attributes" id="mattresslist"> <li class="r nom"><strong class="c2 space pr">Noyau</strong><span class="c8">Ressorts ensach&eacute;s</span></li>   <li class="r nom"><strong class="c2 space pr">Housse</strong><span class="c8">100% polyester</span></li>    <li class="r nom"><strong class="c2 space pr">Mesures</strong><span class="c8">140 x 200 cm</span></li> <li class="r nom"><strong class="c2 space pr">Hauteur</strong><span class="c8">19 cm</span></li>    <li class="r nom"><strong class="c2 space pr">Poids</strong><span class="c8">Convient pour un poids corporel de 80100 kg</span></li>    <li class="r nom"><strong class="c2 space pr">Zones de confort</strong><span class="c8">7</span></li>   <li class="r nom"><strong class="c2 space pr">Degr&eacute; de fermet&eacute;</strong><span class="c8">H3</span></li>    <li class="r nom"><strong class="c2 space pr">Antiallergique</strong><span class="c8">Oui</span></li>   <li class="r nom"><strong class="c2 space pr">Housse amovible</strong><span class="c8">Oui</span></li>  <li class="r nom"><strong class="c2 space pr">Fermeture &agrave; glissi&egrave;re</strong><span class="c8">Fermeture zipp&eacute;e sur les 4 c&ocirc;t&eacute;s</span></li> <li class="r nom"><strong class="c2 space pr">Instructions d&#39;entretien</strong><span class="c8">Lavable en machine &agrave; 60 &deg;C sur le programme linge d&eacute;licat</span></li> <li class="r nom"><strong class="c2 space pr">Labels de qualit&eacute;</strong><span class="c8">5</span></li></ul></div></div></div>"""
  6.  
  7. def check_html(html):
  8.  
  9.     def remove_space_in_closing_tags(html):
  10.         thtml = unescape(html.replace("'","\"")).replace("&#xa;","")
  11.         while "</ " in thtml:
  12.             thtml = thtml.replace("</ ","</")
  13.         return thtml
  14.  
  15.     def build_tags(html):
  16.         closing_tag_list = []
  17.         opening_tag_list = []
  18.  
  19.         # ultimate_regexp = "(?i)<\/?\w+((\s+\w+(\s*=\s*(?:\".*?\"|'.*?'|[^'\">\s]+))?)+\s*|\s*)\/?>" # used to extract htlm tags from html
  20.         ultimate_regexp = "<.+?>"
  21.  
  22.         for match in re.finditer(ultimate_regexp, html):
  23.             try:
  24.                 tag = repr(match.group()).split(" ")[0]
  25.             except:
  26.                 tag = repr(match.group())
  27.             tag = tag.replace("<","").replace(">","").replace("'","")
  28.             if tag.startswith("/"):
  29.                 closing_tag_list.append(tag.lower())
  30.             else:
  31.                 opening_tag_list.append(tag.lower())
  32.         return opening_tag_list, closing_tag_list
  33.  
  34.     def compare_tags(opening_tag_list, closing_tag_list):
  35.         errors = []
  36.         self_closing = ['area', 'base', 'br', 'col', 'embed', 'hr', 'hr/', 'img','input', 'link', 'meta', 'param', 'source', 'track', 'wbr', 'bgsound']
  37.  
  38.         unique_opening_tag_list = list(dict.fromkeys(opening_tag_list))
  39.         unique_closing_tag_list = list(dict.fromkeys(closing_tag_list))
  40.  
  41.         for tag in unique_opening_tag_list:
  42.             opening_tags_count = opening_tag_list.count(tag)
  43.             closing_tags_couunt = closing_tag_list.count("/" + tag)
  44.             if opening_tags_count == closing_tags_couunt or tag in self_closing or '!' in tag or ":" in tag:
  45.                 pass
  46.             else:
  47.                 errors.append("tag {} - KO - {} opening tags and {} closing tags".format(tag, opening_tags_count, closing_tags_couunt))
  48.         for tag in unique_closing_tag_list:
  49.             opening_tags_count = opening_tag_list.count(tag[1:])
  50.             closing_tags_couunt = closing_tag_list.count(tag)
  51.             if opening_tags_count == closing_tags_couunt or tag in self_closing or tag[1:] in unique_opening_tag_list or '!' in tag or ":" in tag:
  52.                 pass
  53.             else:
  54.                 errors.append("tag {} - KO - {} opening tags and {} closing tags".format(tag, opening_tags_count, closing_tags_couunt))
  55.         return errors
  56.  
  57.     opening_tags, closing_tags = build_tags(remove_space_in_closing_tags(html))
  58.     return compare_tags(opening_tags, closing_tags)
  59.  
  60. if __name__ == '__main__':
  61.  
  62.     result = check_html(html_str)
  63.     if result :
  64.         print(result)
  65.     else:
  66.         print("OK")
Add Comment
Please, Sign In to add comment