Advertisement
Guest User

possible bug in lxml XMLParser

a guest
Oct 16th, 2014
335
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.08 KB | None | 0 0
  1. import sys
  2. import tempfile
  3. from lxml import etree
  4.  
  5.  
  6. print("%-20s: %s" % ('Python',           sys.version_info))
  7. print("%-20s: %s" % ('lxml.etree',       etree.LXML_VERSION))
  8. print("%-20s: %s" % ('libxml used',      etree.LIBXML_VERSION))
  9. print("%-20s: %s" % ('libxml compiled',  etree.LIBXML_COMPILED_VERSION))
  10. print("%-20s: %s" % ('libxslt used',     etree.LIBXSLT_VERSION))
  11. print("%-20s: %s" % ('libxslt compiled', etree.LIBXSLT_COMPILED_VERSION))
  12.  
  13. print
  14. print "parsing with resolve_entities=False, no DOCTYPE"
  15.  
  16. parser = etree.XMLParser(resolve_entities=False)
  17.  
  18. try:
  19.     tree = etree.XML("""<test>1<a href="&uuml;bel">&ouml;</a></test>""", parser=parser)
  20. except etree.XMLSyntaxError as e:
  21.     print e
  22.  
  23. print
  24. print "parsing with resolve_entities=False, dummy DOCTYPE"
  25.    
  26. try:
  27.     tree = etree.XML("""<!DOCTYPE test SYSTEM ""><test>1<a href="&uuml;bel">&ouml;</a></test>""", parser=parser)
  28.     print tree[:]
  29.     print tree.find('.//a').attrib['href']
  30.     print etree.tostring(tree)
  31. except etree.XMLSyntaxError as e:
  32.     print e
  33.  
  34.  
  35.  
  36. print
  37. print "parsing with actual DOCTYPE declaring the entities"
  38.  
  39. dtdfile = tempfile.NamedTemporaryFile(suffix='.dtd', delete=False)
  40. dtdfile.write("""<!ELEMENT test (#PCDATA|a)*> <!ELEMENT a (#PCDATA)> <!ATTLIST a href CDATA #REQUIRED> <!ENTITY uuml "&#xfc;"> <!ENTITY ouml "&#xf6;">""")
  41. dtdfile.close()
  42.  
  43. try:
  44.     tree = etree.XML("""<!DOCTYPE test SYSTEM "{}"><test>1<a href="&uuml;bel">&ouml;</a></test>""".format(dtdfile.name), parser=parser)
  45.     print tree[:]
  46.     print tree.find('.//a').attrib['href']
  47.     print etree.tostring(tree)
  48. except etree.XMLSyntaxError as e:
  49.     print e
  50.  
  51.    
  52. print
  53. print "parsing with actual DOCTYPE declaring the entities -- and dtd_validate=True"
  54.  
  55. parser = etree.XMLParser(resolve_entities=False, dtd_validation=True)
  56. try:
  57.     tree = etree.XML("""<!DOCTYPE test SYSTEM "{}"><test>1<a href="&uuml;bel">&ouml;</a></test>""".format(dtdfile.name), parser=parser)
  58.     print tree[:]
  59.     print tree.find('.//a').attrib['href']
  60.     print etree.tostring(tree)
  61. except etree.XMLSyntaxError as e:
  62.     print e
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement