SHARE
TWEET

Untitled

a guest Apr 26th, 2019 62 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1.    
  2. -- vim: ft=lua ts=2 sw=2
  3.  
  4. local esc = function(s) return string.gsub(s, "([%^%$%(%)%%%.%[%]%*%+%-%?])", "%%" .. "%1") end
  5. local str = tostring
  6. local char = string.char
  7. local err = function(s) io.stderr:write(s) end
  8. local out = function(s) io.stdout:write(s) end
  9.  
  10. local ElementNode = require("htmlparser.ElementNode")
  11. local voidelements = require("htmlparser.voidelements")
  12.  
  13. local HtmlParser = {}
  14.  
  15. local tpr = {
  16.     -- Here we're replacing confusing sequences
  17.     -- (things looking like tags, but appearing where tags can't)
  18.     -- with definitelly invalid utf sequence, and later we'll replace them back
  19.     ["<"] = char(208,209,208,209),
  20.     [">"] = char(209,208,209,208),
  21. }
  22.  
  23. local function parse(text,limit)
  24.     local text=str(text)
  25.  
  26.     local limit = limit or htmlparser_looplimit or 1000
  27.  
  28.     local tpl = false
  29.  
  30.     local function g(id,...)
  31.         local arg={...}
  32.         arg[id]=tpr[arg[id]]
  33.         tpl=true
  34.         return table.concat(arg)
  35.     end
  36.  
  37.     text = text
  38.         :gsub(
  39.             "(<)"..
  40.             "([^>]-)"..
  41.             "(<)",
  42.             function(...)return g(3,...)end
  43.         ):gsub(
  44.             "("..tpr["<"]..")"..
  45.             "([^%w%s])"..
  46.             "([^%2]-)"..
  47.             "(%2)"..
  48.             "(>)"..
  49.             "([^>]-)"..
  50.             "(>)",
  51.             function(...)return g(5,...)end
  52.         ):gsub(
  53.             [=[(['"])]=]..
  54.             [=[([^'">%s]-)]=]..
  55.             "(>)"..
  56.             [=[([^'">%s]-)]=]..
  57.             [=[(['"])]=],
  58.             function(...)return g(3,...)end
  59.         )
  60.  
  61.     local index = 0
  62.     local root = ElementNode:new(index, str(text))
  63.  
  64.     local node, descend, tpos, opentags = root, true, 1, {}
  65.     while true do
  66.         if index == limit then
  67.             err("[HTMLParser] [ERR] Main loop reached loop limit ("..limit.."). Please, consider increasing it or check the code for errors")
  68.             break
  69.         end
  70.  
  71.         local openstart, name
  72.         openstart, tpos, name = root._text:find(
  73.             "<" ..        -- an uncaptured starting "<"
  74.             "([%w-]+)" .. -- name = the first word, directly following the "<"
  75.             "[^>]*>",     -- include, but not capture everything up to the next ">"
  76.         tpos)
  77.  
  78.         if not name then break end
  79.  
  80.         index = index + 1
  81.  
  82.         local tag = ElementNode:new(index, str(name), node, descend, openstart, tpos)
  83.         node = tag
  84.  
  85.         local tagloop
  86.         local tagst, apos = tag:gettext(), 1
  87.         while true do
  88.             if tagloop == limit then
  89.                 err("[HTMLParser] [ERR] tag parsing loop reached loop limit ("..limit.."). Please, consider increasing it or check the code for errors")
  90.                 break
  91.             end
  92.  
  93.             local start, k, eq, quote, v
  94.             start, apos, k, eq, quote = tagst:find(
  95.                 "%s+" ..         -- some uncaptured space
  96.                 "([^%s=/>]+)" .. -- k = an unspaced string up to an optional "=" or the "/" or ">"
  97.                 "(=?)" ..        -- eq = the optional; "=", else ""
  98.                 "(['\"]?)",      -- quote = an optional "'" or '"' following the "=", or ""
  99.             apos)
  100.  
  101.             if not k or k == "/>" or k == ">" then break end
  102.  
  103.             if eq == "=" then
  104.                 pattern = "=([^%s>]*)"
  105.                 if quote ~= "" then
  106.                     pattern = quote .. "([^" .. quote .. "]*)" .. quote
  107.                 end
  108.                 start, apos, v = tagst:find(pattern, apos)
  109.             end
  110.  
  111.             v=v or ""
  112.  
  113.             if tpl then
  114.                 for rk,rv in pairs(tpr) do
  115.                         v = v:gsub(rv,rk)
  116.                 end
  117.             end
  118.  
  119.             tag:addattribute(k, v)
  120.             tagloop = (tagloop or 0) + 1
  121.         end
  122.  
  123.         if voidelements[tag.name:lower()] then
  124.             descend = false
  125.             tag:close()
  126.         else
  127.             opentags[tag.name] = opentags[tag.name] or {}
  128.             table.insert(opentags[tag.name], tag)
  129.         end
  130.  
  131.         local closeend = tpos
  132.         local closingloop
  133.         while true do
  134.             if closingloop == limit then
  135.                 err("[HTMLParser] [ERR] tag closing loop reached loop limit ("..limit.."). Please, consider increasing it or check the code for errors")
  136.                 break
  137.             end
  138.  
  139.             local closestart, closing, closename
  140.             closestart, closeend, closing, closename = root._text:find("[^<]*<(/?)([%w-]+)", closeend)
  141.  
  142.             if not closing or closing == "" then break end
  143.  
  144.             tag = table.remove(opentags[closename] or {}) or tag -- kludges for the cases of closing void or non-opened tags
  145.             closestart = root._text:find("<", closestart)
  146.             tag:close(closestart, closeend + 1)
  147.             node = tag.parent
  148.             descend = true
  149.             closingloop = (closingloop or 0) + 1
  150.         end
  151.     end
  152.  
  153.     if tpl then
  154.         for k,v in pairs(tpr) do
  155.             root._text = root._text:gsub(v,k)
  156.         end
  157.     end
  158.  
  159.     return root
  160. end
  161. HtmlParser.parse = parse
  162.  
  163. return HtmlParser
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top