Advertisement
tom2018

Html parser not mine a freinds

Nov 4th, 2012
72
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Lua 8.78 KB | None | 0 0
  1.  
  2. -- $Id: html.lua,v 1.2 2007/05/12 04:37:20 tclua Exp $
  3.  
  4. module(..., package.seeall)
  5.  
  6. entity = {
  7.   nbsp = " ",
  8.   lt = "<",
  9.   gt = ">",
  10.   quot = "\"",
  11.   amp = "&",
  12. }
  13.  
  14. -- keep unknown entity as is
  15. setmetatable(entity, {
  16.   __index = function (t, key)
  17.     return "&" .. key .. ";"
  18.   end
  19. })
  20.  
  21. block = {
  22.   "address",
  23.   "blockquote",
  24.   "center",
  25.   "dir", "div", "dl",
  26.   "fieldset", "form",
  27.   "h1", "h2", "h3", "h4", "h5", "h6", "hr",
  28.   "isindex",
  29.   "menu",
  30.   "noframes",
  31.   "ol",
  32.   "p",
  33.   "pre",
  34.   "table",
  35.   "ul",
  36. }
  37.  
  38. inline = {
  39.   "a", "abbr", "acronym", "applet",
  40.   "b", "basefont", "bdo", "big", "br", "button",
  41.   "cite", "code",
  42.   "dfn",
  43.   "em",
  44.   "font",
  45.   "i", "iframe", "img", "input",
  46.   "kbd",
  47.   "label",
  48.   "map",
  49.   "object",
  50.   "q",
  51.   "s", "samp", "select", "small", "span", "strike", "strong", "sub", "sup",
  52.   "textarea", "tt",
  53.   "u",
  54.   "var",
  55. }
  56.  
  57. tags = {
  58.   a = { empty = false },
  59.   abbr = {empty = false} ,
  60.   acronym = {empty = false} ,
  61.   address = {empty = false} ,
  62.   applet = {empty = false} ,
  63.   area = {empty = true} ,
  64.   b = {empty = false} ,
  65.   base = {empty = true} ,
  66.   basefont = {empty = true} ,
  67.   bdo = {empty = false} ,
  68.   big = {empty = false} ,
  69.   blockquote = {empty = false} ,
  70.   body = { empty = false, },
  71.   br = {empty = true} ,
  72.   button = {empty = false} ,
  73.   caption = {empty = false} ,
  74.   center = {empty = false} ,
  75.   cite = {empty = false} ,
  76.   code = {empty = false} ,
  77.   col = {empty = true} ,
  78.   colgroup = {
  79.     empty = false,
  80.     optional_end = true,
  81.     child = {"col",},
  82.   },
  83.   dd = {empty = false} ,
  84.   del = {empty = false} ,
  85.   dfn = {empty = false} ,
  86.   dir = {empty = false} ,
  87.   div = {empty = false} ,
  88.   dl = {empty = false} ,
  89.   dt = {
  90.     empty = false,
  91.     optional_end = true,
  92.     child = {
  93.       inline,
  94.       "del",
  95.       "ins",
  96.       "noscript",
  97.       "script",
  98.     },
  99.   },
  100.   em = {empty = false} ,
  101.   fieldset = {empty = false} ,
  102.   font = {empty = false} ,
  103.   form = {empty = false} ,
  104.   frame = {empty = true} ,
  105.   frameset = {empty = false} ,
  106.   h1 = {empty = false} ,
  107.   h2 = {empty = false} ,
  108.   h3 = {empty = false} ,
  109.   h4 = {empty = false} ,
  110.   h5 = {empty = false} ,
  111.   h6 = {empty = false} ,
  112.   head = {empty = false} ,
  113.   hr = {empty = true} ,
  114.   html = {empty = false} ,
  115.   i = {empty = false} ,
  116.   iframe = {empty = false} ,
  117.   img = {empty = true} ,
  118.   input = {empty = true} ,
  119.   ins = {empty = false} ,
  120.   isindex = {empty = true} ,
  121.   kbd = {empty = false} ,
  122.   label = {empty = false} ,
  123.   legend = {empty = false} ,
  124.   li = {
  125.     empty = false,
  126.     optional_end = true,
  127.     child = {
  128.       inline,
  129.       block,
  130.       "del",
  131.       "ins",
  132.       "noscript",
  133.       "script",
  134.     },
  135.   },
  136.   link = {empty = true} ,
  137.   map = {empty = false} ,
  138.   menu = {empty = false} ,
  139.   meta = {empty = true} ,
  140.   noframes = {empty = false} ,
  141.   noscript = {empty = false} ,
  142.   object = {empty = false} ,
  143.   ol = {empty = false} ,
  144.   optgroup = {empty = false} ,
  145.   option = {
  146.     empty = false,
  147.     optional_end = true,
  148.     child = {},
  149.   },
  150.   p = {
  151.     empty = false,
  152.     optional_end = true,
  153.     child = {
  154.       inline,
  155.       "del",
  156.       "ins",
  157.       "noscript",
  158.       "script",
  159.     },
  160.   } ,
  161.   param = {empty = true} ,
  162.   pre = {empty = false} ,
  163.   q = {empty = false} ,
  164.   s =  {empty = false} ,
  165.   samp = {empty = false} ,
  166.   script = {empty = false} ,
  167.   select = {empty = false} ,
  168.   small = {empty = false} ,
  169.   span = {empty = false} ,
  170.   strike = {empty = false} ,
  171.   strong = {empty = false} ,
  172.   style = {empty = false} ,
  173.   sub = {empty = false} ,
  174.   sup = {empty = false} ,
  175.   table = {empty = false} ,
  176.   tbody = {empty = false} ,
  177.   td = {
  178.     empty = false,
  179.     optional_end = true,
  180.     child = {
  181.       inline,
  182.       block,
  183.       "del",
  184.       "ins",
  185.       "noscript",
  186.       "script",
  187.     },
  188.   },
  189.   textarea = {empty = false} ,
  190.   tfoot = {
  191.     empty = false,
  192.     optional_end = true,
  193.     child = {"tr",},
  194.   },
  195.   th = {
  196.     empty = false,
  197.     optional_end = true,
  198.     child = {
  199.       inline,
  200.       block,
  201.       "del",
  202.       "ins",
  203.       "noscript",
  204.       "script",
  205.     },
  206.   },
  207.   thead = {
  208.     empty = false,
  209.     optional_end = true,
  210.     child = {"tr",},
  211.   },
  212.   title = {empty = false} ,
  213.   tr = {
  214.     empty = false,
  215.     optional_end = true,
  216.     child = {
  217.       "td", "th",
  218.     },
  219.   },
  220.   tt = {empty = false} ,
  221.   u = {empty = false} ,
  222.   ul = {empty = false} ,
  223.   var = {empty = false} ,
  224. }
  225.  
  226. setmetatable(tags, {
  227.   __index = function (t, key)
  228.     return {empty = false}
  229.   end
  230. })
  231.  
  232. -- string buffer implementation
  233. function newbuf ()
  234.   local buf = {
  235.     _buf = {},
  236.     clear =   function (self) self._buf = {}; return self end,
  237.     content = function (self) return table.concat(self._buf) end,
  238.     append =  function (self, s)
  239.       self._buf[#(self._buf) + 1] = s
  240.       return self
  241.     end,
  242.     set =     function (self, s) self._buf = {s}; return self end,
  243.   }
  244.   return buf
  245. end
  246.  
  247. -- unescape character entities
  248. function unescape (s)
  249.   function entity2string (e)
  250.     return entity[e]
  251.   end
  252.   return s.gsub(s, "&(#?%w+);", entity2string)
  253. end
  254.  
  255. -- iterator factory
  256. function makeiter (f)
  257.   local co = coroutine.create(f)
  258.   return function ()
  259.     local code, res = coroutine.resume(co)
  260.     return res
  261.   end
  262. end
  263.  
  264. -- constructors for token
  265. function Tag (s)
  266.   return string.find(s, "^</") and
  267.     {type = "End",   value = s} or
  268.     {type = "Start", value = s}
  269. end
  270.  
  271. function Text (s)
  272.   local unescaped = unescape(s)
  273.   return {type = "Text", value = unescaped}
  274. end
  275.  
  276. -- lexer: text mode
  277. function text (f, buf)
  278.   local c = f:read(1)
  279.   if c == "<" then
  280.     if buf:content() ~= "" then coroutine.yield(Text(buf:content())) end
  281.     buf:set(c)
  282.     return tag(f, buf)
  283.   elseif c then
  284.     buf:append(c)
  285.     return text(f, buf)
  286.   else
  287.     if buf:content() ~= "" then coroutine.yield(Text(buf:content())) end
  288.   end
  289. end
  290.  
  291. -- lexer: tag mode
  292. function tag (f, buf)
  293.   local c = f:read(1)
  294.   if c == ">" then
  295.     coroutine.yield(Tag(buf:append(c):content()))
  296.     buf:clear()
  297.     return text(f, buf)
  298.   elseif c then
  299.     buf:append(c)
  300.     return tag(f, buf)
  301.   else
  302.     if buf:content() ~= "" then coroutine.yield(Tag(buf:content())) end
  303.   end
  304. end
  305.  
  306. function parse_starttag(tag)
  307.   local tagname = string.match(tag, "<%s*(%w+)")
  308.   local elem = {_attr = {}}
  309.   elem._tag = tagname
  310.   for key, _, val in string.gmatch(tag, "(%w+)%s*=%s*([\"'])(.-)%2") do
  311.     local unescaped = unescape(val)
  312.     elem._attr[key] = unescaped
  313.   end
  314.  
  315.   return elem
  316. end
  317.  
  318. function parse_endtag(tag)
  319.   local tagname = string.match(tag, "<%s*/%s*(%w+)")
  320.   return tagname
  321. end
  322.  
  323. -- find last element that satisfies given predicate
  324. function rfind(t, pred)
  325.   local length = #t
  326.   for i=length,1,-1 do
  327.     if pred(t[i]) then
  328.       return i, t[i]
  329.     end
  330.   end
  331. end
  332.  
  333. function flatten(t, acc)
  334.   acc = acc or {}
  335.   for i,v in ipairs(t) do
  336.     if type(v) == "table" then
  337.       flatten(v, acc)
  338.     else
  339.       acc[#acc + 1] = v
  340.     end
  341.   end
  342.   return acc
  343. end
  344.  
  345. function optional_end_p(elem)
  346.   if tags[elem._tag].optional_end then
  347.     return true
  348.   else
  349.     return false
  350.   end
  351. end
  352.  
  353. function valid_child_p(child, parent)
  354.   local schema = tags[parent._tag].child
  355.   if not schema then return true end
  356.  
  357.   for i,v in ipairs(flatten(schema)) do
  358.     if v == child._tag then
  359.       return true
  360.     end
  361.   end
  362.  
  363.   return false
  364. end
  365.  
  366. -- tree builder
  367. function parse(f)
  368.   local root = {_tag = "#document", _attr = {}}
  369.   local stack = {root}
  370.   for i in makeiter(function () return text(f, newbuf()) end) do
  371.     if i.type == "Start" then
  372.       local new = parse_starttag(i.value)
  373.       local top = stack[#stack]
  374.  
  375.       while
  376.         top._tag ~= "#document" and
  377.         optional_end_p(top) and
  378.         not valid_child_p(new, top)
  379.       do
  380.         stack[#stack] = nil
  381.         top = stack[#stack]
  382.       end
  383.  
  384.       top[#top+1] = new -- appendchild
  385.       if not tags[new._tag].empty then
  386.         stack[#stack+1] = new -- push
  387.       end
  388.     elseif i.type == "End" then
  389.       local tag = parse_endtag(i.value)
  390.       local openingpos = rfind(stack, function(v)
  391.           if v._tag == tag then
  392.             return true
  393.           else
  394.             return false
  395.           end
  396.         end)
  397.       if openingpos then
  398.         local length = #stack
  399.         for j=length,openingpos,-1 do
  400.           table.remove(stack, j)
  401.         end
  402.       end
  403.     else -- Text
  404.       local top = stack[#stack]
  405.       top[#top+1] = i.value
  406.     end
  407.   end
  408.   return root
  409. end
  410.  
  411. function parsestr(s)
  412.   local handle = {
  413.     _content = s,
  414.     _pos = 1,
  415.     read = function (self, length)
  416.       if self._pos > string.len(self._content) then return end
  417.       local ret = string.sub(self._content, self._pos, self._pos + length - 1)
  418.       self._pos = self._pos + length
  419.       return ret
  420.     end
  421.   }
  422.   return parse(handle)
  423. end
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement