Advertisement
Guest User

Untitled

a guest
May 28th, 2016
353
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Lua 10.67 KB | None | 0 0
  1. -- Parsed as per W3's HTML Standard
  2.  
  3. return function(HTMLSource)
  4.    
  5.    
  6.     local PARSER_POSITION = 1
  7.     local DATA_STATE = "" -- State of byte consumption
  8.     local TEXT_COLLECTION -- Used to collect character tokens. Instead of emitting them as a single character
  9.     local DOCUMENT = {} -- The final, tokenized, document structure.
  10.     local OPEN_ELEMENTS = {} -- Track elements that have not been closed. Used to determine parents and children. (If a tag is open, and then another is opened, we've found a family)
  11.     local TEXT_COLLECTION = "" -- Collection of unprocessed unicode. Dumped into a textnode if left unused without an opening tag encountered.
  12.     local ACTIVE_TOKEN -- Reference to the currently active token
  13.     local ACTIVE_ATTRIBUTE -- Reference to the currently active attribute
  14.     local FLAGS = { -- UNUSED
  15.         PAUSE_PARSER = false;
  16.         INTAG = false
  17.     }
  18.     local TOKENS = {}
  19.     local VALID_TAGS = {
  20.         ['scrollingframe'] = "ScrollingFrame";
  21.         ['frame'] = "Frame";
  22.         ['imagebutton'] = "ImageButton";
  23.         ['imagelabel'] = "ImageLabel";
  24.         ['textlabel'] = "TextLabel";
  25.         ['textbox'] = "TextBox";
  26.         ['textbutton'] = "TextButton";
  27.     }
  28.    
  29.     local function CreateToken(Type)
  30.         local Token = {
  31.             TagName = Type;
  32.             SelfClosingFlag = false;
  33.             IsClosingTag = false; -- Key element to making the tree.
  34.             Data = "";
  35.             Attributes = {};
  36.             Style = {};
  37.         }
  38.         table.insert(OPEN_ELEMENTS, Token)
  39.         table.insert(TOKENS, Token)
  40.         if (ACTIVE_TOKEN) then
  41.             -- I believe this is an impossible parser error. Never had it happen.
  42.             if (FindValInTable(OPEN_ELEMENTS, ACTIVE_TOKEN)) then
  43.                 table.insert(ACTIVE_TOKEN.Children, Token)
  44.             end
  45.         end
  46.         return Token
  47.     end
  48.    
  49.     local function AddAttribute(TOKEN)
  50.         local Attr = {AttributeName = "", Value=""}
  51.         table.insert(TOKEN.Attributes, Attr)
  52.         return Attr
  53.     end
  54.    
  55.     function FindValInTable(Tab, Val)
  56.         for i,v in pairs(Tab) do
  57.             if (v == Val) then
  58.                 return v, i
  59.             end
  60.         end
  61.     end
  62.    
  63.     local function GetLastOpenTag()
  64.         -- UNUSED
  65.         -- Searches for the most recently opened tag in OPEN_ELEMENTS
  66.         if (#OPEN_ELEMENTS > 0) then
  67.             return OPEN_ELEMENTS[#OPEN_ELEMENTS]
  68.         end
  69.     end
  70.    
  71.     local function EmitCharToOpenToken(Character)
  72.         local OpenToken = ACTIVE_TOKEN
  73.         OpenToken.Data = OpenToken.Data .. Character
  74.     end
  75.    
  76.     local function EmitToken(TOKEN)
  77.         -- Why do we add the token to the DOCUMENT without knowing its children?
  78.         -- Because closing tags are ALSO added to the document.
  79.         -- The closing tags will tell us when a nest ends.
  80.         -- EXAMPLE:
  81.         --[[
  82.             OPEN TAG A
  83.             OPEN TAG B
  84.             CLOSE TAG B
  85.             CLOSE TAG A
  86.         ]]
  87.         -- Because closing tags are there, we will know when one nest ends. We know OPEN TAG B is inside of A because it comes after A and has a valid close tag.
  88.        
  89.         -- Verify the tag name is a valid tag from VALID_TAGS
  90.         -- You may remove this check to accept arbitrary HTML tag names not supported via ROBLOX GUIs
  91.         if (VALID_TAGS[TOKEN.TagName] or TOKEN.TagName:lower() == "comment" or TOKEN.TagName:lower() == "__text") then
  92.             if (VALID_TAGS[TOKEN.TagName]) then
  93.                 TOKEN.TagName = VALID_TAGS[TOKEN.TagName]
  94.             end
  95.             table.insert(DOCUMENT, ACTIVE_TOKEN)
  96.             TOKENS = RemoveValueFromTable(TOKENS, ACTIVE_TOKEN)
  97.             if (TOKEN.SelfClosingFlag) then
  98.                 -- If it is self-closing, remove it from the open list. Otherwise, it is still open.
  99.                 OPEN_ELEMENTS = RemoveValueFromTable(OPEN_ELEMENTS, ACTIVE_TOKEN)
  100.             end
  101.         end
  102.         ACTIVE_TOKEN = nil--GetLastOpenTag()
  103.         ACTIVE_ATTRIBUTE = nil
  104.     end
  105.        
  106.    
  107.     function RemoveValueFromTable(Tab, Val)
  108.         local T = {}
  109.         for i = 1, #Tab do
  110.             if (Tab[i] ~= Val) then
  111.                 table.insert(T, Tab[i])
  112.             end
  113.         end
  114.         return T;
  115.     end
  116.    
  117.     -- Begin parsing
  118.     while (PARSER_POSITION < #HTMLSource) do
  119.         local CurrentChar = HTMLSource:sub(PARSER_POSITION, PARSER_POSITION)
  120.         if (DATA_STATE == "") then
  121.             if (CurrentChar == "<") then
  122.                 -- Emit text collection
  123.                 if (TEXT_COLLECTION) then
  124.                     ACTIVE_TOKEN = CreateToken("__Text")
  125.                     ACTIVE_TOKEN.Data = TEXT_COLLECTION
  126.                     ACTIVE_TOKEN.SelfClosingFlag = true
  127.                     EmitToken(ACTIVE_TOKEN)
  128.                     TEXT_COLLECTION = nil
  129.                 else
  130.                     EmitToken(ACTIVE_TOKEN)
  131.                 end
  132.                 DATA_STATE = "TAG_OPEN"
  133.                 PARSER_POSITION = PARSER_POSITION + 1
  134.             else
  135.                 -- Emit character token
  136.                 if (TEXT_COLLECTION) then
  137.                     TEXT_COLLECTION = TEXT_COLLECTION .. CurrentChar
  138.                 else
  139.                     TEXT_COLLECTION = CurrentChar
  140.                 end
  141.                 PARSER_POSITION = PARSER_POSITION + 1
  142.             end
  143.         elseif (DATA_STATE == "TAG_OPEN") then
  144.             if (CurrentChar == "!") then
  145.                 DATA_STATE = "MARKUP_DECLARATION_OPEN"
  146.                 PARSER_POSITION = PARSER_POSITION + 1
  147.             elseif (CurrentChar == "/") then
  148.                 DATA_STATE = "END_TAG_OPEN"
  149.                 PARSER_POSITION = PARSER_POSITION + 1
  150.             else
  151.                 DATA_STATE = "TAG_NAME"
  152.                 ACTIVE_TOKEN = CreateToken("")
  153.                 -- DO NOT CONSUME
  154.             end
  155.         elseif (DATA_STATE == "END_TAG_OPEN") then
  156.             if (CurrentChar == ">") then
  157.                 -- Parse error
  158.                 DATA_STATE = ""
  159.                 PARSER_POSITION = PARSER_POSITION + 1
  160.             else
  161.                 DATA_STATE = "TAG_NAME"
  162.                 ACTIVE_TOKEN = CreateToken("")
  163.                 ACTIVE_TOKEN.IsClosingTag = true
  164.                 -- DO NOT CONSUME
  165.             end
  166.         elseif (DATA_STATE == "TAG_NAME") then
  167.             if (CurrentChar == " ") then
  168.                 DATA_STATE = "BEFORE_ATTRIBUTE_NAME"
  169.                 PARSER_POSITION = PARSER_POSITION + 1
  170.             elseif (CurrentChar == "/") then
  171.                 DATA_STATE = "SELF_CLOSING_START_TAG"
  172.                 PARSER_POSITION = PARSER_POSITION + 1
  173.             elseif (CurrentChar == ">") then
  174.                 DATA_STATE = ""
  175.                 EmitToken(ACTIVE_TOKEN)
  176.                 PARSER_POSITION = PARSER_POSITION + 1
  177.             else
  178.                 ACTIVE_TOKEN.TagName = ACTIVE_TOKEN.TagName .. CurrentChar
  179.                 PARSER_POSITION = PARSER_POSITION + 1
  180.             end
  181.         elseif (DATA_STATE == "BEFORE_ATTRIBUTE_NAME") then
  182.             if (CurrentChar == " ") then
  183.                 -- Ignore
  184.                 PARSER_POSITION = PARSER_POSITION + 1
  185.             elseif (CurrentChar == "/") then
  186.                 DATA_STATE = "SELF_CLOSING_START_TAG"
  187.                 PARSER_POSITION = PARSER_POSITION + 1
  188.             elseif (CurrentChar == '"' or CurrentChar == "'" or CurrentChar == "<") then
  189.                 PARSER_POSITION = PARSER_POSITION + 1
  190.             elseif (CurrentChar == ">") then
  191.                 DATA_STATE = ""
  192.                 EmitToken(ACTIVE_TOKEN)
  193.                 PARSER_POSITION = PARSER_POSITION + 1
  194.             else
  195.                 DATA_STATE = "ATTRIBUTE_NAME"
  196.                 ACTIVE_ATTRIBUTE = AddAttribute(ACTIVE_TOKEN)
  197.                 -- DO NOT CONSUME
  198.             end
  199.         elseif (DATA_STATE == "ATTRIBUTE_NAME") then
  200.             if (CurrentChar == " ") then
  201.                 DATA_STATE = "AFTER_ATTRIBUTE_NAME"
  202.                 PARSER_POSITION = PARSER_POSITION + 1
  203.             elseif (CurrentChar == "/") then
  204.                 DATA_STATE = "SELF_CLOSING_START_TAG"
  205.                 PARSER_POSITION = PARSER_POSITION + 1
  206.             elseif (CurrentChar == '"' or CurrentChar == "'") then
  207.                 PARSER_POSITION = PARSER_POSITION + 1
  208.             elseif (CurrentChar == "=") then
  209.                 DATA_STATE = "BEFORE_ATTRIBUTE_VALUE"
  210.                 PARSER_POSITION = PARSER_POSITION + 1
  211.             elseif (CurrentChar == ">") then
  212.                 DATA_STATE = ""
  213.                 EmitToken(ACTIVE_TOKEN)
  214.                 PARSER_POSITION = PARSER_POSITION + 1
  215.             else
  216.                 PARSER_POSITION = PARSER_POSITION + 1
  217.                 ACTIVE_ATTRIBUTE.AttributeName = ACTIVE_ATTRIBUTE.AttributeName .. CurrentChar
  218.             end
  219.         elseif (DATA_STATE == "BEFORE_ATTRIBUTE_VALUE") then
  220.             if (CurrentChar == " ") then
  221.                 PARSER_POSITION = PARSER_POSITION + 1
  222.             elseif (CurrentChar == '"') then
  223.                 DATA_STATE = "ATTRIBUTE_VALUE_DOUBLE_QUOTE"
  224.                 PARSER_POSITION = PARSER_POSITION + 1
  225.             elseif (CurrentChar == '&') then
  226.                 DATA_STATE = "ATTRIBUTE_VALUE_UNQUOTED"
  227.                 -- DO NOT CONSUME THE CHARACTER
  228.             elseif (CurrentChar == "'") then
  229.                 DATA_STATE = "ATTRIBUTE_VALUE_SINGLE_QUOTE"
  230.                 PARSER_POSITION = PARSER_POSITION + 1
  231.             elseif (CurrentChar == ">") then
  232.                 DATA_STATE = ""
  233.                 EmitToken(ACTIVE_TOKEN)
  234.                 PARSER_POSITION = PARSER_POSITION + 1
  235.             elseif (CurrentChar == "<" or CurrentChar == "=") then
  236.                 PARSER_POSITION = PARSER_POSITION + 1
  237.             else
  238.                 ACTIVE_ATTRIBUTE.Value = ACTIVE_ATTRIBUTE.Value .. CurrentChar
  239.             end
  240.         elseif (DATA_STATE == "ATTRIBUTE_VALUE_DOUBLE_QUOTE") then
  241.             if (CurrentChar == '"') then
  242.                 DATA_STATE = "AFTER_ATTRIBUTE_VALUE_QUOTED"
  243.                 PARSER_POSITION = PARSER_POSITION + 1
  244.             else
  245.                 ACTIVE_ATTRIBUTE.Value = ACTIVE_ATTRIBUTE.Value .. CurrentChar
  246.                 PARSER_POSITION = PARSER_POSITION + 1
  247.             end
  248.         elseif (DATA_STATE == "ATTRIBUTE_VALUE_SINGLE_QUOTE") then
  249.             if (CurrentChar == "'") then
  250.                 DATA_STATE = "AFTER_ATTRIBUTE_VALUE_QUOTED"
  251.                 PARSER_POSITION = PARSER_POSITION + 1
  252.             else
  253.                 ACTIVE_ATTRIBUTE.Value = ACTIVE_ATTRIBUTE.Value .. CurrentChar
  254.                 PARSER_POSITION = PARSER_POSITION + 1
  255.             end
  256.         elseif (DATA_STATE == "AFTER_ATTRIBUTE_VALUE_QUOTED") then
  257.             if (CurrentChar == " ") then
  258.                 DATA_STATE = "BEFORE_ATTRIBUTE_NAME"
  259.                 PARSER_POSITION = PARSER_POSITION + 1
  260.             elseif (CurrentChar == "/") then
  261.                 DATA_STATE = "SELF_CLOSING_START_TAG"
  262.                 PARSER_POSITION = PARSER_POSITION + 1
  263.             else
  264.                 DATA_STATE = "BEFORE_ATTRIBUTE_NAME"
  265.                 -- DO NOT CONSUME CHARACTER
  266.             end
  267.         elseif (DATA_STATE == "SELF_CLOSING_START_TAG") then
  268.             if (CurrentChar == ">") then
  269.                 DATA_STATE = ""
  270.                 ACTIVE_TOKEN.SelfClosingFlag = true
  271.                 EmitToken(ACTIVE_TOKEN)
  272.                 PARSER_POSITION = PARSER_POSITION + 1
  273.             else
  274.                 DATA_STATE = "BEFORE_ATTRIBUTE_NAME"
  275.                 PARSER_POSITION = PARSER_POSITION + 1
  276.             end
  277.         elseif (DATA_STATE == "MARKUP_DECLARATION_OPEN") then
  278.             -- Are next two bytes '--' ?
  279.             if (CurrentChar == "-" and HTMLSource:sub(PARSER_POSITION + 1, PARSER_POSITION + 1) == "-") then
  280.                 DATA_STATE = "COMMENT_START"
  281.                 PARSER_POSITION = PARSER_POSITION + 2
  282.                 ACTIVE_TOKEN = CreateToken("Comment")
  283.                 -- Comments are self-closing
  284.                 ACTIVE_TOKEN.SelfClosingFlag = true
  285.             else
  286.                 DATA_STATE = "BOGUS_COMMENT"
  287.                 PARSER_POSITION = PARSER_POSITION + 1
  288.             end
  289.         elseif (DATA_STATE == "BOGUS_COMMENT") then
  290.             if (CurrentChar ~= ">") then
  291.                 PARSER_POSITION = PARSER_POSITION + 1
  292.             else
  293.                 DATA_STATE = "";
  294.                 PARSER_POSITION = PARSER_POSITION + 1
  295.             end
  296.         elseif (DATA_STATE == "COMMENT_START") then
  297.             if (CurrentChar == "-") then
  298.                 DATA_STATE = "COMMENT_START_DASH"
  299.                 PARSER_POSITION = PARSER_POSITION + 1
  300.             else
  301.                 EmitCharToOpenToken(CurrentChar)
  302.                 PARSER_POSITION = PARSER_POSITION + 1
  303.             end
  304.         elseif (DATA_STATE == "COMMENT_START_DASH") then
  305.             if (CurrentChar == "-") then
  306.                 DATA_STATE = "COMMENT_END"
  307.                 PARSER_POSITION = PARSER_POSITION + 1
  308.             elseif (CurrentChar == ">") then
  309.                 DATA_STATE = ""
  310.                 PARSER_POSITION = PARSER_POSITION + 1
  311.             else
  312.                 ACTIVE_TOKEN.Data = ACTIVE_TOKEN.Data .. CurrentChar
  313.                 PARSER_POSITION = PARSER_POSITION + 1
  314.             end
  315.         elseif (DATA_STATE == "COMMENT_END") then
  316.             if (CurrentChar == ">") then
  317.                 DATA_STATE = ""
  318.                 EmitToken(ACTIVE_TOKEN)
  319.                 PARSER_POSITION = PARSER_POSITION + 1
  320.             else
  321.                 DATA_STATE = "COMMENT_START"
  322.             end
  323.         end
  324.     end
  325.     --print("DOC_FINISH");
  326.     return DOCUMENT
  327. end
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement