Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- -- Parsed as per W3's HTML Standard
- return function(HTMLSource)
- local PARSER_POSITION = 1
- local DATA_STATE = "" -- State of byte consumption
- local TEXT_COLLECTION -- Used to collect character tokens. Instead of emitting them as a single character
- local DOCUMENT = {} -- The final, tokenized, document structure.
- local OPEN_ELEMENTS = {} -- Track elements that have not been closed. Used to determine parents and children. (If a tag is open, and then another is opened, we've found a family)
- local TEXT_COLLECTION = "" -- Collection of unprocessed unicode. Dumped into a textnode if left unused without an opening tag encountered.
- local ACTIVE_TOKEN -- Reference to the currently active token
- local ACTIVE_ATTRIBUTE -- Reference to the currently active attribute
- local FLAGS = { -- UNUSED
- PAUSE_PARSER = false;
- INTAG = false
- }
- local TOKENS = {}
- local VALID_TAGS = {
- ['scrollingframe'] = "ScrollingFrame";
- ['frame'] = "Frame";
- ['imagebutton'] = "ImageButton";
- ['imagelabel'] = "ImageLabel";
- ['textlabel'] = "TextLabel";
- ['textbox'] = "TextBox";
- ['textbutton'] = "TextButton";
- }
- local function CreateToken(Type)
- local Token = {
- TagName = Type;
- SelfClosingFlag = false;
- IsClosingTag = false; -- Key element to making the tree.
- Data = "";
- Attributes = {};
- Style = {};
- }
- table.insert(OPEN_ELEMENTS, Token)
- table.insert(TOKENS, Token)
- if (ACTIVE_TOKEN) then
- -- I believe this is an impossible parser error. Never had it happen.
- if (FindValInTable(OPEN_ELEMENTS, ACTIVE_TOKEN)) then
- table.insert(ACTIVE_TOKEN.Children, Token)
- end
- end
- return Token
- end
- local function AddAttribute(TOKEN)
- local Attr = {AttributeName = "", Value=""}
- table.insert(TOKEN.Attributes, Attr)
- return Attr
- end
- function FindValInTable(Tab, Val)
- for i,v in pairs(Tab) do
- if (v == Val) then
- return v, i
- end
- end
- end
- local function GetLastOpenTag()
- -- UNUSED
- -- Searches for the most recently opened tag in OPEN_ELEMENTS
- if (#OPEN_ELEMENTS > 0) then
- return OPEN_ELEMENTS[#OPEN_ELEMENTS]
- end
- end
- local function EmitCharToOpenToken(Character)
- local OpenToken = ACTIVE_TOKEN
- OpenToken.Data = OpenToken.Data .. Character
- end
- local function EmitToken(TOKEN)
- -- Why do we add the token to the DOCUMENT without knowing its children?
- -- Because closing tags are ALSO added to the document.
- -- The closing tags will tell us when a nest ends.
- -- EXAMPLE:
- --[[
- OPEN TAG A
- OPEN TAG B
- CLOSE TAG B
- CLOSE TAG A
- ]]
- -- Because closing tags are there, we will know when one nest ends. We know OPEN TAG B is inside of A because it comes after A and has a valid close tag.
- -- Verify the tag name is a valid tag from VALID_TAGS
- -- You may remove this check to accept arbitrary HTML tag names not supported via ROBLOX GUIs
- if (VALID_TAGS[TOKEN.TagName] or TOKEN.TagName:lower() == "comment" or TOKEN.TagName:lower() == "__text") then
- if (VALID_TAGS[TOKEN.TagName]) then
- TOKEN.TagName = VALID_TAGS[TOKEN.TagName]
- end
- table.insert(DOCUMENT, ACTIVE_TOKEN)
- TOKENS = RemoveValueFromTable(TOKENS, ACTIVE_TOKEN)
- if (TOKEN.SelfClosingFlag) then
- -- If it is self-closing, remove it from the open list. Otherwise, it is still open.
- OPEN_ELEMENTS = RemoveValueFromTable(OPEN_ELEMENTS, ACTIVE_TOKEN)
- end
- end
- ACTIVE_TOKEN = nil--GetLastOpenTag()
- ACTIVE_ATTRIBUTE = nil
- end
- function RemoveValueFromTable(Tab, Val)
- local T = {}
- for i = 1, #Tab do
- if (Tab[i] ~= Val) then
- table.insert(T, Tab[i])
- end
- end
- return T;
- end
- -- Begin parsing
- while (PARSER_POSITION < #HTMLSource) do
- local CurrentChar = HTMLSource:sub(PARSER_POSITION, PARSER_POSITION)
- if (DATA_STATE == "") then
- if (CurrentChar == "<") then
- -- Emit text collection
- if (TEXT_COLLECTION) then
- ACTIVE_TOKEN = CreateToken("__Text")
- ACTIVE_TOKEN.Data = TEXT_COLLECTION
- ACTIVE_TOKEN.SelfClosingFlag = true
- EmitToken(ACTIVE_TOKEN)
- TEXT_COLLECTION = nil
- else
- EmitToken(ACTIVE_TOKEN)
- end
- DATA_STATE = "TAG_OPEN"
- PARSER_POSITION = PARSER_POSITION + 1
- else
- -- Emit character token
- if (TEXT_COLLECTION) then
- TEXT_COLLECTION = TEXT_COLLECTION .. CurrentChar
- else
- TEXT_COLLECTION = CurrentChar
- end
- PARSER_POSITION = PARSER_POSITION + 1
- end
- elseif (DATA_STATE == "TAG_OPEN") then
- if (CurrentChar == "!") then
- DATA_STATE = "MARKUP_DECLARATION_OPEN"
- PARSER_POSITION = PARSER_POSITION + 1
- elseif (CurrentChar == "/") then
- DATA_STATE = "END_TAG_OPEN"
- PARSER_POSITION = PARSER_POSITION + 1
- else
- DATA_STATE = "TAG_NAME"
- ACTIVE_TOKEN = CreateToken("")
- -- DO NOT CONSUME
- end
- elseif (DATA_STATE == "END_TAG_OPEN") then
- if (CurrentChar == ">") then
- -- Parse error
- DATA_STATE = ""
- PARSER_POSITION = PARSER_POSITION + 1
- else
- DATA_STATE = "TAG_NAME"
- ACTIVE_TOKEN = CreateToken("")
- ACTIVE_TOKEN.IsClosingTag = true
- -- DO NOT CONSUME
- end
- elseif (DATA_STATE == "TAG_NAME") then
- if (CurrentChar == " ") then
- DATA_STATE = "BEFORE_ATTRIBUTE_NAME"
- PARSER_POSITION = PARSER_POSITION + 1
- elseif (CurrentChar == "/") then
- DATA_STATE = "SELF_CLOSING_START_TAG"
- PARSER_POSITION = PARSER_POSITION + 1
- elseif (CurrentChar == ">") then
- DATA_STATE = ""
- EmitToken(ACTIVE_TOKEN)
- PARSER_POSITION = PARSER_POSITION + 1
- else
- ACTIVE_TOKEN.TagName = ACTIVE_TOKEN.TagName .. CurrentChar
- PARSER_POSITION = PARSER_POSITION + 1
- end
- elseif (DATA_STATE == "BEFORE_ATTRIBUTE_NAME") then
- if (CurrentChar == " ") then
- -- Ignore
- PARSER_POSITION = PARSER_POSITION + 1
- elseif (CurrentChar == "/") then
- DATA_STATE = "SELF_CLOSING_START_TAG"
- PARSER_POSITION = PARSER_POSITION + 1
- elseif (CurrentChar == '"' or CurrentChar == "'" or CurrentChar == "<") then
- PARSER_POSITION = PARSER_POSITION + 1
- elseif (CurrentChar == ">") then
- DATA_STATE = ""
- EmitToken(ACTIVE_TOKEN)
- PARSER_POSITION = PARSER_POSITION + 1
- else
- DATA_STATE = "ATTRIBUTE_NAME"
- ACTIVE_ATTRIBUTE = AddAttribute(ACTIVE_TOKEN)
- -- DO NOT CONSUME
- end
- elseif (DATA_STATE == "ATTRIBUTE_NAME") then
- if (CurrentChar == " ") then
- DATA_STATE = "AFTER_ATTRIBUTE_NAME"
- PARSER_POSITION = PARSER_POSITION + 1
- elseif (CurrentChar == "/") then
- DATA_STATE = "SELF_CLOSING_START_TAG"
- PARSER_POSITION = PARSER_POSITION + 1
- elseif (CurrentChar == '"' or CurrentChar == "'") then
- PARSER_POSITION = PARSER_POSITION + 1
- elseif (CurrentChar == "=") then
- DATA_STATE = "BEFORE_ATTRIBUTE_VALUE"
- PARSER_POSITION = PARSER_POSITION + 1
- elseif (CurrentChar == ">") then
- DATA_STATE = ""
- EmitToken(ACTIVE_TOKEN)
- PARSER_POSITION = PARSER_POSITION + 1
- else
- PARSER_POSITION = PARSER_POSITION + 1
- ACTIVE_ATTRIBUTE.AttributeName = ACTIVE_ATTRIBUTE.AttributeName .. CurrentChar
- end
- elseif (DATA_STATE == "BEFORE_ATTRIBUTE_VALUE") then
- if (CurrentChar == " ") then
- PARSER_POSITION = PARSER_POSITION + 1
- elseif (CurrentChar == '"') then
- DATA_STATE = "ATTRIBUTE_VALUE_DOUBLE_QUOTE"
- PARSER_POSITION = PARSER_POSITION + 1
- elseif (CurrentChar == '&') then
- DATA_STATE = "ATTRIBUTE_VALUE_UNQUOTED"
- -- DO NOT CONSUME THE CHARACTER
- elseif (CurrentChar == "'") then
- DATA_STATE = "ATTRIBUTE_VALUE_SINGLE_QUOTE"
- PARSER_POSITION = PARSER_POSITION + 1
- elseif (CurrentChar == ">") then
- DATA_STATE = ""
- EmitToken(ACTIVE_TOKEN)
- PARSER_POSITION = PARSER_POSITION + 1
- elseif (CurrentChar == "<" or CurrentChar == "=") then
- PARSER_POSITION = PARSER_POSITION + 1
- else
- ACTIVE_ATTRIBUTE.Value = ACTIVE_ATTRIBUTE.Value .. CurrentChar
- end
- elseif (DATA_STATE == "ATTRIBUTE_VALUE_DOUBLE_QUOTE") then
- if (CurrentChar == '"') then
- DATA_STATE = "AFTER_ATTRIBUTE_VALUE_QUOTED"
- PARSER_POSITION = PARSER_POSITION + 1
- else
- ACTIVE_ATTRIBUTE.Value = ACTIVE_ATTRIBUTE.Value .. CurrentChar
- PARSER_POSITION = PARSER_POSITION + 1
- end
- elseif (DATA_STATE == "ATTRIBUTE_VALUE_SINGLE_QUOTE") then
- if (CurrentChar == "'") then
- DATA_STATE = "AFTER_ATTRIBUTE_VALUE_QUOTED"
- PARSER_POSITION = PARSER_POSITION + 1
- else
- ACTIVE_ATTRIBUTE.Value = ACTIVE_ATTRIBUTE.Value .. CurrentChar
- PARSER_POSITION = PARSER_POSITION + 1
- end
- elseif (DATA_STATE == "AFTER_ATTRIBUTE_VALUE_QUOTED") then
- if (CurrentChar == " ") then
- DATA_STATE = "BEFORE_ATTRIBUTE_NAME"
- PARSER_POSITION = PARSER_POSITION + 1
- elseif (CurrentChar == "/") then
- DATA_STATE = "SELF_CLOSING_START_TAG"
- PARSER_POSITION = PARSER_POSITION + 1
- else
- DATA_STATE = "BEFORE_ATTRIBUTE_NAME"
- -- DO NOT CONSUME CHARACTER
- end
- elseif (DATA_STATE == "SELF_CLOSING_START_TAG") then
- if (CurrentChar == ">") then
- DATA_STATE = ""
- ACTIVE_TOKEN.SelfClosingFlag = true
- EmitToken(ACTIVE_TOKEN)
- PARSER_POSITION = PARSER_POSITION + 1
- else
- DATA_STATE = "BEFORE_ATTRIBUTE_NAME"
- PARSER_POSITION = PARSER_POSITION + 1
- end
- elseif (DATA_STATE == "MARKUP_DECLARATION_OPEN") then
- -- Are next two bytes '--' ?
- if (CurrentChar == "-" and HTMLSource:sub(PARSER_POSITION + 1, PARSER_POSITION + 1) == "-") then
- DATA_STATE = "COMMENT_START"
- PARSER_POSITION = PARSER_POSITION + 2
- ACTIVE_TOKEN = CreateToken("Comment")
- -- Comments are self-closing
- ACTIVE_TOKEN.SelfClosingFlag = true
- else
- DATA_STATE = "BOGUS_COMMENT"
- PARSER_POSITION = PARSER_POSITION + 1
- end
- elseif (DATA_STATE == "BOGUS_COMMENT") then
- if (CurrentChar ~= ">") then
- PARSER_POSITION = PARSER_POSITION + 1
- else
- DATA_STATE = "";
- PARSER_POSITION = PARSER_POSITION + 1
- end
- elseif (DATA_STATE == "COMMENT_START") then
- if (CurrentChar == "-") then
- DATA_STATE = "COMMENT_START_DASH"
- PARSER_POSITION = PARSER_POSITION + 1
- else
- EmitCharToOpenToken(CurrentChar)
- PARSER_POSITION = PARSER_POSITION + 1
- end
- elseif (DATA_STATE == "COMMENT_START_DASH") then
- if (CurrentChar == "-") then
- DATA_STATE = "COMMENT_END"
- PARSER_POSITION = PARSER_POSITION + 1
- elseif (CurrentChar == ">") then
- DATA_STATE = ""
- PARSER_POSITION = PARSER_POSITION + 1
- else
- ACTIVE_TOKEN.Data = ACTIVE_TOKEN.Data .. CurrentChar
- PARSER_POSITION = PARSER_POSITION + 1
- end
- elseif (DATA_STATE == "COMMENT_END") then
- if (CurrentChar == ">") then
- DATA_STATE = ""
- EmitToken(ACTIVE_TOKEN)
- PARSER_POSITION = PARSER_POSITION + 1
- else
- DATA_STATE = "COMMENT_START"
- end
- end
- end
- --print("DOC_FINISH");
- return DOCUMENT
- end
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement