Advertisement
StefanBashkir

Tag Parser

Jun 14th, 2017
83
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
PHP 9.89 KB | None | 0 0
  1. <?php
  2.  
  3. // Parsed as per W3's HTML Standard
  4.  
  5. function parseBBCode($HTMLSource)
  6.    
  7.    
  8.     $PARSER_POSITION = 1;
  9.     $$DATA_STATE = ""; // State of byte consumption
  10.     $TEXT_COLLECTION; // Used to collect character tokens. Instead of emitting them as a single character
  11.     $DOCUMENT = array(); // The final, tokenized, document structure.
  12.     $OPEN_ELEMENTS = array(); // Track elements that have not been closed. Used to determine parents and children. (If a tag is open, and{ another is opened, we've found a family)
  13.     $TEXT_COLLECTION = ""; // Collection of unprocessed unicode. Dumped into a textnode if left unused without an opening tag encountered.
  14.     $ACTIVE_TOKEN; // Reference to the currently active token
  15.     $ACTIVE_ATTRIBUTE; // Reference to the currently active attribute
  16.     $TOKENS = array();
  17.     $VALID_TAGS = array(
  18.         'spoiler' => "spoiler",
  19.     };
  20.    
  21.     function CreateToken($Type){
  22.         $Token = array(
  23.             'TagName' => $Type,
  24.             'SelfClosingFlag' => false,
  25.             'IsClosingTag' => false, // Key element to making the tree.
  26.             'Data' => "",
  27.             'Attributes' => array(),
  28.             'Style' => array(),
  29.         );
  30.         array_push($OPEN_ELEMENTS, $Token);
  31.         array_push($TOKENS, $Token);
  32.         if ($ACTIVE_TOKEN){
  33.             // I believe this is an impossible parser error. Never had it happen.
  34.             if (in_array($ACTIVE_TOKEN, $OPEN_ELEMENTS)){
  35.                 array_push($ACTIVE_TOKEN['Children'], $Token);
  36.             }
  37.         }
  38.         return $Token;
  39.     }
  40.    
  41.     function AddAttribute($TOKEN){
  42.         $Attr = array(
  43.             'AttributeName' => "",
  44.             'Value' => ""
  45.         );
  46.         array_push($TOKEN['Attributes'], $Attr);
  47.         return $Attr;
  48.     }
  49.    
  50.     function EmitCharToOpenToken($Character){
  51.         $OpenToken = $ACTIVE_TOKEN;
  52.         $OpenToken['Data'] .= $Character;
  53.     }
  54.    
  55.     function EmitToken($TOKEN){
  56.         // Why do we add the token to the DOCUMENT without knowing its children?
  57.         // Because closing tags are ALSO added to the document.
  58.         // The closing tags will tell us when a nest ends.
  59.         // EXAMPLE:
  60.         /*
  61.             OPEN TAG A
  62.             OPEN TAG B
  63.             CLOSE TAG B
  64.             CLOSE TAG A
  65.         */
  66.         // Because closing tags are there, we will know when one nest ends. We know OPEN TAG B is inside of A because it comes after A and has a valid close tag.
  67.        
  68.         // Verify the tag name is a valid tag from VALID_TAGS
  69.         // You may remove this check to accept arbitrary HTML tag names not supported via ROBLOX GUIs
  70.         if ($VALID_TAGS[$TOKEN['TagName']] || strtolower($TOKEN['TagName']) == "comment" || strtolower($TOKEN['TagName']) == "__text"){
  71.             if ($VALID_TAGS[$TOKEN['TagName']]){
  72.                 $TOKEN['TagName'] = $VALID_TAGS[$TOKEN['TagName']];
  73.             }
  74.             array_push($DOCUMENT, $ACTIVE_TOKEN);
  75.             $key = array_search($ACTIVE_TOKEN, $TOKENS);
  76.             unset($TOKENS[$key]);
  77.             if ($TOKEN['SelfClosingFlag']){
  78.                 // If it is self-closing, remove it from the open list. Otherwise, it is still open.
  79.                 $key = array_search($OPEN_ELEMENTS, $ACTIVE_TOKEN);
  80.                 unsert($OPEN_ELEMENTS[$key]);
  81.             }
  82.         }
  83.         $ACTIVE_TOKEN = null;
  84.         $ACTIVE_ATTRIBUTE = null;
  85.     }
  86.    
  87.     // Begin parsing
  88.     while ($PARSER_POSITION < strlen($HTMLSource)) do
  89.         $CurrentChar = substr($HTMLSource, $PARSER_POSITION, 1);
  90.         if ($DATA_STATE == ""){
  91.             if ($CurrentChar == "<"){
  92.                 // Emit text collection
  93.                 if (TEXT_COLLECTION){
  94.                     $ACTIVE_TOKEN = CreateToken("__Text")
  95.                     $ACTIVE_TOKEN.Data = TEXT_COLLECTION
  96.                     $ACTIVE_TOKEN.SelfClosingFlag = true
  97.                     EmitToken(ACTIVE_TOKEN)
  98.                     TEXT_COLLECTION = nil
  99.                 }else{
  100.                     EmitToken(ACTIVE_TOKEN)
  101.                 }
  102.                 $DATA_STATE = "TAG_OPEN"
  103.                 $PARSER_POSITION = $PARSER_POSITION + 1
  104.             }else{
  105.                 // Emit character token
  106.                 if (TEXT_COLLECTION){
  107.                     TEXT_COLLECTION = TEXT_COLLECTION .. CurrentChar
  108.                 }else{
  109.                     TEXT_COLLECTION = CurrentChar
  110.             }
  111.             $PARSER_POSITION = $PARSER_POSITION + 1
  112.         }
  113.         }elseif{} ($DATA_STATE == "TAG_OPEN"){
  114.             if ($CurrentChar == "!"){
  115.                 $DATA_STATE = "MARKUP_DECLARATION_OPEN"
  116.                 $PARSER_POSITION = $PARSER_POSITION + 1
  117.             }elseif{ ($CurrentChar == "/"){
  118.                 $DATA_STATE = "END_TAG_OPEN"
  119.                 $PARSER_POSITION = $PARSER_POSITION + 1
  120.             }else{
  121.                 $DATA_STATE = "TAG_NAME"
  122.                 $ACTIVE_TOKEN = CreateToken("")
  123.                 // DO NOT CONSUME
  124.         }
  125.         }elseif{ ($DATA_STATE == "END_TAG_OPEN"){
  126.             if ($CurrentChar == ">"){
  127.                 // Parse error
  128.                 $DATA_STATE = ""
  129.                 $PARSER_POSITION = $PARSER_POSITION + 1
  130.             }else{
  131.                 $DATA_STATE = "TAG_NAME"
  132.                 $ACTIVE_TOKEN = CreateToken("")
  133.                 $ACTIVE_TOKEN.IsClosingTag = true
  134.                 // DO NOT CONSUME
  135.         }
  136.         }elseif{ ($DATA_STATE == "TAG_NAME"){
  137.             if ($CurrentChar == " "){
  138.                 $DATA_STATE = "BEFORE_ATTRIBUTE_NAME"
  139.                 $PARSER_POSITION = $PARSER_POSITION + 1
  140.             }elseif{ ($CurrentChar == "/"){
  141.                 $DATA_STATE = "SELF_CLOSING_START_TAG"
  142.                 $PARSER_POSITION = $PARSER_POSITION + 1
  143.             }elseif{ ($CurrentChar == ">"){
  144.                 $DATA_STATE = ""
  145.                 EmitToken(ACTIVE_TOKEN)
  146.                 $PARSER_POSITION = $PARSER_POSITION + 1
  147.             }else{
  148.                 $ACTIVE_TOKEN.TagName = ACTIVE_TOKEN.TagName .. CurrentChar
  149.                 $PARSER_POSITION = $PARSER_POSITION + 1
  150.         }
  151.         }elseif{ ($DATA_STATE == "BEFORE_ATTRIBUTE_NAME"){
  152.             if ($CurrentChar == " "){
  153.                 // Ignore
  154.                 $PARSER_POSITION = $PARSER_POSITION + 1
  155.             }elseif{ ($CurrentChar == "/"){
  156.                 $DATA_STATE = "SELF_CLOSING_START_TAG"
  157.                 $PARSER_POSITION = $PARSER_POSITION + 1
  158.             }elseif{ ($CurrentChar == '"' or CurrentChar == "'" or CurrentChar == "<"){
  159.                 $PARSER_POSITION = $PARSER_POSITION + 1
  160.             }elseif{ ($CurrentChar == ">"){
  161.                 $DATA_STATE = ""
  162.                 EmitToken(ACTIVE_TOKEN)
  163.                 $PARSER_POSITION = $PARSER_POSITION + 1
  164.             }else{
  165.                 $DATA_STATE = "ATTRIBUTE_NAME"
  166.                 ACTIVE_ATTRIBUTE = AddAttribute(ACTIVE_TOKEN)
  167.                 // DO NOT CONSUME
  168.         }
  169.         }elseif{ ($DATA_STATE == "ATTRIBUTE_NAME"){
  170.             if ($CurrentChar == " "){
  171.                 $DATA_STATE = "AFTER_ATTRIBUTE_NAME"
  172.                 $PARSER_POSITION = $PARSER_POSITION + 1
  173.             }elseif{ ($CurrentChar == "/"){
  174.                 $DATA_STATE = "SELF_CLOSING_START_TAG"
  175.                 $PARSER_POSITION = $PARSER_POSITION + 1
  176.             }elseif{ ($CurrentChar == '"' or CurrentChar == "'"){
  177.                 $PARSER_POSITION = $PARSER_POSITION + 1
  178.             }elseif{ ($CurrentChar == "="){
  179.                 $DATA_STATE = "BEFORE_ATTRIBUTE_VALUE"
  180.                 $PARSER_POSITION = $PARSER_POSITION + 1
  181.             }elseif{ ($CurrentChar == ">"){
  182.                 $DATA_STATE = ""
  183.                 EmitToken(ACTIVE_TOKEN)
  184.                 $PARSER_POSITION = $PARSER_POSITION + 1
  185.             }else{
  186.                 $PARSER_POSITION = $PARSER_POSITION + 1
  187.                 ACTIVE_ATTRIBUTE.AttributeName = ACTIVE_ATTRIBUTE.AttributeName .. CurrentChar
  188.         }
  189.         }elseif{ ($DATA_STATE == "BEFORE_ATTRIBUTE_VALUE"){
  190.             if ($CurrentChar == " "){
  191.                 $PARSER_POSITION = $PARSER_POSITION + 1
  192.             }elseif{ ($CurrentChar == '"'){
  193.                 $DATA_STATE = "ATTRIBUTE_VALUE_DOUBLE_QUOTE"
  194.                 $PARSER_POSITION = $PARSER_POSITION + 1
  195.             }elseif{ ($CurrentChar == '&'){
  196.                 $DATA_STATE = "ATTRIBUTE_VALUE_UNQUOTED"
  197.                 // DO NOT CONSUME THE CHARACTER
  198.             }elseif{ ($CurrentChar == "'"){
  199.                 $DATA_STATE = "ATTRIBUTE_VALUE_SINGLE_QUOTE"
  200.                 $PARSER_POSITION = $PARSER_POSITION + 1
  201.             }elseif{ ($CurrentChar == ">"){
  202.                 $DATA_STATE = ""
  203.                 EmitToken(ACTIVE_TOKEN)
  204.                 $PARSER_POSITION = $PARSER_POSITION + 1
  205.             }elseif{ ($CurrentChar == "<" or CurrentChar == "="){
  206.                 $PARSER_POSITION = $PARSER_POSITION + 1
  207.             }else{
  208.                 ACTIVE_ATTRIBUTE.Value = ACTIVE_ATTRIBUTE.Value .. CurrentChar
  209.             }
  210.         }elseif{ ($DATA_STATE == "ATTRIBUTE_VALUE_DOUBLE_QUOTE"){
  211.             if ($CurrentChar == '"'){
  212.                 $DATA_STATE = "AFTER_ATTRIBUTE_VALUE_QUOTED"
  213.                 $PARSER_POSITION = $PARSER_POSITION + 1
  214.             }else{
  215.                 ACTIVE_ATTRIBUTE.Value = ACTIVE_ATTRIBUTE.Value .. CurrentChar
  216.                 $PARSER_POSITION = $PARSER_POSITION + 1
  217.         }elseif{ ($DATA_STATE == "ATTRIBUTE_VALUE_SINGLE_QUOTE"){
  218.             if ($CurrentChar == "'"){
  219.                 $DATA_STATE = "AFTER_ATTRIBUTE_VALUE_QUOTED"
  220.                 $PARSER_POSITION = $PARSER_POSITION + 1
  221.             }else{
  222.                 ACTIVE_ATTRIBUTE.Value = ACTIVE_ATTRIBUTE.Value .. CurrentChar
  223.                 $PARSER_POSITION = $PARSER_POSITION + 1
  224.         }elseif{ ($DATA_STATE == "AFTER_ATTRIBUTE_VALUE_QUOTED"){
  225.             if ($CurrentChar == " "){
  226.                 $DATA_STATE = "BEFORE_ATTRIBUTE_NAME"
  227.                 $PARSER_POSITION = $PARSER_POSITION + 1
  228.             }elseif{ ($CurrentChar == "/"){
  229.                 $DATA_STATE = "SELF_CLOSING_START_TAG"
  230.                 $PARSER_POSITION = $PARSER_POSITION + 1
  231.             }else{
  232.                 $DATA_STATE = "BEFORE_ATTRIBUTE_NAME"
  233.                 // DO NOT CONSUME CHARACTER
  234.         }elseif{ ($DATA_STATE == "SELF_CLOSING_START_TAG"){
  235.             if ($CurrentChar == ">"){
  236.                 $DATA_STATE = ""
  237.                 $ACTIVE_TOKEN.SelfClosingFlag = true
  238.                 EmitToken(ACTIVE_TOKEN)
  239.                 $PARSER_POSITION = $PARSER_POSITION + 1
  240.             }else{
  241.                 $DATA_STATE = "BEFORE_ATTRIBUTE_NAME"
  242.                 $PARSER_POSITION = $PARSER_POSITION + 1
  243.         }elseif{ ($DATA_STATE == "MARKUP_DECLARATION_OPEN"){
  244.             // Are next two bytes '//' ?
  245.             if ($CurrentChar == "-" and HTMLSource:sub$PARSER_POSITION + 1,$PARSER_POSITION + 1) == "-"){
  246.                 $DATA_STATE = "COMMENT_START"
  247.                 $PARSER_POSITION = $PARSER_POSITION + 2
  248.                 $ACTIVE_TOKEN = CreateToken("Comment")
  249.                 // Comments are self-closing
  250.                 $ACTIVE_TOKEN.SelfClosingFlag = true
  251.             }else{
  252.                 $DATA_STATE = "BOGUS_COMMENT"
  253.                 $PARSER_POSITION = $PARSER_POSITION + 1
  254.         }elseif{ ($DATA_STATE == "BOGUS_COMMENT"){
  255.             if ($CurrentChar ~= ">"){
  256.                 $PARSER_POSITION = $PARSER_POSITION + 1
  257.             }else{
  258.                 $DATA_STATE = "";
  259.                 $PARSER_POSITION = $PARSER_POSITION + 1
  260.         }elseif{ ($DATA_STATE == "COMMENT_START"){
  261.             if ($CurrentChar == "-"){
  262.                 $DATA_STATE = "COMMENT_START_DASH"
  263.                 $PARSER_POSITION = $PARSER_POSITION + 1
  264.             }else{
  265.                 EmitCharToOpenToken($CurrentChar)
  266.                 $PARSER_POSITION = $PARSER_POSITION + 1
  267.         }elseif{ ($DATA_STATE == "COMMENT_START_DASH"){
  268.             if ($CurrentChar == "-"){
  269.                 $DATA_STATE = "COMMENT_END"
  270.                 $PARSER_POSITION = $PARSER_POSITION + 1
  271.             }elseif{ ($CurrentChar == ">"){
  272.                 $DATA_STATE = ""
  273.                 $PARSER_POSITION = $PARSER_POSITION + 1
  274.             }else{
  275.                 $ACTIVE_TOKEN.Data = ACTIVE_TOKEN.Data .. CurrentChar
  276.                 $PARSER_POSITION = $PARSER_POSITION + 1
  277.         }elseif{ ($DATA_STATE == "COMMENT_END"){
  278.             if ($CurrentChar == ">"){
  279.                 $DATA_STATE = ""
  280.                 EmitToken(ACTIVE_TOKEN)
  281.                 $PARSER_POSITION = $PARSER_POSITION + 1
  282.             }else{
  283.                 $DATA_STATE = "COMMENT_START"
  284.             }
  285.         }
  286.     }
  287. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement