Advertisement
JoshDreamland

BB Parser Alpha

Jan 23rd, 2014
88
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
PHP 12.83 KB | None | 0 0
  1. <?php
  2. /*!
  3.  * @file bbcode.php
  4.  *
  5.  * A BBCode tag is defined in terms of two parameters, a parse type,
  6.  * and an argument count. The parse type, "type", is given by one of
  7.  * the tag type ("TT_") constants.
  8.  *
  9.  * The argument count, "args", is in the range [0,4]. A zero-argument tag
  10.  * does not have an end tag. A one-argument tag has its argument between
  11.  * the start and end tag. A two-element tag is identical to a one-argument
  12.  * tag with the advent of an =arg1 in the opening tag, eg,
  13.  *   [url=arg1]arg0[/url]
  14.  * A three-argument tag is given an additional parameter at tag close.
  15.  * A four-element tag actually has unlimited arguments and operates as
  16.  * an HTML tag. The parameters are passed in an associative array to
  17.  * a callback, which is the only valid type for this count.
  18.  *
  19.  * Tags also contain an "enclosed" flag which denotes what is wrapped
  20.  * inside the tags; this is one of the content type ("CT_") constants.
  21.  *
  22.  * From an implementation perspective, the argument count controls how
  23.  * the tag is decomposed into memory, ie, parsed. The type determines
  24.  * how the tag is evaluated after parse finishes. The argument count is
  25.  * used in evaluation to pass the correct number of parameters, but the
  26.  * type is not considered during parse, at all.
  27.  *
  28.  * Other notes: Tag names may not contain spaces or "=".
  29.  *
  30.  * @section License
  31.  *
  32.  * Copyright (C) 2014 Josh Ventura <josh at dreamland im>
  33.  *
  34.  * This file is a BBCode parser. Permission is hereby granted,
  35.  * free of charge, to any person obtaining a copy of this software
  36.  * and associated documentation files (the "Software"), to deal in
  37.  * the Software without restriction, including without limitation
  38.  * the rights to use, copy, modify, merge, publish, distribute,
  39.  * sublicense, and/or sell copies of the Software, and to permit
  40.  * persons to whom the Software is furnished to do so, subject to
  41.  * the following conditions:
  42.  *
  43.  * The above copyright notice and this permission notice shall be
  44.  * included in all copies or substantial portions of the Software.
  45.  *
  46.  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  47.  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
  48.  * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  49.  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
  50.  * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
  51.  * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  52.  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
  53.  * OR OTHER DEALINGS IN THE SOFTWARE.
  54.  *
  55.  * Amen.
  56. */
  57.  
  58. const TT_TRIVIAL = 0;    ///< This tag is a direct HTML wrapper tag
  59. const TT_INTERLEAVE = 1; ///< This tag becomes "html"[0] . arg0 . "html"[1] . arg1 ...
  60. const TT_CALLBACK = 2;   ///< This tag gets evaluated by a callback function
  61.  
  62. const CT_MARKDOWN = 0; ///< The inside of the tag is Markdown-format text as non-BB. This is the default.
  63. const CT_RAW = 1; ///< The inside of the tag is raw data and should not be formatted.
  64.  
  65. include('tags.php');
  66.  
  67. function notice($s) { echo "<!-- Notice: $s -->\n"; }
  68.  
  69. /*!
  70.  * This is a recursive function; call it with i = 0, and it will
  71.  * call itself recursively until all tags are parsed. It returns the
  72.  * parsed string, with $i set to the position of the first-read closing tag.
  73.  * @param  $str  The string to parse.
  74.  * @param  $i    [in/out] The position from which to start parsing.
  75.  *               Set at the end of the function to denote the first
  76.  *               unparsed character, or FALSE if all characters have
  77.  *               been consumed.
  78.  * @return Returns the HTML parsed substring of the given input.
  79. */
  80. function parse_sub($str, &$i, $contenttype, $opentags)
  81. {
  82.   global $bbtags;
  83.   $outstr = "";
  84.   $bookmark = $i;
  85.   for ($i = strpos($str, '[', $i); $i !== FALSE; $i = strpos($str, '[', $i))
  86.   {
  87.     $close = strpos($str, ']', $i + 1);
  88.     if ($close == FALSE) {
  89.       $i = FALSE;
  90.       return $outstr . parse_nonbb(substr($str, $bookmark, $i - $bookmark), $contenttype);
  91.     }
  92.    
  93.     // Look inside our tag, now
  94.     $stag = substr($str, $i+1, $close - $i - 1);
  95.    
  96.     // If it's a closing tag, return and let the parent handle that
  97.     if ($stag[0] == '/')
  98.       return $outstr . parse_nonbb(substr($str, $bookmark, $i - $bookmark), $contenttype);
  99.    
  100.     if ($contenttype == CT_RAW) {
  101.       ++$i;
  102.       continue;
  103.     }
  104.    
  105.     // Doplegänger
  106.     if ($stag[0] == '[') {
  107.       ++$i;
  108.       continue;
  109.     }
  110.    
  111.     // Make sure we're safe if args=1; we don't know, yet
  112.     $tagc = preg_split('/\s*[\s=]\s*/', $stag, 2);
  113.     if (count($tagc) == 2)
  114.       $arg1 = $tagc[1]; // This technically allows [tag x]y[/tag]
  115.     else
  116.       $arg1 = NULL; // Don't reuse old arg values
  117.     $tname = strtolower($tagc[0]); // Deliberately not trimmed
  118.    
  119.     // Look up tag
  120.     $tstart = $i;
  121.     $i = $close + 1;
  122.     if (!array_key_exists($tname, $bbtags)) {
  123.       notice("No bbtag called [$tname] ($stag)");
  124.       continue;
  125.     }
  126.    
  127.     // Tag found
  128.     $bbtag = $bbtags[$tname];
  129.     if ($bbtag['args'] > 0)
  130.     {
  131.       $tlen = strlen($tname) + 2;
  132.      
  133.       // Handle associative tags
  134.       if ($bbtag['args'] > 3)
  135.       {
  136.         $i = $tstart + $tlen;
  137.         if (ctype_space($str[$i-1])) {
  138.           $args = read_attr_list($str, $i);
  139.           if ($args === NULL) { // Bail on failure
  140.             $i = FALSE;
  141.             return $outstr . parse_nonbb(substr($str, $bookmark), $contenttype);
  142.           }
  143.           ++$i;
  144.         }
  145.         else if ($str[$i-1] == ']')
  146.           $args = array();
  147.         else continue;
  148.       }
  149.       else {
  150.         $args = NULL;
  151.         if ($bbtag['args'] == 1 && $str[$tstart + $tlen - 1] != ']') {
  152.           notice("$str [$tstart+$tlen-1] != ']'");
  153.           continue;
  154.         }
  155.       }
  156.      
  157.       $arg0 = '';
  158.       $stend = $i;
  159.       for (;;)
  160.       {
  161.         // This is where shit gets interesting
  162.         array_push($opentags, $tname);
  163.         $arg0 .= parse_sub($str, $i, isset($bbtag['content'])? $bbtag['content'] : CT_MARKDOWN, $opentags);
  164.         array_pop($opentags);
  165.        
  166.         // Make sure we arrived at a closing tag
  167.         if ($i == FALSE)
  168.           return $outstr . parse_nonbb(substr($str, $bookmark, $stend - $bookmark), $contenttype) . $arg0;
  169.        
  170.         $close = strpos($str, ']', $i);
  171.         $ctag = substr($str, $i, $close - $i);
  172.        
  173.         // Make sure this is *our* closing tag
  174.         if (strncasecmp($ctag, '[/'.$tname, $tlen) != 0)
  175.         {
  176.           // If someone else's closing tag, bail; tags should be closed in order
  177.           foreach ($opentags as $k => $otname)
  178.             if (strncasecmp($ctag, '[/'.$otname, 2 + strlen($otname)) == 0)
  179.               return $outstr . parse_nonbb(substr($str, $bookmark, $stend - $bookmark), $contenttype) . $arg0;
  180.          
  181.           // If not anyone's tag, just keep looking
  182.           $arg0 .= $str[$i++];
  183.           continue;
  184.         }
  185.        
  186.         break;
  187.       }
  188.      
  189.       // Now we have a little more parsing to do for ternary tags
  190.       if ($bbtag['args'] == 3) {
  191.         $arg2 = trim(substr($str, $i + $tlen, $close - $i - $tlen));
  192.         if (strlen($arg2) > 0 && $arg2[0] == '=')
  193.           $arg2 = trim(substr($arg2, 1));
  194.       }
  195.       else
  196.         $arg2 = NULL;
  197.      
  198.       $i = $close + 1;
  199.     }
  200.     else
  201.       $arg0 = $arg1 = $arg2 = $args = NULL;
  202.    
  203.     $outstr .= parse_nonbb(substr($str, $bookmark, $tstart - $bookmark), $contenttype);
  204.     $outstr .= evaluate_tag($tname, $bbtag, $arg0, $arg1, $arg2, $args);
  205.     $bookmark = $i;
  206.   }
  207.   $outstr .= parse_nonbb(substr($str, $bookmark), $contenttype);
  208.   return $outstr;
  209. }
  210.  
  211.  
  212. /*! This function might as well be in that last block of code,
  213.  * but it was getting frighteningly big, so I moved it here.
  214.  * @note Unused tag arguments should be NULL; \p $arg1 and \p $arg2 should
  215.  *       never be non-null at the same time as \p $args.
  216.  * @param $tname  The name of the tag.
  217.  * @param $bbtag  The tag array from the \c bbtags array.
  218.  * @param $arg0   The first argument, the text between the tags, if applicable.
  219.  * @param $arg1   The second argument, the = value in the opening tag, if applicable.
  220.  * @param $arg2   The third argument, the = value in the closing tag, if applicable.
  221.  * @param $args   An associative array of arguments, if applicable.
  222.  * @return Returns the result of evaluating the tag;
  223.  *         a string with which to replace the tag.
  224. */
  225. function evaluate_tag($tname, $bbtag, $arg0, $arg1, $arg2, $args) {
  226.   switch ($bbtag['type'])
  227.   {
  228.     case TT_TRIVIAL: switch ($bbtag['args']) {
  229.       case 0:  return "<$tname />";
  230.       case 1:  return "<$tname>$arg0</$tname>";
  231.       case 2:  return "<!-- This tag is invalid -->";
  232.       case 3:  return "<!-- This tag is invalid -->";
  233.       default: return "<!-- This tag would be a security risk -->";
  234.     }
  235.     case TT_INTERLEAVE: $h = $bbtag['html']; switch ($bbtag['args']) {
  236.       case 0:  return $h[0];
  237.       case 1:  return $h[0] . $arg0 . $h[1];
  238.       case 2:  return $h[0] . $arg1 . $h[1] . $arg0 . $h[2];
  239.       case 3:  return $h[0] . $arg1 . $h[1] . $arg0 . $h[2] . $arg2 . $h[3];
  240.       default: return "<!-- This tag cannot be automated -->";
  241.     }
  242.     case TT_CALLBACK: $f = $bbtag['func']; switch ($bbtag['args']) {
  243.       case 0:  return $f();
  244.       case 1:  return $f($arg0);
  245.       case 2:  return $f($arg0, $arg1);
  246.       case 3:  return $f($arg0, $arg1, $arg2);
  247.       default: return $f($arg0, $args);
  248.     }
  249.   }
  250. }
  251.  
  252.  
  253. /*! This nonsense is to parse associative tags.
  254.  * @param $str The string from which to read attributes.
  255.  * @param $i   [in/out] The position from which to start reading.
  256.  *             Set at end of function call to denote the end of the list.
  257.  * @return  Returns the associative array read in from the string.
  258. */
  259. function read_attr_list($str, &$i) {
  260.   $len = strlen($str);
  261.   $attrs = array();
  262.   while ($i < $len && $str[$i] != ']')
  263.   {
  264.     if (ctype_space($str[$i])) {
  265.       ++$i;
  266.       continue;
  267.     }
  268.    
  269.     // Read an attribute name
  270.     $attr_start = $i;
  271.     while (++$i < $len && $str[$i] != '=' && !ctype_space($str[$i]));
  272.     $attr_name = substr($str, $attr_start, $i-$attr_start);
  273.    
  274.     if ($i >= $len) // Bail if out of bounds
  275.       { notice("OOB0: '$attr_name'"); return NULL; }
  276.      
  277.     // Read past the attribute name
  278.     while ($i < $len && ctype_space($str[$i]))
  279.       { notice("white: '" . $str[$i] . "'"); ++$i; }
  280.     if ($i >= $len) // Bail if out of bounds
  281.       { notice("OOB1: '$attr_name'"); return NULL; }
  282.    
  283.     if ($str[$i] == '=')
  284.     {
  285.       while (++$i < $len && ctype_space($str[$i]));
  286.       if ($str[$i] == '"' || $str[$i] == '\'')
  287.       {
  288.         $val_start = $i + 1;
  289.         $ochar = $str[$i];
  290.         while (++$i < $len && $str[$i] != $ochar)
  291.           if ($str[$i] == '\\') ++$i;
  292.         if ($i >= $len)
  293.           { notice("OOB2"); return NULL; }
  294.         $val = str_replace(
  295.           array("\\\\", "\\\'", "\\\"", "\\r", "\\n", "\\t"),
  296.           array("\\",     "\'",   "\"",   "\r",  "\n", "\t"),
  297.           substr($str, $val_start, $i - $val_start)
  298.         );
  299.         ++$i;
  300.       }
  301.       else {
  302.         $val_start = $i;
  303.         while (++$i < $len && $str[$i] != ']' && !ctype_space($str[$i]));
  304.         if ($i >= $len)
  305.           { notice("OOB3"); return NULL; }
  306.         $val = substr($str, $val_start, $i - $val_start);
  307.       }
  308.     }
  309.     else {
  310.       $val = NULL;
  311.       ++$i;
  312.     }
  313.     $attrs[$attr_name] = $val;
  314.   }
  315.   return $attrs;
  316. }
  317.  
  318. /*!
  319.  * Parse for non-BBCode elements, such as URLs or (God forbid) Markdown.
  320.  * @param $str  HTML-escaped plain text input.
  321.  * @return  Returns HTML-formatted parsed text.
  322. */
  323. function parse_nonbb($str, $contenttype)
  324. {
  325.   if ($contenttype != CT_MARKDOWN)
  326.     return $str;
  327.  
  328.   $urlexp = "/(?i)\b("
  329.     . "(?:https?:\/\/|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}\/)"
  330.     . "(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+"
  331.     . "(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’])"
  332.   .")/";
  333.   return preg_replace($urlexp, "<a href=\"$1\">$1</a>", $str);
  334.   return $str;
  335. }
  336.  
  337. /*! Main BBCode parser call.
  338.  * @param  $str  The BBCode string to parse.
  339.  * @return Returns the HTML parsed version of the input.
  340. */
  341. function parse_bbcode($str) {
  342.   $i = 0;
  343.   $res = "";
  344.   while ($i !== false) {
  345.     $res .= parse_sub($str, $i, CT_MARKDOWN, array());
  346.     if ($i !== false)
  347.       $res .= $str[$i++];
  348.   }
  349.   return $res;
  350. }
  351.  
  352. $test = "
  353. [b]Bold[/b]
  354. [assoc one=\"two\" two=\"four\" three=eight four=sixteen]yes[/assoc]
  355. [/code]
  356. www.google.com
  357.  
  358. [b]this should be[i] bold[/b]
  359. [b]this should not be, but this should be: [b] bold[/b]
  360.  
  361. [hr]
  362.  
  363. http://google.com/
  364. www.google.com
  365. ";
  366.  
  367. echo "=================================\nTest:\n=================================\n" . $test . "\n";
  368. echo "=================================\nResult:\n=================================\n";
  369. echo parse_bbcode($test);
  370.  
  371. ?>
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement