Advertisement
JoshDreamland

BB Parser Beta

Jan 28th, 2014
98
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. <?php
  2. /*!
  3.  * @file bbcode.php
  4.  *
  5.  * A BBCode tag is defined in terms of two parameters, a parse type,
  6.  * and an argument count. The parse type, "type", is given by one of
  7.  * the tag type ("TT_") constants.
  8.  *
  9.  * The argument count, "args", is in the range [0,4]. A zero-argument tag
  10.  * does not have an end tag. A one-argument tag has its argument between
  11.  * the start and end tag. A two-element tag is identical to a one-argument
  12.  * tag with the advent of an =arg1 in the opening tag, eg,
  13.  *   [url=arg1]arg0[/url]
  14.  * A three-argument tag is given an additional parameter at tag close.
  15.  * A four-element tag actually has unlimited arguments and operates as
  16.  * an HTML tag. The parameters are passed in an associative array to
  17.  * a callback, which is the only valid type for this count.
  18.  *
  19.  * Tags also contain an "enclosed" flag which denotes what is wrapped
  20.  * inside the tags; this is one of the content type ("CT_") constants.
  21.  *
  22.  * From an implementation perspective, the argument count controls how
  23.  * the tag is decomposed into memory, ie, parsed. The type determines
  24.  * how the tag is evaluated after parse finishes. The argument count is
  25.  * used in evaluation to pass the correct number of parameters, but the
  26.  * type is not considered during parse, at all.
  27.  *
  28.  * Other notes: Tag names may not contain spaces or "=".
  29.  *
  30.  * @section License
  31.  *
  32.  * Copyright (C) 2014 Josh Ventura <josh at dreamland im>
  33.  *
  34.  * This file is a BBCode parser. Permission is hereby granted,
  35.  * free of charge, to any person obtaining a copy of this software
  36.  * and associated documentation files (the "Software"), to deal in
  37.  * the Software without restriction, including without limitation
  38.  * the rights to use, copy, modify, merge, publish, distribute,
  39.  * sublicense, and/or sell copies of the Software, and to permit
  40.  * persons to whom the Software is furnished to do so, subject to
  41.  * the following conditions:
  42.  *
  43.  * The above copyright notice and this permission notice shall be
  44.  * included in all copies or substantial portions of the Software.
  45.  *
  46.  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  47.  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
  48.  * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  49.  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
  50.  * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
  51.  * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  52.  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
  53.  * OR OTHER DEALINGS IN THE SOFTWARE.
  54.  *
  55.  * Amen.
  56. */
  57.  
  58. const TT_TRIVIAL = 0;    ///< This tag is a direct HTML wrapper tag
  59. const TT_INTERLEAVE = 1; ///< This tag becomes "html"[0] . arg0 . "html"[1] . arg1 ...
  60. const TT_CALLBACK = 2;   ///< This tag gets evaluated by a callback function
  61.  
  62. const CT_MARKDOWN = 0; ///< The inside of the tag is Markdown-format text as non-BB. This is the default.
  63. const CT_RAW = 1; ///< The inside of the tag is raw data and should not be formatted.
  64.  
  65. include('tags.php');
  66.  
  67. function notice($s) { echo "<!-- Notice: $s -->\n"; }
  68.  
  69. /*!
  70.  * This is a recursive function; call it with i = 0, and it will
  71.  * call itself recursively until all tags are parsed. It returns the
  72.  * parsed string, with $i set to the position of the first-read closing tag.
  73.  * @param  $str  The string to parse.
  74.  * @param  $i    [in/out] The position from which to start parsing.
  75.  *               Set at the end of the function to denote the first
  76.  *               unparsed character, or FALSE if all characters have
  77.  *               been consumed.
  78.  * @return Returns the HTML parsed substring of the given input.
  79. */
  80. function parse_sub($str, &$i, $contenttype, $opentags)
  81. {
  82.   global $bbtags;
  83.   $outstr = "";
  84.   $bookmark = $i;
  85.   for ($i = strpos($str, '[', $i); $i !== FALSE; $i = strpos($str, '[', $i))
  86.   {
  87.     $close = strpos($str, ']', $i + 1);
  88.     if ($close == FALSE) {
  89.       $i = FALSE;
  90.       break;
  91.     }
  92.    
  93.     // Look inside our tag, now
  94.     $stag = substr($str, $i+1, $close - $i - 1);
  95.    
  96.     // If it's a closing tag, return and let the parent handle that
  97.     if ($stag[0] == '/')
  98.       return $outstr . parse_nonbb(substr($str, $bookmark, $i - $bookmark), $contenttype);
  99.    
  100.     if ($contenttype == CT_RAW) {
  101.       ++$i;
  102.       continue;
  103.     }
  104.    
  105.     // Dopplegänger
  106.     $doppl = strpos($stag, '[');
  107.     if ($doppl !== FALSE) {
  108.       $i += $doppl + 1;
  109.       continue;
  110.     }
  111.    
  112.     // Make sure we're safe if args=1; we don't know, yet
  113.     $tagc = preg_split('/\s*[\s=]\s*/', $stag, 2);
  114.     if (count($tagc) == 2)
  115.       $arg1 = $tagc[1]; // This technically allows [tag x]y[/tag]
  116.     else
  117.       $arg1 = NULL; // Don't reuse old arg values
  118.     $tname = strtolower($tagc[0]); // Deliberately not trimmed
  119.    
  120.     // Look up tag
  121.     $tstart = $i;
  122.     $i = $close + 1;
  123.     if (!array_key_exists($tname, $bbtags)) {
  124.       notice("No bbtag called [$tname] ($stag)");
  125.       continue;
  126.     }
  127.    
  128.     // Tag found
  129.     $bbtag = $bbtags[$tname];
  130.     if ($bbtag['args'] > 0)
  131.     {
  132.       $tlen = strlen($tname) + 2;
  133.      
  134.       // Handle associative tags
  135.       if ($bbtag['args'] > 3)
  136.       {
  137.         $i = $tstart + $tlen;
  138.         if (ctype_space($str[$i-1])) {
  139.           $args = read_attr_list($str, $i);
  140.           if ($args === NULL) { // Bail on failure
  141.             $i = FALSE;
  142.             return $outstr . parse_nonbb(substr($str, $bookmark), $contenttype);
  143.           }
  144.           ++$i;
  145.         }
  146.         else if ($str[$i-1] == ']')
  147.           $args = array();
  148.         else continue;
  149.       }
  150.       else {
  151.         $args = NULL;
  152.         if ($bbtag['args'] == 1 && $str[$tstart + $tlen - 1] != ']') {
  153.           notice("$str [$tstart+$tlen-1] != ']'");
  154.           continue;
  155.         }
  156.       }
  157.      
  158.       $arg0 = '';
  159.       $stend = $i;
  160.       for (;;)
  161.       {
  162.         // This is where shit gets interesting
  163.         array_push($opentags, $tname);
  164.         $arg0 .= parse_sub($str, $i, isset($bbtag['content'])? $bbtag['content'] : CT_MARKDOWN, $opentags);
  165.         array_pop($opentags);
  166.        
  167.         // Make sure we arrived at a closing tag
  168.         if ($i == FALSE)
  169.           return $outstr . parse_nonbb(substr($str, $bookmark, $stend - $bookmark), $contenttype) . $arg0;
  170.        
  171.         $close = strpos($str, ']', $i);
  172.         $ctag = substr($str, $i, $close - $i);
  173.        
  174.         // Make sure this is *our* closing tag
  175.         if (strncasecmp($ctag, '[/'.$tname, $tlen) != 0)
  176.         {
  177.           // If someone else's closing tag, bail; tags should be closed in order
  178.           foreach ($opentags as $k => $otname)
  179.             if (strncasecmp($ctag, '[/'.$otname, 2 + strlen($otname)) == 0)
  180.               return $outstr . parse_nonbb(substr($str, $bookmark, $stend - $bookmark), $contenttype) . $arg0;
  181.          
  182.           // If not anyone's tag, just keep looking
  183.           $arg0 .= $str[$i++];
  184.           continue;
  185.         }
  186.        
  187.         break;
  188.       }
  189.      
  190.       // Now we have a little more parsing to do for ternary tags
  191.       if ($bbtag['args'] == 3) {
  192.         $arg2 = trim(substr($str, $i + $tlen, $close - $i - $tlen));
  193.         if (strlen($arg2) > 0 && $arg2[0] == '=')
  194.           $arg2 = trim(substr($arg2, 1));
  195.       }
  196.       else
  197.         $arg2 = NULL;
  198.      
  199.       $i = $close + 1;
  200.     }
  201.     else
  202.       $arg0 = $arg1 = $arg2 = $args = NULL;
  203.    
  204.     $outstr .= parse_nonbb(substr($str, $bookmark, $tstart - $bookmark), $contenttype);
  205.     $outstr .= evaluate_tag($tname, $bbtag, $arg0, $arg1, $arg2, $args);
  206.     $bookmark = $i;
  207.   }
  208.   $outstr .= parse_nonbb(substr($str, $bookmark), $contenttype);
  209.   return $outstr;
  210. }
  211.  
  212.  
  213. /*! This function might as well be in that last block of code,
  214.  * but it was getting frighteningly big, so I moved it here.
  215.  * @note Unused tag arguments should be NULL; \p $arg1 and \p $arg2 should
  216.  *       never be non-null at the same time as \p $args.
  217.  * @param $tname  The name of the tag.
  218.  * @param $bbtag  The tag array from the \c bbtags array.
  219.  * @param $arg0   The first argument, the text between the tags, if applicable.
  220.  * @param $arg1   The second argument, the = value in the opening tag, if applicable.
  221.  * @param $arg2   The third argument, the = value in the closing tag, if applicable.
  222.  * @param $args   An associative array of arguments, if applicable.
  223.  * @return Returns the result of evaluating the tag;
  224.  *         a string with which to replace the tag.
  225. */
  226. function evaluate_tag($tname, $bbtag, $arg0, $arg1, $arg2, $args) {
  227.   switch ($bbtag['type'])
  228.   {
  229.     case TT_TRIVIAL: switch ($bbtag['args']) {
  230.       case 0:  return "<$tname />";
  231.       case 1:  return "<$tname>$arg0</$tname>";
  232.       case 2:  return "<!-- This tag is invalid -->";
  233.       case 3:  return "<!-- This tag is invalid -->";
  234.       default: return "<!-- This tag would be a security risk -->";
  235.     }
  236.     case TT_INTERLEAVE: $h = $bbtag['html']; switch ($bbtag['args']) {
  237.       case 0:  return $h[0];
  238.       case 1:  return $h[0] . $arg0 . $h[1];
  239.       case 2:  return $h[0] . $arg1 . $h[1] . $arg0 . $h[2];
  240.       case 3:  return $h[0] . $arg1 . $h[1] . $arg0 . $h[2] . $arg2 . $h[3];
  241.       default: return "<!-- This tag cannot be automated -->";
  242.     }
  243.     case TT_CALLBACK: $f = $bbtag['func']; switch ($bbtag['args']) {
  244.       case 0:  return $f();
  245.       case 1:  return $f($arg0);
  246.       case 2:  return $f($arg0, $arg1);
  247.       case 3:  return $f($arg0, $arg1, $arg2);
  248.       default: return $f($arg0, $args);
  249.     }
  250.   }
  251. }
  252.  
  253.  
  254. /*! This nonsense is to parse associative tags.
  255.  * @param $str The string from which to read attributes.
  256.  * @param $i   [in/out] The position from which to start reading.
  257.  *             Set at end of function call to denote the end of the list.
  258.  * @return  Returns the associative array read in from the string.
  259. */
  260. function read_attr_list($str, &$i) {
  261.   $len = strlen($str);
  262.   $attrs = array();
  263.   while ($i < $len && $str[$i] != ']')
  264.   {
  265.     if (ctype_space($str[$i])) {
  266.       ++$i;
  267.       continue;
  268.     }
  269.    
  270.     // Read an attribute name
  271.     $attr_start = $i;
  272.     while (++$i < $len && $str[$i] != '=' && !ctype_space($str[$i]));
  273.     $attr_name = substr($str, $attr_start, $i-$attr_start);
  274.    
  275.     if ($i >= $len) // Bail if out of bounds
  276.       { notice("OOB0: '$attr_name'"); return NULL; }
  277.      
  278.     // Read past the attribute name
  279.     while ($i < $len && ctype_space($str[$i]))
  280.       { notice("white: '" . $str[$i] . "'"); ++$i; }
  281.     if ($i >= $len) // Bail if out of bounds
  282.       { notice("OOB1: '$attr_name'"); return NULL; }
  283.    
  284.     if ($str[$i] == '=')
  285.     {
  286.       while (++$i < $len && ctype_space($str[$i]));
  287.       if ($str[$i] == '"' || $str[$i] == '\'')
  288.       {
  289.         $val_start = $i + 1;
  290.         $ochar = $str[$i];
  291.         while (++$i < $len && $str[$i] != $ochar)
  292.           if ($str[$i] == '\\') ++$i;
  293.         if ($i >= $len)
  294.           { notice("OOB2"); return NULL; }
  295.         $val = str_replace(
  296.           array("\\\\", "\\\'", "\\\"", "\\r", "\\n", "\\t"),
  297.           array("\\",     "\'",   "\"",   "\r",  "\n", "\t"),
  298.           substr($str, $val_start, $i - $val_start)
  299.         );
  300.         ++$i;
  301.       }
  302.       else {
  303.         $val_start = $i;
  304.         while (++$i < $len && $str[$i] != ']' && !ctype_space($str[$i]));
  305.         if ($i >= $len)
  306.           { notice("OOB3"); return NULL; }
  307.         $val = substr($str, $val_start, $i - $val_start);
  308.       }
  309.     }
  310.     else {
  311.       $val = NULL;
  312.       ++$i;
  313.     }
  314.     $attrs[$attr_name] = $val;
  315.   }
  316.   return $attrs;
  317. }
  318.  
  319. /*!
  320.  * Parse for non-BBCode elements, such as URLs or (God forbid) Markdown.
  321.  * @param $str  HTML-escaped plain text input.
  322.  * @return  Returns HTML-formatted parsed text.
  323. */
  324. function parse_nonbb($str, $contenttype)
  325. {
  326.   if ($contenttype != CT_MARKDOWN)
  327.     return $str;
  328.  
  329.   $urlexp = "/(?i)\b("
  330.     . "(?:https?:\/\/|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}\/)"
  331.     . "(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+"
  332.     . "(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’])"
  333.   .")/";
  334.   return preg_replace($urlexp, "<a href=\"$1\">$1</a>", $str);
  335.   return $str;
  336. }
  337.  
  338. /*! Main BBCode parser call.
  339.  * @param  $str  The BBCode string to parse.
  340.  * @return Returns the HTML parsed version of the input.
  341. */
  342. function parse_bbcode($str) {
  343.   $i = 0;
  344.   $res = "";
  345.   while ($i !== false) {
  346.     $res .= parse_sub($str, $i, CT_MARKDOWN, array());
  347.     if ($i !== false)
  348.       $res .= $str[$i++];
  349.   }
  350.   return $res;
  351. }
  352.  
  353. $test = "
  354. [b]Bold[/b]
  355. [assoc one=\"two\" two=\"four\" three=eight four=sixteen]yes[/assoc]
  356. [/code]
  357. www.google.com
  358.  
  359. [b]this should be[i] bold[/b]
  360. [b]this should not be, but this should be: [b] bold[/b]
  361.  
  362. [hr]
  363.  
  364. [ [b]bold[/b] ]
  365. [ [b]bold[/b] ]
  366.  
  367. http://google.com/
  368. www.google.com [
  369. ";
  370.  
  371. echo "=================================\nTest:\n=================================\n" . $test . "\n";
  372. echo "=================================\nResult:\n=================================\n";
  373. echo parse_bbcode($test);
  374.  
  375. ?>
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement