Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- <?
- if (!function_exists(mb_substr)){
- function mb_substr($s, $n, $i){
- return substr($s, $n, $i);
- }
- }
- if (!function_exists(mb_strlen)){
- function mb_strlen($s){
- return strlen($s);
- }
- }
- /**
- * Класс для разбора HTML
- *
- */
- class HtmlChecker{
- const STOP_ON_ERROR=false;
- private $input='';
- protected $output='';
- protected $error = null;
- protected $source_encode = 'utf-8';
- function __construct($encode='utf-8'){
- $this->source_encode = $encode;
- $this->error = null;
- }
- /**
- * Функция вывода текста
- *
- * @param string $text_tocken
- * @return string
- */
- protected function text($text_tocken){
- return $text_tocken;
- }
- /**
- * Функция вывода комментария
- *
- * @param string $text
- * @return string
- */
- protected function comment($text){
- return '<!--'.$text.'-->';
- }
- /**
- * функция закрытия тега
- *
- * @param string $tag_name
- * @return string
- */
- protected function close_tag($tag_name){
- return '</'.trim($tag_name, $this->source_encode).'>';
- }
- /**
- * Функция открытия тега
- *
- * @param string $tag_name
- * @param array $attributes [0-имя атрибута, 1-значение(null=атрибут-флаг), 2-используемые кавычки ]
- * @param закрывающийся ли тег $single_tag
- * @return string
- */
- protected function open_tag($tag_name, $attributes, $single_tag){
- $ans = '<'.trim($tag_name, $this->source_encode).'';
- foreach ($attributes as $a){
- $ans.=' '.trim($a[0], $this->source_encode).(is_null($a[1])?'':'='.(empty($a[2])?'"':$a[2]).$a[1].(empty($a[2])?'"':$a[2]).'');
- }
- $ans.=($single_tag?'/':'').'>';
- return $ans;
- }
- /**
- * Возвращает сообщения об ошибках
- *
- * @return unknown
- */
- public function getError(){
- return $this->error;
- }
- /**
- * Ниже идут функции внутреннего устройства
- */
- protected function setError($mes){
- $sub_lenght=20;
- $this->error .= "\n".$mes.' on char #'.$this->index.' snip: '.
- mb_substr(
- $this->input,
- $this->index-$sub_lenght/2,
- $sub_lenght,
- $this->source_encode
- ).
- '';
- }
- protected function isEnd(){
- if ((self::STOP_ON_ERROR and !empty($this->error)) or $this->index >= mb_strlen($this->input, $this->source_encode)){
- return true;
- }
- return false;
- }
- /**
- * Разбор строки
- *
- * @param string $s
- * @return string
- */
- public function check($s=null){
- if (!empty($s))
- $this->input = $s;
- $this->index = 0;
- $this->error = null;
- $this->html('root');
- return $this->output;
- }
- protected function getChar($seek=0){
- return mb_substr($this->input, $this->index+$seek, 1, $this->source_encode);
- }
- protected function html($root_tag){
- if ($this->isEnd())
- return;
- $text_tocken = '';
- while(!$this->isEnd() and $this->getChar() != '<'){
- if ($this->getChar()!='>')
- $text_tocken.=$this->getChar();
- $this->index++;
- }
- $node['text'] = $this->text($text_tocken);
- $this->output .= $this->text($text_tocken);
- if($this->getChar() == '<'){
- $this->index++;
- if ($this->getChar()=='!' and $this->getChar(1)=='-' and $this->getChar(2)=='-' ){
- $this->index+=3;
- $comment = '';
- while($this->getChar(1)!='-' or $this->getChar(2)!='-' or $this->getChar(3)!='>'){
- $comment.=$this->getChar();
- $this->index++;
- }
- $this->index+=3;
- $this->output.=$this->comment($comment);
- }else{
- $tag_name = '';
- if ($this->getChar()=='/'){
- $this->index++;
- $tag_close=true;
- }else{
- $tag_close=false;
- }
- $single_tag=false;
- while(!$this->isEnd() and !in_array($this->getChar(), array(' ', '>', '/'))){
- $tag_name.=$this->getChar();
- $this->index++;
- }
- $attributes = array();
- while(!$tag_close and !$this->isEnd()){ //аттрибуты по циклу выдираем
- while(!$this->isEnd() and $this->getChar()==' '){
- $this->index++;
- }
- if ($this->getChar()=='/'){
- $this->index++;
- $single_tag=true;
- break;
- }elseif ($this->getChar()=='>'){
- break;
- }else{
- $attr = $this->parse_attr();
- if (empty($attr)){
- $this->setError('incorrect attribute list in tag "'.$tag_name.'"');
- break;
- }
- $attributes[] = $attr;
- }
- }
- if ($this->getChar()=='>'){
- $this->index++;
- }else{
- $this->setError('incorrect close tag "'.$tag_name.'" symbol "'.$this->getChar().'"');
- }
- if ($tag_close){
- $this->output.=$this->close_tag($tag_name);
- }else{
- $this->output.=$this->open_tag($tag_name, $attributes, $single_tag);
- }
- }
- }
- $this->html(isset($tag_name)?$tag_name:$root_tag);
- }
- protected function parse_attr(){
- $attr_name='';
- while(!$this->isEnd() and !in_array($this->getChar(), array('>','/','=',' '))){
- $attr_name.=$this->getChar();
- $this->index++;
- }
- $attr_value=null;
- while(!$this->isEnd() and $this->getChar()==' '){
- $this->index++;
- }
- while(!$this->isEnd() and $this->getChar()=='='){
- $this->index++;
- $attr_value='';
- while(!$this->isEnd() and $this->getChar()==' '){
- $this->index++;
- }
- if (in_array($this->getChar(), array('>', '/'))){
- break;
- }
- if (in_array($this->getChar(),array('"', '\''))){
- $bsymbol=$this->getChar();
- $this->index++;
- while(!$this->isEnd()){
- if ($this->getChar()==$bsymbol and $this->getChar(-1)!='\\'){
- break;
- }
- $attr_value.=$this->getChar();
- $this->index++;
- }
- if ($this->getChar()!=$bsymbol){
- $this->setError('incorrect attribute value end (attr name="'.$attr_name.'")');
- }
- $this->index++;
- }else{
- $bsymbol='';
- while (!$this->isEnd() and in_array($this->getChar(), array('1','2','3','4','5','6','7','8','9','0'))){
- $attr_value.=$this->getChar();
- $this->index++;
- }
- if (!in_array($this->getChar(), array('>', '/', ' '))){
- while (!in_array($this->getChar(), array('>', '/', ' '))){
- $attr_value.=$this->getChar();
- $this->index++;
- }
- $this->setError('incorrect attribute value (attr name="'.$attr_name.'")');
- }
- }
- break;
- }
- return array($attr_name, $attr_value, $bsymbol);
- }
- }
- /**
- * Класс позволяет фильтровать и валидировать HTML
- * Язык:
- * @ - символ указывает что атрибуты могут быть у любого элемента
- * , - разделитель тегов
- * | - разделитель атрибутов
- * [ - начало секции списка атрибутов
- * ] - конец секции списка атрибутов
- * = - помечает атрибут как не пустой (тоесть пустые будут удаляться)
- * comments - мета тег, говорит фильтровать ли комментарии или нет.
- * EXAMPLE:
- * $parser = new HtmlChecker('@[id=|class=|style=|title=],
- a[name=|href|target=],
- #p,br,strong,b,em,i,u,sub,sup,blockquote,
- ol[type=|compact],ul[type=|compact],li,
- img[src=|border=|alt=|hspace=|vspace=|width=|height=|align=],
- div,span,code,pre,hr[size=|noshade],
- font[face=|size=|color=]');
- */
- class SimpleHtmlChecker extends HtmlChecker{
- private $elements;
- function __construct($encode='utf-8', $s){
- parent::__construct($encode);
- $elements_tockens = explode(',', str_replace(array("\n","\r"),'',$s));
- $this->elements = array();
- $global_attr = array();
- foreach ($elements_tockens as $e){
- $e = trim($e);
- if (preg_match('~^@\[([^\]]*)\]$~i', $e, $regs)){
- $global_attr = array_merge($global_attr, explode('|', $regs[1]));
- }elseif (preg_match('~^(-|#)?([a-z]+)(\[([^\]]*)\])?$~i', $e, $regs)){
- $this->elements[$regs[2]] = explode('|', trim($regs[4]));
- if (empty($this->elements[$regs[2]][0]))
- $this->elements[$regs[2]] = array();
- }else{
- trigger_error($e.' incorrect tocken');
- }
- }
- foreach ($this->elements as &$e){
- $e = array_unique(array_merge($e, $global_attr));
- }
- }
- protected function text($text_tocken){
- $text_tocken = str_replace(array("\n", "\r"), ' ',$text_tocken);
- while (strpos($text_tocken, ' ')!==false)
- $text_tocken = str_replace(' ', ' ',$text_tocken);
- return $text_tocken;
- }
- protected function comment($text){
- if (isset($this->elements['comments']))
- return '<!--'.$text.'-->';
- return '';
- }
- protected function close_tag($tag_name){
- $tag_name=trim($tag_name, $this->source_encode);
- if (!isset($this->elements[$tag_name])){
- $this->setError('deny close tag "'.$tag_name.'"');
- return '';
- }
- return '</'.$tag_name.'>';
- }
- protected function open_tag($tag_name, $attributes, $single_tag){
- echo $this->source_encode;
- $tag_name=trim($tag_name, $this->source_encode);
- if (!isset($this->elements[$tag_name])){
- $this->setError('deny open tag <'.$tag_name.'>');
- return '';
- }
- $ans = '<'.$tag_name.'';
- foreach ($attributes as $a){
- if (in_array($a[0], $this->elements[$tag_name]) or (!empty($a[1]) and in_array($a[0].'=', $this->elements[$tag_name])))
- $ans.=' '.trim($a[0], $this->source_encode).(is_null($a[1])?'':'='.(empty($a[2])?'"':$a[2]).$a[1].(empty($a[2])?'"':$a[2]).'');
- else{
- $this->setError('deny arttibute "'.$a[0].'" for tag <'.$tag_name.'>');
- }
- }
- $ans.=($single_tag?'/':'').'>';
- return $ans;
- }
- }
- /**
- EXAMPLE:
- $input = '<div /> > <div < HTML bags <p title="<div class=\"tips\"> some html code</div>"> test text <ol compact>
- <!-- pagebreak --> <bee><script>alert("I am hack you!");</script>
- <li> li text <br /></li></ol></p>
- <p onclick="alert(\"<br />!!!\")">text 2 <br incorrect attrib="some wrong">
- <img empty src="1x1.gif" width= 60px height = \'20 px\' style="border: 1px solid red">
- </p>';
- echo "BEFORE:\n $input\n\n AFTER:\n";
- $parser = new SimpleHtmlChecker('cp1251', '@[id=|class=|style=|title=],
- comments,
- a[name=|href|target=],
- #p,br,strong,b,em,i,u,sub,sup,blockquote,
- ol[type=|compact],ul[type=|compact],li,
- img[src=|border=|alt=|hspace=|vspace=|width=|height=|align=],
- div,span,code,pre,hr[size=|noshade],
- font[face=|size=|color=]'
- );
- echo $parser->check($input);
- echo "\n".$parser->getError();
- */
- ?>
Add Comment
Please, Sign In to add comment