Guest User

Untitled

a guest
Apr 20th, 2018
91
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 10.91 KB | None | 0 0
  1. <?
  2. if (!function_exists(mb_substr)){
  3. function mb_substr($s, $n, $i){
  4. return substr($s, $n, $i);
  5. }
  6. }
  7. if (!function_exists(mb_strlen)){
  8. function mb_strlen($s){
  9. return strlen($s);
  10. }
  11. }
  12. /**
  13. * Класс для разбора HTML
  14. *
  15. */
  16. class HtmlChecker{
  17. const STOP_ON_ERROR=false;
  18. private $input='';
  19. protected $output='';
  20. protected $error = null;
  21. protected $source_encode = 'utf-8';
  22. function __construct($encode='utf-8'){
  23. $this->source_encode = $encode;
  24. $this->error = null;
  25. }
  26. /**
  27. * Функция вывода текста
  28. *
  29. * @param string $text_tocken
  30. * @return string
  31. */
  32. protected function text($text_tocken){
  33. return $text_tocken;
  34. }
  35. /**
  36. * Функция вывода комментария
  37. *
  38. * @param string $text
  39. * @return string
  40. */
  41. protected function comment($text){
  42. return '<!--'.$text.'-->';
  43. }
  44. /**
  45. * функция закрытия тега
  46. *
  47. * @param string $tag_name
  48. * @return string
  49. */
  50. protected function close_tag($tag_name){
  51. return '</'.trim($tag_name, $this->source_encode).'>';
  52. }
  53. /**
  54. * Функция открытия тега
  55. *
  56. * @param string $tag_name
  57. * @param array $attributes [0-имя атрибута, 1-значение(null=атрибут-флаг), 2-используемые кавычки ]
  58. * @param закрывающийся ли тег $single_tag
  59. * @return string
  60. */
  61. protected function open_tag($tag_name, $attributes, $single_tag){
  62. $ans = '<'.trim($tag_name, $this->source_encode).'';
  63. foreach ($attributes as $a){
  64. $ans.=' '.trim($a[0], $this->source_encode).(is_null($a[1])?'':'='.(empty($a[2])?'"':$a[2]).$a[1].(empty($a[2])?'"':$a[2]).'');
  65. }
  66. $ans.=($single_tag?'/':'').'>';
  67. return $ans;
  68. }
  69. /**
  70. * Возвращает сообщения об ошибках
  71. *
  72. * @return unknown
  73. */
  74. public function getError(){
  75. return $this->error;
  76. }
  77.  
  78.  
  79. /**
  80. * Ниже идут функции внутреннего устройства
  81. */
  82.  
  83.  
  84. protected function setError($mes){
  85. $sub_lenght=20;
  86. $this->error .= "\n".$mes.' on char #'.$this->index.' snip: '.
  87. mb_substr(
  88. $this->input,
  89. $this->index-$sub_lenght/2,
  90. $sub_lenght,
  91. $this->source_encode
  92. ).
  93. '';
  94. }
  95.  
  96. protected function isEnd(){
  97. if ((self::STOP_ON_ERROR and !empty($this->error)) or $this->index >= mb_strlen($this->input, $this->source_encode)){
  98. return true;
  99. }
  100. return false;
  101. }
  102. /**
  103. * Разбор строки
  104. *
  105. * @param string $s
  106. * @return string
  107. */
  108. public function check($s=null){
  109. if (!empty($s))
  110. $this->input = $s;
  111. $this->index = 0;
  112. $this->error = null;
  113. $this->html('root');
  114. return $this->output;
  115. }
  116. protected function getChar($seek=0){
  117. return mb_substr($this->input, $this->index+$seek, 1, $this->source_encode);
  118. }
  119. protected function html($root_tag){
  120. if ($this->isEnd())
  121. return;
  122. $text_tocken = '';
  123. while(!$this->isEnd() and $this->getChar() != '<'){
  124. if ($this->getChar()!='>')
  125. $text_tocken.=$this->getChar();
  126. $this->index++;
  127. }
  128. $node['text'] = $this->text($text_tocken);
  129. $this->output .= $this->text($text_tocken);
  130. if($this->getChar() == '<'){
  131. $this->index++;
  132. if ($this->getChar()=='!' and $this->getChar(1)=='-' and $this->getChar(2)=='-' ){
  133. $this->index+=3;
  134. $comment = '';
  135. while($this->getChar(1)!='-' or $this->getChar(2)!='-' or $this->getChar(3)!='>'){
  136. $comment.=$this->getChar();
  137. $this->index++;
  138. }
  139. $this->index+=3;
  140. $this->output.=$this->comment($comment);
  141. }else{
  142. $tag_name = '';
  143. if ($this->getChar()=='/'){
  144. $this->index++;
  145. $tag_close=true;
  146. }else{
  147. $tag_close=false;
  148. }
  149. $single_tag=false;
  150. while(!$this->isEnd() and !in_array($this->getChar(), array(' ', '>', '/'))){
  151. $tag_name.=$this->getChar();
  152. $this->index++;
  153. }
  154. $attributes = array();
  155. while(!$tag_close and !$this->isEnd()){ //аттрибуты по циклу выдираем
  156. while(!$this->isEnd() and $this->getChar()==' '){
  157. $this->index++;
  158. }
  159. if ($this->getChar()=='/'){
  160. $this->index++;
  161. $single_tag=true;
  162. break;
  163. }elseif ($this->getChar()=='>'){
  164. break;
  165. }else{
  166. $attr = $this->parse_attr();
  167. if (empty($attr)){
  168. $this->setError('incorrect attribute list in tag "'.$tag_name.'"');
  169. break;
  170. }
  171. $attributes[] = $attr;
  172. }
  173. }
  174. if ($this->getChar()=='>'){
  175. $this->index++;
  176. }else{
  177. $this->setError('incorrect close tag "'.$tag_name.'" symbol "'.$this->getChar().'"');
  178. }
  179. if ($tag_close){
  180. $this->output.=$this->close_tag($tag_name);
  181. }else{
  182. $this->output.=$this->open_tag($tag_name, $attributes, $single_tag);
  183. }
  184. }
  185. }
  186. $this->html(isset($tag_name)?$tag_name:$root_tag);
  187. }
  188.  
  189. protected function parse_attr(){
  190. $attr_name='';
  191. while(!$this->isEnd() and !in_array($this->getChar(), array('>','/','=',' '))){
  192. $attr_name.=$this->getChar();
  193. $this->index++;
  194. }
  195. $attr_value=null;
  196. while(!$this->isEnd() and $this->getChar()==' '){
  197. $this->index++;
  198. }
  199. while(!$this->isEnd() and $this->getChar()=='='){
  200. $this->index++;
  201. $attr_value='';
  202. while(!$this->isEnd() and $this->getChar()==' '){
  203. $this->index++;
  204. }
  205. if (in_array($this->getChar(), array('>', '/'))){
  206. break;
  207. }
  208. if (in_array($this->getChar(),array('"', '\''))){
  209. $bsymbol=$this->getChar();
  210. $this->index++;
  211. while(!$this->isEnd()){
  212. if ($this->getChar()==$bsymbol and $this->getChar(-1)!='\\'){
  213. break;
  214. }
  215. $attr_value.=$this->getChar();
  216. $this->index++;
  217. }
  218. if ($this->getChar()!=$bsymbol){
  219. $this->setError('incorrect attribute value end (attr name="'.$attr_name.'")');
  220. }
  221. $this->index++;
  222. }else{
  223. $bsymbol='';
  224. while (!$this->isEnd() and in_array($this->getChar(), array('1','2','3','4','5','6','7','8','9','0'))){
  225. $attr_value.=$this->getChar();
  226. $this->index++;
  227. }
  228. if (!in_array($this->getChar(), array('>', '/', ' '))){
  229. while (!in_array($this->getChar(), array('>', '/', ' '))){
  230. $attr_value.=$this->getChar();
  231. $this->index++;
  232. }
  233. $this->setError('incorrect attribute value (attr name="'.$attr_name.'")');
  234. }
  235. }
  236. break;
  237. }
  238. return array($attr_name, $attr_value, $bsymbol);
  239. }
  240. }
  241.  
  242. /**
  243. * Класс позволяет фильтровать и валидировать HTML
  244. * Язык:
  245. * @ - символ указывает что атрибуты могут быть у любого элемента
  246. * , - разделитель тегов
  247. * | - разделитель атрибутов
  248. * [ - начало секции списка атрибутов
  249. * ] - конец секции списка атрибутов
  250. * = - помечает атрибут как не пустой (тоесть пустые будут удаляться)
  251. * comments - мета тег, говорит фильтровать ли комментарии или нет.
  252. * EXAMPLE:
  253. * $parser = new HtmlChecker('@[id=|class=|style=|title=],
  254. a[name=|href|target=],
  255. #p,br,strong,b,em,i,u,sub,sup,blockquote,
  256. ol[type=|compact],ul[type=|compact],li,
  257. img[src=|border=|alt=|hspace=|vspace=|width=|height=|align=],
  258. div,span,code,pre,hr[size=|noshade],
  259. font[face=|size=|color=]');
  260. */
  261. class SimpleHtmlChecker extends HtmlChecker{
  262. private $elements;
  263. function __construct($encode='utf-8', $s){
  264. parent::__construct($encode);
  265. $elements_tockens = explode(',', str_replace(array("\n","\r"),'',$s));
  266. $this->elements = array();
  267. $global_attr = array();
  268. foreach ($elements_tockens as $e){
  269. $e = trim($e);
  270. if (preg_match('~^@\[([^\]]*)\]$~i', $e, $regs)){
  271. $global_attr = array_merge($global_attr, explode('|', $regs[1]));
  272. }elseif (preg_match('~^(-|#)?([a-z]+)(\[([^\]]*)\])?$~i', $e, $regs)){
  273. $this->elements[$regs[2]] = explode('|', trim($regs[4]));
  274. if (empty($this->elements[$regs[2]][0]))
  275. $this->elements[$regs[2]] = array();
  276. }else{
  277. trigger_error($e.' incorrect tocken');
  278. }
  279. }
  280. foreach ($this->elements as &$e){
  281. $e = array_unique(array_merge($e, $global_attr));
  282. }
  283. }
  284. protected function text($text_tocken){
  285. $text_tocken = str_replace(array("\n", "\r"), ' ',$text_tocken);
  286. while (strpos($text_tocken, ' ')!==false)
  287. $text_tocken = str_replace(' ', ' ',$text_tocken);
  288. return $text_tocken;
  289. }
  290. protected function comment($text){
  291. if (isset($this->elements['comments']))
  292. return '<!--'.$text.'-->';
  293. return '';
  294. }
  295. protected function close_tag($tag_name){
  296. $tag_name=trim($tag_name, $this->source_encode);
  297. if (!isset($this->elements[$tag_name])){
  298. $this->setError('deny close tag "'.$tag_name.'"');
  299. return '';
  300. }
  301. return '</'.$tag_name.'>';
  302. }
  303. protected function open_tag($tag_name, $attributes, $single_tag){
  304. echo $this->source_encode;
  305. $tag_name=trim($tag_name, $this->source_encode);
  306. if (!isset($this->elements[$tag_name])){
  307. $this->setError('deny open tag <'.$tag_name.'>');
  308. return '';
  309. }
  310. $ans = '<'.$tag_name.'';
  311. foreach ($attributes as $a){
  312. if (in_array($a[0], $this->elements[$tag_name]) or (!empty($a[1]) and in_array($a[0].'=', $this->elements[$tag_name])))
  313. $ans.=' '.trim($a[0], $this->source_encode).(is_null($a[1])?'':'='.(empty($a[2])?'"':$a[2]).$a[1].(empty($a[2])?'"':$a[2]).'');
  314. else{
  315. $this->setError('deny arttibute "'.$a[0].'" for tag <'.$tag_name.'>');
  316. }
  317. }
  318. $ans.=($single_tag?'/':'').'>';
  319. return $ans;
  320. }
  321. }
  322. /**
  323. EXAMPLE:
  324.  
  325.  
  326. $input = '<div /> > <div < HTML bags <p title="<div class=\"tips\"> some html code</div>"> test text <ol compact>
  327. <!-- pagebreak --> <bee><script>alert("I am hack you!");</script>
  328. <li> li text <br /></li></ol></p>
  329. <p onclick="alert(\"<br />!!!\")">text 2 <br incorrect attrib="some wrong">
  330. <img empty src="1x1.gif" width= 60px height = \'20 px\' style="border: 1px solid red">
  331. </p>';
  332. echo "BEFORE:\n $input\n\n AFTER:\n";
  333. $parser = new SimpleHtmlChecker('cp1251', '@[id=|class=|style=|title=],
  334. comments,
  335. a[name=|href|target=],
  336. #p,br,strong,b,em,i,u,sub,sup,blockquote,
  337. ol[type=|compact],ul[type=|compact],li,
  338. img[src=|border=|alt=|hspace=|vspace=|width=|height=|align=],
  339. div,span,code,pre,hr[size=|noshade],
  340. font[face=|size=|color=]'
  341. );
  342. echo $parser->check($input);
  343. echo "\n".$parser->getError();
  344.  
  345. */
  346. ?>
Add Comment
Please, Sign In to add comment