Advertisement
Guest User

Untitled

a guest
Mar 3rd, 2013
85
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 53.80 KB | None | 0 0
  1. <?php
  2. /**
  3. * Website: http://sourceforge.net/projects/simplehtmldom/
  4. * Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/)
  5. * Contributions by:
  6. * Yousuke Kumakura (Attribute filters)
  7. * Vadim Voituk (Negative indexes supports of "find" method)
  8. * Antcs (Constructor with automatically load contents either text or file/url)
  9. *
  10. * all affected sections have comments starting with "PaperG"
  11. *
  12. * Paperg - Added case insensitive testing of the value of the selector.
  13. * Paperg - Added tag_start for the starting index of tags - NOTE: This works but not accurately.
  14. * This tag_start gets counted AFTER \r\n have been crushed out, and after the remove_noice calls so it will not reflect the REAL position of the tag in the source,
  15. * it will almost always be smaller by some amount.
  16. * We use this to determine how far into the file the tag in question is. This "percentage will never be accurate as the $dom->size is the "real" number of bytes the dom was created from.
  17. * but for most purposes, it's a really good estimation.
  18. * Paperg - Added the forceTagsClosed to the dom constructor. Forcing tags closed is great for malformed html, but it CAN lead to parsing errors.
  19. * Allow the user to tell us how much they trust the html.
  20. * Paperg add the text and plaintext to the selectors for the find syntax. plaintext implies text in the innertext of a node. text implies that the tag is a text node.
  21. * This allows for us to find tags based on the text they contain.
  22. * Create find_ancestor_tag to see if a tag is - at any level - inside of another specific tag.
  23. * Paperg: added parse_charset so that we know about the character set of the source document.
  24. * NOTE: If the user's system has a routine called get_last_retrieve_url_contents_content_type availalbe, we will assume it's returning the content-type header from the
  25. * last transfer or curl_exec, and we will parse that and use it in preference to any other method of charset detection.
  26. *
  27. * Licensed under The MIT License
  28. * Redistributions of files must retain the above copyright notice.
  29. *
  30. * @author S.C. Chen <me578022@gmail.com>
  31. * @author John Schlick
  32. * @author Rus Carroll
  33. * @version 1.11 ($Rev: 184 $)
  34. * @package PlaceLocalInclude
  35. * @subpackage simple_html_dom
  36. */
  37.  
  38. /**
  39. * All of the Defines for the classes below.
  40. * @author S.C. Chen <me578022@gmail.com>
  41. */
  42. define('HDOM_TYPE_ELEMENT', 1);
  43. define('HDOM_TYPE_COMMENT', 2);
  44. define('HDOM_TYPE_TEXT', 3);
  45. define('HDOM_TYPE_ENDTAG', 4);
  46. define('HDOM_TYPE_ROOT', 5);
  47. define('HDOM_TYPE_UNKNOWN', 6);
  48. define('HDOM_QUOTE_DOUBLE', 0);
  49. define('HDOM_QUOTE_SINGLE', 1);
  50. define('HDOM_QUOTE_NO', 3);
  51. define('HDOM_INFO_BEGIN', 0);
  52. define('HDOM_INFO_END', 1);
  53. define('HDOM_INFO_QUOTE', 2);
  54. define('HDOM_INFO_SPACE', 3);
  55. define('HDOM_INFO_TEXT', 4);
  56. define('HDOM_INFO_INNER', 5);
  57. define('HDOM_INFO_OUTER', 6);
  58. define('HDOM_INFO_ENDSPACE',7);
  59. define('DEFAULT_TARGET_CHARSET', 'UTF-8');
  60. define('DEFAULT_BR_TEXT', "\r\n");
  61. // helper functions
  62. // -----------------------------------------------------------------------------
  63. // get html dom from file
  64. // $maxlen is defined in the code as PHP_STREAM_COPY_ALL which is defined as -1.
  65. function file_get_html($url, $use_include_path = false, $context=null, $offset = -1, $maxLen=-1, $lowercase = true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT)
  66. {
  67. // We DO force the tags to be terminated.
  68. $dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $defaultBRText);
  69. // For sourceforge users: uncomment the next line and comment the retreive_url_contents line 2 lines down if it is not already done.
  70. $contents = file_get_contents($url, $use_include_path, $context, $offset);
  71. // Paperg - use our own mechanism for getting the contents as we want to control the timeout.
  72. // $contents = retrieve_url_contents($url);
  73. if (empty($contents))
  74. {
  75. return false;
  76. }
  77. // The second parameter can force the selectors to all be lowercase.
  78. $dom->load($contents, $lowercase, $stripRN);
  79. return $dom;
  80. }
  81.  
  82. // get html dom from string
  83. function str_get_html($str, $lowercase=true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT)
  84. {
  85. $dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $defaultBRText);
  86. if (empty($str))
  87. {
  88. $dom->clear();
  89. return false;
  90. }
  91. $dom->load($str, $lowercase, $stripRN);
  92. return $dom;
  93. }
  94.  
  95. // dump html dom tree
  96. function dump_html_tree($node, $show_attr=true, $deep=0)
  97. {
  98. $node->dump($node);
  99. }
  100.  
  101. /**
  102. * simple html dom node
  103. * PaperG - added ability for "find" routine to lowercase the value of the selector.
  104. * PaperG - added $tag_start to track the start position of the tag in the total byte index
  105. *
  106. * @package PlaceLocalInclude
  107. */
  108. class simple_html_dom_node {
  109. public $nodetype = HDOM_TYPE_TEXT;
  110. public $tag = 'text';
  111. public $attr = array();
  112. public $children = array();
  113. public $nodes = array();
  114. public $parent = null;
  115. public $_ = array();
  116. public $tag_start = 0;
  117. private $dom = null;
  118.  
  119. function __construct($dom)
  120. {
  121. $this->dom = $dom;
  122. $dom->nodes[] = $this;
  123. }
  124.  
  125. function __destruct()
  126. {
  127. $this->clear();
  128. }
  129.  
  130. function __toString()
  131. {
  132. return $this->outertext();
  133. }
  134.  
  135. // clean up memory due to php5 circular references memory leak...
  136. function clear()
  137. {
  138. $this->dom = null;
  139. $this->nodes = null;
  140. $this->parent = null;
  141. $this->children = null;
  142. }
  143.  
  144. // dump node's tree
  145. function dump($show_attr=true, $deep=0)
  146. {
  147. $lead = str_repeat(' ', $deep);
  148.  
  149. echo $lead.$this->tag;
  150. if ($show_attr && count($this->attr)>0)
  151. {
  152. echo '(';
  153. foreach ($this->attr as $k=>$v)
  154. echo "[$k]=>\"".$this->$k.'", ';
  155. echo ')';
  156. }
  157. echo "\n";
  158.  
  159. foreach ($this->nodes as $c)
  160. $c->dump($show_attr, $deep+1);
  161. }
  162.  
  163.  
  164. // Debugging function to dump a single dom node with a bunch of information about it.
  165. function dump_node()
  166. {
  167. echo $this->tag;
  168. if (count($this->attr)>0)
  169. {
  170. echo '(';
  171. foreach ($this->attr as $k=>$v)
  172. {
  173. echo "[$k]=>\"".$this->$k.'", ';
  174. }
  175. echo ')';
  176. }
  177. if (count($this->attr)>0)
  178. {
  179. echo ' $_ (';
  180. foreach ($this->_ as $k=>$v)
  181. {
  182. if (is_array($v))
  183. {
  184. echo "[$k]=>(";
  185. foreach ($v as $k2=>$v2)
  186. {
  187. echo "[$k2]=>\"".$v2.'", ';
  188. }
  189. echo ")";
  190. } else {
  191. echo "[$k]=>\"".$v.'", ';
  192. }
  193. }
  194. echo ")";
  195. }
  196.  
  197. if (isset($this->text))
  198. {
  199. echo " text: (" . $this->text . ")";
  200. }
  201.  
  202. echo " children: " . count($this->children);
  203. echo " nodes: " . count($this->nodes);
  204. echo " tag_start: " . $this->tag_start;
  205. echo "\n";
  206.  
  207. }
  208.  
  209. // returns the parent of node
  210. function parent()
  211. {
  212. return $this->parent;
  213. }
  214.  
  215. // returns children of node
  216. function children($idx=-1)
  217. {
  218. if ($idx===-1) return $this->children;
  219. if (isset($this->children[$idx])) return $this->children[$idx];
  220. return null;
  221. }
  222.  
  223. // returns the first child of node
  224. function first_child()
  225. {
  226. if (count($this->children)>0) return $this->children[0];
  227. return null;
  228. }
  229.  
  230. // returns the last child of node
  231. function last_child()
  232. {
  233. if (($count=count($this->children))>0) return $this->children[$count-1];
  234. return null;
  235. }
  236.  
  237. // returns the next sibling of node
  238. function next_sibling()
  239. {
  240. if ($this->parent===null) return null;
  241. $idx = 0;
  242. $count = count($this->parent->children);
  243. while ($idx<$count && $this!==$this->parent->children[$idx])
  244. ++$idx;
  245. if (++$idx>=$count) return null;
  246. return $this->parent->children[$idx];
  247. }
  248.  
  249. // returns the previous sibling of node
  250. function prev_sibling()
  251. {
  252. if ($this->parent===null) return null;
  253. $idx = 0;
  254. $count = count($this->parent->children);
  255. while ($idx<$count && $this!==$this->parent->children[$idx])
  256. ++$idx;
  257. if (--$idx<0) return null;
  258. return $this->parent->children[$idx];
  259. }
  260.  
  261. // function to locate a specific ancestor tag in the path to the root.
  262. function find_ancestor_tag($tag)
  263. {
  264. global $debugObject;
  265. if (is_object($debugObject))
  266. {
  267. $debugObject->debugLogEntry(1);
  268. }
  269.  
  270. // Start by including ourselves in the comparison.
  271. $returnDom = $this;
  272.  
  273. while (!is_null($returnDom))
  274. {
  275. if (is_object($debugObject))
  276. {
  277. $debugObject->debugLog(2, "Current tag is: " . $returnDom->tag);
  278. }
  279.  
  280. if ($returnDom->tag == $tag)
  281. {
  282. break;
  283. }
  284. $returnDom = $returnDom->parent;
  285. }
  286. return $returnDom;
  287. }
  288.  
  289. // get dom node's inner html
  290. function innertext()
  291. {
  292. if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER];
  293. if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
  294.  
  295. $ret = '';
  296. foreach ($this->nodes as $n)
  297. $ret .= $n->outertext();
  298. return $ret;
  299. }
  300.  
  301. // get dom node's outer text (with tag)
  302. function outertext()
  303. {
  304. global $debugObject;
  305. if (is_object($debugObject))
  306. {
  307. $text = '';
  308. if ($this->tag == 'text')
  309. {
  310. if (!empty($this->text))
  311. {
  312. $text = " with text: " . $this->text;
  313. }
  314. }
  315. $debugObject->debugLog(1, 'Innertext of tag: ' . $this->tag . $text);
  316. }
  317.  
  318. if ($this->tag==='root') return $this->innertext();
  319.  
  320. // trigger callback
  321. if ($this->dom && $this->dom->callback!==null)
  322. {
  323. call_user_func_array($this->dom->callback, array($this));
  324. }
  325.  
  326. if (isset($this->_[HDOM_INFO_OUTER])) return $this->_[HDOM_INFO_OUTER];
  327. if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
  328.  
  329. // render begin tag
  330. if ($this->dom && $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]])
  331. {
  332. $ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup();
  333. } else {
  334. $ret = "";
  335. }
  336.  
  337. // render inner text
  338. if (isset($this->_[HDOM_INFO_INNER]))
  339. {
  340. // If it's a br tag... don't return the HDOM_INNER_INFO that we may or may not have added.
  341. if ($this->tag != "br")
  342. {
  343. $ret .= $this->_[HDOM_INFO_INNER];
  344. }
  345. } else {
  346. if ($this->nodes)
  347. {
  348. foreach ($this->nodes as $n)
  349. {
  350. $ret .= $this->convert_text($n->outertext());
  351. }
  352. }
  353. }
  354.  
  355. // render end tag
  356. if (isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END]!=0)
  357. $ret .= '</'.$this->tag.'>';
  358. return $ret;
  359. }
  360.  
  361. // get dom node's plain text
  362. function text()
  363. {
  364. if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER];
  365. switch ($this->nodetype)
  366. {
  367. case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
  368. case HDOM_TYPE_COMMENT: return '';
  369. case HDOM_TYPE_UNKNOWN: return '';
  370. }
  371. if (strcasecmp($this->tag, 'script')===0) return '';
  372. if (strcasecmp($this->tag, 'style')===0) return '';
  373.  
  374. $ret = '';
  375. // In rare cases, (always node type 1 or HDOM_TYPE_ELEMENT - observed for some span tags, and some p tags) $this->nodes is set to NULL.
  376. // NOTE: This indicates that there is a problem where it's set to NULL without a clear happening.
  377. // WHY is this happening?
  378. if (!is_null($this->nodes))
  379. {
  380. foreach ($this->nodes as $n)
  381. {
  382. $ret .= $this->convert_text($n->text());
  383. }
  384. }
  385. return $ret;
  386. }
  387.  
  388. function xmltext()
  389. {
  390. $ret = $this->innertext();
  391. $ret = str_ireplace('<![CDATA[', '', $ret);
  392. $ret = str_replace(']]>', '', $ret);
  393. return $ret;
  394. }
  395.  
  396. // build node's text with tag
  397. function makeup()
  398. {
  399. // text, comment, unknown
  400. if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
  401.  
  402. $ret = '<'.$this->tag;
  403. $i = -1;
  404.  
  405. foreach ($this->attr as $key=>$val)
  406. {
  407. ++$i;
  408.  
  409. // skip removed attribute
  410. if ($val===null || $val===false)
  411. continue;
  412.  
  413. $ret .= $this->_[HDOM_INFO_SPACE][$i][0];
  414. //no value attr: nowrap, checked selected...
  415. if ($val===true)
  416. $ret .= $key;
  417. else {
  418. switch ($this->_[HDOM_INFO_QUOTE][$i])
  419. {
  420. case HDOM_QUOTE_DOUBLE: $quote = '"'; break;
  421. case HDOM_QUOTE_SINGLE: $quote = '\''; break;
  422. default: $quote = '';
  423. }
  424. $ret .= $key.$this->_[HDOM_INFO_SPACE][$i][1].'='.$this->_[HDOM_INFO_SPACE][$i][2].$quote.$val.$quote;
  425. }
  426. }
  427. $ret = $this->dom->restore_noise($ret);
  428. return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>';
  429. }
  430.  
  431. // find elements by css selector
  432. //PaperG - added ability for find to lowercase the value of the selector.
  433. function find($selector, $idx=null, $lowercase=false)
  434. {
  435. $selectors = $this->parse_selector($selector);
  436. if (($count=count($selectors))===0) return array();
  437. $found_keys = array();
  438.  
  439. // find each selector
  440. for ($c=0; $c<$count; ++$c)
  441. {
  442. // The change on the below line was documented on the sourceforge code tracker id 2788009
  443. // used to be: if (($levle=count($selectors[0]))===0) return array();
  444. if (($levle=count($selectors[$c]))===0) return array();
  445. if (!isset($this->_[HDOM_INFO_BEGIN])) return array();
  446.  
  447. $head = array($this->_[HDOM_INFO_BEGIN]=>1);
  448.  
  449. // handle descendant selectors, no recursive!
  450. for ($l=0; $l<$levle; ++$l)
  451. {
  452. $ret = array();
  453. foreach ($head as $k=>$v)
  454. {
  455. $n = ($k===-1) ? $this->dom->root : $this->dom->nodes[$k];
  456. //PaperG - Pass this optional parameter on to the seek function.
  457. $n->seek($selectors[$c][$l], $ret, $lowercase);
  458. }
  459. $head = $ret;
  460. }
  461.  
  462. foreach ($head as $k=>$v)
  463. {
  464. if (!isset($found_keys[$k]))
  465. $found_keys[$k] = 1;
  466. }
  467. }
  468.  
  469. // sort keys
  470. ksort($found_keys);
  471.  
  472. $found = array();
  473. foreach ($found_keys as $k=>$v)
  474. $found[] = $this->dom->nodes[$k];
  475.  
  476. // return nth-element or array
  477. if (is_null($idx)) return $found;
  478. else if ($idx<0) $idx = count($found) + $idx;
  479. return (isset($found[$idx])) ? $found[$idx] : null;
  480. }
  481.  
  482. // seek for given conditions
  483. // PaperG - added parameter to allow for case insensitive testing of the value of a selector.
  484. protected function seek($selector, &$ret, $lowercase=false)
  485. {
  486. global $debugObject;
  487. if (is_object($debugObject))
  488. {
  489. $debugObject->debugLogEntry(1);
  490. }
  491.  
  492. list($tag, $key, $val, $exp, $no_key) = $selector;
  493.  
  494. // xpath index
  495. if ($tag && $key && is_numeric($key))
  496. {
  497. $count = 0;
  498. foreach ($this->children as $c)
  499. {
  500. if ($tag==='*' || $tag===$c->tag) {
  501. if (++$count==$key) {
  502. $ret[$c->_[HDOM_INFO_BEGIN]] = 1;
  503. return;
  504. }
  505. }
  506. }
  507. return;
  508. }
  509.  
  510. $end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0;
  511. if ($end==0) {
  512. $parent = $this->parent;
  513. while (!isset($parent->_[HDOM_INFO_END]) && $parent!==null) {
  514. $end -= 1;
  515. $parent = $parent->parent;
  516. }
  517. $end += $parent->_[HDOM_INFO_END];
  518. }
  519.  
  520. for ($i=$this->_[HDOM_INFO_BEGIN]+1; $i<$end; ++$i) {
  521. $node = $this->dom->nodes[$i];
  522.  
  523. $pass = true;
  524.  
  525. if ($tag==='*' && !$key) {
  526. if (in_array($node, $this->children, true))
  527. $ret[$i] = 1;
  528. continue;
  529. }
  530.  
  531. // compare tag
  532. if ($tag && $tag!=$node->tag && $tag!=='*') {$pass=false;}
  533. // compare key
  534. if ($pass && $key) {
  535. if ($no_key) {
  536. if (isset($node->attr[$key])) $pass=false;
  537. } else {
  538. if (($key != "plaintext") && !isset($node->attr[$key])) $pass=false;
  539. }
  540. }
  541. // compare value
  542. if ($pass && $key && $val && $val!=='*') {
  543. // If they have told us that this is a "plaintext" search then we want the plaintext of the node - right?
  544. if ($key == "plaintext") {
  545. // $node->plaintext actually returns $node->text();
  546. $nodeKeyValue = $node->text();
  547. } else {
  548. // this is a normal search, we want the value of that attribute of the tag.
  549. $nodeKeyValue = $node->attr[$key];
  550. }
  551. if (is_object($debugObject)) {$debugObject->debugLog(2, "testing node: " . $node->tag . " for attribute: " . $key . $exp . $val . " where nodes value is: " . $nodeKeyValue);}
  552.  
  553. //PaperG - If lowercase is set, do a case insensitive test of the value of the selector.
  554. if ($lowercase) {
  555. $check = $this->match($exp, strtolower($val), strtolower($nodeKeyValue));
  556. } else {
  557. $check = $this->match($exp, $val, $nodeKeyValue);
  558. }
  559. if (is_object($debugObject)) {$debugObject->debugLog(2, "after match: " . ($check ? "true" : "false"));}
  560.  
  561. // handle multiple class
  562. if (!$check && strcasecmp($key, 'class')===0) {
  563. foreach (explode(' ',$node->attr[$key]) as $k) {
  564. // Without this, there were cases where leading, trailing, or double spaces lead to our comparing blanks - bad form.
  565. if (!empty($k)) {
  566. if ($lowercase) {
  567. $check = $this->match($exp, strtolower($val), strtolower($k));
  568. } else {
  569. $check = $this->match($exp, $val, $k);
  570. }
  571. if ($check) break;
  572. }
  573. }
  574. }
  575. if (!$check) $pass = false;
  576. }
  577. if ($pass) $ret[$i] = 1;
  578. unset($node);
  579. }
  580. // It's passed by reference so this is actually what this function returns.
  581. if (is_object($debugObject)) {$debugObject->debugLog(1, "EXIT - ret: ", $ret);}
  582. }
  583.  
  584. protected function match($exp, $pattern, $value) {
  585. global $debugObject;
  586. if (is_object($debugObject)) {$debugObject->debugLogEntry(1);}
  587.  
  588. switch ($exp) {
  589. case '=':
  590. return ($value===$pattern);
  591. case '!=':
  592. return ($value!==$pattern);
  593. case '^=':
  594. return preg_match("/^".preg_quote($pattern,'/')."/", $value);
  595. case '$=':
  596. return preg_match("/".preg_quote($pattern,'/')."$/", $value);
  597. case '*=':
  598. if ($pattern[0]=='/') {
  599. return preg_match($pattern, $value);
  600. }
  601. return preg_match("/".$pattern."/i", $value);
  602. }
  603. return false;
  604. }
  605.  
  606. protected function parse_selector($selector_string) {
  607. global $debugObject;
  608. if (is_object($debugObject)) {$debugObject->debugLogEntry(1);}
  609.  
  610. // pattern of CSS selectors, modified from mootools
  611. // Paperg: Add the colon to the attrbute, so that it properly finds <tag attr:ibute="something" > like google does.
  612. // Note: if you try to look at this attribute, yo MUST use getAttribute since $dom->x:y will fail the php syntax check.
  613. // Notice the \[ starting the attbute? and the @? following? This implies that an attribute can begin with an @ sign that is not captured.
  614. // This implies that an html attribute specifier may start with an @ sign that is NOT captured by the expression.
  615. // farther study is required to determine of this should be documented or removed.
  616. // $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is";
  617. $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-:]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is";
  618. preg_match_all($pattern, trim($selector_string).' ', $matches, PREG_SET_ORDER);
  619. if (is_object($debugObject)) {$debugObject->debugLog(2, "Matches Array: ", $matches);}
  620.  
  621. $selectors = array();
  622. $result = array();
  623. //print_r($matches);
  624.  
  625. foreach ($matches as $m) {
  626. $m[0] = trim($m[0]);
  627. if ($m[0]==='' || $m[0]==='/' || $m[0]==='//') continue;
  628. // for browser generated xpath
  629. if ($m[1]==='tbody') continue;
  630.  
  631. list($tag, $key, $val, $exp, $no_key) = array($m[1], null, null, '=', false);
  632. if (!empty($m[2])) {$key='id'; $val=$m[2];}
  633. if (!empty($m[3])) {$key='class'; $val=$m[3];}
  634. if (!empty($m[4])) {$key=$m[4];}
  635. if (!empty($m[5])) {$exp=$m[5];}
  636. if (!empty($m[6])) {$val=$m[6];}
  637.  
  638. // convert to lowercase
  639. if ($this->dom->lowercase) {$tag=strtolower($tag); $key=strtolower($key);}
  640. //elements that do NOT have the specified attribute
  641. if (isset($key[0]) && $key[0]==='!') {$key=substr($key, 1); $no_key=true;}
  642.  
  643. $result[] = array($tag, $key, $val, $exp, $no_key);
  644. if (trim($m[7])===',') {
  645. $selectors[] = $result;
  646. $result = array();
  647. }
  648. }
  649. if (count($result)>0)
  650. $selectors[] = $result;
  651. return $selectors;
  652. }
  653.  
  654. function __get($name) {
  655. if (isset($this->attr[$name]))
  656. {
  657. return $this->convert_text($this->attr[$name]);
  658. }
  659. switch ($name) {
  660. case 'outertext': return $this->outertext();
  661. case 'innertext': return $this->innertext();
  662. case 'plaintext': return $this->text();
  663. case 'xmltext': return $this->xmltext();
  664. default: return array_key_exists($name, $this->attr);
  665. }
  666. }
  667.  
  668. function __set($name, $value) {
  669. switch ($name) {
  670. case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value;
  671. case 'innertext':
  672. if (isset($this->_[HDOM_INFO_TEXT])) return $this->_[HDOM_INFO_TEXT] = $value;
  673. return $this->_[HDOM_INFO_INNER] = $value;
  674. }
  675. if (!isset($this->attr[$name])) {
  676. $this->_[HDOM_INFO_SPACE][] = array(' ', '', '');
  677. $this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;
  678. }
  679. $this->attr[$name] = $value;
  680. }
  681.  
  682. function __isset($name) {
  683. switch ($name) {
  684. case 'outertext': return true;
  685. case 'innertext': return true;
  686. case 'plaintext': return true;
  687. }
  688. //no value attr: nowrap, checked selected...
  689. return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]);
  690. }
  691.  
  692. function __unset($name) {
  693. if (isset($this->attr[$name]))
  694. unset($this->attr[$name]);
  695. }
  696.  
  697. // PaperG - Function to convert the text from one character set to another if the two sets are not the same.
  698. function convert_text($text) {
  699. global $debugObject;
  700. if (is_object($debugObject)) {$debugObject->debugLogEntry(1);}
  701.  
  702. $converted_text = $text;
  703.  
  704. $sourceCharset = "";
  705. $targetCharset = "";
  706. if ($this->dom) {
  707. $sourceCharset = strtoupper($this->dom->_charset);
  708. $targetCharset = strtoupper($this->dom->_target_charset);
  709. }
  710. if (is_object($debugObject)) {$debugObject->debugLog(3, "source charset: " . $sourceCharset . " target charaset: " . $targetCharset);}
  711.  
  712. if (!empty($sourceCharset) && !empty($targetCharset) && (strcasecmp($sourceCharset, $targetCharset) != 0))
  713. {
  714. // Check if the reported encoding could have been incorrect and the text is actually already UTF-8
  715. if ((strcasecmp($targetCharset, 'UTF-8') == 0) && ($this->is_utf8($text)))
  716. {
  717. $converted_text = $text;
  718. }
  719. else
  720. {
  721. $converted_text = iconv($sourceCharset, $targetCharset, $text);
  722. }
  723. }
  724.  
  725. return $converted_text;
  726. }
  727.  
  728. function is_utf8($string)
  729. {
  730. return (utf8_encode(utf8_decode($string)) == $string);
  731. }
  732.  
  733. // camel naming conventions
  734. function getAllAttributes() {return $this->attr;}
  735. function getAttribute($name) {return $this->__get($name);}
  736. function setAttribute($name, $value) {$this->__set($name, $value);}
  737. function hasAttribute($name) {return $this->__isset($name);}
  738. function removeAttribute($name) {$this->__set($name, null);}
  739. function getElementById($id) {return $this->find("#$id", 0);}
  740. function getElementsById($id, $idx=null) {return $this->find("#$id", $idx);}
  741. function getElementByTagName($name) {return $this->find($name, 0);}
  742. function getElementsByTagName($name, $idx=null) {return $this->find($name, $idx);}
  743. function parentNode() {return $this->parent();}
  744. function childNodes($idx=-1) {return $this->children($idx);}
  745. function firstChild() {return $this->first_child();}
  746. function lastChild() {return $this->last_child();}
  747. function nextSibling() {return $this->next_sibling();}
  748. function previousSibling() {return $this->prev_sibling();}
  749. }
  750.  
  751. /**
  752. * simple html dom parser
  753. * Paperg - in the find routine: allow us to specify that we want case insensitive testing of the value of the selector.
  754. * Paperg - change $size from protected to public so we can easily access it
  755. * Paperg - added ForceTagsClosed in the constructor which tells us whether we trust the html or not. Default is to NOT trust it.
  756. *
  757. * @package PlaceLocalInclude
  758. */
  759. class simple_html_dom {
  760. public $root = null;
  761. public $nodes = array();
  762. public $callback = null;
  763. public $lowercase = false;
  764. public $size;
  765. protected $pos;
  766. protected $doc;
  767. protected $char;
  768. protected $cursor;
  769. protected $parent;
  770. protected $noise = array();
  771. protected $token_blank = " \t\r\n";
  772. protected $token_equal = ' =/>';
  773. protected $token_slash = " />\r\n\t";
  774. protected $token_attr = ' >';
  775. protected $_charset = '';
  776. protected $_target_charset = '';
  777. protected $default_br_text = "";
  778.  
  779. // use isset instead of in_array, performance boost about 30%...
  780. protected $self_closing_tags = array('img'=>1, 'br'=>1, 'input'=>1, 'meta'=>1, 'link'=>1, 'hr'=>1, 'base'=>1, 'embed'=>1, 'spacer'=>1);
  781. protected $block_tags = array('root'=>1, 'body'=>1, 'form'=>1, 'div'=>1, 'span'=>1, 'table'=>1);
  782. // Known sourceforge issue #2977341
  783. // B tags that are not closed cause us to return everything to the end of the document.
  784. protected $optional_closing_tags = array(
  785. 'tr'=>array('tr'=>1, 'td'=>1, 'th'=>1),
  786. 'th'=>array('th'=>1),
  787. 'td'=>array('td'=>1),
  788. 'li'=>array('li'=>1),
  789. 'dt'=>array('dt'=>1, 'dd'=>1),
  790. 'dd'=>array('dd'=>1, 'dt'=>1),
  791. 'dl'=>array('dd'=>1, 'dt'=>1),
  792. 'p'=>array('p'=>1),
  793. 'nobr'=>array('nobr'=>1),
  794. 'b'=>array('b'=>1),
  795. );
  796.  
  797. function __construct($str=null, $lowercase=true, $forceTagsClosed=true, $target_charset=DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT) {
  798. if ($str) {
  799. if (preg_match("/^http:\/\//i",$str) || is_file($str))
  800. $this->load_file($str);
  801. else
  802. $this->load($str, $lowercase, $stripRN, $defaultBRText);
  803. }
  804. // Forcing tags to be closed implies that we don't trust the html, but it can lead to parsing errors if we SHOULD trust the html.
  805. if (!$forceTagsClosed) {
  806. $this->optional_closing_array=array();
  807. }
  808. $this->_target_charset = $target_charset;
  809. }
  810.  
  811. function __destruct() {
  812. $this->clear();
  813. }
  814.  
  815. // load html from string
  816. function load($str, $lowercase=true, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT) {
  817. global $debugObject;
  818.  
  819. // prepare
  820. $this->prepare($str, $lowercase, $stripRN, $defaultBRText);
  821. // strip out comments
  822. $this->remove_noise("'<!--(.*?)-->'is");
  823. // strip out cdata
  824. $this->remove_noise("'<!\[CDATA\[(.*?)\]\]>'is", true);
  825. // Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037
  826. // Script tags removal now preceeds style tag removal.
  827. // strip out <script> tags
  828. $this->remove_noise("'<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>'is");
  829. $this->remove_noise("'<\s*script\s*>(.*?)<\s*/\s*script\s*>'is");
  830. // strip out <style> tags
  831. $this->remove_noise("'<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>'is");
  832. $this->remove_noise("'<\s*style\s*>(.*?)<\s*/\s*style\s*>'is");
  833. // strip out preformatted tags
  834. $this->remove_noise("'<\s*(?:code)[^>]*>(.*?)<\s*/\s*(?:code)\s*>'is");
  835. // strip out server side scripts
  836. $this->remove_noise("'(<\?)(.*?)(\?>)'s", true);
  837. // strip smarty scripts
  838. $this->remove_noise("'(\{\w)(.*?)(\})'s", true);
  839.  
  840. // parsing
  841. while ($this->parse());
  842. // end
  843. $this->root->_[HDOM_INFO_END] = $this->cursor;
  844. $this->parse_charset();
  845. }
  846.  
  847. // load html from file
  848. function load_file() {
  849. $args = func_get_args();
  850.  
  851. // ADDED BY DANNY
  852. $opts = array(
  853. 'http'=>array(
  854. 'method'=>"GET",
  855. 'header'=>"Accept-language: en\r\n" .
  856. "User-Agent: Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6\r\n".
  857. "Cookie: foo=bar\r\n"
  858. )
  859. );
  860. $context = stream_context_create($opts);
  861. // END DANNY
  862.  
  863. $this->load(call_user_func_array('file_get_contents', $args), true);
  864. // Per the simple_html_dom repositiry this is a planned upgrade to the codebase.
  865. // Throw an error if we can't properly load the dom.
  866. if (($error=error_get_last())!==null) {
  867. $this->clear();
  868. return false;
  869. }
  870. }
  871.  
  872. // set callback function
  873. function set_callback($function_name) {
  874. $this->callback = $function_name;
  875. }
  876.  
  877. // remove callback function
  878. function remove_callback() {
  879. $this->callback = null;
  880. }
  881.  
  882. // save dom as string
  883. function save($filepath='') {
  884. $ret = $this->root->innertext();
  885. if ($filepath!=='') file_put_contents($filepath, $ret, LOCK_EX);
  886. return $ret;
  887. }
  888.  
  889. // find dom node by css selector
  890. // Paperg - allow us to specify that we want case insensitive testing of the value of the selector.
  891. function find($selector, $idx=null, $lowercase=false) {
  892. return $this->root->find($selector, $idx, $lowercase);
  893. }
  894.  
  895. // clean up memory due to php5 circular references memory leak...
  896. function clear() {
  897. foreach ($this->nodes as $n) {$n->clear(); $n = null;}
  898. // This add next line is documented in the sourceforge repository. 2977248 as a fix for ongoing memory leaks that occur even with the use of clear.
  899. if (isset($this->children)) foreach ($this->children as $n) {$n->clear(); $n = null;}
  900. if (isset($this->parent)) {$this->parent->clear(); unset($this->parent);}
  901. if (isset($this->root)) {$this->root->clear(); unset($this->root);}
  902. unset($this->doc);
  903. unset($this->noise);
  904. }
  905.  
  906. function dump($show_attr=true) {
  907. $this->root->dump($show_attr);
  908. }
  909.  
  910. // prepare HTML data and init everything
  911. protected function prepare($str, $lowercase=true, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT) {
  912. $this->clear();
  913.  
  914. // set the length of content before we do anything to it.
  915. $this->size = strlen($str);
  916.  
  917. //before we save the string as the doc... strip out the \r \n's if we are told to.
  918. if ($stripRN) {
  919. $str = str_replace("\r", " ", $str);
  920. $str = str_replace("\n", " ", $str);
  921. }
  922.  
  923. $this->doc = $str;
  924. $this->pos = 0;
  925. $this->cursor = 1;
  926. $this->noise = array();
  927. $this->nodes = array();
  928. $this->lowercase = $lowercase;
  929. $this->default_br_text = $defaultBRText;
  930. $this->root = new simple_html_dom_node($this);
  931. $this->root->tag = 'root';
  932. $this->root->_[HDOM_INFO_BEGIN] = -1;
  933. $this->root->nodetype = HDOM_TYPE_ROOT;
  934. $this->parent = $this->root;
  935. if ($this->size>0) $this->char = $this->doc[0];
  936. }
  937.  
  938. // parse html content
  939. protected function parse() {
  940. if (($s = $this->copy_until_char('<'))==='')
  941. return $this->read_tag();
  942.  
  943. // text
  944. $node = new simple_html_dom_node($this);
  945. ++$this->cursor;
  946. $node->_[HDOM_INFO_TEXT] = $s;
  947. $this->link_nodes($node, false);
  948. return true;
  949. }
  950.  
  951. // PAPERG - dkchou - added this to try to identify the character set of the page we have just parsed so we know better how to spit it out later.
  952. // NOTE: IF you provide a routine called get_last_retrieve_url_contents_content_type which returns the CURLINFO_CONTENT_TYPE fromt he last curl_exec
  953. // (or the content_type header fromt eh last transfer), we will parse THAT, and if a charset is specified, we will use it over any other mechanism.
  954. protected function parse_charset()
  955. {
  956. global $debugObject;
  957.  
  958. $charset = null;
  959.  
  960. if (function_exists('get_last_retrieve_url_contents_content_type'))
  961. {
  962. $contentTypeHeader = get_last_retrieve_url_contents_content_type();
  963. $success = preg_match('/charset=(.+)/', $contentTypeHeader, $matches);
  964. if ($success)
  965. {
  966. $charset = $matches[1];
  967. if (is_object($debugObject)) {$debugObject->debugLog(2, 'header content-type found charset of: ' . $charset);}
  968. }
  969.  
  970. }
  971.  
  972. if (empty($charset))
  973. {
  974. $el = $this->root->find('meta[http-equiv=Content-Type]',0);
  975. if (!empty($el))
  976. {
  977. $fullvalue = $el->content;
  978. if (is_object($debugObject)) {$debugObject->debugLog(2, 'meta content-type tag found' . $fullValue);}
  979.  
  980. if (!empty($fullvalue))
  981. {
  982. $success = preg_match('/charset=(.+)/', $fullvalue, $matches);
  983. if ($success)
  984. {
  985. $charset = $matches[1];
  986. }
  987. else
  988. {
  989. // If there is a meta tag, and they don't specify the character set, research says that it's typically ISO-8859-1
  990. if (is_object($debugObject)) {$debugObject->debugLog(2, 'meta content-type tag couldn\'t be parsed. using iso-8859 default.');}
  991. $charset = 'ISO-8859-1';
  992. }
  993. }
  994. }
  995. }
  996.  
  997. // If we couldn't find a charset above, then lets try to detect one based on the text we got...
  998. if (empty($charset))
  999. {
  1000. // Have php try to detect the encoding from the text given to us.
  1001. $charset = mb_detect_encoding($this->root->plaintext . "ascii", $encoding_list = array( "UTF-8", "CP1252" ) );
  1002. if (is_object($debugObject)) {$debugObject->debugLog(2, 'mb_detect found: ' . $charset);}
  1003.  
  1004. // and if this doesn't work... then we need to just wrongheadedly assume it's UTF-8 so that we can move on - cause this will usually give us most of what we need...
  1005. if ($charset === false)
  1006. {
  1007. if (is_object($debugObject)) {$debugObject->debugLog(2, 'since mb_detect failed - using default of utf-8');}
  1008. $charset = 'UTF-8';
  1009. }
  1010. }
  1011.  
  1012. // Since CP1252 is a superset, if we get one of it's subsets, we want it instead.
  1013. if ((strtolower($charset) == strtolower('ISO-8859-1')) || (strtolower($charset) == strtolower('Latin1')) || (strtolower($charset) == strtolower('Latin-1')))
  1014. {
  1015. if (is_object($debugObject)) {$debugObject->debugLog(2, 'replacing ' . $charset . ' with CP1252 as its a superset');}
  1016. $charset = 'CP1252';
  1017. }
  1018.  
  1019. if (is_object($debugObject)) {$debugObject->debugLog(1, 'EXIT - ' . $charset);}
  1020.  
  1021. return $this->_charset = $charset;
  1022. }
  1023.  
  1024. // read tag info
  1025. protected function read_tag() {
  1026. if ($this->char!=='<') {
  1027. $this->root->_[HDOM_INFO_END] = $this->cursor;
  1028. return false;
  1029. }
  1030. $begin_tag_pos = $this->pos;
  1031. $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
  1032.  
  1033. // end tag
  1034. if ($this->char==='/') {
  1035. $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
  1036. // This represetns the change in the simple_html_dom trunk from revision 180 to 181.
  1037. // $this->skip($this->token_blank_t);
  1038. $this->skip($this->token_blank);
  1039. $tag = $this->copy_until_char('>');
  1040.  
  1041. // skip attributes in end tag
  1042. if (($pos = strpos($tag, ' '))!==false)
  1043. $tag = substr($tag, 0, $pos);
  1044.  
  1045. $parent_lower = strtolower($this->parent->tag);
  1046. $tag_lower = strtolower($tag);
  1047.  
  1048. if ($parent_lower!==$tag_lower) {
  1049. if (isset($this->optional_closing_tags[$parent_lower]) && isset($this->block_tags[$tag_lower])) {
  1050. $this->parent->_[HDOM_INFO_END] = 0;
  1051. $org_parent = $this->parent;
  1052.  
  1053. while (($this->parent->parent) && strtolower($this->parent->tag)!==$tag_lower)
  1054. $this->parent = $this->parent->parent;
  1055.  
  1056. if (strtolower($this->parent->tag)!==$tag_lower) {
  1057. $this->parent = $org_parent; // restore origonal parent
  1058. if ($this->parent->parent) $this->parent = $this->parent->parent;
  1059. $this->parent->_[HDOM_INFO_END] = $this->cursor;
  1060. return $this->as_text_node($tag);
  1061. }
  1062. }
  1063. else if (($this->parent->parent) && isset($this->block_tags[$tag_lower])) {
  1064. $this->parent->_[HDOM_INFO_END] = 0;
  1065. $org_parent = $this->parent;
  1066.  
  1067. while (($this->parent->parent) && strtolower($this->parent->tag)!==$tag_lower)
  1068. $this->parent = $this->parent->parent;
  1069.  
  1070. if (strtolower($this->parent->tag)!==$tag_lower) {
  1071. $this->parent = $org_parent; // restore origonal parent
  1072. $this->parent->_[HDOM_INFO_END] = $this->cursor;
  1073. return $this->as_text_node($tag);
  1074. }
  1075. }
  1076. else if (($this->parent->parent) && strtolower($this->parent->parent->tag)===$tag_lower) {
  1077. $this->parent->_[HDOM_INFO_END] = 0;
  1078. $this->parent = $this->parent->parent;
  1079. }
  1080. else
  1081. return $this->as_text_node($tag);
  1082. }
  1083.  
  1084. $this->parent->_[HDOM_INFO_END] = $this->cursor;
  1085. if ($this->parent->parent) $this->parent = $this->parent->parent;
  1086.  
  1087. $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
  1088. return true;
  1089. }
  1090.  
  1091. $node = new simple_html_dom_node($this);
  1092. $node->_[HDOM_INFO_BEGIN] = $this->cursor;
  1093. ++$this->cursor;
  1094. $tag = $this->copy_until($this->token_slash);
  1095. $node->tag_start = $begin_tag_pos;
  1096.  
  1097. // doctype, cdata & comments...
  1098. if (isset($tag[0]) && $tag[0]==='!') {
  1099. $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until_char('>');
  1100.  
  1101. if (isset($tag[2]) && $tag[1]==='-' && $tag[2]==='-') {
  1102. $node->nodetype = HDOM_TYPE_COMMENT;
  1103. $node->tag = 'comment';
  1104. } else {
  1105. $node->nodetype = HDOM_TYPE_UNKNOWN;
  1106. $node->tag = 'unknown';
  1107. }
  1108. if ($this->char==='>') $node->_[HDOM_INFO_TEXT].='>';
  1109. $this->link_nodes($node, true);
  1110. $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
  1111. return true;
  1112. }
  1113.  
  1114. // text
  1115. if ($pos=strpos($tag, '<')!==false) {
  1116. $tag = '<' . substr($tag, 0, -1);
  1117. $node->_[HDOM_INFO_TEXT] = $tag;
  1118. $this->link_nodes($node, false);
  1119. $this->char = $this->doc[--$this->pos]; // prev
  1120. return true;
  1121. }
  1122.  
  1123. if (!preg_match("/^[\w-:]+$/", $tag)) {
  1124. $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until('<>');
  1125. if ($this->char==='<') {
  1126. $this->link_nodes($node, false);
  1127. return true;
  1128. }
  1129.  
  1130. if ($this->char==='>') $node->_[HDOM_INFO_TEXT].='>';
  1131. $this->link_nodes($node, false);
  1132. $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
  1133. return true;
  1134. }
  1135.  
  1136. // begin tag
  1137. $node->nodetype = HDOM_TYPE_ELEMENT;
  1138. $tag_lower = strtolower($tag);
  1139. $node->tag = ($this->lowercase) ? $tag_lower : $tag;
  1140.  
  1141. // handle optional closing tags
  1142. if (isset($this->optional_closing_tags[$tag_lower]) ) {
  1143. while (isset($this->optional_closing_tags[$tag_lower][strtolower($this->parent->tag)])) {
  1144. $this->parent->_[HDOM_INFO_END] = 0;
  1145. $this->parent = $this->parent->parent;
  1146. }
  1147. $node->parent = $this->parent;
  1148. }
  1149.  
  1150. $guard = 0; // prevent infinity loop
  1151. $space = array($this->copy_skip($this->token_blank), '', '');
  1152.  
  1153. // attributes
  1154. do
  1155. {
  1156. if ($this->char!==null && $space[0]==='') break;
  1157. $name = $this->copy_until($this->token_equal);
  1158. if ($guard===$this->pos) {
  1159. $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
  1160. continue;
  1161. }
  1162. $guard = $this->pos;
  1163.  
  1164. // handle endless '<'
  1165. if ($this->pos>=$this->size-1 && $this->char!=='>') {
  1166. $node->nodetype = HDOM_TYPE_TEXT;
  1167. $node->_[HDOM_INFO_END] = 0;
  1168. $node->_[HDOM_INFO_TEXT] = '<'.$tag . $space[0] . $name;
  1169. $node->tag = 'text';
  1170. $this->link_nodes($node, false);
  1171. return true;
  1172. }
  1173.  
  1174. // handle mismatch '<'
  1175. if ($this->doc[$this->pos-1]=='<') {
  1176. $node->nodetype = HDOM_TYPE_TEXT;
  1177. $node->tag = 'text';
  1178. $node->attr = array();
  1179. $node->_[HDOM_INFO_END] = 0;
  1180. $node->_[HDOM_INFO_TEXT] = substr($this->doc, $begin_tag_pos, $this->pos-$begin_tag_pos-1);
  1181. $this->pos -= 2;
  1182. $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
  1183. $this->link_nodes($node, false);
  1184. return true;
  1185. }
  1186.  
  1187. if ($name!=='/' && $name!=='') {
  1188. $space[1] = $this->copy_skip($this->token_blank);
  1189. $name = $this->restore_noise($name);
  1190. if ($this->lowercase) $name = strtolower($name);
  1191. if ($this->char==='=') {
  1192. $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
  1193. $this->parse_attr($node, $name, $space);
  1194. }
  1195. else {
  1196. //no value attr: nowrap, checked selected...
  1197. $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO;
  1198. $node->attr[$name] = true;
  1199. if ($this->char!='>') $this->char = $this->doc[--$this->pos]; // prev
  1200. }
  1201. $node->_[HDOM_INFO_SPACE][] = $space;
  1202. $space = array($this->copy_skip($this->token_blank), '', '');
  1203. }
  1204. else
  1205. break;
  1206. } while ($this->char!=='>' && $this->char!=='/');
  1207.  
  1208. $this->link_nodes($node, true);
  1209. $node->_[HDOM_INFO_ENDSPACE] = $space[0];
  1210.  
  1211. // check self closing
  1212. if ($this->copy_until_char_escape('>')==='/') {
  1213. $node->_[HDOM_INFO_ENDSPACE] .= '/';
  1214. $node->_[HDOM_INFO_END] = 0;
  1215. }
  1216. else {
  1217. // reset parent
  1218. if (!isset($this->self_closing_tags[strtolower($node->tag)])) $this->parent = $node;
  1219. }
  1220. $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
  1221.  
  1222. // If it's a BR tag, we need to set it's text to the default text.
  1223. // This way when we see it in plaintext, we can generate formatting that the user wants.
  1224. if ($node->tag == "br") {
  1225. $node->_[HDOM_INFO_INNER] = $this->default_br_text;
  1226. }
  1227.  
  1228. return true;
  1229. }
  1230.  
  1231. // parse attributes
  1232. protected function parse_attr($node, $name, &$space) {
  1233. // Per sourceforge: http://sourceforge.net/tracker/?func=detail&aid=3061408&group_id=218559&atid=1044037
  1234. // If the attribute is already defined inside a tag, only pay atetntion to the first one as opposed to the last one.
  1235. if (isset($node->attr[$name]))
  1236. {
  1237. return;
  1238. }
  1239.  
  1240. $space[2] = $this->copy_skip($this->token_blank);
  1241. switch ($this->char) {
  1242. case '"':
  1243. $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;
  1244. $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
  1245. $node->attr[$name] = $this->restore_noise($this->copy_until_char_escape('"'));
  1246. $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
  1247. break;
  1248. case '\'':
  1249. $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_SINGLE;
  1250. $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
  1251. $node->attr[$name] = $this->restore_noise($this->copy_until_char_escape('\''));
  1252. $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
  1253. break;
  1254. default:
  1255. $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO;
  1256. $node->attr[$name] = $this->restore_noise($this->copy_until($this->token_attr));
  1257. }
  1258. // PaperG: Attributes should not have \r or \n in them, that counts as html whitespace.
  1259. $node->attr[$name] = str_replace("\r", "", $node->attr[$name]);
  1260. $node->attr[$name] = str_replace("\n", "", $node->attr[$name]);
  1261. // PaperG: If this is a "class" selector, lets get rid of the preceeding and trailing space since some people leave it in the multi class case.
  1262. if ($name == "class") {
  1263. $node->attr[$name] = trim($node->attr[$name]);
  1264. }
  1265. }
  1266.  
  1267. // link node's parent
  1268. protected function link_nodes(&$node, $is_child) {
  1269. $node->parent = $this->parent;
  1270. $this->parent->nodes[] = $node;
  1271. if ($is_child)
  1272. $this->parent->children[] = $node;
  1273. }
  1274.  
  1275. // as a text node
  1276. protected function as_text_node($tag) {
  1277. $node = new simple_html_dom_node($this);
  1278. ++$this->cursor;
  1279. $node->_[HDOM_INFO_TEXT] = '</' . $tag . '>';
  1280. $this->link_nodes($node, false);
  1281. $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
  1282. return true;
  1283. }
  1284.  
  1285. protected function skip($chars) {
  1286. $this->pos += strspn($this->doc, $chars, $this->pos);
  1287. $this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
  1288. }
  1289.  
  1290. protected function copy_skip($chars) {
  1291. $pos = $this->pos;
  1292. $len = strspn($this->doc, $chars, $pos);
  1293. $this->pos += $len;
  1294. $this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
  1295. if ($len===0) return '';
  1296. return substr($this->doc, $pos, $len);
  1297. }
  1298.  
  1299. protected function copy_until($chars) {
  1300. $pos = $this->pos;
  1301. $len = strcspn($this->doc, $chars, $pos);
  1302. $this->pos += $len;
  1303. $this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
  1304. return substr($this->doc, $pos, $len);
  1305. }
  1306.  
  1307. protected function copy_until_char($char) {
  1308. if ($this->char===null) return '';
  1309.  
  1310. if (($pos = strpos($this->doc, $char, $this->pos))===false) {
  1311. $ret = substr($this->doc, $this->pos, $this->size-$this->pos);
  1312. $this->char = null;
  1313. $this->pos = $this->size;
  1314. return $ret;
  1315. }
  1316.  
  1317. if ($pos===$this->pos) return '';
  1318. $pos_old = $this->pos;
  1319. $this->char = $this->doc[$pos];
  1320. $this->pos = $pos;
  1321. return substr($this->doc, $pos_old, $pos-$pos_old);
  1322. }
  1323.  
  1324. protected function copy_until_char_escape($char) {
  1325. if ($this->char===null) return '';
  1326.  
  1327. $start = $this->pos;
  1328. while (1) {
  1329. if (($pos = strpos($this->doc, $char, $start))===false) {
  1330. $ret = substr($this->doc, $this->pos, $this->size-$this->pos);
  1331. $this->char = null;
  1332. $this->pos = $this->size;
  1333. return $ret;
  1334. }
  1335.  
  1336. if ($pos===$this->pos) return '';
  1337.  
  1338. if ($this->doc[$pos-1]==='\\') {
  1339. $start = $pos+1;
  1340. continue;
  1341. }
  1342.  
  1343. $pos_old = $this->pos;
  1344. $this->char = $this->doc[$pos];
  1345. $this->pos = $pos;
  1346. return substr($this->doc, $pos_old, $pos-$pos_old);
  1347. }
  1348. }
  1349.  
  1350. // remove noise from html content
  1351. protected function remove_noise($pattern, $remove_tag=false) {
  1352. $count = preg_match_all($pattern, $this->doc, $matches, PREG_SET_ORDER|PREG_OFFSET_CAPTURE);
  1353.  
  1354. for ($i=$count-1; $i>-1; --$i) {
  1355. $key = '___noise___'.sprintf('% 3d', count($this->noise)+100);
  1356. $idx = ($remove_tag) ? 0 : 1;
  1357. $this->noise[$key] = $matches[$i][$idx][0];
  1358. $this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0]));
  1359. }
  1360.  
  1361. // reset the length of content
  1362. $this->size = strlen($this->doc);
  1363. if ($this->size>0) $this->char = $this->doc[0];
  1364. }
  1365.  
  1366. // restore noise to html content
  1367. function restore_noise($text) {
  1368. while (($pos=strpos($text, '___noise___'))!==false) {
  1369. $key = '___noise___'.$text[$pos+11].$text[$pos+12].$text[$pos+13];
  1370. if (isset($this->noise[$key]))
  1371. $text = substr($text, 0, $pos).$this->noise[$key].substr($text, $pos+14);
  1372. }
  1373. return $text;
  1374. }
  1375.  
  1376. function __toString() {
  1377. return $this->root->innertext();
  1378. }
  1379.  
  1380. function __get($name) {
  1381. switch ($name) {
  1382. case 'outertext':
  1383. return $this->root->innertext();
  1384. case 'innertext':
  1385. return $this->root->innertext();
  1386. case 'plaintext':
  1387. return $this->root->text();
  1388. case 'charset':
  1389. return $this->_charset;
  1390. case 'target_charset':
  1391. return $this->_target_charset;
  1392. }
  1393. }
  1394.  
  1395. // camel naming conventions
  1396. function childNodes($idx=-1) {return $this->root->childNodes($idx);}
  1397. function firstChild() {return $this->root->first_child();}
  1398. function lastChild() {return $this->root->last_child();}
  1399. function getElementById($id) {return $this->find("#$id", 0);}
  1400. function getElementsById($id, $idx=null) {return $this->find("#$id", $idx);}
  1401. function getElementByTagName($name) {return $this->find($name, 0);}
  1402. function getElementsByTagName($name, $idx=-1) {return $this->find($name, $idx);}
  1403. function loadFile() {$args = func_get_args();$this->load_file($args);}
  1404. }
  1405.  
  1406. ?>
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement