Advertisement
pushpesh4u

Remove selected elements using PHP DOM

May 30th, 2012
303
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
PHP 2.12 KB | None | 0 0
  1. <?php
  2. $url = 'http://en.wikipedia.org/w/index.php?title=Elephant&action=render';
  3. $curl = curl_init();
  4. curl_setopt($curl, CURLOPT_URL, $url);
  5. curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
  6. curl_setopt($curl, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.13) Gecko/20080311 Firefox/2.0.0.13');
  7.  
  8. // Now, we have the response
  9. $html = '<html>' . curl_exec($curl) . '</html>';
  10. curl_close($curl);
  11.  
  12. $document = new DOMDocument('1.0');
  13. $document->loadHTML($html);
  14.  
  15. // these are the elements which are to be kept in the response, rest all are to be removed.
  16. $allowed_elements = array('a','b','i','p');
  17.  
  18. // this array stores all nodes present in the DOM initially.
  19. $elems = array();
  20.  
  21. $parent = $document->getElementsByTagName('html')->item(0);
  22. foreach ($parent->getElementsByTagName('*') as $element)
  23. {
  24.     $node = (string)$element->nodeName;
  25.     if(strtolower($node) == 'body'){
  26.         continue;
  27.     }
  28.  
  29.     // store this node in the array.
  30.     $elems[] = $node;
  31. }
  32.  
  33. // keep only unique node names, i.e remove multiple occurrences of same node
  34. $elems = array_values( array_unique( $elems ) );
  35.  
  36. // now, get a list of the elements, which are to be removed from the response
  37. // array_diff removes all the elements present in the second array from the first array
  38. // and returns a final array. So $elems now has all the elements, which need to be removed.
  39. $elems = array_diff( $elems, $allowed_elements );
  40.  
  41. // re-index and sort them
  42. $elems = array_values( array_unique( $elems ) );
  43. sort($elems);
  44.  
  45. // Now, remove all elements present in $elem as we don't need them.
  46. foreach( $elems as $elem ) {
  47.     $parent1 = $parent->getElementsByTagName($elem);
  48.  
  49.     // this is the number of times this element occurs in the response
  50.     $length = $parent->getElementsByTagName($elem)->length;
  51.  
  52.     // remove each one of them
  53.     for($i=0;$i<$length;$i++) {
  54.         // 0 will always be the index because after each `removeChild`, the next element shifts 1 position back.
  55.         $el = $parent1->item(0);
  56.         if( $el ) {
  57.             $el->parentNode->removeChild($el);
  58.         }
  59.     }
  60. }
  61.  
  62. echo $document->saveHTML();
  63. ?>
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement