Advertisement
Guest User

Mino

a guest
Aug 16th, 2009
375
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
PHP 15.64 KB | None | 0 0
  1. <?php
  2.     /*=======================================================================+
  3.     |                       PHP Universal Feed Parser                        |  
  4.     +------------------------------------------------------------------------/
  5.  
  6.     Author          : Anis uddin Ahmad <admin@ajaxray.com>
  7.     Web             : http://www.ajaxray.com
  8.     Publish Date    : March 24, 2008
  9.  
  10. LICENSE
  11. ----------------------------------------------------------------------
  12. PHP Universal Feed Parser 1.0 - A PHP class to parse RSS 1.0, RSS 2.0 and ATOM 1.0 feed.
  13. Copyright (C) 2008  Anis uddin Ahmad <admin@ajaxray.com>
  14.    
  15. This program is free software; you can redistribute it and/or
  16. modify it under the terms of the GNU General Public License (GPL)
  17. as published by the Free Software Foundation; either version 2
  18. of the License, or (at your option) any later version.
  19.  
  20. This program is distributed in the hope that it will be useful,
  21. but WITHOUT ANY WARRANTY; without even the implied warranty of
  22. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  23. GNU General Public License for more details.
  24.  
  25. To read the license please visit http://www.gnu.org/copyleft/gpl.html
  26. =======================================================================
  27.  
  28. HOW TO USE
  29. -----------------------------------------------------------------------
  30. It's very easy to use. Just follow this 3 steps:
  31. 1. Include the file
  32.     include('FeedParser.php');
  33. 2. Create an object of FeedParser class
  34.     $Parser = new FeedParser();
  35. 3. Parse the URL you want to featch
  36.     $Parser->parse('http://www.sitepoint.com/rss.php');
  37.    
  38. Done.
  39. Now you can use this functions to get various information of parsed feed:
  40.     1. $Parser->getChannels()        - To get all channel elements as array
  41.     2. $Parser->getItems()           - To get all feed elements as array
  42.     3. $Parser->getChannel($name)    - To get a channel element by name
  43.     4. $Parser->getItem($index)      - To get a feed element as array by it's index
  44.     5. $Parser->getTotalItems()      - To get the number of total feed elements
  45.     6. $Parser->getFeedVersion()     - To get the detected version of parsed feed
  46.     7. $Parser->getParsedUrl()       - To get the parsed feed URL  
  47.    
  48. =======================================================================
  49.  
  50. IMPORTANT NOTES
  51. -----------------------------------------------------------------------
  52. 1. All array keys are must be UPPERCASE
  53. 2. All dates are converted to timestamp
  54. 3. Attributes of a tag will be found under TAGNAME_ATTRS index
  55.     example: Attributes of $item['GUID'] will be found as $item['GUID_ATTRS']
  56. 4. The tags which have subtags will be an array and sub tags will be found as it's element
  57.     example: IMAGE tag in RSS 2.0
  58. ========================================================================
  59.  
  60. EXAMPLES
  61. -----------------------------------------------------------------------
  62. To see more details and examples, please visit:
  63.     http://www.ajaxray.com/blog/2008/05/02/php-universal-feed-parser-lightweight-php-class-for-parsing-rss-and-atom-feeds/
  64. ========================================================================
  65. */
  66.  
  67. /**
  68. * PHP Univarsel Feed Parser class
  69. *
  70. * Parses RSS 1.0, RSS2.0 and ATOM Feed
  71. *
  72. * @license     GNU General Public License (GPL)                            
  73. * @author      Anis uddin Ahmad <admin@ajaxray.com>
  74. * @link        http://www.ajaxray.com/blog/2008/05/02/php-universal-feed-parser-lightweight-php-class-for-parsing-rss-and-atom-feeds/
  75. */
  76. class FeedParser{
  77.        
  78.     private $xmlParser      = null;
  79.     private $insideItem     = array();                  // Keep track of current position in tag tree
  80.     private $currentTag     = null;                     // Last entered tag name      
  81.     private $currentAttr    = null;                     // Attributes array of last entered tag
  82.    
  83.     private $namespaces     = array(
  84.                             'http://purl.org/rss/1.0/'                  => 'RSS 1.0',
  85.                             'http://purl.org/rss/1.0/modules/content/'  => 'RSS 2.0',
  86.                             'http://www.w3.org/2005/Atom'               => 'ATOM 1',
  87.                             );                          // Namespaces to detact feed version
  88.     private $itemTags       = array('ITEM','ENTRY');    // List of tag names which holds a feed item
  89.     private $channelTags    = array('CHANNEL','FEED');  // List of tag names which holds all channel elements
  90.     private $dateTags       = array('UPDATED','PUBDATE','DC:DATE');  
  91.     private $hasSubTags     = array('IMAGE','AUTHOR');  // List of tag names which have sub tags
  92.     private $channels       = array();                  
  93.     private $items          = array();
  94.     private $itemIndex      = 0;
  95.  
  96.     private $url            = null;                     // The parsed url
  97.     private $version        = null;                     // Detected feed version
  98.    
  99.        
  100.     /**
  101.     * Constructor - Initialize and set event handler functions to xmlParser
  102.     */    
  103.     function __construct()
  104.     {
  105.         $this->xmlParser = xml_parser_create();
  106.        
  107.         xml_set_object($this->xmlParser, $this);
  108.         xml_set_element_handler($this->xmlParser, "startElement", "endElement");
  109.         xml_set_character_data_handler($this->xmlParser, "characterData");
  110.     }  
  111.  
  112.     /*-----------------------------------------------------------------------+
  113.     |  Public functions. Use to parse feed and get informations.             |  
  114.     +-----------------------------------------------------------------------*/
  115.    
  116.     /**
  117.     * Get all channel elements  
  118.     *
  119.     * @access   public
  120.     * @return   array   - All chennels as associative array
  121.     */
  122.     public function getChannels()
  123.     {
  124.         return $this->channels;
  125.     }
  126.    
  127.     /**
  128.     * Get all feed items  
  129.     *
  130.     * @access   public
  131.     * @return   array   - All feed items as associative array
  132.     */
  133.     public function getItems()
  134.     {
  135.         return $this->items;
  136.     }
  137.  
  138.     /**
  139.     * Get total number of feed items
  140.     *
  141.     * @access   public
  142.     * @return   number  
  143.     */  
  144.     public function getTotalItems()
  145.     {
  146.         return count($this->items);
  147.     }
  148.  
  149.     /**
  150.     * Get a feed item by index
  151.     *
  152.     * @access   public
  153.     * @param    number  index of feed item
  154.     * @return   array   feed item as associative array of it's elements
  155.     */  
  156.     public function getItem($index)
  157.     {
  158.         if($index < $this->getTotalItems())
  159.         {
  160.             return $this->items[$index];
  161.         }
  162.         else
  163.         {
  164.             throw new Exception("Item index is learger then total items.");
  165.             return false;
  166.         }        
  167.     }
  168.    
  169.     /**
  170.     * Get a channel element by name
  171.     *
  172.     * @access   public
  173.     * @param    string  the name of channel tag
  174.     * @return   string
  175.     */  
  176.     public function getChannel($tagName)
  177.     {
  178.         if(array_key_exists(strtoupper($tagName), $this->channels))
  179.         {
  180.             return $this->channels[strtoupper($tagName)];
  181.         }
  182.         else
  183.         {
  184.             throw new Exception("Channel tag $tagName not found.");
  185.             return false;
  186.         }
  187.     }
  188.    
  189.     /**
  190.     * Get the parsed URL
  191.     *
  192.     * @access   public
  193.     * @return   string
  194.     */  
  195.     public function getParsedUrl()
  196.     {
  197.         if(empty($this->url))
  198.         {
  199.             throw new Exception("Feed URL is not set yet.");
  200.             return FALSE;
  201.         }
  202.         else
  203.         {
  204.             return $this->url;
  205.         }
  206.        
  207.        
  208.     }
  209.  
  210.     /**
  211.     * Get the detected Feed version
  212.     *
  213.     * @access   public
  214.     * @return   string
  215.     */  
  216.    public function getFeedVersion()
  217.    {
  218.         return $this->version;
  219.    }
  220.    
  221.     /**
  222.     * Parses a feed url
  223.     *
  224.     * @access   public
  225.     * @param    srting  teh feed url
  226.     * @return   void
  227.     */  
  228.     public function parse($url)
  229.     {
  230.         $this->url  = $url;
  231.         $URLContent = $this->getUrlContent();
  232.        
  233.         if($URLContent)
  234.         {  
  235.             $segments   = str_split($URLContent, 4096);
  236.             foreach($segments as $index=>$data)
  237.             {
  238.                 $lastPiese = ((count($segments)-1) == $index)? true : false;
  239.                 xml_parse($this->xmlParser, $data, $lastPiese)
  240.                    or die(sprintf("XML error: %s at line %d",  
  241.                    xml_error_string(xml_get_error_code($this->xmlParser)),  
  242.                    xml_get_current_line_number($this->xmlParser)));
  243.             }
  244.             xml_parser_free($this->xmlParser);  
  245.         }
  246.         else
  247.         {
  248.             die('Sorry! cannot load the feed url.');   
  249.         }
  250.        
  251.         if(empty($this->version))
  252.         {
  253.             die('Sorry! cannot detect the feed version.');
  254.         }
  255.     }  
  256.    
  257.    // End public functions -------------------------------------------------
  258.    
  259.    /*-----------------------------------------------------------------------+
  260.    | Private functions. Be careful to edit them.                            |  
  261.    +-----------------------------------------------------------------------*/
  262.  
  263.    /**
  264.     * Load the whole contents of a RSS/ATOM page
  265.     *
  266.     * @access   private
  267.     * @return   string
  268.     */
  269.     private function getUrlContent()
  270.     {
  271.         if(empty($this->url))
  272.         {
  273.             throw new Exception("URL to parse is empty!.");
  274.             return false;
  275.         }
  276.    
  277.         if($content = @file_get_contents($this->url))
  278.         {
  279.             return $content;
  280.         }
  281.         else
  282.         {
  283.             $ch         = curl_init();
  284.            
  285.             curl_setopt($ch, CURLOPT_URL, $this->url);
  286.             curl_setopt($ch, CURLOPT_HEADER, false);
  287.             curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
  288.  
  289.             $content    = curl_exec($ch);
  290.             $error      = curl_error($ch);
  291.            
  292.             curl_close($ch);
  293.            
  294.             if(empty($error))
  295.             {
  296.                 return $content;   
  297.             }
  298.             else
  299.             {
  300.                 throw new Exception("Erroe occured while loading url by cURL. <br />\n" . $error) ;
  301.                 return false;
  302.             }
  303.         }
  304.    
  305.     }
  306.    
  307.     /**
  308.     * Handle the start event of a tag while parsing
  309.     *
  310.     * @access   private
  311.     * @param    object  the xmlParser object
  312.     * @param    string  name of currently entering tag
  313.     * @param    array   array of attributes
  314.     * @return   void
  315.     */
  316.     private function startElement($parser, $tagName, $attrs)
  317.     {
  318.         if(!$this->version)
  319.         {
  320.             $this->findVersion($tagName, $attrs);
  321.         }      
  322.        
  323.         array_push($this->insideItem, $tagName);
  324.        
  325.         $this->currentTag  = $tagName;
  326.         $this->currentAttr = $attrs;
  327.     }  
  328.  
  329.     /**
  330.     * Handle the end event of a tag while parsing
  331.     *
  332.     * @access   private
  333.     * @param    object  the xmlParser object
  334.     * @param    string  name of currently ending tag
  335.     * @return   void
  336.     */    
  337.     private function endElement($parser, $tagName)
  338.     {
  339.         $this->characterDataAttributes($xmlParser);
  340.        
  341.         if (in_array($tagName, $this->itemTags))
  342.         {
  343.            $this->itemIndex++;
  344.         }
  345.        
  346.         array_pop($this->insideItem);
  347.         $this->currentTag = $this->insideItem[count($this->insideItem)-1];
  348.     }  
  349.  
  350.     /**
  351.     * Handle character data of a tag while parsing
  352.     *
  353.     * @access   private
  354.     * @param    object  the xmlParser object
  355.     * @param    string  tag value
  356.     * @return   void
  357.     */
  358.     private function characterData($parser, $data)
  359.     {
  360.         //Converting all date formats to timestamp
  361.         if(in_array($this->currentTag, $this->dateTags))
  362.         {
  363.             $data = strtotime($data);
  364.         }
  365.                  
  366.        if($this->inChannel())
  367.        {
  368.             // If has subtag, make current element an array and assign subtags as it's element
  369.             if(in_array($this->getParentTag(), $this->hasSubTags))  
  370.             {
  371.                 if(! is_array($this->channels[$this->getParentTag()]))
  372.                 {
  373.                     $this->channels[$this->getParentTag()] = array();
  374.                 }
  375.  
  376.                 $this->channels[$this->getParentTag()][$this->currentTag] .= strip_tags($this->unhtmlentities((trim($data))));
  377.                 return;
  378.             }
  379.             else
  380.             {
  381.                 if(! in_array($this->currentTag, $this->hasSubTags))  
  382.                 {
  383.                     $this->channels[$this->currentTag] .= strip_tags($this->unhtmlentities((trim($data))));
  384.                 }
  385.             }
  386.        }
  387.        elseif($this->inItem())
  388.        {
  389.            // If has subtag, make current element an array and assign subtags as it's elements
  390.            if(in_array($this->getParentTag(), $this->hasSubTags))  
  391.             {
  392.                 if(! is_array($this->items[$this->itemIndex][$this->getParentTag()]))
  393.                 {
  394.                     $this->items[$this->itemIndex][$this->getParentTag()] = array();
  395.                 }
  396.  
  397.                 $this->items[$this->itemIndex][$this->getParentTag()][$this->currentTag] .= strip_tags($this->unhtmlentities((trim($data))));
  398.                 return;
  399.             }
  400.             else
  401.             {
  402.                 if(! in_array($this->currentTag, $this->hasSubTags))  
  403.                 {
  404.                     $this->items[$this->itemIndex][$this->currentTag] .= strip_tags($this->unhtmlentities((trim($data))));
  405.                 }
  406.             }
  407.        }
  408.     }
  409.  
  410.     private function characterDataAttributes($parser)
  411.     {
  412.        if($this->inChannel())
  413.        {
  414.             if(!empty($this->currentAttr))
  415.             {
  416.                 $this->channels[$this->currentTag . '_ATTRS'] = $this->currentAttr;
  417.  
  418.                 //If the tag has no value
  419.                 if(strlen($this->channels[$this->currentTag]) < 2)
  420.                 {
  421.                     //If there is only one attribute, assign the attribute value as channel value
  422.                     if(count($this->currentAttr) == 1)
  423.                     {
  424.                         foreach($this->currentAttr as $attrVal)
  425.                         {
  426.                             $this->channels[$this->currentTag] = $attrVal;
  427.                         }
  428.                     }
  429.                     //If there are multiple attributes, assign the attributs array as channel value
  430.                     else
  431.                     {
  432.                         $this->channels[$this->currentTag] = $this->currentAttr;
  433.                     }
  434.                 }
  435.             }
  436.        }
  437.        elseif($this->inItem())
  438.        {
  439.             if(!empty($this->currentAttr))
  440.             {
  441.                 $this->items[$this->itemIndex][$this->currentTag . '_ATTRS'] = $this->currentAttr;
  442.  
  443.                 //If the tag has no value
  444.  
  445.                 if(strlen($this->items[$this->itemIndex][$this->currentTag]) < 2)
  446.                 {
  447.                     //If there is only one attribute, assign the attribute value as feed element's value
  448.                     if(count($this->currentAttr) == 1)
  449.                     {
  450.                         foreach($this->currentAttr as $attrVal)
  451.                         {
  452.                            $this->items[$this->itemIndex][$this->currentTag] = $attrVal;
  453.                         }
  454.                     }
  455.                     //If there are multiple attributes, assign the attribute array as feed element's value
  456.                     else
  457.                     {
  458.                        $this->items[$this->itemIndex][$this->currentTag] = $this->currentAttr;
  459.                     }
  460.                 }
  461.             }
  462.        }
  463.     }
  464.  
  465.     /**
  466.     * Find out the feed version
  467.     *
  468.     * @access   private
  469.     * @param    string  name of current tag
  470.     * @param    array   array of attributes
  471.     * @return   void
  472.     */  
  473.     private function findVersion($tagName, $attrs)
  474.     {
  475.         $namespace = array_values($attrs);
  476.         foreach($this->namespaces as $value =>$version)
  477.         {
  478.             if(in_array($value, $namespace))
  479.             {
  480.                 $this->version = $version;
  481.                 return;
  482.             }    
  483.         }
  484.     }
  485.    
  486.     private function getParentTag()
  487.     {
  488.         return $this->insideItem[count($this->insideItem) - 2];
  489.     }
  490.  
  491.     /**
  492.     * Detect if current position is in channel element
  493.     *
  494.     * @access   private
  495.     * @return   bool
  496.     */  
  497.     private function inChannel()
  498.     {
  499.         if($this->version == 'RSS 1.0')
  500.         {
  501.             if(in_array('CHANNEL', $this->insideItem) && $this->currentTag != 'CHANNEL')
  502.             return TRUE;
  503.         }
  504.         elseif($this->version == 'RSS 2.0')
  505.         {
  506.             if(in_array('CHANNEL', $this->insideItem) && !in_array('ITEM', $this->insideItem) && $this->currentTag != 'CHANNEL')
  507.             return TRUE;    
  508.         }
  509.         elseif($this->version == 'ATOM 1')
  510.         {
  511.             if(in_array('FEED', $this->insideItem) && !in_array('ENTRY', $this->insideItem) && $this->currentTag != 'FEED')
  512.             return TRUE;    
  513.         }
  514.        
  515.         return FALSE;
  516.     }
  517.  
  518.     /**
  519.     * Detect if current position is in Item element
  520.     *
  521.     * @access   private
  522.     * @return   bool
  523.     */    
  524.     private function inItem()
  525.     {
  526.         if($this->version == 'RSS 1.0' || $this->version == 'RSS 2.0')
  527.         {
  528.             if(in_array('ITEM', $this->insideItem) && $this->currentTag != 'ITEM')
  529.             return TRUE;
  530.         }
  531.         elseif($this->version == 'ATOM 1')
  532.         {
  533.             if(in_array('ENTRY', $this->insideItem) && $this->currentTag != 'ENTRY')
  534.             return TRUE;    
  535.         }
  536.        
  537.         return FALSE;
  538.     }  
  539.  
  540.     //This function is taken from lastRSS
  541.     /**
  542.     * Replace HTML entities &something; by real characters
  543.     *
  544.     *
  545.     * @access   private
  546.     * @author   Vojtech Semecky <webmaster@oslab.net>
  547.     * @link     http://lastrss.oslab.net/
  548.     * @param    string
  549.     * @return   string
  550.     */  
  551.     private function unhtmlentities($string)
  552.     {
  553.         // Get HTML entities table
  554.         $trans_tbl = get_html_translation_table (HTML_ENTITIES, ENT_QUOTES);
  555.         // Flip keys<==>values
  556.         $trans_tbl = array_flip ($trans_tbl);
  557.         // Add support for &apos; entity (missing in HTML_ENTITIES)
  558.         $trans_tbl += array('&apos;' => "'");
  559.         // Replace entities by values
  560.         return strtr ($string, $trans_tbl);
  561.     }
  562. } //End class FeedParser
  563. ?>
  564.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement