Guest
Public paste!

IceDragon

By: a guest | Jul 4th, 2009 | Syntax: PHP | Size: 3.83 KB | Hits: 50 | Expires: Never
Copy text to clipboard
  1. <?php
  2. /*
  3.  *** robots.txt processing class
  4.  * Author: IceDragon of QuickFox.org
  5.  *         http://www.icerealm.org/
  6.  *
  7.  * Feel free to modify/use for any purpose.
  8.  *
  9.  * Change Log:
  10.  *   1.0.0 [20090704] - Initial release.
  11.  */
  12.  
  13. class Robots {
  14.     /** Members **/
  15.     public $gUserAgent = NULL;
  16.     public $gRules     = array();
  17.  
  18.     /** Constructor **/
  19.     // Class constructor - optionally accepts a path to preload robots.txt
  20.     // exclusion rules from.
  21.     public function __construct( $path = NULL, $userAgent = FALSE )
  22.     {
  23.         if( $userAgent )
  24.             $this->SetUserAgent( $userAgent );
  25.         if( $path != NULL )
  26.             $this->Load( $path );
  27.     }
  28.  
  29.  
  30.     /** Methods **/
  31.     // This function loads a robots.txt file from a specific path and stores
  32.     // the exclusion rules in the $rules method. Use this to prime the class
  33.     // with data.
  34.     public function Load( $path )
  35.     {
  36.         $fd = fopen( $path, 'r' );
  37.         if( !$fd )
  38.             throw new Exception("Unable to open path `$path`");
  39.  
  40.         $user_agent   = "*";
  41.         $disallowed   = array();
  42.         $this->gRules = array();
  43.        
  44.         while( !feof($fd) )
  45.         {
  46.             // Read line and check if we've reached an EOF.
  47.             $line = fgets($fd);
  48.             if( feof($fd) )
  49.                 continue;
  50.  
  51.             $line       = trim($line);
  52.             $split_line = explode(' ', $line);
  53.  
  54.             // Disregard comments or empty lines.
  55.             if( $line == "" || $line[0] == "#" )
  56.                 continue;
  57.  
  58.             // UserAgent change.
  59.             if( strtolower($split_line[0]) == "user-agent:" )
  60.             {
  61.                 if( array_key_exists( $user_agent, $this->gRules ) )
  62.                 {
  63.                     $existing_rules = $this->gRules[$user_agent];
  64.                     $disallowed = array_merge( $existing_rules, $disallowed );
  65.                 }
  66.                
  67.                 $this->gRules[$user_agent] = $disallowed;
  68.                 $user_agent = strtolower( $split_line[1] );
  69.                 $disallowed = array();
  70.                 continue;
  71.             }
  72.  
  73.             // Disallow rule.
  74.             if( strtolower($split_line[0]) == "disallow:" )
  75.                 if( $split_line[1][0] != "#" )
  76.                     array_push( $disallowed, $split_line[1] );
  77.         }
  78.  
  79.         // Add the last entry.
  80.         if( array_key_exists( $user_agent, $this->gRules ) )
  81.         {
  82.             $existing_rules = $this->gRules[$user_agent];
  83.             $disallowed = array_merge( $existing_rules, $disallowed );
  84.         }
  85.         $this->gRules[$user_agent] = $disallowed;
  86.         fclose($fd);
  87.     } // Load()
  88.  
  89.  
  90.     // This function sets the UserAgent to match URLs against.
  91.     public function SetUserAgent( $userAgent )
  92.     {
  93.         $this->gUserAgent = $userAgent;
  94.     } // SetUserAgent()
  95.  
  96.    
  97.     // This function verifies if the current UserAgent is allowed to access the
  98.     // specific URL. Returns TRUE if allowed, FALSE otherwise.
  99.     // NOTE: Do not specify the full URL (http://...), only the part after the
  100.     //       domain! i.e.: IsAllowed("/robots.txt");
  101.     public function IsAllowed( $url )
  102.     {
  103.         // Locate a matching UserAgent string from the list.
  104.         foreach( $this->gRules as $user_agent => $rules )
  105.         {
  106.             if( $user_agent != "*" && strstr( strtolower($this->gUserAgent), $user_agent ) == FALSE )
  107.                 continue;
  108.  
  109.             foreach( $rules as $str )
  110.                 if( $this->_StartsWith( $url, $str ) )
  111.                     return FALSE;
  112.         }
  113.  
  114.         return TRUE;
  115.     } // IsAllowed()
  116.  
  117.  
  118.     // Check if a string starts with a substring.
  119.     private function _StartsWith( $str, $subStr )
  120.     {
  121.         return ( substr($str, 0, strlen($subStr)) == $subStr );
  122.     } // _StartsWith()
  123. } // class Robots
  124. ?>