Advertisement
To-Slalom

preg_match major bots

May 14th, 2017
234
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
PHP 4.03 KB | None | 0 0
  1. <?php
  2.     /* Grab Major bots name and version
  3.         Hello everyone thats see this this is a beta code that i made isn't 100%
  4.         but so far its working for the list of bots that i add it
  5.         if you wanna help me to improve this code please send email to toslalompt@gmail.com
  6.         with corrections,...
  7.     */
  8.     /*
  9.         this ereg aint 100% but if you want to help me to improve feel free to comment/update via email Thankyou
  10.         im starting in php so :)
  11.     */
  12.     $majorbots .= '(' ;
  13.     $majorbots .= '[A-Za-z]+[A-Za-z-.: ]+|[A-Za-z-_.: ]+' ;
  14.     $majorbots .= '(bot|robot|spider|crawler|curl)' ;
  15.     $majorbots .= ')' ;
  16.     $majorbots .= '[\/ ]?' ;
  17.     $majorbots .= '([v]?[0-9]*\.?[0-9a-z\.]+)?' ;
  18.     // the rest of the ua string
  19.     $majorbots .= '(.*)' ;
  20.    
  21.     /* List of bots i tested */
  22.     $bots = array (
  23.     'Mozilla/5.0 (compatible; DuckDuckGo-Favicons-Bot/1.0; +http://www.google.com/bot.html)' ,
  24.     'Mozilla/5.0 (compatible; Sogou Orionbot 3.0' ,
  25.     'Adidxbot/v4.1.0' ,
  26.     'crawler for netopian (http://www.netopian.co.uk/)' ,
  27.     'boitho.com-robot/1.1' ,
  28.     'SEMrushBot' ,
  29.     'eBot / v.1.0a (http://alfa.elchron.cz)' ,
  30.     'envolk[ITS]spider/1.6 ( http://www.envolk.com/envolkspider.html)' ,
  31.     'SpeedySpider - http://www.entireweb.com',
  32.     'Bot mailto:craftbot@yahoo.com' ,
  33.     'Sogou News Spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)',
  34.     'Mozilla/5.0 (compatible; DuckDuckGo Favicons-Bot/1.0 +http://www.google.com/bot.html)' ,
  35.     'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Exabot-Thumbnails)' ,
  36.     'Mozilla/5.0 (compatible; Exabot/3.0; +http://www.exabot.com/go/robot)' ,
  37.     'Mozilla/5.0 (compatible; Mail.RU_Bot/2.0; +http://go.mail.ru/help/robots)',
  38.     'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0) AddSugarSpiderBot www.idealobserver.com' ,
  39.     'Mozilla/4.0 (compatible; BlitzBot)' ,
  40.     'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)' ,
  41.     'FAST Enteprise Crawler/6 (www dot fastsearch dot com)',
  42.     'TinEye-bot/0.51 (see http://www.tineye.com/crawler.html)' ,
  43.     'Whoismindbot/1.0 (+http://www.whoismind.com/bot.html)' ,
  44.     'yacybot (/global; amd64 Linux 4.4.0-31-generic; java 1.8.0_91; Europe/en) http://yacy.net/bot.html' ,
  45.     'StatoolsBot (+http://www.statools.com/bot.html)',
  46.     'adidxbot/2.0 (+http://search.msn.com/msnbot.htm)' ,
  47.     'BlitzBOT@tricus.net (Mozilla compatible)' ,
  48.     'BlitzBOT@tricus.com (Mozilla compatible)' ,
  49.     'ConveraCrawler/0.9d (+http://www.authoritativeweb.com/crawl)',
  50.     'EmeraldShield.com WebBot (http://www.emeraldshield.com/webbot.aspx)' ,
  51.     ) ;
  52.  
  53.     // http://www.useragentstring.com/pages/useragentstring.php?typ=Crawler
  54.     echo ' testing '.count ( $bots ).' bots <br/> ' ;
  55.     $pregRep  = "{\(|\)|]|\[}" ;
  56.     $numbDots = "[a-zA-Z0-9.]+" ;
  57.     $clean    = array ( $pregRep , "/mozilla\/".$numbDots." compatible;/" , "/windows nt ".$numbDots."/" , "/like gecko /" , "/linux khtml\/".$numbDots."/" ) ;
  58.     $count    = 1 ;
  59.     foreach ( $bots as $key => $bot ) {
  60.         // ereg to ( and ) and [ and ] from useragent
  61.         // make full string lower
  62.         $useragent = strtolower ( $bot ) ;
  63.         // remove the words from useragent that i put in ereg
  64.         $useragent = preg_replace ( $clean , '' , $useragent ) ;
  65.         // remove the space that of some useragents have
  66.         $useragent = preg_replace ( "{ \/ }" , '/' , $useragent ) ;
  67.         // simple function to revome url from useragent and -
  68.         $useragent = preg_replace ( "@http://(.*)| - http://(.*)$@" , '' , $useragent ) ;
  69.         // now lets start checking
  70.         // 1 going to check if is a bot
  71.         if ( preg_match("/(bot|crawl|slurp|spider)/" , $useragent ) ) {
  72.             // since its a bot now
  73.             if ( preg_match ( "#^".$majorbots."$#" , $useragent , $match ) ) {
  74.                 // just to check wich boot im working with
  75.                 echo ' this bot is '.$bots[$key].' <br/> ';
  76.                 // and now print bot info with 6 fields
  77.                 echo ' print '.$count.' <br/> <pre>  ' . print_r ( $match , 1 ) . '</pre>';
  78.                 // example
  79.                 /*
  80.                 (
  81.                     [0] => full string clean with my str_replace
  82.                     [1] => bot name
  83.                     [2] => bot|crawl|slurp|spider
  84.                     [3] => bot version
  85.                     [4] => rest of the bot sctring
  86.                 )
  87.                 */
  88.             }
  89.         }
  90.         $count++ ;
  91.     }
  92. ?>
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement