Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/php
- <?
- #usage: php solr-parse.php [LOGFILE...]
- #parses a solr log file of a certain etsyish type
- ini_set("memory_limit","-1");
- set_time_limit(0);
- date_default_timezone_set("GMT");
- set_error_handler('phpStderr');
- function phpStderr($errno, $errstr, $errfile, $errline) {
- if(error_reporting() === 0){
- return;
- }
- file_put_contents('php://stderr' ,$errno.': Error in '.$errfile.':'.$errline.' - '.$errstr. "\n");
- }
- function grabCategories()
- {
- $p=new PDO("pgsql:host=10.101.192.20 port=6666 dbname=etsy_v2 user=etsy password=sekret");
- $paths=array();
- foreach($p->query("select path from categories") as $r) $paths[]=$r[0];
- sort($paths);
- return $paths;
- }
- function line2array($x='',$c)
- {
- $xx=$x;
- $r=array();
- $x=urldecode($x);
- $x=str_replace('\-','-',$x);
- $x=str_replace('\"','"',$x);
- $x=str_replace("\\",'',$x); //Could alternatively replace \ with \\
- $x=str_replace(array('|',"\n","\r"),'',$x);//prep for PSV
- #echo"$x\n";
- $r['line']=$x;
- $r['ip']=substr($x,0,strpos($x,' '));
- $r['sort']='creation_tsz asc';
- $sorts=array('creation_tsz desc','price asc','price desc', 'score asc', 'score desc');
- foreach($sorts as $y)if(stripos($x,"sort=".$y."&")!==false)$r['sort']=$y;
- $r['page']=1;
- preg_match('/&start=([0-9]+)&rows=21/', $x, $m);
- if(count($m))$r['page']=floor($m[1]/21)+1;
- $r['price_low']='';
- $r['price_high']='';
- //&fq=price:[5 TO 60]& //website disallows: [1.23 TO *]
- preg_match('/price:\[([\S]+) TO ([^\]]+)\]&/', $x, $m);
- if(count($m))
- { if(is_numeric($m[2]))$r['price_high']=$m[2];
- if(is_numeric($m[1]))$r['price_low']=$m[1];
- }
- $r['uses_description']=strpos($x,'tags_title_desc')!==false?1:0;
- $r['search_type']='all';
- $types=explode(' ','handmade vintage supplies');
- foreach($types as $y)if(strpos($x,"is_".$y.":true")!==false)$r['search_type']=$y;
- $r['seller_id']='';
- preg_match('/\(user_id:([0-9]+)\)/', $x, $m);
- if(count($m)) $r['seller_id']=(int)$m[1];
- $bl=strpos($x,'[');
- $br=strpos($x,']');
- $t=substr($x,$bl+1,$br-$bl-7);
- $r['datetime']=date('Y-m-d H:i:s',strtotime(substr($t,0,2)." ".substr($t,3,3)." ".substr($t,7,4)." ".substr($t,12)));
- #minus terms first, then not-minus terms
- $r['search_terms']='';
- $k=array();
- preg_match_all('/(-?)tags_title[^:]*:(\"[A-Za-z0-9\s]*\"|[^\)\s&]+)/',$x,$m);
- for($i=0;$i<count($m[1]);$i++) $m[1][$i] .= $m[2][$i];
- $k=array_merge($k,$m[1]);
- if(stripos($r['sort'],'score') !== false)
- {
- preg_match('/&q=([^&]*)&f/',$x,$m);
- if(count($m))
- {
- #foreach(explode(' ',$m[1]) as $z) echo "$z ";
- #echo "\n";
- ###for($i=0;$i<count($m[1]);$i++) $m[1][$i] .= $m[2][$i];
- $k=array_merge($k,explode(' ',$m[1]));
- }
- }
- $r['category_chain']='';
- preg_match_all('/(sub)?categor[^:]*:([^\s\)&]+)/', $x, $m);
- if(count($m)>2) $r['category_chain']=implode('.',$m[2]); //sometimes overwritten later on (or used to be)
- preg_match_all('/(-?)tags:(\"[A-Za-z0-9\s]*\"|[^\)\s&]+)/',$x,$m);
- for($i=0;$i<count($m[1]);$i++) $m[1][$i] .= $m[2][$i];
- $tags=$m[1];
- $u=0;
- if(isset($tags[0]) && $tags[0]==$r['category_chain'])$u++;
- while ($u < count($tags)
- && is_array(array_slice($tags,0,$u+1))
- && in_array(implode('.',array_slice($tags,0,$u+1)),$c)) {
- $u++;
- }
- if($u>0) $r['category_chain']=implode('.',array_slice($tags,0,$u));
- $k=array_merge($k,array_slice($tags,$u));
- $r['search_terms']=implode(' ',$k);
- return $r;
- }
- function solr($table, $logfile, $categories)
- {
- print "BEGIN;\n\n";
- print "select make_tracking_table('$table');\n";
- print "COPY $table ";
- print " (date, ip, search_type, seller_id, uses_desc, category, terms, price_low, price_high, sort, page) ";
- print " FROM STDIN WITH DELIMITER '|' NULL AS ''; \n";
- $a = file($logfile, FILE_IGNORE_NEW_LINES);
- if ($a) {
- for($i=0;$i<count($a);$i++)
- {
- //if(0 == $i % 10000) echo $i."\n";
- if (strpos($a[$i],'tags:loadbalancer')) continue;
- if (strpos($a[$i],'curl/7.18.0')) continue;
- $r=line2array($a[$i],$categories);
- $g=explode(' ','datetime ip search_type seller_id uses_description category_chain search_terms price_low price_high sort page');
- $h=array();
- foreach($g as $z) $h[]=$r[$z];
- if($r['search_terms']) #Skip adding results with no search terms
- echo implode('|',$h)."\n";
- }
- }
- print "\.\n";
- print "COMMIT; \n";
- }
- #$argv=array('','XTCXTC','2009_06_04.request.log');$argc=count($argv);
- #$argv=array('','XTCXTC','score_2009_06_04.request.log');$argc=count($argv);
- if ($argc < 2) {
- print "usage: <table_name> [<logfiles>...]\n\n";
- exit;
- }
- $cats = grabCategories();
- $table = $argv[1];
- for($i = 2; $i < $argc; $i++) {
- solr($table, $argv[$i], $cats);
- }
Add Comment
Please, Sign In to add comment