Guest User

Untitled

a guest
Jun 20th, 2018
106
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.85 KB | None | 0 0
  1. #!/usr/bin/php
  2. <?
  3. #usage: php solr-parse.php [LOGFILE...]
  4. #parses a solr log file of a certain etsyish type
  5. ini_set("memory_limit","-1");
  6. set_time_limit(0);
  7. date_default_timezone_set("GMT");
  8.  
  9. set_error_handler('phpStderr');
  10.  
  11. function phpStderr($errno, $errstr, $errfile, $errline) {
  12. if(error_reporting() === 0){
  13. return;
  14. }
  15. file_put_contents('php://stderr' ,$errno.': Error in '.$errfile.':'.$errline.' - '.$errstr. "\n");
  16. }
  17.  
  18. function grabCategories()
  19. {
  20. $p=new PDO("pgsql:host=10.101.192.20 port=6666 dbname=etsy_v2 user=etsy password=sekret");
  21. $paths=array();
  22. foreach($p->query("select path from categories") as $r) $paths[]=$r[0];
  23. sort($paths);
  24. return $paths;
  25. }
  26.  
  27. function line2array($x='',$c)
  28. {
  29. $xx=$x;
  30. $r=array();
  31. $x=urldecode($x);
  32. $x=str_replace('\-','-',$x);
  33. $x=str_replace('\"','"',$x);
  34. $x=str_replace("\\",'',$x); //Could alternatively replace \ with \\
  35. $x=str_replace(array('|',"\n","\r"),'',$x);//prep for PSV
  36.  
  37. #echo"$x\n";
  38.  
  39. $r['line']=$x;
  40.  
  41. $r['ip']=substr($x,0,strpos($x,' '));
  42.  
  43. $r['sort']='creation_tsz asc';
  44. $sorts=array('creation_tsz desc','price asc','price desc', 'score asc', 'score desc');
  45. foreach($sorts as $y)if(stripos($x,"sort=".$y."&")!==false)$r['sort']=$y;
  46.  
  47. $r['page']=1;
  48. preg_match('/&start=([0-9]+)&rows=21/', $x, $m);
  49. if(count($m))$r['page']=floor($m[1]/21)+1;
  50.  
  51. $r['price_low']='';
  52. $r['price_high']='';
  53. //&fq=price:[5 TO 60]& //website disallows: [1.23 TO *]
  54. preg_match('/price:\[([\S]+) TO ([^\]]+)\]&/', $x, $m);
  55. if(count($m))
  56. { if(is_numeric($m[2]))$r['price_high']=$m[2];
  57. if(is_numeric($m[1]))$r['price_low']=$m[1];
  58. }
  59.  
  60. $r['uses_description']=strpos($x,'tags_title_desc')!==false?1:0;
  61.  
  62. $r['search_type']='all';
  63. $types=explode(' ','handmade vintage supplies');
  64. foreach($types as $y)if(strpos($x,"is_".$y.":true")!==false)$r['search_type']=$y;
  65.  
  66. $r['seller_id']='';
  67. preg_match('/\(user_id:([0-9]+)\)/', $x, $m);
  68. if(count($m)) $r['seller_id']=(int)$m[1];
  69.  
  70. $bl=strpos($x,'[');
  71. $br=strpos($x,']');
  72. $t=substr($x,$bl+1,$br-$bl-7);
  73. $r['datetime']=date('Y-m-d H:i:s',strtotime(substr($t,0,2)." ".substr($t,3,3)." ".substr($t,7,4)." ".substr($t,12)));
  74.  
  75. #minus terms first, then not-minus terms
  76. $r['search_terms']='';
  77.  
  78. $k=array();
  79. preg_match_all('/(-?)tags_title[^:]*:(\"[A-Za-z0-9\s]*\"|[^\)\s&]+)/',$x,$m);
  80. for($i=0;$i<count($m[1]);$i++) $m[1][$i] .= $m[2][$i];
  81. $k=array_merge($k,$m[1]);
  82.  
  83. if(stripos($r['sort'],'score') !== false)
  84. {
  85. preg_match('/&q=([^&]*)&f/',$x,$m);
  86. if(count($m))
  87. {
  88. #foreach(explode(' ',$m[1]) as $z) echo "$z ";
  89. #echo "\n";
  90. ###for($i=0;$i<count($m[1]);$i++) $m[1][$i] .= $m[2][$i];
  91. $k=array_merge($k,explode(' ',$m[1]));
  92. }
  93. }
  94.  
  95. $r['category_chain']='';
  96. preg_match_all('/(sub)?categor[^:]*:([^\s\)&]+)/', $x, $m);
  97. if(count($m)>2) $r['category_chain']=implode('.',$m[2]); //sometimes overwritten later on (or used to be)
  98.  
  99. preg_match_all('/(-?)tags:(\"[A-Za-z0-9\s]*\"|[^\)\s&]+)/',$x,$m);
  100. for($i=0;$i<count($m[1]);$i++) $m[1][$i] .= $m[2][$i];
  101. $tags=$m[1];
  102.  
  103. $u=0;
  104. if(isset($tags[0]) && $tags[0]==$r['category_chain'])$u++;
  105. while ($u < count($tags)
  106. && is_array(array_slice($tags,0,$u+1))
  107. && in_array(implode('.',array_slice($tags,0,$u+1)),$c)) {
  108. $u++;
  109. }
  110. if($u>0) $r['category_chain']=implode('.',array_slice($tags,0,$u));
  111.  
  112. $k=array_merge($k,array_slice($tags,$u));
  113. $r['search_terms']=implode(' ',$k);
  114.  
  115. return $r;
  116. }
  117.  
  118. function solr($table, $logfile, $categories)
  119. {
  120. print "BEGIN;\n\n";
  121. print "select make_tracking_table('$table');\n";
  122. print "COPY $table ";
  123. print " (date, ip, search_type, seller_id, uses_desc, category, terms, price_low, price_high, sort, page) ";
  124. print " FROM STDIN WITH DELIMITER '|' NULL AS ''; \n";
  125.  
  126. $a = file($logfile, FILE_IGNORE_NEW_LINES);
  127. if ($a) {
  128. for($i=0;$i<count($a);$i++)
  129. {
  130. //if(0 == $i % 10000) echo $i."\n";
  131. if (strpos($a[$i],'tags:loadbalancer')) continue;
  132. if (strpos($a[$i],'curl/7.18.0')) continue;
  133. $r=line2array($a[$i],$categories);
  134.  
  135. $g=explode(' ','datetime ip search_type seller_id uses_description category_chain search_terms price_low price_high sort page');
  136. $h=array();
  137. foreach($g as $z) $h[]=$r[$z];
  138. if($r['search_terms']) #Skip adding results with no search terms
  139. echo implode('|',$h)."\n";
  140. }
  141. }
  142. print "\.\n";
  143. print "COMMIT; \n";
  144. }
  145.  
  146.  
  147. #$argv=array('','XTCXTC','2009_06_04.request.log');$argc=count($argv);
  148. #$argv=array('','XTCXTC','score_2009_06_04.request.log');$argc=count($argv);
  149.  
  150. if ($argc < 2) {
  151. print "usage: <table_name> [<logfiles>...]\n\n";
  152. exit;
  153. }
  154.  
  155. $cats = grabCategories();
  156. $table = $argv[1];
  157.  
  158. for($i = 2; $i < $argc; $i++) {
  159. solr($table, $argv[$i], $cats);
  160. }
Add Comment
Please, Sign In to add comment