Advertisement
terorama

Yandex blogs parse

Nov 14th, 2012
119
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
PHP 3.94 KB | None | 0 0
  1. <?php
  2.  
  3. //------------------------------
  4. function parse_topic($topic) {
  5.  
  6.    $url='http://blogs.yandex.ru/search.xml';
  7.    $postdata = 'text='.urlencode($topic);
  8.  
  9.    
  10.     $ch = curl_init();
  11.    curl_setopt($ch, CURLOPT_HEADER,0);
  12.    curl_setopt($ch, CURLOPT_URL, $url);
  13.    curl_setopt($ch, CURLOPT_TIMEOUT, 10);
  14.    curl_setopt($ch, CURLOPT_FOLLOWLOCATION,1);
  15.    curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727)");
  16.  
  17.    curl_setopt($ch, CURLOPT_RETURNTRANSFER,1);
  18.     curl_setopt($ch, CURLOPT_POST, 1);
  19.   curl_setopt($ch, CURLOPT_POSTFIELDS, $postdata);
  20.    curl_setopt($ch, CURLOPT_COOKIEFILE, 'yacook.txt');
  21.    curl_setopt($ch, CURLOPT_COOKIEJAR, 'yacook.txt');
  22.  
  23.  
  24.    $result = curl_exec($ch);
  25. $result = mb_convert_encoding($result,'windows-1251');
  26. //echo $result;
  27.  
  28.    $uu='<td class="l-page-search-l">';
  29.  
  30.    $rez3=substr($result, strpos($result,$uu)+strlen($uu));
  31.    $rez3=substr($rez3,0, strpos($rez3,'<td class="l-page-search-r">'));
  32.  
  33. //echo $rez3;
  34.  
  35.    preg_match_all('/class="b-item.*?<a[^>]*?href=(["\'])([^\1>]*?)\1[^>]*?class=" Search[^>]*?>(.*?)<\/a>/ims',
  36.    $rez3,  $gets);
  37.    
  38.   return $gets;
  39.  
  40. }
  41.  
  42. //------------------------------
  43. function load_yablog_todays() {
  44.  
  45.      $url='http://blogs.yandex.ru';
  46.  
  47.    $ch = curl_init();
  48.    curl_setopt($ch, CURLOPT_HEADER,0);
  49.    curl_setopt($ch, CURLOPT_URL, $url);
  50.    curl_setopt($ch, CURLOPT_TIMEOUT, 10);
  51.    curl_setopt($ch, CURLOPT_FOLLOWLOCATION,1);
  52.    curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727)");
  53.  
  54.    curl_setopt($ch, CURLOPT_RETURNTRANSFER,1);
  55.    curl_setopt($ch, CURLOPT_COOKIEFILE, 'yacook.txt');
  56.    curl_setopt($ch, CURLOPT_COOKIEJAR, 'yacook.txt');
  57.  
  58.    $result = curl_exec($ch);
  59.  
  60.    $result = mb_convert_encoding($result,'windows-1251');
  61.    $uu='<table class="b-themes">';
  62.  
  63.    $rez3=substr($result, strpos($result,$uu)+strlen($uu));
  64.    $rez3=substr($rez3,0, strpos($rez3,'</table>'));
  65.    //echo $rez3;
  66.    preg_match_all('/<a[^>]*?href=(["\'])(.*?)\1[^>]*?>(.*?)<\/a>/ims',$rez3, $gets);
  67.  
  68.    return $gets;
  69.  
  70.  
  71. }
  72. //-------------------------------------------
  73.  $relstr= "
  74.  <script type='text/javascript'>
  75.   window.onload=function() {
  76.  
  77.  var el=document.createElement('DIV');
  78.  el.id='inf888';
  79.  
  80.  el.num=10;
  81.  document.body.appendChild(el);
  82.  
  83.  el.tm=setInterval( function () {
  84.    
  85.    el.innerHTML='restart after '+el.num+' s';
  86.    el.num--;
  87.    
  88.    if (el.num==0) {
  89.      clearInterval(el.tm);
  90.      location.replace(location.protocol+'//'+location.hostname+location.pathname+'?step=2');
  91.    }
  92.    
  93.  },1000);
  94.  
  95. }</script>";
  96. session_start();
  97. echo '<html><head><title>test888</title>'.
  98.     '<meta http-equiv="content-type" content="text/html;charset=windows-1251"/>'.
  99.     '</head><body>';
  100.  
  101. //-------------------------------------------
  102. if (!isset($_GET['step'])) {
  103.    echo '<a href="'.$_SERVER["PHP_SELF"].'?step=1">go</a>';
  104.    exit();
  105. }
  106.  
  107. //------------------------------
  108. if ($_GET['step']==1) {
  109.  
  110.    $todays = load_yablog_todays();
  111.    $els=array();
  112.  
  113.    $_SESSION['todays3']=$todays[3];
  114.    $_SESSION['els']=$els;
  115.    $_SESSION['step4']=0;
  116.  
  117.    echo '<pre>'.print_r($todays[3],true).'</pre>';
  118.    echo $relstr;
  119.    
  120. } else if ($_GET['step']==2) {
  121.  
  122.    
  123.    $todays3=$_SESSION['todays3'];
  124.    $els=$_SESSION['els'];
  125.    $step4=$_SESSION['step4'];
  126.  
  127.    $topic=$todays3[$step4];
  128.    $els_ftg = parse_topic($topic);
  129.  
  130.    $els_ftg4=array();
  131.    $els_ftg4['lnk']=$els_ftg[2];
  132.    $els_ftg4['txt']=$els_ftg[3];
  133.  
  134.    $els = array_merge_recursive($els,$els_ftg4);
  135.  
  136.    $_SESSION['els']=$els;
  137.    $_SESSION['step4']++;
  138.  
  139.    if ($_SESSION['step4']==count($todays3)) {
  140.       echo '<h2>final results</h2>';
  141.       echo zpr($els);  
  142.       exit();    
  143.    }
  144.  
  145.    echo '<h2>current results</h2>';
  146.    echo zpr($els);
  147.    echo $relstr;
  148. }
  149.  
  150. function zpr($s) {
  151.    return '<pre>'.strip_tags(print_r($s,true)).'</pre>';
  152. }
  153.  
  154.  
  155.  
  156. ?>
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement