Advertisement
DJ_PhoeniX

VUZ parser

Dec 4th, 2011
2,841
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
PHP 3.12 KB | None | 0 0
  1. <?
  2.     Header('Content-type: text/html; charset=UTF-8');
  3.     mysql_pconnect('host','user','pass') or die(mysql_errno().": ".mysql_error());
  4.     mysql_select_db('vuz_db') or die(mysql_errno().": ".mysql_error());
  5.     mysql_query('set names \'utf8\'') or die(mysql_errno().": ".mysql_error());
  6.     set_time_limit(0);
  7.     function koi2utf($s){
  8.         return iconv('koi8-r','utf-8',$s);
  9.     }
  10.     function curl_get($url, $params = false){
  11.         if(strpos($url,'#')) $url = substr($url,0,strpos($url,'#'));
  12.         if($params){
  13.             $url .= strpos($url,'?') ? '&' : '?';
  14.             foreach($params as $k=>&$v) $v = URLEncode($k).'='.URLEncode($v);
  15.             $params = join('&',$params);
  16.             $url .= $params;
  17.         }
  18.         $ch = curl_init();
  19.         curl_setopt($ch, CURLOPT_URL, $url);
  20.         curl_setopt($ch, CURLOPT_HEADER, false);
  21.         curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
  22.         curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 30);
  23.         curl_setopt($ch, CURLOPT_USERAGENT, $_SERVER['HTTP_USER_AGENT']);
  24.         $data = curl_exec($ch);
  25.         curl_close($ch);
  26.         return $data;
  27.     }
  28.     $page = isset($_GET['page']) ? $_GET['page'] : 1;
  29.     $data = curl_get('http://abitur.nica.ru/new/www/search.php',array('page'=>$page));
  30.     preg_match_all('/vuz_detail\.php\?[^"]*/i',$data,$match);
  31.     $specs = array();
  32.     $vuzes = array();
  33.     $vuzes_specs = array();
  34.     foreach($match[0] as $v){
  35.         preg_match('/code=([0-9]*)/i',$v,$vuz_id);
  36.         $vuz_id = intval($vuz_id[1]);
  37.         $data = curl_get("http://abitur.nica.ru/new/www/vuz_detail.php",array('code'=>$vuz_id));
  38.         preg_match_all('/<h1>(.*?)<\/h1>/i',$data,$vuz);
  39.         $vuz_name = $vuz[1][0];
  40.         preg_match_all('/<div\s+class="contact"\s*>.*?<\/div>/is',$data,$vuz_contacts);
  41.         preg_match_all('/<p>(.*?)<\/p>/is',$vuz_contacts[0][0],$vuz_contacts);
  42.         $vuz_contacts = preg_replace('/\s+/',' ',$vuz_contacts[1]);
  43.         $vuz_addr = trim($vuz_contacts[0]);
  44.         $data = curl_get("http://abitur.nica.ru/new/www/vuz_specs.php",array('code'=>$vuz_id));
  45.         preg_match_all('/<span\s+class="bold"\s*>(.*?)<\/span>/is',$data,$vuz_specs);
  46.         foreach($vuz_specs[1] as &$s){
  47.             preg_match('/\s*(.*)\s+\((\d{6})\)/is',$s,$s_inf);
  48.             $specs[$s_inf[2]]=koi2utf($s_inf[1]);
  49.             $vuzes_specs[]='('.$vuz_id.','.intval($s_inf[2]).')';
  50.         }
  51.         $vuzes[]='('.intval($vuz_id).',\''.koi2utf($vuz_name).'\',\''.str_replace('\'','\\\'',koi2utf($vuz_addr)).'\')';
  52.     }
  53.     ksort($specs);
  54.     foreach($specs as $id=>&$name){
  55.         $name = '('.intval($id).',\''.$name.'\')';
  56.     }
  57.     if(count($vuzes)>0) mysql_query('insert ignore into vuzes values'.join(',',$vuzes)) or die(mysql_errno().": ".mysql_error());
  58.     if(count($specs)>0) mysql_query('insert ignore into specs values'.join(',',$specs)) or die(mysql_errno().": ".mysql_error());
  59.     if(count($vuzes_specs)>0) mysql_query('insert ignore into vuzes_specs values'.join(',',$vuzes_specs)) or die(mysql_errno().": ".mysql_error());
  60.     echo "Со страницы #$page в базу добавлено ".count($vuzes)." вузов, ".count($specs)." специализаций и ".count($vuzes_specs)." пар \"вуз-специализация\".<br/>";
  61.     if(count($vuzes)==0)  echo "Похоже, это всё. Ура :)";
  62.     else echo "Работаем дальше...<script>location.replace('?page=".(++$page)."');</script>";
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement