Advertisement
Guest User

crawle news, galaxycloud.vn tham khao

a guest
Nov 12th, 2019
181
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
PHP 5.61 KB | None | 0 0
  1. <?php
  2. function ol1($str){
  3. echo "\n $str";
  4. }
  5. ol1("--- Start crawle ".basename(__FILE__). " --- ");
  6.  
  7. $mLink = [
  8.     'http://abc.com/pc-console/page/'=>'zy370886',
  9.     'http://abc.com/cong-dong-game/page/'=>'zd047253',
  10.     'http://abc.com/game-online/page/'=>'va207791',
  11.     'http://abc.com/game-mobile/page/'=>'ue985114',
  12.     'http://abc.com/esports/page/'=>'gh466291',
  13.     ''=>'',
  14.     ''=>'',
  15.     ''=>'',
  16. ];
  17.  
  18. $ll = 0;
  19. while (1) {
  20.     BaoGame::$newListInsert = [];
  21.     echo "<br/>\n LOOP = $ll";
  22.     if($ll){
  23.         echo "<br/>\n Sleep 10p";
  24.         sleep(600);
  25.     }
  26.     $ll++;
  27.     foreach ($mLink AS $link => $pid) {
  28.  
  29.         if (!$link)
  30.             continue;
  31.  
  32.         for ($i = 1; $i < 10; $i++) {
  33.             $link1 = "$link$i";
  34.             echo "<br/>\n$link1";
  35.             sleep(1);
  36.             getListBaiBao($link1, qqgetIdFromRand($pid));
  37.         }
  38.     }
  39.     BaoGame::checkInsert(__FILE__);;
  40. }
  41.  
  42.  
  43. function getContentBai($linkBai){
  44.     sleep(1);
  45.  
  46.     BaoGame::$totalKeoVe++;
  47.  
  48.     $cont = ctool::postget1curl($linkBai);
  49.     if(!$cont)
  50.         return;
  51.  
  52.     $xx = str_get_html($cont);
  53.  
  54.     if(!$xx)
  55.         return;
  56.  
  57.     $x1 = $xx->find("div.entry-content", 0);
  58.     if(!$x1)
  59.         return;
  60.     //$cont = $x1->innertext;
  61.  
  62.     //Tim tat ca a co href= img, jpg
  63.     foreach ($x1->find("img") AS $elm){
  64.         $src = $elm->src;
  65.         $elm->parent()->outertext = "<img src='$src' class='glx_img'>";
  66.     }
  67.  
  68.     foreach ($x1->find("script") AS $img1) {
  69.         $img1->outertext = '';
  70.     }
  71.  
  72.     $cont = $x1->innertext;
  73.  
  74.     $cont = str_replace("text-align: justify;", '', $cont);
  75.  
  76.     //echo "<br/>\nCONT = $cont ";
  77.  
  78.     //Remove all atribute khac src, href, class
  79.     $xx = str_get_html($cont);
  80.     foreach ($xx->find("*") AS $elm){
  81.         foreach ($elm->getAllAttributes() as $attr => $val) {
  82.             //if($attr != 'src' && $attr != 'title'&& $attr != 'href')
  83.             if($attr != 'src' && $attr != 'title' && $attr != 'href' && $attr != 'class')
  84.                 $elm->removeAttribute($attr);
  85.         }
  86.     }
  87.     $sum = $xx->find("p", 0)->innertext;
  88.     $sum = strip_tags($sum);
  89.     $xx->find("p", 0)->outertext = '';
  90.  
  91.     //getch( "SUME = $sum");
  92.  
  93.     $cont = $xx->innertext;
  94.  
  95.     for($i = 1; $i< 20; $i++){
  96.         $cont = str_replace("<p>&nbsp;</p>\n<p>&nbsp;</p>", "<p>&nbsp;</p>", $cont);
  97.         $cont = str_replace("<p>&nbsp;</p>\r<p>&nbsp;</p>", "<p>&nbsp;</p>", $cont);
  98.         $cont = str_replace("<p>&nbsp;</p><p>&nbsp;</p>", "<p>&nbsp;</p>", $cont);
  99.         $cont = str_replace("<p> </p>\n<p> </p>", "<p> </p>", $cont);
  100.         $cont = str_replace("<p> </p>\r<p> </p>", "<p> </p>", $cont);
  101.         $cont = str_replace("<p>&nbsp;</p> <p>&nbsp;</p>", "<p>&nbsp;</p>", $cont);
  102.     }
  103.  
  104.     //return $cont;
  105.  
  106.     return [$sum, $cont];
  107. }
  108.  
  109. function getListBaiBao($link, $pid){
  110.  
  111.     $newList = [];
  112.  
  113.     $cont = ctool::postget1curl($link);
  114.     if(!$cont)
  115.         return;
  116.  
  117.     $xx = str_get_html($cont);
  118.  
  119.     foreach ($xx->find("div.blog-post.saxon-block.saxon-large-grid-post") AS $x){
  120.  
  121.         echo "<br/>\n ++++++++++++++ " . BaoGame::$totalKeoVe . " +++++++++++++++ ";
  122.  
  123.         echo "<br/>\n Total New = ".count(BaoGame::$newListInsert);
  124.  
  125.         $x1 = str_get_html($x->innertext);
  126.  
  127.         $link1 = trim($x1->find("a",0)->href);
  128.         $link1 = "$link1";
  129.         echo "<br/>\n LINK = $link1";
  130.  
  131.         $img = \Base\ClassString::getStringBetween2StringType2($x1->innertext, 'url\(', '\);');
  132.         $img = str_replace(['(', ')', ';','"','>'] ,'', $img);
  133.  
  134.         echo "<br/>\n IMG =  $img";
  135.  
  136.         //;
  137.  
  138.         $title1 = $x1->find("h3",0)->children(0)->innertext;
  139.  
  140.         $title1 = strip_tags($title1);
  141.  
  142.         $title1 = html_entity_decode($title1);
  143.  
  144.         echo "<br/>\n Title = $title1";
  145.  
  146.  
  147.         $sum1 = $x1->find("div.post-excerpt",0)->innertext;
  148.         $sum1 = html_entity_decode($sum1);
  149.         $sum1 = strip_tags($sum1);
  150.         echo "<br/>\n Sum = $sum1";
  151.  
  152.         $time = $x1->find("div.post-date",0)->innertext;
  153.         $time = trim($time);
  154.         echo "<br/>\n Time =" . $time ;
  155.  
  156.         $time0 = $time = trim($time);
  157.         list($d, $m, $y) = explode("/", substr($time, 0,10));
  158.         //echo "<br/>\n $y-$m-$d";
  159.         $dateOK = $time = "$y-$m-$d";
  160.  
  161.         if(strtotime($dateOK) < time() - 7 * _NSECOND_DAY)
  162.             return;
  163.  
  164.         echo "<br/>\n DateOK = $dateOK";
  165.  
  166.         $daco = 0;
  167.         $obj = new \Base\ModelNewsFile();
  168.         if($obj->getOneWhere_(" refer = '$link1'")){
  169.             echo "<br/>\n Da co , ID = $obj->id ";
  170.             //continue;
  171.             $daco = 1;
  172.         }
  173.         foreach (BaoGame::$newListInsert AS $obj1){
  174.             if($obj1->refer == $link1){
  175.                 continue;
  176.             }
  177.         }
  178.         if(isCli()){
  179.  
  180.             $title1 = str_replace(["“", "”"], "\"", $title1);
  181.             $obj->name = $title1;
  182.             $obj->refer = $link1;
  183.             $obj->image0 = $img;
  184.             $obj->createdAt = $dateOK;
  185.             $obj->name = $title1;
  186.             $obj->parent = $pid;
  187.             $obj->status = 1;
  188.             $obj->siteid = ClassSetting::$siteId;
  189.             //$obj->summary = $sum1;
  190.             $ttt = getContentBai($link1);
  191.             $cont = $ttt[1];
  192.             $obj->summary = $ttt[0];
  193.             $obj->content = $cont;
  194.  
  195.             if($daco) {
  196.                 //$obj->updateDbMe();
  197.                // echo "<br/>\n Update done!";
  198.             }
  199.             else {
  200.                 BaoGame::$newListInsert[] = $obj;
  201.                 //$obj->insertDbMe();
  202.                 //echo "<br/>\n Insert done!";
  203.             }
  204.  
  205.         }
  206.     }
  207. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement