Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- <?php
- include_once('simple_html_dom.php');
- function scraping_slashdot($art_id) {
- //assign URL and check for existance
- $url = 'http://someurl.com/?CategoryID=157&ArticleID='.$art_id ;
- $handle = @fopen($url,'r');
- if($handle !== false){
- // echo 'Exists';
- // create HTML DOM
- $html = file_get_html($url);
- $item['id'] = $art_id;
- // get article block
- if ($html){
- foreach($html->find('table[class^=bodyItem]') as $article) {
- $ret = $item = array();
- // get elements
- $item['title'] = trim($article->find('h1[class^=articletitle]', 0)->plaintext);
- $item['author'] = trim($article->find('td[class^=ArticleAuthor]', 0)->plaintext);
- $item['date'] = trim($article->find('td[class^=date]', 0)->plaintext);
- $item['image'] = trim($article->find('td[class^=MainImage]', 0)->innertext);
- $item['mainimagesrc'] = trim($article->find('img[src*=_Uploads]', 0)->src);
- $item['image2'] = trim($article->find('img[src*=_Uploads]', 0)->plaintext);
- //$item['moreimages'] = trim($article->find('img[src*=_Uploads]', 0)->plaintext);
- // $item['content'] = trim($article->find('div.body', 0)->plaintext);
- $item['content'] = trim($article->find('div[id^=ctlB]', 0)->plaintext);
- foreach($article->find('img[src*=extra]') as $element) {
- $item['moreimages'][] = $element->src;
- }
- $ret[] = $item;
- }
- // clean up memory
- $html->clear();
- unset($html);
- } // end if HTML
- } else {
- $return = 'URL Doesnt EXISTS for article '.$art_id.'</br>';
- return $return;
- } // end if handle
- return $ret;
- }
- // -----------------------------------------------------------------------------
- // test it!
- // $art_id = '964';
- // $arts_id = array('965','964');
- function scrap123(){
- require_once(ABSPATH . 'wp-admin/includes/media.php');
- require_once(ABSPATH . 'wp-admin/includes/file.php');
- require_once(ABSPATH . 'wp-admin/includes/image.php');
- // $arts_id = range (900, 920);
- // $arts_id = array(900,901,902,903,904,905,906,907,908,909,910,911,912,913,914,915,916,97,918,919,920);
- set_time_limit(99999);
- ini_set('max_execution_time', 99999); //300 seconds = 5 minutes
- ini_set('memory_limit','128M');
- error_reporting(E_ALL); ini_set('display_errors', '1');
- $output = '';
- $current = 0;//see bottom flush functions...
- // foreach ($arts_id as $art_id){
- echo 'running';
- flush();
- $control = array();
- $range =range(122,400);
- // ob_start();
- // for($i=100;$i<300;$i++){
- foreach ($range as $art_id){ // done 500-600
- // $art_id = $i;
- if (!in_array($art_id,$control))
- $control[] = $art_id;
- $output = '';
- $current++;//see bottom flush functions...
- outputProgress($current, 300);//see bottom flush functions...
- $ret = scraping_slashdot($art_id);
- $string = 'URL Doesnt EXISTS for article '.$art_id.'</br>';
- if($ret !== $string){
- foreach($ret as $v) {
- $output .= '<span style="color:blue;font-size:1.3em"><br>###################################<br>';
- // $output .= 'ITEM ID : '.$v['id'].'<br>' ;
- $output .= 'ITEM ID : '.$art_id.'<br>' ;
- $output .= '<br>###################################<br></span>';
- $title = $clr_title = $v['title'];
- if (!$title){
- $title = 'not found';
- }
- $title = mb_convert_encoding($title, 'HTML-ENTITIES', 'UTF-8');
- $output .= 'TITLE : '.$title.'<br>';
- $author = $clr_author = $v['author'];
- $author = mb_convert_encoding($author, 'HTML-ENTITIES', 'UTF-8');
- $output .= 'AUTHOR : '.$author.'<br>';
- $date = $clr_date = $v['date'];
- $date = mb_convert_encoding($date, 'HTML-ENTITIES', 'UTF-8');
- $output .= 'DATE : '.$date.'<br>';
- // NOTE : changed Y-m-d to Y-d-m
- $clr_datenew = date('Y-m-d ' . sprintf('%02d:%02d:00', mt_rand(0, 23), mt_rand(0, 59)), strtotime(str_replace('/','-',$clr_date)));
- $datenew3 = date('Y-m-d '.mt_rand(0, 23).':'.mt_rand(0, 59).':s', strtotime($date));
- $datenew3 = date('Y-m-d H:i:s', strtotime($date));
- $output .= 'CLER_DATENEW : '.$clr_datenew.'<br>';
- $output .= 'CLER_DATENEW 2 : '.$datenew3.'<br>';
- $image =$v['image'];
- if (!$image){
- $image = 'not found';
- }
- $image = mb_convert_encoding($image, 'HTML-ENTITIES', 'UTF-8');
- $output .= 'IMAGE : '.$image.'<br>';
- $image2 = $v['image2'];
- $image2 = mb_convert_encoding($image2, 'HTML-ENTITIES', 'UTF-8');
- $output .= 'IMAGE2 : '.$image2.'<br>';
- //main image source
- $mainimagesrc =$v['mainimagesrc'];
- if (!$mainimagesrc){
- $mainimagesrc = 'not found';
- }
- $mainimagesrc = mb_convert_encoding($mainimagesrc, 'HTML-ENTITIES', 'UTF-8');
- $output .= 'MAIN IMAGE SRC : '.$mainimagesrc.'<br>';
- $realimgpath = 'http://someurl.com/'.str_replace('_cut/F0_0244_0000_','',$mainimagesrc);
- $output .= 'REAL IMAGE PATH : '.$realimgpath.'<br>';//
- $output .= '<img src='.$realimgpath.' width="100" height="100">';
- $content = $clr_content = $v['content'];
- $content = mb_convert_encoding($content, 'HTML-ENTITIES', 'UTF-8');
- $output .= '</br>CONTENT : '.$content.'<br>';
- // additional images
- $moreimg = $v['moreimages'];
- if ($moreimg){
- foreach($moreimg as $imgadd ) {
- $output .= '</br>ADDITIONAL IMAGES SRC :' . $imgadd;
- $output .= '</br>REAL ADDITIONAL IMAGE PATH : '.'http://someurl.com/' . $imgadd .'<br>';//
- $output .= '<img src='.'http://someurl.com/' . $imgadd .' width="100" height="100">';
- }
- }
- // END ITEM
- $output .= '<br><br><br>###################################<br><br><br>';
- // sanitize post elements for wp
- $allowed = array('a' => array(
- 'href' => array(),
- 'title' => array()
- ),
- 'br' => array(),
- 'em' => array(),
- 'strong' => array(),
- 'b' => array()
- );
- // $clr_content = wp_kses($clr_content,$allowed);
- $clr_content = balanceTags($clr_content,true);
- $clr_content = wp_kses_post($clr_content);
- // $clr_title = $clr_title . $date . $clr_datenew . $datenew;
- // CREATE THE POST OBJECT
- $my_post = array(
- 'post_title' => $clr_title ,
- 'post_content' => $clr_content ,
- 'post_status' => 'publish',
- 'post_author' => 1,
- 'post_category' => array(2),//$the_cat ,// remember it needs to be an array , if not - array($the_cat)
- 'post_excerpt' => NULL,//$clr_title,
- 'post_date' => $clr_datenew // $clr_datenew [ Y-m-d H:i:s ] //The time post was made.
- );
- $post_id = wp_insert_post( $my_post );
- $url = $realimgpath;
- $description = $title;
- // upload_img_id = media_sideload_image($url,$post_id,$description);
- media_sideload_image($url,$post_id,$description);
- if ($moreimg){
- foreach($moreimg as $imgadd ) {
- $addurl = 'http://www.123.org.il/' . $imgadd;
- media_sideload_image($addurl,$post_id,$description);
- }
- }
- // sleep(1);
- update_post_meta( $post_id, 'orig_123_author', $clr_author ); // add author meta for archiving
- update_post_meta( $post_id, 'orig_123_date', $date ); // give thumbnail
- update_post_meta( $post_id, 'orig_123_clrdatenew_date', $clr_datenew ); // give thumbnail
- // update_post_meta( $post_id, '_thumbnail_id', $upload_img_id ); // give thumbnail
- // set_post_thumbnail($post_id,$upload_img_id);
- // media_sideload_image does not return post_id but html string
- // so now we need to look for the attachment....
- $attachments = get_posts(array(
- 'numberposts' => '1',
- 'post_parent' => $post_id,
- 'post_type' => 'attachment',
- 'post_mime_type' => 'image',
- 'order' => 'ASC'));
- if(sizeof($attachments) > 0){
- // set image as the post thumbnail
- set_post_thumbnail($post_id, $attachments[0]->ID);
- }
- //echo $output ;
- //echo '</br> DONE for article :'.$art_id.' with post ID :' . $post_id;
- } //end foreach($ret as $v)
- $output .= '</br><span style="color:green"><b>DONE' . $art_id .'</b></span></br>';
- } else {
- $output .= $ret . '</br> ARTICLE '.$art_id.'NOT EXISTS OR IS EMPTY OR 404' ;
- // echo $ret . '</br> ARTICLE '.$art_id.'NOT EXISTS OR IS EMPTY OR 404' ;
- } // if($ret != $string)
- echo $output ;
- // ob_get_clean();
- flush();
- // ob_flush();
- } // end for each art id
- // echo ob_get_clean();
- }
- /**
- * Output span with progress.
- *
- * @param $current integer Current progress out of total
- * @param $total integer Total steps required to complete
- */
- function outputProgress($current, $total) {
- // echo "<span style='background:red;font-size:1.5em;color:#fff;'>" . round($current / $total * 100) . "% </span>";
- echo "<span style='background:red;font-size:1.5em;color:#fff;'>" . $current .'/'. $total . "% </span>";
- myFlush();
- sleep(1);
- }
- /**
- * Flush output buffer
- */
- function myFlush() {
- echo(str_repeat(' ', 256));
- if (@ob_get_contents()) {
- @ob_end_flush();
- }
- flush();
- }
- // td[align=center]
- // h1[class^=articletitle]
- // table[class^=bodyItem]
- // <h1 class="pageTitleColor articletitle">הושלם בהצלחה פרויקט שיפוץ התשתיות ברחוב 'מלכי ישראל'</h1>
- ?>
Advertisement
Add Comment
Please, Sign In to add comment