Guest User

Untitled

a guest
Apr 2nd, 2013
210
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 9.40 KB | None | 0 0
  1. <?php
  2. include_once('simple_html_dom.php');
  3.  
  4. function scraping_slashdot($art_id) {
  5.  
  6. //assign URL and check for existance
  7. $url = 'http://someurl.com/?CategoryID=157&ArticleID='.$art_id ;
  8. $handle = @fopen($url,'r');
  9. if($handle !== false){
  10. // echo 'Exists';
  11.  
  12. // create HTML DOM
  13. $html = file_get_html($url);
  14. $item['id'] = $art_id;
  15. // get article block
  16. if ($html){
  17.  
  18. foreach($html->find('table[class^=bodyItem]') as $article) {
  19. $ret = $item = array();
  20. // get elements
  21. $item['title'] = trim($article->find('h1[class^=articletitle]', 0)->plaintext);
  22. $item['author'] = trim($article->find('td[class^=ArticleAuthor]', 0)->plaintext);
  23. $item['date'] = trim($article->find('td[class^=date]', 0)->plaintext);
  24. $item['image'] = trim($article->find('td[class^=MainImage]', 0)->innertext);
  25. $item['mainimagesrc'] = trim($article->find('img[src*=_Uploads]', 0)->src);
  26. $item['image2'] = trim($article->find('img[src*=_Uploads]', 0)->plaintext);
  27. //$item['moreimages'] = trim($article->find('img[src*=_Uploads]', 0)->plaintext);
  28. // $item['content'] = trim($article->find('div.body', 0)->plaintext);
  29. $item['content'] = trim($article->find('div[id^=ctlB]', 0)->plaintext);
  30.  
  31. foreach($article->find('img[src*=extra]') as $element) {
  32. $item['moreimages'][] = $element->src;
  33. }
  34.  
  35. $ret[] = $item;
  36. }
  37.  
  38.  
  39.  
  40. // clean up memory
  41. $html->clear();
  42. unset($html);
  43.  
  44. } // end if HTML
  45.  
  46. } else {
  47. $return = 'URL Doesnt EXISTS for article '.$art_id.'</br>';
  48. return $return;
  49. } // end if handle
  50.  
  51. return $ret;
  52. }
  53.  
  54. // -----------------------------------------------------------------------------
  55. // test it!
  56.  
  57. // $art_id = '964';
  58.  
  59. // $arts_id = array('965','964');
  60.  
  61. function scrap123(){
  62.  
  63. require_once(ABSPATH . 'wp-admin/includes/media.php');
  64. require_once(ABSPATH . 'wp-admin/includes/file.php');
  65. require_once(ABSPATH . 'wp-admin/includes/image.php');
  66.  
  67. // $arts_id = range (900, 920);
  68. // $arts_id = array(900,901,902,903,904,905,906,907,908,909,910,911,912,913,914,915,916,97,918,919,920);
  69. set_time_limit(99999);
  70. ini_set('max_execution_time', 99999); //300 seconds = 5 minutes
  71. ini_set('memory_limit','128M');
  72. error_reporting(E_ALL); ini_set('display_errors', '1');
  73.  
  74. $output = '';
  75.  
  76. $current = 0;//see bottom flush functions...
  77.  
  78. // foreach ($arts_id as $art_id){
  79. echo 'running';
  80. flush();
  81. $control = array();
  82.  
  83. $range =range(122,400);
  84. // ob_start();
  85. // for($i=100;$i<300;$i++){
  86. foreach ($range as $art_id){ // done 500-600
  87.  
  88. // $art_id = $i;
  89. if (!in_array($art_id,$control))
  90.  
  91. $control[] = $art_id;
  92. $output = '';
  93. $current++;//see bottom flush functions...
  94. outputProgress($current, 300);//see bottom flush functions...
  95.  
  96. $ret = scraping_slashdot($art_id);
  97. $string = 'URL Doesnt EXISTS for article '.$art_id.'</br>';
  98.  
  99. if($ret !== $string){
  100.  
  101. foreach($ret as $v) {
  102. $output .= '<span style="color:blue;font-size:1.3em"><br>###################################<br>';
  103. // $output .= 'ITEM ID : '.$v['id'].'<br>' ;
  104. $output .= 'ITEM ID : '.$art_id.'<br>' ;
  105. $output .= '<br>###################################<br></span>';
  106.  
  107. $title = $clr_title = $v['title'];
  108.  
  109. if (!$title){
  110. $title = 'not found';
  111. }
  112.  
  113. $title = mb_convert_encoding($title, 'HTML-ENTITIES', 'UTF-8');
  114. $output .= 'TITLE : '.$title.'<br>';
  115.  
  116. $author = $clr_author = $v['author'];
  117. $author = mb_convert_encoding($author, 'HTML-ENTITIES', 'UTF-8');
  118. $output .= 'AUTHOR : '.$author.'<br>';
  119.  
  120. $date = $clr_date = $v['date'];
  121. $date = mb_convert_encoding($date, 'HTML-ENTITIES', 'UTF-8');
  122. $output .= 'DATE : '.$date.'<br>';
  123.  
  124.  
  125. // NOTE : changed Y-m-d to Y-d-m
  126. $clr_datenew = date('Y-m-d ' . sprintf('%02d:%02d:00', mt_rand(0, 23), mt_rand(0, 59)), strtotime(str_replace('/','-',$clr_date)));
  127.  
  128.  
  129. $datenew3 = date('Y-m-d '.mt_rand(0, 23).':'.mt_rand(0, 59).':s', strtotime($date));
  130. $datenew3 = date('Y-m-d H:i:s', strtotime($date));
  131.  
  132. $output .= 'CLER_DATENEW : '.$clr_datenew.'<br>';
  133. $output .= 'CLER_DATENEW 2 : '.$datenew3.'<br>';
  134.  
  135.  
  136. $image =$v['image'];
  137. if (!$image){
  138. $image = 'not found';
  139. }
  140. $image = mb_convert_encoding($image, 'HTML-ENTITIES', 'UTF-8');
  141.  
  142. $output .= 'IMAGE : '.$image.'<br>';
  143.  
  144. $image2 = $v['image2'];
  145. $image2 = mb_convert_encoding($image2, 'HTML-ENTITIES', 'UTF-8');
  146. $output .= 'IMAGE2 : '.$image2.'<br>';
  147.  
  148. //main image source
  149. $mainimagesrc =$v['mainimagesrc'];
  150.  
  151. if (!$mainimagesrc){
  152. $mainimagesrc = 'not found';
  153. }
  154.  
  155. $mainimagesrc = mb_convert_encoding($mainimagesrc, 'HTML-ENTITIES', 'UTF-8');
  156. $output .= 'MAIN IMAGE SRC : '.$mainimagesrc.'<br>';
  157. $realimgpath = 'http://someurl.com/'.str_replace('_cut/F0_0244_0000_','',$mainimagesrc);
  158. $output .= 'REAL IMAGE PATH : '.$realimgpath.'<br>';//
  159.  
  160. $output .= '<img src='.$realimgpath.' width="100" height="100">';
  161.  
  162.  
  163. $content = $clr_content = $v['content'];
  164. $content = mb_convert_encoding($content, 'HTML-ENTITIES', 'UTF-8');
  165. $output .= '</br>CONTENT : '.$content.'<br>';
  166.  
  167. // additional images
  168.  
  169. $moreimg = $v['moreimages'];
  170. if ($moreimg){
  171.  
  172. foreach($moreimg as $imgadd ) {
  173. $output .= '</br>ADDITIONAL IMAGES SRC :' . $imgadd;
  174.  
  175. $output .= '</br>REAL ADDITIONAL IMAGE PATH : '.'http://someurl.com/' . $imgadd .'<br>';//
  176. $output .= '<img src='.'http://someurl.com/' . $imgadd .' width="100" height="100">';
  177. }
  178. }
  179. // END ITEM
  180. $output .= '<br><br><br>###################################<br><br><br>';
  181.  
  182. // sanitize post elements for wp
  183.  
  184.  
  185. $allowed = array('a' => array(
  186. 'href' => array(),
  187. 'title' => array()
  188. ),
  189. 'br' => array(),
  190. 'em' => array(),
  191. 'strong' => array(),
  192. 'b' => array()
  193. );
  194.  
  195. // $clr_content = wp_kses($clr_content,$allowed);
  196.  
  197. $clr_content = balanceTags($clr_content,true);
  198. $clr_content = wp_kses_post($clr_content);
  199. // $clr_title = $clr_title . $date . $clr_datenew . $datenew;
  200.  
  201. // CREATE THE POST OBJECT
  202. $my_post = array(
  203.  
  204. 'post_title' => $clr_title ,
  205. 'post_content' => $clr_content ,
  206. 'post_status' => 'publish',
  207. 'post_author' => 1,
  208. 'post_category' => array(2),//$the_cat ,// remember it needs to be an array , if not - array($the_cat)
  209. 'post_excerpt' => NULL,//$clr_title,
  210. 'post_date' => $clr_datenew // $clr_datenew [ Y-m-d H:i:s ] //The time post was made.
  211. );
  212.  
  213.  
  214. $post_id = wp_insert_post( $my_post );
  215. $url = $realimgpath;
  216. $description = $title;
  217. // upload_img_id = media_sideload_image($url,$post_id,$description);
  218. media_sideload_image($url,$post_id,$description);
  219.  
  220. if ($moreimg){
  221.  
  222. foreach($moreimg as $imgadd ) {
  223.  
  224. $addurl = 'http://www.123.org.il/' . $imgadd;
  225. media_sideload_image($addurl,$post_id,$description);
  226. }
  227. }
  228. // sleep(1);
  229. update_post_meta( $post_id, 'orig_123_author', $clr_author ); // add author meta for archiving
  230. update_post_meta( $post_id, 'orig_123_date', $date ); // give thumbnail
  231. update_post_meta( $post_id, 'orig_123_clrdatenew_date', $clr_datenew ); // give thumbnail
  232. // update_post_meta( $post_id, '_thumbnail_id', $upload_img_id ); // give thumbnail
  233. // set_post_thumbnail($post_id,$upload_img_id);
  234.  
  235. // media_sideload_image does not return post_id but html string
  236. // so now we need to look for the attachment....
  237.  
  238. $attachments = get_posts(array(
  239. 'numberposts' => '1',
  240. 'post_parent' => $post_id,
  241. 'post_type' => 'attachment',
  242. 'post_mime_type' => 'image',
  243. 'order' => 'ASC'));
  244.  
  245. if(sizeof($attachments) > 0){
  246. // set image as the post thumbnail
  247. set_post_thumbnail($post_id, $attachments[0]->ID);
  248. }
  249.  
  250.  
  251. //echo $output ;
  252. //echo '</br> DONE for article :'.$art_id.' with post ID :' . $post_id;
  253.  
  254. } //end foreach($ret as $v)
  255.  
  256. $output .= '</br><span style="color:green"><b>DONE' . $art_id .'</b></span></br>';
  257.  
  258. } else {
  259. $output .= $ret . '</br> ARTICLE '.$art_id.'NOT EXISTS OR IS EMPTY OR 404' ;
  260. // echo $ret . '</br> ARTICLE '.$art_id.'NOT EXISTS OR IS EMPTY OR 404' ;
  261. } // if($ret != $string)
  262.  
  263. echo $output ;
  264. // ob_get_clean();
  265. flush();
  266. // ob_flush();
  267.  
  268. } // end for each art id
  269. // echo ob_get_clean();
  270. }
  271.  
  272.  
  273.  
  274. /**
  275. * Output span with progress.
  276. *
  277. * @param $current integer Current progress out of total
  278. * @param $total integer Total steps required to complete
  279. */
  280. function outputProgress($current, $total) {
  281. // echo "<span style='background:red;font-size:1.5em;color:#fff;'>" . round($current / $total * 100) . "% </span>";
  282. echo "<span style='background:red;font-size:1.5em;color:#fff;'>" . $current .'/'. $total . "% </span>";
  283. myFlush();
  284. sleep(1);
  285. }
  286.  
  287. /**
  288. * Flush output buffer
  289. */
  290. function myFlush() {
  291. echo(str_repeat(' ', 256));
  292. if (@ob_get_contents()) {
  293. @ob_end_flush();
  294. }
  295. flush();
  296. }
  297. // td[align=center]
  298. // h1[class^=articletitle]
  299. // table[class^=bodyItem]
  300. // <h1 class="pageTitleColor articletitle">הושלם בהצלחה פרויקט שיפוץ התשתיות ברחוב 'מלכי ישראל'</h1>
  301. ?>
Advertisement
Add Comment
Please, Sign In to add comment