Advertisement
Guest User

Untitled

a guest
Feb 10th, 2012
1,694
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 18.10 KB | None | 0 0
  1. <?php
  2. set_time_limit (0);
  3. $include_dir = "../include";
  4. include "auth.php";
  5. require_once ("$include_dir/commonfuncs.php");
  6. $all = 0;
  7. extract (getHttpVars());
  8. $settings_dir = "../settings";
  9. require_once ("$settings_dir/conf.php");
  10.  
  11. include "messages.php";
  12. include "spiderfuncs.php";
  13. error_reporting (E_ALL ^ E_NOTICE ^ E_WARNING);
  14.  
  15.  
  16. $delay_time = 0;
  17.  
  18.  
  19. $command_line = 0;
  20.  
  21. if (isset($_SERVER['argv']) && $_SERVER['argc'] >= 2) {
  22. $command_line = 1;
  23. $ac = 1; //argument counter
  24. while ($ac < (count($_SERVER['argv']))) {
  25. $arg = $_SERVER['argv'][$ac];
  26.  
  27. if ($arg == '-all') {
  28. $all = 1;
  29. break;
  30. } else if ($arg == '-u') {
  31. $url = $_SERVER['argv'][$ac+1];
  32. $ac= $ac+2;
  33. } else if ($arg == '-f') {
  34. $soption = 'full';
  35. $ac++;
  36. } else if ($arg == '-d') {
  37. $soption = 'level';
  38. $maxlevel = $_SERVER['argv'][$ac+1];;
  39. $ac= $ac+2;
  40. } else if ($arg == '-l') {
  41. $domaincb = 1;
  42. $ac++;
  43. } else if ($arg == '-r') {
  44. $reindex = 1;
  45. $ac++;
  46. } else if ($arg == '-m') {
  47. $in = str_replace("\\n", chr(10), $_SERVER['argv'][$ac+1]);
  48. $ac= $ac+2;
  49. } else if ($arg == '-n') {
  50. $out = str_replace("\\n", chr(10), $_SERVER['argv'][$ac+1]);
  51. $ac= $ac+2;
  52. } else {
  53. commandline_help();
  54. die();
  55. }
  56.  
  57. }
  58. }
  59.  
  60.  
  61. if (isset($soption) && $soption == 'full') {
  62. $maxlevel = -1;
  63.  
  64. }
  65.  
  66. if (!isset($domaincb)) {
  67. $domaincb = 0;
  68.  
  69. }
  70.  
  71. if(!isset($reindex)) {
  72. $reindex=0;
  73. }
  74.  
  75. if(!isset($maxlevel)) {
  76. $maxlevel=0;
  77. }
  78.  
  79.  
  80. if ($keep_log) {
  81. if ($log_format=="html") {
  82. $log_file = $log_dir."/".Date("ymdHi").".html";
  83. } else {
  84. $log_file = $log_dir."/".Date("ymdHi").".log";
  85. }
  86.  
  87. if (!$log_handle = fopen($log_file, 'w')) {
  88. die ("Logging option is set, but cannot open file for logging.");
  89. }
  90. }
  91.  
  92. if ($all == 1) {
  93. index_all();
  94. } else {
  95.  
  96. if ($reindex == 1 && $command_line == 1) {
  97. $result=mysql_query("select url, spider_depth, required, disallowed, can_leave_domain from ".$mysql_table_prefix."sites where url='$url'");
  98. echo mysql_error();
  99. if($row=mysql_fetch_row($result)) {
  100. $url = $row[0];
  101. $maxlevel = $row[1];
  102. $in= $row[2];
  103. $out = $row[3];
  104. $domaincb = $row[4];
  105. if ($domaincb=='') {
  106. $domaincb=0;
  107. }
  108. if ($maxlevel == -1) {
  109. $soption = 'full';
  110. } else {
  111. $soption = 'level';
  112. }
  113. }
  114.  
  115. }
  116. if (!isset($in)) {
  117. $in = "";
  118. }
  119. if (!isset($out)) {
  120. $out = "";
  121. }
  122.  
  123. index_site($url, $reindex, $maxlevel, $soption, $in, $out, $domaincb);
  124.  
  125. }
  126.  
  127. $tmp_urls = Array();
  128.  
  129.  
  130. function microtime_float(){
  131. list($usec, $sec) = explode(" ", microtime());
  132. return ((float)$usec + (float)$sec);
  133. }
  134.  
  135.  
  136. function index_url($url, $level, $site_id, $md5sum, $domain, $indexdate, $sessid, $can_leave_domain, $reindex) {
  137. global $entities, $min_delay;
  138. global $command_line;
  139. global $min_words_per_page;
  140. global $supdomain;
  141. global $mysql_table_prefix, $user_agent, $tmp_urls, $delay_time, $domain_arr;
  142. $needsReindex = 1;
  143. $deletable = 0;
  144.  
  145. $url_status = url_status($url);
  146. $thislevel = $level - 1;
  147.  
  148. if (strstr($url_status['state'], "Relocation")) {
  149. $url = preg_replace("/ /", "", url_purify($url_status['path'], $url, $can_leave_domain));
  150.  
  151. if ($url <> '') {
  152. $result = mysql_query("select link from ".$mysql_table_prefix."temp where link='$url' && id = '$sessid'");
  153. echo mysql_error();
  154. $rows = mysql_numrows($result);
  155. if ($rows == 0) {
  156. mysql_query ("insert into ".$mysql_table_prefix."temp (link, level, id) values ('$url', '$level', '$sessid')");
  157. echo mysql_error();
  158. }
  159. }
  160.  
  161. $url_status['state'] == "redirected";
  162. }
  163.  
  164. /*
  165. if ($indexdate <> '' && $url_status['date'] <> '') {
  166. if ($indexdate > $url_status['date']) {
  167. $url_status['state'] = "Date checked. Page contents not changed";
  168. $needsReindex = 0;
  169. }
  170. }*/
  171. ini_set("user_agent", $user_agent);
  172. if ($url_status['state'] == 'ok') {
  173. $OKtoIndex = 1;
  174. $file_read_error = 0;
  175.  
  176. if (time() - $delay_time < $min_delay) {
  177. sleep ($min_delay- (time() - $delay_time));
  178. }
  179. $delay_time = time();
  180. if (!fst_lt_snd(phpversion(), "4.3.0")) {
  181. $file = file_get_contents($url);
  182. if ($file === FALSE) {
  183. $file_read_error = 1;
  184. }
  185. } else {
  186. $fl = @fopen($url, "r");
  187. if ($fl) {
  188. while ($buffer = @fgets($fl, 4096)) {
  189. $file .= $buffer;
  190. }
  191. } else {
  192. $file_read_error = 1;
  193. }
  194.  
  195. fclose ($fl);
  196. }
  197. if ($file_read_error) {
  198. $contents = getFileContents($url);
  199. $file = $contents['file'];
  200. }
  201.  
  202.  
  203. $pageSize = number_format(strlen($file)/1024, 2, ".", "");
  204. printPageSizeReport($pageSize);
  205.  
  206. if ($url_status['content'] != 'text') {
  207. $file = extract_text($file, $url_status['content']);
  208. }
  209.  
  210. printStandardReport('starting', $command_line);
  211.  
  212.  
  213. $newmd5sum = md5($file);
  214.  
  215.  
  216. if ($md5sum == $newmd5sum) {
  217. printStandardReport('md5notChanged',$command_line);
  218. $OKtoIndex = 0;
  219. } else if (isDuplicateMD5($newmd5sum)) {
  220. $OKtoIndex = 0;
  221. printStandardReport('duplicate',$command_line);
  222. }
  223.  
  224. if (($md5sum != $newmd5sum || $reindex ==1) && $OKtoIndex == 1) {
  225. $urlparts = parse_url($url);
  226. $newdomain = $urlparts['host'];
  227. $type = 0;
  228.  
  229. /* if ($newdomain <> $domain)
  230. $domainChanged = 1;
  231.  
  232. if ($domaincb==1) {
  233. $start = strlen($newdomain) - strlen($supdomain);
  234. if (substr($newdomain, $start) == $supdomain) {
  235. $domainChanged = 0;
  236. }
  237. }*/
  238.  
  239. // remove link to css file
  240. //get all links from file
  241. $data = clean_file($file, $url, $url_status['content']);
  242.  
  243. if ($data['noindex'] == 1) {
  244. $OKtoIndex = 0;
  245. $deletable = 1;
  246. printStandardReport('metaNoindex',$command_line);
  247. }
  248.  
  249.  
  250. $wordarray = unique_array(explode(" ", $data['content']));
  251.  
  252. if ($data['nofollow'] != 1) {
  253. $links = get_links($file, $url, $can_leave_domain, $data['base']);
  254. $links = distinct_array($links);
  255. $all_links = count($links);
  256. $numoflinks = 0;
  257. //if there are any, add to the temp table, but only if there isnt such url already
  258. if (is_array($links)) {
  259. reset ($links);
  260.  
  261. while ($thislink = each($links)) {
  262. if ($tmp_urls[$thislink[1]] != 1) {
  263. $tmp_urls[$thislink[1]] = 1;
  264. $numoflinks++;
  265. mysql_query ("insert into ".$mysql_table_prefix."temp (link, level, id) values ('$thislink[1]', '$level', '$sessid')");
  266. echo mysql_error();
  267. }
  268. }
  269. }
  270. } else {
  271. printStandardReport('noFollow',$command_line);
  272. }
  273.  
  274. if ($OKtoIndex == 1) {
  275.  
  276. $title = $data['title'];
  277. $host = $data['host'];
  278. $path = $data['path'];
  279. $fulltxt = $data['fulltext'];
  280. $desc = substr($data['description'], 0,254);
  281. $url_parts = parse_url($url);
  282. $domain_for_db = $url_parts['host'];
  283.  
  284. if (isset($domain_arr[$domain_for_db])) {
  285. $dom_id = $domain_arr[$domain_for_db];
  286. } else {
  287. mysql_query("insert into ".$mysql_table_prefix."domains (domain) values ('$domain_for_db')");
  288. $dom_id = mysql_insert_id();
  289. $domain_arr[$domain_for_db] = $dom_id;
  290. }
  291.  
  292. $wordarray = calc_weights ($wordarray, $title, $host, $path, $data['keywords']);
  293.  
  294. //if there are words to index, add the link to the database, get its id, and add the word + their relation
  295. if (is_array($wordarray) && count($wordarray) > $min_words_per_page) {
  296. if ($md5sum == '') {
  297. mysql_query ("insert into ".$mysql_table_prefix."links (site_id, url, title, description, fulltxt, indexdate, size, md5sum, level) values ('$site_id', '$url', '$title', '$desc', '$fulltxt', curdate(), '$pageSize', '$newmd5sum', $thislevel)");
  298. echo mysql_error();
  299. $result = mysql_query("select link_id from ".$mysql_table_prefix."links where url='$url'");
  300. echo mysql_error();
  301. $row = mysql_fetch_row($result);
  302. $link_id = $row[0];
  303.  
  304. save_keywords($wordarray, $link_id, $dom_id);
  305.  
  306. printStandardReport('indexed', $command_line);
  307. }else if (($md5sum <> '') && ($md5sum <> $newmd5sum)) { //if page has changed, start updating
  308.  
  309. $result = mysql_query("select link_id from ".$mysql_table_prefix."links where url='$url'");
  310. echo mysql_error();
  311. $row = mysql_fetch_row($result);
  312. $link_id = $row[0];
  313. for ($i=0;$i<=15; $i++) {
  314. $char = dechex($i);
  315. mysql_query ("delete from ".$mysql_table_prefix."link_keyword$char where link_id=$link_id");
  316. echo mysql_error();
  317. }
  318. save_keywords($wordarray, $link_id, $dom_id);
  319. $query = "update ".$mysql_table_prefix."links set title='$title', description ='$desc', fulltxt = '$fulltxt', indexdate=now(), size = '$pageSize', md5sum='$newmd5sum', level=$thislevel where link_id=$link_id";
  320. mysql_query($query);
  321. echo mysql_error();
  322. printStandardReport('re-indexed', $command_line);
  323. }
  324. }else {
  325. printStandardReport('minWords', $command_line);
  326.  
  327. }
  328. }
  329. }
  330. } else {
  331. $deletable = 1;
  332. printUrlStatus($url_status['state'], $command_line);
  333.  
  334. }
  335. if ($reindex ==1 && $deletable == 1) {
  336. check_for_removal($url);
  337. } else if ($reindex == 1) {
  338.  
  339. }
  340. if (!isset($all_links)) {
  341. $all_links = 0;
  342. }
  343. if (!isset($numoflinks)) {
  344. $numoflinks = 0;
  345. }
  346. printLinksReport($numoflinks, $all_links, $command_line);
  347. }
  348.  
  349.  
  350. function index_site($url, $reindex, $maxlevel, $soption, $url_inc, $url_not_inc, $can_leave_domain) {
  351. global $mysql_table_prefix, $command_line, $mainurl, $tmp_urls, $domain_arr, $all_keywords;
  352. if (!isset($all_keywords)) {
  353. $result = mysql_query("select keyword_ID, keyword from ".$mysql_table_prefix."keywords");
  354. echo mysql_error();
  355. while($row=mysql_fetch_array($result)) {
  356. $all_keywords[addslashes($row[1])] = $row[0];
  357. }
  358. }
  359. $compurl = parse_url($url);
  360. if ($compurl['path'] == '')
  361. $url = $url . "/";
  362.  
  363. $t = microtime();
  364. $a = getenv("REMOTE_ADDR");
  365. $sessid = md5 ($t.$a);
  366.  
  367.  
  368. $urlparts = parse_url($url);
  369.  
  370. $domain = $urlparts['host'];
  371. if (isset($urlparts['port'])) {
  372. $port = (int)$urlparts['port'];
  373. }else {
  374. $port = 80;
  375. }
  376.  
  377.  
  378.  
  379. $result = mysql_query("select site_id from ".$mysql_table_prefix."sites where url='$url'");
  380. echo mysql_error();
  381. $row = mysql_fetch_row($result);
  382. $site_id = $row[0];
  383.  
  384. if ($site_id != "" && $reindex == 1) {
  385. mysql_query ("insert into ".$mysql_table_prefix."temp (link, level, id) values ('$url', 0, '$sessid')");
  386. echo mysql_error();
  387. $result = mysql_query("select url, level from ".$mysql_table_prefix."links where site_id = $site_id");
  388. while ($row = mysql_fetch_array($result)) {
  389. $site_link = $row['url'];
  390. $link_level = $row['level'];
  391. if ($site_link != $url) {
  392. mysql_query ("insert into ".$mysql_table_prefix."temp (link, level, id) values ('$site_link', $link_level, '$sessid')");
  393. }
  394. }
  395.  
  396. $qry = "update ".$mysql_table_prefix."sites set indexdate=now(), spider_depth = $maxlevel, required = '$url_inc'," .
  397. "disallowed = '$url_not_inc', can_leave_domain=$can_leave_domain where site_id=$site_id";
  398. mysql_query ($qry);
  399. echo mysql_error();
  400. } else if ($site_id == '') {
  401. mysql_query ("insert into ".$mysql_table_prefix."sites (url, indexdate, spider_depth, required, disallowed, can_leave_domain) " .
  402. "values ('$url', now(), $maxlevel, '$url_inc', '$url_not_inc', $can_leave_domain)");
  403. echo mysql_error();
  404. $result = mysql_query("select site_ID from ".$mysql_table_prefix."sites where url='$url'");
  405. $row = mysql_fetch_row($result);
  406. $site_id = $row[0];
  407. } else {
  408. mysql_query ("update ".$mysql_table_prefix."sites set indexdate=now(), spider_depth = $maxlevel, required = '$url_inc'," .
  409. "disallowed = '$url_not_inc', can_leave_domain=$can_leave_domain where site_id=$site_id");
  410. echo mysql_error();
  411. }
  412.  
  413.  
  414. $result = mysql_query("select site_id, temp_id, level, count, num from ".$mysql_table_prefix."pending where site_id='$site_id'");
  415. echo mysql_error();
  416. $row = mysql_fetch_row($result);
  417. $pending = $row[0];
  418. $level = 0;
  419. $domain_arr = get_domains();
  420. if ($pending == '') {
  421. mysql_query ("insert into ".$mysql_table_prefix."temp (link, level, id) values ('$url', 0, '$sessid')");
  422. echo mysql_error();
  423. } else if ($pending != '') {
  424. printStandardReport('continueSuspended',$command_line);
  425. mysql_query("select temp_id, level, count from ".$mysql_table_prefix."pending where site_id='$site_id'");
  426. echo mysql_error();
  427. $sessid = $row[1];
  428. $level = $row[2];
  429. $pend_count = $row[3] + 1;
  430. $num = $row[4];
  431. $pending = 1;
  432. $tmp_urls = get_temp_urls($sessid);
  433. }
  434.  
  435. if ($reindex != 1) {
  436. mysql_query ("insert into ".$mysql_table_prefix."pending (site_id, temp_id, level, count) values ('$site_id', '$sessid', '0', '0')");
  437. echo mysql_error();
  438. }
  439.  
  440.  
  441. $time = time();
  442.  
  443.  
  444. $omit = check_robot_txt($url);
  445.  
  446. printHeader ($omit, $url, $command_line);
  447.  
  448.  
  449. $mainurl = $url;
  450. $num = 0;
  451.  
  452. while (($level <= $maxlevel && $soption == 'level') || ($soption == 'full')) {
  453. if ($pending == 1) {
  454. $count = $pend_count;
  455. $pending = 0;
  456. } else
  457. $count = 0;
  458.  
  459. $links = array();
  460.  
  461. $result = mysql_query("select distinct link from ".$mysql_table_prefix."temp where level=$level && id='$sessid' order by link");
  462. echo mysql_error();
  463. $rows = mysql_num_rows($result);
  464.  
  465. if ($rows == 0) {
  466. break;
  467. }
  468.  
  469. $i = 0;
  470.  
  471. while ($row = mysql_fetch_array($result)) {
  472. $links[] = $row['link'];
  473. }
  474.  
  475. reset ($links);
  476.  
  477.  
  478. while ($count < count($links)) {
  479. $num++;
  480. $thislink = $links[$count];
  481. $urlparts = parse_url($thislink);
  482. reset ($omit);
  483. $forbidden = 0;
  484. foreach ($omit as $omiturl) {
  485. $omiturl = trim($omiturl);
  486.  
  487. $omiturl_parts = parse_url($omiturl);
  488. if ($omiturl_parts['scheme'] == '') {
  489. $check_omit = $urlparts['host'] . $omiturl;
  490. } else {
  491. $check_omit = $omiturl;
  492. }
  493.  
  494. if (strpos($thislink, $check_omit)) {
  495. printRobotsReport($num, $thislink, $command_line);
  496. check_for_removal($thislink);
  497. $forbidden = 1;
  498. break;
  499. }
  500. }
  501.  
  502. if (!check_include($thislink, $url_inc, $url_not_inc )) {
  503. printUrlStringReport($num, $thislink, $command_line);
  504. check_for_removal($thislink);
  505. $forbidden = 1;
  506. }
  507.  
  508. if ($forbidden == 0) {
  509. printRetrieving($num, $thislink, $command_line);
  510. $query = "select md5sum, indexdate from ".$mysql_table_prefix."links where url='$thislink'";
  511. $result = mysql_query($query);
  512. echo mysql_error();
  513. $rows = mysql_num_rows($result);
  514. if ($rows == 0) {
  515. index_url($thislink, $level+1, $site_id, '', $domain, '', $sessid, $can_leave_domain, $reindex);
  516.  
  517. mysql_query("update ".$mysql_table_prefix."pending set level = $level, count=$count, num=$num where site_id=$site_id");
  518. echo mysql_error();
  519. }else if ($rows <> 0 && $reindex == 1) {
  520. $row = mysql_fetch_array($result);
  521. $md5sum = $row['md5sum'];
  522. $indexdate = $row['indexdate'];
  523. index_url($thislink, $level+1, $site_id, $md5sum, $domain, $indexdate, $sessid, $can_leave_domain, $reindex);
  524. mysql_query("update ".$mysql_table_prefix."pending set level = $level, count=$count, num=$num where site_id=$site_id");
  525. echo mysql_error();
  526. }else {
  527. printStandardReport('inDatabase',$command_line);
  528. }
  529.  
  530. }
  531. $count++;
  532. }
  533. $level++;
  534. }
  535.  
  536. mysql_query ("delete from ".$mysql_table_prefix."temp where id = '$sessid'");
  537. echo mysql_error();
  538. mysql_query ("delete from ".$mysql_table_prefix."pending where site_id = '$site_id'");
  539. echo mysql_error();
  540. printStandardReport('completed',$command_line);
  541.  
  542.  
  543. }
  544.  
  545. function index_all() {
  546. global $mysql_table_prefix;
  547. $result=mysql_query("select url, spider_depth, required, disallowed, can_leave_domain from ".$mysql_table_prefix."sites");
  548. echo mysql_error();
  549. while ($row=mysql_fetch_row($result)) {
  550. $url = $row[0];
  551. $depth = $row[1];
  552. $include = $row[2];
  553. $not_include = $row[3];
  554. $can_leave_domain = $row[4];
  555. if ($can_leave_domain=='') {
  556. $can_leave_domain=0;
  557. }
  558. if ($depth == -1) {
  559. $soption = 'full';
  560. } else {
  561. $soption = 'level';
  562. }
  563. index_site($url, 1, $depth, $soption, $include, $not_include, $can_leave_domain);
  564. }
  565. }
  566.  
  567. function get_temp_urls ($sessid) {
  568. global $mysql_table_prefix;
  569. $result = mysql_query("select link from ".$mysql_table_prefix."temp where id='$sessid'");
  570. echo mysql_error();
  571. $tmp_urls = Array();
  572. while ($row=mysql_fetch_row($result)) {
  573. $tmp_urls[$row[0]] = 1;
  574. }
  575. return $tmp_urls;
  576.  
  577. }
  578.  
  579. function get_domains () {
  580. global $mysql_table_prefix;
  581. $result = mysql_query("select domain_id, domain from ".$mysql_table_prefix."domains");
  582. echo mysql_error();
  583. $domains = Array();
  584. while ($row=mysql_fetch_row($result)) {
  585. $domains[$row[1]] = $row[0];
  586. }
  587. return $domains;
  588.  
  589. }
  590.  
  591. function commandline_help() {
  592. print "Usage: php spider.php <options>\n\n";
  593. print "Options:\n";
  594. print " -all\t\t Reindex everything in the database\n";
  595. print " -u <url>\t Set url to index\n";
  596. print " -f\t\t Set indexing depth to full (unlimited depth)\n";
  597. print " -d <num>\t Set indexing depth to <num>\n";
  598. print " -l\t\t Allow spider to leave the initial domain\n";
  599. print " -r\t\t Set spider to reindex a site\n";
  600. print " -m <string>\t Set the string(s) that an url must include (use \\n as a delimiter between multiple strings)\n";
  601. print " -n <string>\t Set the string(s) that an url must not include (use \\n as a delimiter between multiple strings)\n";
  602. }
  603.  
  604. printStandardReport('quit',$command_line);
  605. if ($email_log) {
  606. $indexed = ($all==1) ? 'ALL' : $url;
  607. $log_report = "";
  608. if ($log_handle) {
  609. $log_report = "Log saved into $log_file";
  610. }
  611. mail($admin_email, "Sphider indexing report", "Sphider has finished indexing $indexed at ".date("y-m-d H:i:s").". ".$log_report);
  612. }
  613. if ( $log_handle) {
  614. fclose($log_handle);
  615. }
  616.  
  617. ?>
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement