Advertisement
Guest User

Untitled

a guest
Feb 10th, 2012
124
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 23.12 KB | None | 0 0
  1. <?php
  2. function getFileContents($url) {
  3. global $user_agent;
  4. $urlparts = parse_url($url);
  5. $path = $urlparts['path'];
  6. $host = $urlparts['host'];
  7. if ($urlparts['query'] != "")
  8. $path .= "?".$urlparts['query'];
  9. if (isset ($urlparts['port'])) {
  10. $port = (int) $urlparts['port'];
  11. } else
  12. if ($urlparts['scheme'] == "http") {
  13. $port = 80;
  14. } else
  15. if ($urlparts['scheme'] == "https") {
  16. $port = 443;
  17. }
  18.  
  19. if ($port == 80) {
  20. $portq = "";
  21. } else {
  22. $portq = ":$port";
  23. }
  24.  
  25. $all = "*/*";
  26.  
  27. $request = "GET $path HTTP/1.0\r\nHost: $host$portq\r\nAccept: $all\r\nUser-Agent: $user_agent\r\n\r\n";
  28.  
  29. $fsocket_timeout = 30;
  30. if (substr($url, 0, 5) == "https") {
  31. $target = "ssl://".$host;
  32. } else {
  33. $target = $host;
  34. }
  35.  
  36.  
  37. $errno = 0;
  38. $errstr = "";
  39. $fp = @ fsockopen($target, $port, $errno, $errstr, $fsocket_timeout);
  40.  
  41. print $errstr;
  42. if (!$fp) {
  43. $contents['state'] = "NOHOST";
  44. printConnectErrorReport($errstr);
  45. return $contents;
  46. } else {
  47. if (!fputs($fp, $request)) {
  48. $contents['state'] = "Cannot send request";
  49. return $contents;
  50. }
  51. $data = null;
  52. socket_set_timeout($fp, $fsocket_timeout);
  53. do{
  54. $status = socket_get_status($fp);
  55. $data .= fgets($fp, 8192);
  56. } while (!feof($fp) && !$status['timed_out']) ;
  57.  
  58. fclose($fp);
  59. if ($status['timed_out'] == 1) {
  60. $contents['state'] = "timeout";
  61. } else
  62. $contents['state'] = "ok";
  63. $contents['file'] = substr($data, strpos($data, "\r\n\r\n") + 4);
  64. }
  65. return $contents;
  66. }
  67.  
  68. /*
  69. check if file is available and in readable form
  70. */
  71. function url_status($url) {
  72. global $user_agent, $index_pdf, $index_doc, $index_xls, $index_ppt;
  73. $urlparts = parse_url($url);
  74. $path = $urlparts['path'];
  75. $host = $urlparts['host'];
  76. if (isset($urlparts['query']))
  77. $path .= "?".$urlparts['query'];
  78.  
  79. if (isset ($urlparts['port'])) {
  80. $port = (int) $urlparts['port'];
  81. } else
  82. if ($urlparts['scheme'] == "http") {
  83. $port = 80;
  84. } else
  85. if ($urlparts['scheme'] == "https") {
  86. $port = 443;
  87. }
  88.  
  89. if ($port == 80) {
  90. $portq = "";
  91. } else {
  92. $portq = ":$port";
  93. }
  94.  
  95. $all = "*/*"; //just to prevent "comment effect" in get accept
  96. $request = "HEAD $path HTTP/1.1\r\nHost: $host$portq\r\nAccept: $all\r\nUser-Agent: $user_agent\r\n\r\n";
  97.  
  98. if (substr($url, 0, 5) == "https") {
  99. $target = "ssl://".$host;
  100. } else {
  101. $target = $host;
  102. }
  103.  
  104. $fsocket_timeout = 30;
  105. $errno = 0;
  106. $errstr = "";
  107. $fp = fsockopen($target, $port, $errno, $errstr, $fsocket_timeout);
  108. print $errstr;
  109. $linkstate = "ok";
  110. if (!$fp) {
  111. $status['state'] = "NOHOST";
  112. } else {
  113. socket_set_timeout($fp, 30);
  114. fputs($fp, $request);
  115. $answer = fgets($fp, 4096);
  116. $regs = Array ();
  117. if (preg_match("/HTTP/[0-9.]+ (([0-9])[0-9]{2})/", $answer, $regs)) {
  118. $httpcode = $regs[2];
  119. $full_httpcode = $regs[1];
  120.  
  121. if ($httpcode <> 2 && $httpcode <> 3) {
  122. $status['state'] = "Unreachable: http $full_httpcode";
  123. $linkstate = "Unreachable";
  124. }
  125. }
  126.  
  127. if ($linkstate <> "Unreachable") {
  128. while ($answer) {
  129. $answer = fgets($fp, 4096);
  130.  
  131. if (preg_match("/Location: *([^\n\r ]+)/", $answer, $regs) && $httpcode == 3 && $full_httpcode != 302) {
  132. $status['path'] = $regs[1];
  133. $status['state'] = "Relocation: http $full_httpcode";
  134. fclose($fp);
  135. return $status;
  136. }
  137.  
  138. if (preg_match("/Last-Modified: *([a-z0-9,: ]+)/i", $answer, $regs)) {
  139. $status['date'] = $regs[1];
  140. }
  141.  
  142. if (preg_match("/Content-Type:/i", $answer)) {
  143. $content = $answer;
  144. $answer = '';
  145. break;
  146. }
  147. }
  148. $socket_status = socket_get_status($fp);
  149. if (preg_match("/Content-Type: *([a-z\/.-]*)/i", $content, $regs)) {
  150. if ($regs[1] == 'text/html' || $regs[1] == 'text/' || $regs[1] == 'text/plain') {
  151. $status['content'] = 'text';
  152. $status['state'] = 'ok';
  153. } else if ($regs[1] == 'application/pdf' && $index_pdf == 1) {
  154. $status['content'] = 'pdf';
  155. $status['state'] = 'ok';
  156. } else if (($regs[1] == 'application/msword' || $regs[1] == 'application/vnd.ms-word') && $index_doc == 1) {
  157. $status['content'] = 'doc';
  158. $status['state'] = 'ok';
  159. } else if (($regs[1] == 'application/excel' || $regs[1] == 'application/vnd.ms-excel') && $index_xls == 1) {
  160. $status['content'] = 'xls';
  161. $status['state'] = 'ok';
  162. } else if (($regs[1] == 'application/mspowerpoint' || $regs[1] == 'application/vnd.ms-powerpoint') && $index_ppt == 1) {
  163. $status['content'] = 'ppt';
  164. $status['state'] = 'ok';
  165. } else {
  166. $status['state'] = "Not text or html";
  167. }
  168.  
  169. } else
  170. if ($socket_status['timed_out'] == 1) {
  171. $status['state'] = "Timed out (no reply from server)";
  172.  
  173. } else
  174. $status['state'] = "Not text or html";
  175.  
  176. }
  177. }
  178. fclose($fp);
  179. return $status;
  180. }
  181.  
  182. /*
  183. Read robots.txt file in the server, to find any disallowed files/folders
  184. */
  185. function check_robot_txt($url) {
  186. global $user_agent;
  187. $urlparts = parse_url($url);
  188. $url = 'http://'.$urlparts['host']."/robots.txt";
  189.  
  190. $url_status = url_status($url);
  191. $omit = array ();
  192.  
  193. if ($url_status['state'] == "ok") {
  194. $robot = file($url);
  195. if (!$robot) {
  196. $contents = getFileContents($url);
  197. $file = $contents['file'];
  198. $robot = explode("\n", $file);
  199. }
  200.  
  201. $regs = Array ();
  202. $this_agent= "";
  203. while (list ($id, $line) = each($robot)) {
  204. if (preg_match("/^user-agent: *([^#]+) */", $line, $regs)) {
  205. $this_agent = trim($regs[1]);
  206. if ($this_agent == '*' || $this_agent == $user_agent)
  207. $check = 1;
  208. else
  209. $check = 0;
  210. }
  211.  
  212. if (preg_match("/disallow: *([^#]+)/", $line, $regs) && $check == 1) {
  213. $disallow_str = preg_replace("/[\n ]+/i", "", $regs[1]);
  214. if (trim($disallow_str) != "") {
  215. $omit[] = $disallow_str;
  216. } else {
  217. if ($this_agent == '*' || $this_agent == $user_agent) {
  218. return null;
  219. }
  220. }
  221. }
  222. }
  223. }
  224.  
  225. return $omit;
  226. }
  227.  
  228. /*
  229. Remove the file part from an url (to build an url from an url and given relative path)
  230. */
  231. function remove_file_from_url($url) {
  232. $url_parts = parse_url($url);
  233. $path = $url_parts['path'];
  234.  
  235. $regs = Array ();
  236. if (preg_match('/([^\/]+)$/i', $path, $regs)) {
  237. $file = $regs[1];
  238. $check = $file.'$';
  239. $path = preg_replace("/$check"."/i", "", $path);
  240. }
  241.  
  242. if ($url_parts['port'] == 80 || $url_parts['port'] == "") {
  243. $portq = "";
  244. } else {
  245. $portq = ":".$url_parts['port'];
  246. }
  247.  
  248. $url = $url_parts['scheme']."://".$url_parts['host'].$portq.$path;
  249. return $url;
  250. }
  251.  
  252. /*
  253. Extract links from html
  254. */
  255. function get_links($file, $url, $can_leave_domain, $base) {
  256.  
  257. $chunklist = array ();
  258. // The base URL comes from either the meta tag or the current URL.
  259. if (!empty($base)) {
  260. $url = $base;
  261. }
  262.  
  263. $links = array ();
  264. $regs = Array ();
  265. $checked_urls = Array();
  266.  
  267. preg_match_all("/href\s*=\s*[\'\"]?([+:%\/\?~=&;\\\(\),._a-zA-Z0-9-]*)(#[.a-zA-Z0-9-]*)?[\'\" ]?(\s*rel\s*=\s*[\'\"]?(nofollow)[\'\"]?)?/i", $file, $regs, PREG_SET_ORDER);
  268. foreach ($regs as $val) {
  269. if ($checked_urls[$val[1]]!=1 && !isset ($val[4])) { //if nofollow is not set
  270. if (($a = url_purify($val[1], $url, $can_leave_domain)) != '') {
  271. $links[] = $a;
  272. }
  273. $checked_urls[$val[1]] = 1;
  274. }
  275. }
  276. preg_match_all("/(frame[^>]*src[[:blank:]]*)=[[:blank:]]*[\'\"]?(([[a-z]{3,5}:\/\/(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%\/?=&;\\\(\),._ a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\'\" ]?/i", $file, $regs, PREG_SET_ORDER);
  277. foreach ($regs as $val) {
  278. if ($checked_urls[$val[1]]!=1 && !isset ($val[4])) { //if nofollow is not set
  279. if (($a = url_purify($val[1], $url, $can_leave_domain)) != '') {
  280. $links[] = $a;
  281. }
  282. $checked_urls[$val[1]] = 1;
  283. }
  284. }
  285. preg_match_all("/(window[.]location)[[:blank:]]*=[[:blank:]]*[\'\"]?(([[a-z]{3,5}:\/\/(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%\/?=&;\\\(\),._ a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\'\" ]?/i", $file, $regs, PREG_SET_ORDER);
  286. foreach ($regs as $val) {
  287. if ($checked_urls[$val[1]]!=1 && !isset ($val[4])) { //if nofollow is not set
  288. if (($a = url_purify($val[1], $url, $can_leave_domain)) != '') {
  289. $links[] = $a;
  290. }
  291. $checked_urls[$val[1]] = 1;
  292. }
  293. }
  294. preg_match_all("/(http-equiv=['\"]refresh['\"] *content=['\"][0-9]+;url)[[:blank:]]*=[[:blank:]]*[\'\"]?(([[a-z]{3,5}:\/\/(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%\/?=&;\\\(\),._ a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\'\" ]?/i", $file, $regs, PREG_SET_ORDER);
  295. foreach ($regs as $val) {
  296. if ($checked_urls[$val[1]]!=1 && !isset ($val[4])) { //if nofollow is not set
  297. if (($a = url_purify($val[1], $url, $can_leave_domain)) != '') {
  298. $links[] = $a;
  299. }
  300. $checked_urls[$val[1]] = 1;
  301. }
  302. }
  303.  
  304. preg_match_all("/(window[.]open[[:blank:]]*[(])[[:blank:]]*[\'\"]?(([[a-z]{3,5}:\/\/(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%\/?=&;\\\(\),._ a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\'\" ]?/i", $file, $regs, PREG_SET_ORDER);
  305. foreach ($regs as $val) {
  306. if ($checked_urls[$val[1]]!=1 && !isset ($val[4])) { //if nofollow is not set
  307. if (($a = url_purify($val[1], $url, $can_leave_domain)) != '') {
  308. $links[] = $a;
  309. }
  310. $checked_urls[$val[1]] = 1;
  311. }
  312. }
  313.  
  314. return $links;
  315. }
  316.  
  317. /*
  318. Function to build a unique word array from the text of a webpage, together with the count of each word
  319. */
  320. function unique_array($arr) {
  321. global $min_word_length;
  322. global $common;
  323. global $word_upper_bound;
  324. global $index_numbers, $stem_words;
  325.  
  326. if ($stem_words == 1) {
  327. $newarr = Array();
  328. foreach ($arr as $val) {
  329. $newarr[] = stem($val);
  330. }
  331. $arr = $newarr;
  332. }
  333. sort($arr);
  334. reset($arr);
  335. $newarr = array ();
  336.  
  337. $i = 0;
  338. $counter = 1;
  339. $element = current($arr);
  340.  
  341. if ($index_numbers == 1) {
  342. $pattern = "/[a-z0-9]+/";
  343. } else {
  344. $pattern = "/[a-z]+/";
  345. }
  346.  
  347. $regs = Array ();
  348. for ($n = 0; $n < sizeof($arr); $n ++) {
  349. //check if word is long enough, contains alphabetic characters and is not a common word
  350. //to eliminate/count multiple instance of words
  351. $next_in_arr = next($arr);
  352. if ($next_in_arr != $element) {
  353. if (strlen($element) >= $min_word_length && preg_match($pattern, remove_accents($element)) && (@ $common[$element] <> 1)) {
  354. if (preg_match("/^(-|\\\')(.*)/", $element, $regs))
  355. $element = $regs[2];
  356.  
  357. if (preg_match("/(.*)(\\\'|-)$/", $element, $regs))
  358. $element = $regs[1];
  359.  
  360. $newarr[$i][1] = $element;
  361. $newarr[$i][2] = $counter;
  362. $element = current($arr);
  363. $i ++;
  364. $counter = 1;
  365. } else {
  366. $element = $next_in_arr;
  367. }
  368. } else {
  369. if ($counter < $word_upper_bound)
  370. $counter ++;
  371. }
  372.  
  373. }
  374. return $newarr;
  375. }
  376.  
  377. /*
  378. Checks if url is legal, relative to the main url.
  379. */
  380. function url_purify($url, $parent_url, $can_leave_domain) {
  381. global $ext, $mainurl, $apache_indexes, $strip_sessids;
  382.  
  383.  
  384.  
  385. $urlparts = parse_url($url);
  386.  
  387. $main_url_parts = parse_url($mainurl);
  388. if ($urlparts['host'] != "" && $urlparts['host'] != $main_url_parts['host'] && $can_leave_domain != 1) {
  389. return '';
  390. }
  391.  
  392. reset($ext);
  393. while (list ($id, $excl) = each($ext))
  394. if (preg_match("/\.$excl$/i", $url))
  395. return '';
  396.  
  397. if (substr($url, -1) == '\\') {
  398. return '';
  399. }
  400.  
  401.  
  402.  
  403. if (isset($urlparts['query'])) {
  404. if ($apache_indexes[$urlparts['query']]) {
  405. return '';
  406. }
  407. }
  408.  
  409. if (preg_match("/[\/]?mailto:|[\/]?javascript:|[\/]?news:/i", $url)) {
  410. return '';
  411. }
  412. if (isset($urlparts['scheme'])) {
  413. $scheme = $urlparts['scheme'];
  414. } else {
  415. $scheme ="";
  416. }
  417.  
  418.  
  419.  
  420. //only http and https links are followed
  421. if (!($scheme == 'http' || $scheme == '' || $scheme == 'https')) {
  422. return '';
  423. }
  424.  
  425. //parent url might be used to build an url from relative path
  426. $parent_url = remove_file_from_url($parent_url);
  427. $parent_url_parts = parse_url($parent_url);
  428.  
  429.  
  430. if (substr($url, 0, 1) == '/') {
  431. $url = $parent_url_parts['scheme']."://".$parent_url_parts['host'].$url;
  432. } else
  433. if (!isset($urlparts['scheme'])) {
  434. $url = $parent_url.$url;
  435. }
  436.  
  437. $url_parts = parse_url($url);
  438.  
  439. $urlpath = $url_parts['path'];
  440.  
  441. $regs = Array ();
  442.  
  443. while (preg_match("/[^\/]*\/[.]{2}\//", $urlpath, $regs)) {
  444. $urlpath = str_replace($regs[0], "", $urlpath);
  445. }
  446.  
  447. //remove relative path instructions like ../ etc
  448. $urlpath = preg_replace("/\/+/", "/", $urlpath);
  449. $urlpath = preg_replace("/[^\/]*\/[.]{2}/", "", $urlpath);
  450. $urlpath = str_replace("./", "", $urlpath);
  451. $query = "";
  452. if (isset($url_parts['query'])) {
  453. $query = "?".$url_parts['query'];
  454. }
  455. if ($main_url_parts['port'] == 80 || $url_parts['port'] == "") {
  456. $portq = "";
  457. } else {
  458. $portq = ":".$main_url_parts['port'];
  459. }
  460. $url = $url_parts['scheme']."://".$url_parts['host'].$portq.$urlpath.$query;
  461.  
  462. //if we index sub-domains
  463. if ($can_leave_domain == 1) {
  464. return $url;
  465. }
  466.  
  467. $mainurl = remove_file_from_url($mainurl);
  468.  
  469. if ($strip_sessids == 1) {
  470. $url = remove_sessid($url);
  471. }
  472. //only urls in staying in the starting domain/directory are followed
  473. $url = convert_url($url);
  474. if (strstr($url, $mainurl) == false) {
  475. return '';
  476. } else
  477. return $url;
  478. }
  479.  
  480. function save_keywords($wordarray, $link_id, $domain) {
  481. global $mysql_table_prefix, $all_keywords;
  482. reset($wordarray);
  483. while ($thisword = each($wordarray)) {
  484. $word = $thisword[1][1];
  485. $wordmd5 = substr(md5($word), 0, 1);
  486. $weight = $thisword[1][2];
  487. if (strlen($word)<= 30) {
  488. $keyword_id = $all_keywords[$word];
  489. if ($keyword_id == "") {
  490. mysql_query("insert into ".$mysql_table_prefix."keywords (keyword) values ('$word')");
  491. if (mysql_errno() == 1062) {
  492. $result = mysql_query("select keyword_ID from ".$mysql_table_prefix."keywords where keyword='$word'");
  493. echo mysql_error();
  494. $row = mysql_fetch_row($result);
  495. $keyword_id = $row[0];
  496. } else{
  497. $keyword_id = mysql_insert_id();
  498. $all_keywords[$word] = $keyword_id;
  499. echo mysql_error();
  500. }
  501. }
  502. $inserts[$wordmd5] .= ",($link_id, $keyword_id, $weight, $domain)";
  503. }
  504. }
  505.  
  506. for ($i=0;$i<=15; $i++) {
  507. $char = dechex($i);
  508. $values= substr($inserts[$char], 1);
  509. if ($values!="") {
  510. $query = "insert into ".$mysql_table_prefix."link_keyword$char (link_id, keyword_id, weight, domain) values $values";
  511. mysql_query($query);
  512. echo mysql_error();
  513. }
  514.  
  515.  
  516. }
  517. }
  518.  
  519. function get_head_data($file) {
  520. $headdata = "";
  521.  
  522. preg_match("@<head[^>]*>(.*?)<\/head>@si",$file, $regs);
  523.  
  524. $headdata = $regs[1];
  525.  
  526. $description = "";
  527. $robots = "";
  528. $keywords = "";
  529. $base = "";
  530. $res = Array ();
  531. if ($headdata != "") {
  532. preg_match("/<meta +name *=[\"']?robots[\"']? *content=[\"']?([^<>'\"]+)[\"']?/i", $headdata, $res);
  533. if (isset ($res)) {
  534. $robots = $res[1];
  535. }
  536.  
  537. preg_match("/<meta +name *=[\"']?description[\"']? *content=[\"']?([^<>'\"]+)[\"']?/i", $headdata, $res);
  538. if (isset ($res)) {
  539. $description = $res[1];
  540. }
  541.  
  542. preg_match("/<meta +name *=[\"']?keywords[\"']? *content=[\"']?([^<>'\"]+)[\"']?/i", $headdata, $res);
  543. if (isset ($res)) {
  544. $keywords = $res[1];
  545. }
  546. // e.g. <base href="http://www.consil.co.uk/index.php" />
  547. preg_match("/<base +href *= *[\"']?([^<>'\"]+)[\"']?/i", $headdata, $res);
  548. if (isset ($res)) {
  549. $base = $res[1];
  550. }
  551. $keywords = preg_replace("/[, ]+/", " ", $keywords);
  552. $robots = explode(",", strtolower($robots));
  553. $nofollow = 0;
  554. $noindex = 0;
  555. foreach ($robots as $x) {
  556. if (trim($x) == "noindex") {
  557. $noindex = 1;
  558. }
  559. if (trim($x) == "nofollow") {
  560. $nofollow = 1;
  561. }
  562. }
  563. $data['description'] = addslashes($description);
  564. $data['keywords'] = addslashes($keywords);
  565. $data['nofollow'] = $nofollow;
  566. $data['noindex'] = $noindex;
  567. $data['base'] = $base;
  568. }
  569. return $data;
  570. }
  571.  
  572. function clean_file($file, $url, $type) {
  573. global $entities, $index_host, $index_meta_keywords;
  574.  
  575. $urlparts = parse_url($url);
  576. $host = $urlparts['host'];
  577. //remove filename from path
  578. $path = preg_replace('/([^\/]+)$/i', "", $urlparts['path']);
  579. $file = preg_replace("/<link rel[^<>]*>/i", " ", $file);
  580. $file = preg_replace("@<!--sphider_noindex-->.*?<!--\/sphider_noindex-->@si", " ",$file);
  581. $file = preg_replace("@<!--.*?-->@si", " ",$file);
  582. $file = preg_replace("@<script[^>]*?>.*?</script>@si", " ",$file);
  583. $headdata = get_head_data($file);
  584. $regs = Array ();
  585. if (preg_match("@<title *>(.*?)<\/title*>@si", $file, $regs)) {
  586. $title = trim($regs[1]);
  587. $file = str_replace($regs[0], "", $file);
  588. } else if ($type == 'pdf' || $type == 'doc') { //the title of a non-html file is its first few words
  589. $title = substr($file, 0, strrpos(substr($file, 0, 40), " "));
  590. }
  591.  
  592. $file = preg_replace("@<style[^>]*>.*?<\/style>@si", " ", $file);
  593.  
  594. //create spaces between tags, so that removing tags doesnt concatenate strings
  595. $file = preg_replace("/<[\w ]+>/", "\\0 ", $file);
  596. $file = preg_replace("/<\/[\w ]+>/", "\\0 ", $file);
  597. $file = strip_tags($file);
  598. $file = preg_replace("/&nbsp;/", " ", $file);
  599.  
  600. $fulltext = $file;
  601. $file .= " ".$title;
  602. if ($index_host == 1) {
  603. $file = $file." ".$host." ".$path;
  604. }
  605. if ($index_meta_keywords == 1) {
  606. $file = $file." ".$headdata['keywords'];
  607. }
  608.  
  609.  
  610. //replace codes with ascii chars
  611. $file = preg_replace('~&#x([0-9a-f]+);~ei', 'chr(hexdec("\\1"))', $file);
  612. $file = preg_replace('~&#([0-9]+);~e', 'chr("\\1")', $file);
  613. $file = strtolower($file);
  614. reset($entities);
  615. while ($char = each($entities)) {
  616. $file = preg_replace("/".$char[0]."/i", $char[1], $file);
  617. }
  618. $file = preg_replace("/&[a-z]{1,6};/", " ", $file);
  619. $file = preg_replace("/[\*\^\+\?\\\.\[\]\^\$\|\{\)\(\}~!\"\/@#£$%&=`´;><:,]+/", " ", $file);
  620. $file = preg_replace("/\s+/", " ", $file);
  621. $data['fulltext'] = addslashes($fulltext);
  622. $data['content'] = addslashes($file);
  623. $data['title'] = addslashes($title);
  624. $data['description'] = $headdata['description'];
  625. $data['keywords'] = $headdata['keywords'];
  626. $data['host'] = $host;
  627. $data['path'] = $path;
  628. $data['nofollow'] = $headdata['nofollow'];
  629. $data['noindex'] = $headdata['noindex'];
  630. $data['base'] = $headdata['base'];
  631.  
  632. return $data;
  633.  
  634. }
  635.  
  636. function calc_weights($wordarray, $title, $host, $path, $keywords) {
  637. global $index_host, $index_meta_keywords;
  638. $hostarray = unique_array(explode(" ", preg_replace("/[^[:alnum:]-]+/i", " ", strtolower($host))));
  639. $patharray = unique_array(explode(" ", preg_replace("/[^[:alnum:]-]+/i", " ", strtolower($path))));
  640. $titlearray = unique_array(explode(" ", preg_replace("/[^[:alnum:]-]+/i", " ", strtolower($title))));
  641. $keywordsarray = unique_array(explode(" ", preg_replace("/[^[:alnum:]-]+/i", " ", strtolower($keywords))));
  642. $path_depth = countSubstrs($path, "/");
  643.  
  644. while (list ($wid, $word) = each($wordarray)) {
  645. $word_in_path = 0;
  646. $word_in_domain = 0;
  647. $word_in_title = 0;
  648. $meta_keyword = 0;
  649. if ($index_host == 1) {
  650. while (list ($id, $path) = each($patharray)) {
  651. if ($path[1] == $word[1]) {
  652. $word_in_path = 1;
  653. break;
  654. }
  655. }
  656. reset($patharray);
  657.  
  658. while (list ($id, $host) = each($hostarray)) {
  659. if ($host[1] == $word[1]) {
  660. $word_in_domain = 1;
  661. break;
  662. }
  663. }
  664. reset($hostarray);
  665. }
  666.  
  667. if ($index_meta_keywords == 1) {
  668. while (list ($id, $keyword) = each($keywordsarray)) {
  669. if ($keyword[1] == $word[1]) {
  670. $meta_keyword = 1;
  671. break;
  672. }
  673. }
  674. reset($keywordsarray);
  675. }
  676. while (list ($id, $tit) = each($titlearray)) {
  677. if ($tit[1] == $word[1]) {
  678. $word_in_title = 1;
  679. break;
  680. }
  681. }
  682. reset($titlearray);
  683.  
  684. $wordarray[$wid][2] = (int) (calc_weight($wordarray[$wid][2], $word_in_title, $word_in_domain, $word_in_path, $path_depth, $meta_keyword));
  685. }
  686. reset($wordarray);
  687. return $wordarray;
  688. }
  689.  
  690. function isDuplicateMD5($md5sum) {
  691. global $mysql_table_prefix;
  692. $result = mysql_query("select link_id from ".$mysql_table_prefix."links where md5sum='$md5sum'");
  693. echo mysql_error();
  694. if (mysql_num_rows($result) > 0) {
  695. return true;
  696. }
  697. return false;
  698. }
  699.  
  700. function check_include($link, $inc, $not_inc) {
  701. $url_inc = Array ();
  702. $url_not_inc = Array ();
  703. if ($inc != "") {
  704. $url_inc = explode("\n", $inc);
  705. }
  706. if ($not_inc != "") {
  707. $url_not_inc = explode("\n", $not_inc);
  708. }
  709. $oklinks = Array ();
  710.  
  711. $include = true;
  712. foreach ($url_not_inc as $str) {
  713. $str = trim($str);
  714. if ($str != "") {
  715. if (substr($str, 0, 1) == '*') {
  716. if (preg_match(substr($str, 1), $link)) {
  717. $include = false;
  718. break;
  719. }
  720. } else {
  721. if (!(strpos($link, $str) === false)) {
  722. $include = false;
  723. break;
  724. }
  725. }
  726. }
  727. }
  728. if ($include && $inc != "") {
  729. $include = false;
  730. foreach ($url_inc as $str) {
  731. $str = trim($str);
  732. if ($str != "") {
  733. if (substr($str, 0, 1) == '*') {
  734. if (preg_match(substr($str, 1), $link)) {
  735. $include = true;
  736. break 2;
  737. }
  738. } else {
  739. if (strpos($link, $str) !== false) {
  740. $include = true;
  741. break;
  742. }
  743. }
  744. }
  745. }
  746. }
  747. return $include;
  748. }
  749.  
  750. function check_for_removal($url) {
  751. global $mysql_table_prefix;
  752. global $command_line;
  753. $result = mysql_query("select link_id, visible from ".$mysql_table_prefix."links"." where url='$url'");
  754. echo mysql_error();
  755. if (mysql_num_rows($result) > 0) {
  756. $row = mysql_fetch_row($result);
  757. $link_id = $row[0];
  758. $visible = $row[1];
  759. if ($visible > 0) {
  760. $visible --;
  761. mysql_query("update ".$mysql_table_prefix."links set visible=$visible where link_id=$link_id");
  762. echo mysql_error();
  763. } else {
  764. mysql_query("delete from ".$mysql_table_prefix."links where link_id=$link_id");
  765. echo mysql_error();
  766. for ($i=0;$i<=15; $i++) {
  767. $char = dechex($i);
  768. mysql_query("delete from ".$mysql_table_prefix."link_keyword$char where link_id=$link_id");
  769. echo mysql_error();
  770. }
  771. printStandardReport('pageRemoved',$command_line);
  772. }
  773. }
  774. }
  775.  
  776. function convert_url($url) {
  777. $url = str_replace("&amp;", "&", $url);
  778. $url = str_replace(" ", "%20", $url);
  779. return $url;
  780. }
  781.  
  782. function extract_text($contents, $source_type) {
  783. global $tmp_dir, $pdftotext_path, $catdoc_path, $xls2csv_path, $catppt_path;
  784.  
  785. $temp_file = "tmp_file";
  786. $filename = $tmp_dir."/".$temp_file ;
  787. if (!$handle = fopen($filename, 'w')) {
  788. die ("Cannot open file $filename");
  789. }
  790.  
  791. if (fwrite($handle, $contents) === FALSE) {
  792. die ("Cannot write to file $filename");
  793. }
  794.  
  795. fclose($handle);
  796. if ($source_type == 'pdf') {
  797. $command = $pdftotext_path." $filename -";
  798. $a = exec($command,$result, $retval);
  799. } else if ($source_type == 'doc') {
  800. $command = $catdoc_path." $filename";
  801. $a = exec($command,$result, $retval);
  802. } else if ($source_type == 'xls') {
  803. $command = $xls2csv_path." $filename";
  804. $a = exec($command,$result, $retval);
  805. } else if ($source_type == 'ppt') {
  806. $command = $catppt_path." $filename";
  807. $a = exec($command,$result, $retval);
  808. }
  809.  
  810. unlink ($filename);
  811. return implode(' ', $result);
  812.  
  813. }
  814.  
  815. //function to calculate the weight of pages
  816. function calc_weight ($words_in_page, $word_in_title, $word_in_domain, $word_in_path, $path_depth, $meta_keyword) {
  817. global $title_weight, $domain_weight, $path_weight,$meta_weight;
  818. $weight = ($words_in_page + $word_in_title * $title_weight +
  819. $word_in_domain * $domain_weight +
  820. $word_in_path * $path_weight + $meta_keyword * $meta_weight) *10 / (0.8 +0.2*$path_depth);
  821.  
  822. return $weight;
  823. }
  824.  
  825. function remove_sessid($url) {
  826. return preg_replace("/(\?|&)(PHPSESSID|JSESSIONID|ASPSESSIONID|sid)=[0-9a-zA-Z]+$/", "", $url);
  827. }
  828. ?>
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement