Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- /*
- * checks two urls and determines if they are more than $min_similarity% similar. requires bash access, a decent OS, w3m and wdiff
- * @param $url first url
- * @param $other_url other url
- * @param $min_similarity an integer between 0 and 100, as in percentage
- */
- function urlEqualsUrl($url, $other_url, $min_similarity = 80) {
- $dir = "/tmp/equ_".substr(uniqid(),6);
- mkdir($dir);
- $cmd = "w3m -dump \"$url\" 2>/dev/null > $dir/1.html;
- w3m -dump \"$other_url\" 2>/dev/null > $dir/2.html;
- wdiff -nis $dir/1.html $dir/2.html | tail -2 | awk '{print $5}'";
- $percentages = explode(PHP_EOL, `$cmd`);
- $percentage = (substr($percentages[0], 0, -1) + substr($percentages[1], 0, -1)) / 2;
- return $percentage > $min_similarity;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement