Advertisement
Guest User

urevic

a guest
Feb 16th, 2009
1,226
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
PHP 3.88 KB | None | 0 0
  1. <?php
  2.  
  3. /**
  4. * @class Document_Hash
  5. * Base class for documents hashing
  6. * @author Stanislav Perederiy
  7. */
  8. class Document_Hash
  9. {
  10.     /// Content of the hashing document
  11.     public $_doc_content = null;
  12.     /// Document charset
  13.     private $_charset = null;
  14.     /// Tokens array
  15.     public $_tokens = array();
  16.     /// CRC32 hashes array
  17.     private $_crc32 = array();
  18.     /// Document length in sententences or words (this is not real length, it can't be greater than max_lenght)
  19.     public $length = null;
  20.     /// Document MD5
  21.     public $docMD5 = null;
  22.  
  23.     public function __construct($doc_content, $charset = "UTF-8")
  24.     {
  25.         $this->_doc_content = $doc_content;
  26.         $this->_charset = $charset;
  27.  
  28.         // Prepare document for hashing
  29.         $this->_prepare();
  30.  
  31.         // As we'll need MD5 of doc each time we use this class it makes sense to create it here
  32.         $this->docMD5 = md5($this->_doc_content);
  33.     }
  34.  
  35.     private function _prepare ()
  36.     {
  37.         $this->_genericReplacements()
  38.              ->_splitToTokens();
  39.     }
  40.  
  41.     private function _makeCrc32array ()
  42.     {
  43.         $this->_crc32 = array();
  44.         foreach ($this->_tokens as $token) {
  45.             $this->_crc32[] = crc32($token);
  46.         }
  47.         return $this;
  48.     }
  49.  
  50.     public function getCrc32array ()
  51.     {
  52.         if (count($this->_crc32) == 0) $this->_makeCrc32array();
  53.         return $this->_crc32;
  54.     }
  55.  
  56.     private function _splitToTokens ()
  57.     {
  58.         $content = $this->_doc_content;
  59.         $tmp = array();
  60.         $tmp2 = array();
  61.         $slength = array();
  62.  
  63.         $content = str_replace(".", " ", $content);
  64.         $tmp = explode(" ", $content);
  65.         foreach ($tmp as $word) {
  66.             // let's count only words with more then 4 chars
  67.             if (mb_strlen($word, $this->_charset) > 3) {
  68.                 $tmp2[$word] = $word;
  69.                 $slength[$word] = strlen($word);
  70.             }
  71.         }
  72.        
  73.         array_multisort($slength, SORT_DESC, $tmp2, SORT_ASC);
  74.         $count = count($slength);
  75.         // Save only top15 (by length) sentences/words
  76.         for ($i=0; $i<$count && $i<15; $i++) {
  77.             $this->_tokens[] = current($tmp2);
  78.             next($tmp2);
  79.         }
  80.  
  81.         $this->length = count($this->_tokens);
  82.         return $this;
  83.     }
  84.  
  85.     private function _genericReplacements ()
  86.     {
  87.         $this->_doc_content = strip_tags($this->_doc_content);
  88.         $this->_doc_content = ltrim(rtrim($this->_doc_content));
  89.         $this->_doc_content = mb_strtolower($this->_doc_content, $this->_charset);
  90.  
  91.         // Remove dots between chars (for things like urls)
  92.         $this->_doc_content = $this->_my_preg_replace("/([a-z]{1})[\.]+([a-z]{1})/", "$1$2", $this->_doc_content);
  93.             // ? Remove all html entities
  94.             // $this->_doc_content = $this->_my_preg_replace("/&[#|a-z|0-9]+;/", " ", $this->_doc_content);
  95.         // Decode all html entities
  96.         $this->_doc_content = html_entity_decode($this->_doc_content, ENT_COMPAT, $this->_charset);
  97.         // Replace multiple spaces chars with just one space
  98.         $this->_doc_content = $this->_my_preg_replace("/[\s|\t|\n|\r]+/", " ", $this->_doc_content);
  99.         // Remove dots, dashes and spaces between digits
  100.         $this->_doc_content = $this->_my_preg_replace("/([0-9]{1})[\.|\s|\-]+([0-9]{1})/", "$1$2", $this->_doc_content);
  101.         // Remove spaces after sentences and replace multiple dots with just one dot
  102.         $this->_doc_content = $this->_my_preg_replace("/[\.]+ /", ".", $this->_doc_content);
  103.         // The same for sentences ending with question marks
  104.         $this->_doc_content = $this->_my_preg_replace("/[\?]+ /", ".", $this->_doc_content);
  105.         // The same for "!"
  106.         $this->_doc_content = $this->_my_preg_replace("/[\!]+ /", ".", $this->_doc_content);
  107.         // Remove all non-alphanumeric characters except for spaces and dots
  108.         $this->_doc_content = $this->_my_preg_replace("/[^a-z|&#1072;-&#1103;|^\.|^\d|^\s|^@]+/i", "", $this->_doc_content);
  109.  
  110.         return $this;
  111.     }
  112.  
  113.     /**
  114.     * Wrapper for preg_replace. For correct support unicode and non-unicode charsets
  115.     * @return: string
  116.     **/
  117.     private function _my_preg_replace ($regex, $replace, $subject)
  118.     {
  119.         $u = "";
  120.         if ($this->_charset == "UTF-8") {
  121.             $u = "u";
  122.         }
  123.         return preg_replace ($regex . $u, $replace, $subject);
  124.     }
  125.  
  126. }
  127.  
  128. ?>
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement