Advertisement
Guest User

urevic

a guest
Feb 16th, 2009
843
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
PHP 5.98 KB | None | 0 0
  1. <?php if (! defined('BASE_PATH')) exit('No direct script access allowed');
  2.  
  3. require_once (BASE_PATH . "/lib/Document_Hash.php");
  4. require_once (BASE_PATH . "/lib/Model.php");
  5.  
  6. /**
  7. * @class Itemhashes_Model
  8. * Model for news duplicates detection
  9. *
  10. * Usage:
  11. * 1. Check if the news item already exists with isDup()
  12. * 2. If it doesn't exist insert it to the database
  13. * 3. Get id of inserted item with mysql_insert_id()
  14. * 4. Tell this id to the instance of DupNewsModel with setDocId
  15. * 5. Save the news item hashes with saveHashes()
  16. *
  17. * Notes:
  18. * You need only text of the item for this class (without title, date etc).
  19. * The text should be in UTF-8.
  20. *
  21. * @author Stanislav Perederiy
  22. */
  23. class Itemhashes_Model extends Model
  24. {
  25.     protected $_table = "items_hashes_summary";
  26.     protected $_table2 = "items_hashes";
  27.     /// Instance of hashing class
  28.     protected $_hash = null;
  29.     /// Document id for saving hashes in database
  30.     protected $_docId = null;
  31.  
  32.     public function __construct($text = null, $docId = null)
  33.     {
  34.         parent::__construct();
  35.         if ( !empty($text) ) {
  36.             $this->_hash = new Document_Hash($text);
  37.         }
  38.  
  39.         $this->_docId = $docId;
  40.         $this->initDB();
  41.         $this->_table2 = COUNTRY_DB_TABLE_PREFIX . $this->_table2;
  42.     }
  43.  
  44.     public function setDocId ($docId)
  45.   {
  46.         $this->_docId = $docId;
  47.         return $this;
  48.     }
  49.  
  50.     public function isDup()
  51.     {
  52.         if (!$this->_isHashValid()) {
  53.             throw new Exception("Error: It seems that something went wrong during hashing!", 500);
  54.             return 0;
  55.         }
  56.  
  57.         $conditions = "";
  58.         if ( $this->_docId != null ) {
  59.             $conditions = "doc_id != " . $this->_docId;
  60.         }
  61.         else {
  62.             $conditions = "1";
  63.         }
  64.  
  65.         $result = $this->_db->query("SELECT doc_id FROM {$this->_table} WHERE $conditions AND full_hash='{$this->_hash->docMD5}' AND
  66.                                                                                length={$this->_hash->length}");
  67.         if ( $id = $result->fetchColumn() ) {
  68. /*          while (list ($id) = mysql_fetch_array($result)) {
  69.                 echo "Doc is equal to $id\n";
  70.             }*/
  71.             return $id;
  72.         }
  73.  
  74.         $crc32 = $this->_hash->getCrc32array();
  75.         $hashClause = "0";
  76.         foreach ($crc32 as $token_hash) {
  77.             $hashClause .= " OR word_hash=$token_hash";
  78.         }
  79.  
  80.         $result = $this->_db->query("SELECT doc_id, COUNT(id) as inters FROM {$this->_table2} WHERE $conditions AND ($hashClause) GROUP BY doc_id HAVING inters>1");
  81.         while (list($id, $intersecs) = $result->fetch()) {
  82.             $result2 = $this->_db->query("SELECT length FROM {$this->_table} WHERE doc_id=$id");
  83.             $length = $result2->fetchColumn();
  84.             $length = min($length, $this->_hash->length);
  85.             $similarity = ($intersecs/$length)*100;  // Similarity between 2 docs in percents
  86.             //echo "Detected $intersecs of $length intersections with $id ($similarity %)\n";
  87.             if ($similarity > 80) {
  88.                 return $id;
  89.             }
  90.  
  91.         }
  92.  
  93.         return 0;
  94.     }
  95.  
  96.     public function findSimilar()
  97.     {
  98.         $crc32 = $this->_hash->getCrc32array();
  99.         $hashClause = "0";
  100.         foreach ($crc32 as $token_hash) {
  101.             $hashClause .= " OR word_hash=$token_hash";
  102.         }
  103.  
  104.         if ( isset($this->_docId) ) {
  105.             $doc_id_clause = "doc_id != " . $this->_docId;
  106.         }
  107.         else {
  108.             $doc_id_clause = " 1 ";
  109.         }
  110.  
  111.         echo "SELECT doc_id, COUNT(id) as inters FROM {$this->_table2} WHERE $doc_id_clause AND ($hashClause) GROUP BY doc_id HAVING inters>1";
  112.         $result = $this->_db->query("SELECT doc_id, COUNT(id) as inters FROM {$this->_table2} WHERE $doc_id_clause AND ($hashClause) GROUP BY doc_id HAVING inters>1");
  113.         while (list($id, $intersects) = $result->fetch()) {
  114.             $result2 = $this->_db->query("SELECT length FROM {$this->_table} WHERE doc_id=$id");
  115.             $length = $result2->fetchColumn();
  116.             $length = max($length, $this->_hash->length);
  117.             $similarity = ($intersects/$length)*100;  // Similarity between 2 docs in percents
  118.             if ($similarity > 75) {
  119.                 echo "Detected $intersects of $length intersections with $id ($similarity %)<br>\n";
  120.             }
  121.  
  122.         }
  123.  
  124.     }
  125.  
  126.     public function save()
  127.     {
  128.         // Some checks:
  129.  
  130.         if ($this->_docId == null) {
  131.             throw new Exception("Error: _docId should be defined in order to save!", 500);
  132.             return 0;
  133.         }
  134.         // Check if we have more or less correct results from hash class instace
  135.         if (!$this->_isHashValid()) {
  136.             throw new Exception("Error: It seems that something went wrong during hashing!", 500);
  137.             return 0;
  138.         }
  139.         // We have to check if we have this hashes of this document already
  140.         $result = $this->_db->query("SELECT doc_id FROM {$this->_table} WHERE doc_id={$this->_docId}");
  141.         $result2 = $this->_db->query("SELECT doc_id FROM {$this->_table2} WHERE doc_id={$this->_docId} LIMIT 0,1");
  142.         if ( $result->fetchColumn() || $result2->fetchColumn() ) {
  143.             // And delete them if we do
  144.             $this->delete();
  145.         }
  146.  
  147.  
  148.         // Saving process itself:
  149.         $this->_db->query("INSERT INTO {$this->_table} SET doc_id={$this->_docId},
  150.                                                                 full_hash='{$this->_hash->docMD5}',
  151.                                                                 length='{$this->_hash->length}'");
  152.  
  153.         $crc32 = $this->_hash->getCrc32array();
  154.         foreach ($crc32 as $word_hash) {
  155.             $this->_db->query("INSERT INTO {$this->_table2} SET doc_id={$this->_docId}, word_hash={$word_hash}");
  156.         }
  157.  
  158.     }
  159.  
  160.     public function delete()
  161.     {
  162.         $this->_db->query("DELETE FROM {$this->_table} WHERE doc_id=".$this->_docId);
  163.         $this->_db->query("DELETE FROM {$this->_table2} WHERE doc_id=".$this->_docId);
  164.     }
  165.  
  166.     protected function _isHashValid()
  167.     {
  168.         if ($this->_hash->length < 1 && $this->_hash->docMD5 != '') {
  169.             return 0;
  170.         }
  171.         return 1;
  172.     }
  173.  
  174. }
  175.  
  176. /*
  177.  
  178. Useful queries:
  179. SELECT doc_id, full_hash, COUNT(*) as qty FROM `terdo_items_hashes_summary` GROUP BY full_hash HAVING qty>1
  180.  
  181. SELECT doc_id, word_hash, COUNT(*) as qty FROM `terdo_items_hashes` GROUP BY word_hash HAVING qty>1
  182.  
  183. Tables sctrucure:
  184.  
  185. CREATE TABLE `items_hashes_summary` (
  186.   `doc_id` int(11) NOT NULL,
  187.   `full_hash` char(32) collate utf8_unicode_ci NOT NULL,
  188.   `length` smallint(11) NOT NULL,
  189.   PRIMARY KEY  (`doc_id`)
  190. )
  191.  
  192.  
  193. CREATE TABLE `items_hashes` (
  194.   `id` int(11) NOT NULL auto_increment,
  195.   `doc_id` int(11) NOT NULL,
  196.   `word_hash` int(11) NOT NULL,
  197.   PRIMARY KEY  (`id`)
  198. )
  199.  
  200.  
  201.  */
  202. ?>
  203.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement