Advertisement
Guest User

Untitled

a guest
Apr 26th, 2017
1,531
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.92 KB | None | 0 0
  1. <?php
  2. /*
  3. ***************************************************************************
  4. * Copyright (C) 2008 by Felipe Ribeiro *
  5. * felipernb@gmail.com *
  6. * http://www.feliperibeiro.com *
  7. * *
  8. * Permission is hereby granted, free of charge, to any person obtaining *
  9. * a copy of this software and associated documentation files (the *
  10. * "Software"), to deal in the Software without restriction, including *
  11. * without limitation the rights to use, copy, modify, merge, publish, *
  12. * distribute, sublicense, and/or sell copies of the Software, and to *
  13. * permit persons to whom the Software is furnished to do so, subject to *
  14. * the following conditions: *
  15. * *
  16. * The above copyright notice and this permission notice shall be *
  17. * included in all copies or substantial portions of the Software. *
  18. * *
  19. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, *
  20. * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF *
  21. * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*
  22. * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR *
  23. * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, *
  24. * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR *
  25. * OTHER DEALINGS IN THE SOFTWARE. *
  26. ***************************************************************************
  27. */
  28.  
  29.  
  30. /**
  31. * This class implements the Spell correcting feature, useful for the
  32. * "Did you mean" functionality on the search engine. Using a dicionary of words
  33. * extracted from the product catalog.
  34. *
  35. * Based on the concepts of Peter Norvig: http://norvig.com/spell-correct.html
  36. *
  37. * @author Felipe Ribeiro <felipernb@gmail.com>
  38. * @date September 18th, 2008
  39. * @package catalog
  40. *
  41. */
  42. class SpellCorrector {
  43. private static $NWORDS;
  44.  
  45. /**
  46. * Reads a text and extracts the list of words
  47. *
  48. * @param string $text
  49. * @return array The list of words
  50. */
  51. private static function words($text) {
  52. $matches = array();
  53. preg_match_all("/[a-z]+/",strtolower($text),$matches);
  54. return $matches[0];
  55. }
  56.  
  57. /**
  58. * Creates a table (dictionary) where the word is the key and the value is it's relevance
  59. * in the text (the number of times it appear)
  60. *
  61. * @param array $features
  62. * @return array
  63. */
  64. private static function train(array $features) {
  65. $model = array();
  66. $count = count($features);
  67. for($i = 0; $i<$count; $i++) {
  68. $f = $features[$i];
  69. $parts = explode("=",$f);
  70. $model[$parts[0]] = intval($parts[1]);
  71. }
  72. return $model;
  73. }
  74.  
  75. /**
  76. * Generates a list of possible "disturbances" on the passed string
  77. *
  78. * @param string $word
  79. * @return array
  80. */
  81. private static function edits1($word) {
  82. $alphabet = 'abcdefghijklmnopqrstuvwxyz';
  83. $alphabet = str_split($alphabet);
  84. $n = strlen($word);
  85. $edits = array();
  86. for($i = 0 ; $i<$n;$i++) {
  87. $edits[] = substr($word,0,$i).substr($word,$i+1); //deleting one char
  88. foreach($alphabet as $c) {
  89. $edits[] = substr($word,0,$i) . $c . substr($word,$i+1); //substituting one char
  90. }
  91. }
  92. for($i = 0; $i < $n-1; $i++) {
  93. $edits[] = substr($word,0,$i).$word[$i+1].$word[$i].substr($word,$i+2); //swapping chars order
  94. }
  95. for($i=0; $i < $n+1; $i++) {
  96. foreach($alphabet as $c) {
  97. $edits[] = substr($word,0,$i).$c.substr($word,$i); //inserting one char
  98. }
  99. }
  100.  
  101. return $edits;
  102. }
  103.  
  104. /**
  105. * Generate possible "disturbances" in a second level that exist on the dictionary
  106. *
  107. * @param string $word
  108. * @return array
  109. */
  110. private static function known_edits2($word) {
  111. $known = array();
  112. foreach(self::edits1($word) as $e1) {
  113. foreach(self::edits1($e1) as $e2) {
  114. if(array_key_exists($e2,self::$NWORDS)) $known[] = $e2;
  115. }
  116. }
  117. return $known;
  118. }
  119.  
  120. /**
  121. * Given a list of words, returns the subset that is present on the dictionary
  122. *
  123. * @param array $words
  124. * @return array
  125. */
  126. private static function known(array $words) {
  127. $known = array();
  128. foreach($words as $w) {
  129. if(array_key_exists($w,self::$NWORDS)) {
  130. $known[] = $w;
  131. }
  132. }
  133. return $known;
  134. }
  135.  
  136.  
  137. /**
  138. * Returns the word that is present on the dictionary that is the most similar (and the most relevant) to the
  139. * word passed as parameter,
  140. *
  141. * @param string $word
  142. * @return string
  143. */
  144. public static function correct($word) {
  145. $word = trim($word);
  146. if(empty($word)) return;
  147.  
  148. $word = strtolower($word);
  149. /* To optimize performance, the serialized dictionary can be saved on a file
  150. instead of parsing every single execution */
  151.  
  152. if(empty(self::$NWORDS)) {
  153.  
  154. if(!file_exists('serialized_dictionary.txt')) {
  155. echo "ERROR!! Please keep serialized_dictionary.txt in the same directory!!";
  156. } else {
  157. $contents = array();
  158. $contents = file("serialized_dictionary.txt");
  159. self::$NWORDS = self::train($contents);
  160. }
  161. }
  162.  
  163. $candidates = array();
  164. if(self::known(array($word))) {
  165. return $word;
  166. } elseif(($tmp_candidates = self::known(self::edits1($word)))) {
  167. foreach($tmp_candidates as $candidate) {
  168. $candidates[] = $candidate;
  169. }
  170. } elseif(($tmp_candidates = self::known_edits2($word))) {
  171. foreach($tmp_candidates as $candidate) {
  172. $candidates[] = $candidate;
  173. }
  174. } else {
  175. return $word;
  176. }
  177.  
  178. $max = 0;
  179. foreach($candidates as $c) {
  180. $value = self::$NWORDS[$c];
  181. if( $value > $max) {
  182. $max = $value;
  183. $word = $c;
  184. }
  185. }
  186. return $word;
  187. }
  188. }
  189.  
  190. ?>
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement