Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- <?php
- /******************************************************************************
- * Copyright (c) 1994-2002
- * Andrew Kovalenko aka Keva. All rights reserved.
- * http://www.keva.ru/
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. Redistributions in any form must be accompanied by information on
- * how to obtain complete source code for the stemming software and any
- * accompanying software that uses the stemming software. The source code
- * must either be included in the distribution or be available for no
- * more than the cost of distribution plus a nominal fee, and must be
- * freely redistributable under reasonable conditions. For an
- * executable file, complete source code means the source code for all
- * modules it contains. It does not include source code for modules or
- * files that typically accompany the major components of the operating
- * system on which the executable file runs.
- *
- * THIS SOFTWARE IS PROVIDED BY ANDREW KOVALENKO ``AS IS'' AND ANY EXPRESS
- * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR
- * NON-INFRINGEMENT, ARE DISCLAIMED. IN NO EVENT SHALL ANDREW KOVALENKO
- * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
- * THE POSSIBILITY OF SUCH DAMAGE.
- *****************************************************************************/
- /******************************************************************************
- * Porting on PHP by Sedlyar Alexandr aka Seth 27.08.2009 All rights reserved.
- * http://seteh.in.ua
- *****************************************************************************/
- class stemka {
- var $vowels = "аеиоуыэюя";
- var $fuzzyLang = '';
- function GetStem($word, $lang = '', $path = '') {
- $this->minstem = $this->GetMinStem($word);
- $this->mcount = 5;
- $this->lplist = null;
- $fflag = true;
- if ($this->fuzzyLang != $lang)
- $fflag = $this->LoadFuzzy($lang, $path);
- if ($fflag) {
- $lcount = $this->GetStemLen($word, 0, strlen($word)-1);
- $result = '';
- if (is_array($this->lplist)) {
- foreach (array_reverse($this->lplist) as $v) {
- $result = '|' . substr($word, $v, strlen($word) - $v) . $result;
- $word = substr($word, 0, $v);
- }
- }
- $result = $word . $result;
- } else {
- $result = $word;
- }
- return $result;
- }
- function GetStemCrop($word, $lang = '', $path = '') {
- $this->minstem = $this->GetMinStem($word);
- $this->mcount = 5;
- $this->lplist = null;
- $fflag = true;
- if ($this->fuzzyLang != $lang)
- $fflag = $this->LoadFuzzy($lang, $path);
- if ($fflag) {
- $lcount = $this->GetStemLen($word, 0, strlen($word)-1);
- if ($lcount > 0)
- $res = substr($word, 0, $this->lplist[0]);
- else
- $res = $word;
- return $res;
- } else {
- return $word;
- }
- }
- function GetMinStem($word) {
- $length = strlen($word);
- for ($nindex = 0; $nindex < $length; $nindex++ ) {
- if (strchr($this->vowels, $word[$nindex] ) !== FALSE ) {
- while (++$nindex < $length && strchr($this->vowels, $word[$nindex] ) !== FALSE) {
- }
- return $nindex + 1;
- }
- }
- return $length;
- }
- function GetStemLen($word, $offset, $lptail) {
- $lpbase = $offset;
- $cchars = ord($this->fuzzy[$lpbase]);
- $pchars = $lpbase + $cchars;
- $lpoffs = ($pchars - 1) + $cchars * 2;
- $ptable = $word;
- $stmlen = $lptail + 1;
- $result = 0;
- while ($pchars > $lpbase) {
- while ($pchars > $lpbase && ord($this->fuzzy[$pchars]) != 0 && $this->fuzzy[$pchars] != $ptable[$lptail] ) {
- $pchars--;
- $lpoffs -= 2;
- }
- if ($pchars <= $lpbase)
- continue;
- if ($this->fuzzy[$pchars] == $ptable[$lptail] && $stmlen > $this->minstem - 2) {
- $lpoffs_tmp = (ord($this->fuzzy[$lpoffs+1]) << 8) + ord($this->fuzzy[$lpoffs]);
- $result += $this->GetStemLen($word, $lpoffs_tmp << 3, $lptail - 1 );
- } elseif (ord($this->fuzzy[$pchars]) == 0 && $stmlen >= $this->minstem - 2 && $this->mcount != 0 ) {
- $this->lplist[] = $stmlen + 2;
- $this->mcount--;
- return $result + 1;
- }
- $pchars--;
- $lpoffs -= 2;
- }
- return $result;
- }
- function LoadFuzzy($lang = '', $path = '') {
- if (file_exists($path . 'fuzzy' . $lang . '.dat')) {
- $this->fuzzy = file_get_contents($path . 'fuzzy' . $lang . '.dat');
- $ret = true;
- } else {
- $this->fuzzy = '';
- $ret = false;
- }
- return $ret;
- }
- }
- ?>
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement