Advertisement
Guest User

Eng/Rus/Ukr Stemmer by Andrew Kovalenko

a guest
Aug 18th, 2010
899
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
PHP 6.38 KB | None | 0 0
  1. <?php
  2.     /******************************************************************************
  3.     * Copyright (c) 1994-2002
  4.     * Andrew Kovalenko aka Keva.  All rights reserved.
  5.     *      http://www.keva.ru/
  6.     *
  7.     * Redistribution and use in source and binary forms, with or without
  8.     * modification, are permitted provided that the following conditions
  9.     * are met:
  10.     * 1. Redistributions of source code must retain the above copyright
  11.     *    notice, this list of conditions and the following disclaimer.
  12.     * 2. Redistributions in binary form must reproduce the above copyright
  13.     *    notice, this list of conditions and the following disclaimer in the
  14.     *    documentation and/or other materials provided with the distribution.
  15.     * 3. Redistributions in any form must be accompanied by information on
  16.     *    how to obtain complete source code for the stemming software and any
  17.     *    accompanying software that uses the stemming software.  The source code
  18.     *    must either be included in the distribution or be available for no
  19.     *    more than the cost of distribution plus a nominal fee, and must be
  20.     *    freely redistributable under reasonable conditions.  For an
  21.     *    executable file, complete source code means the source code for all
  22.     *    modules it contains.  It does not include source code for modules or
  23.     *    files that typically accompany the major components of the operating
  24.     *    system on which the executable file runs.
  25.     *
  26.     * THIS SOFTWARE IS PROVIDED BY ANDREW KOVALENKO ``AS IS'' AND ANY EXPRESS
  27.     * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  28.     * WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR
  29.     * NON-INFRINGEMENT, ARE DISCLAIMED.  IN NO EVENT SHALL ANDREW KOVALENKO
  30.     * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  31.     * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  32.     * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  33.     * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  34.     * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  35.     * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  36.     * THE POSSIBILITY OF SUCH DAMAGE.
  37.     *****************************************************************************/
  38.     /******************************************************************************
  39.     * Porting on PHP by Sedlyar Alexandr aka Seth 27.08.2009 All rights reserved.
  40.     * http://seteh.in.ua
  41.     *****************************************************************************/
  42.     class stemka {
  43.         var $vowels = "аеиоуыэюя";
  44.         var $fuzzyLang = '';
  45.         function GetStem($word, $lang = '', $path = '') {
  46.             $this->minstem = $this->GetMinStem($word);
  47.             $this->mcount = 5;
  48.             $this->lplist = null;
  49.             $fflag = true;
  50.             if ($this->fuzzyLang != $lang)
  51.                 $fflag = $this->LoadFuzzy($lang, $path);
  52.             if ($fflag) {
  53.                 $lcount = $this->GetStemLen($word, 0, strlen($word)-1);
  54.                 $result = '';
  55.                 if (is_array($this->lplist)) {
  56.                     foreach (array_reverse($this->lplist) as $v) {
  57.                         $result = '|' . substr($word, $v, strlen($word) - $v) . $result;
  58.                         $word = substr($word, 0, $v);
  59.                     }
  60.                 }
  61.                 $result = $word . $result;
  62.             } else {
  63.                 $result = $word;
  64.             }
  65.             return $result;
  66.         }
  67.         function GetStemCrop($word, $lang = '', $path = '') {
  68.             $this->minstem = $this->GetMinStem($word);
  69.             $this->mcount = 5;
  70.             $this->lplist = null;
  71.             $fflag = true;
  72.             if ($this->fuzzyLang != $lang)
  73.                 $fflag = $this->LoadFuzzy($lang, $path);
  74.             if ($fflag) {
  75.                 $lcount = $this->GetStemLen($word, 0, strlen($word)-1);
  76.                 if ($lcount > 0)
  77.                     $res = substr($word, 0, $this->lplist[0]);
  78.                 else
  79.                     $res = $word;
  80.                 return $res;
  81.             } else {
  82.                 return $word;
  83.             }
  84.         }
  85.         function GetMinStem($word) {
  86.             $length = strlen($word);
  87.             for ($nindex = 0; $nindex < $length; $nindex++ ) {
  88.                 if (strchr($this->vowels, $word[$nindex] ) !== FALSE ) {
  89.                     while (++$nindex < $length && strchr($this->vowels, $word[$nindex] ) !== FALSE) {
  90.                     }
  91.                     return $nindex + 1;
  92.                 }
  93.             }
  94.             return $length;
  95.         }
  96.         function GetStemLen($word, $offset, $lptail) {
  97.             $lpbase = $offset;
  98.             $cchars = ord($this->fuzzy[$lpbase]);
  99.             $pchars = $lpbase + $cchars;
  100.             $lpoffs = ($pchars - 1) + $cchars * 2;
  101.             $ptable = $word;
  102.             $stmlen = $lptail + 1;
  103.             $result = 0;
  104.             while ($pchars > $lpbase) {
  105.                 while ($pchars > $lpbase && ord($this->fuzzy[$pchars]) != 0 && $this->fuzzy[$pchars] != $ptable[$lptail] ) {
  106.                     $pchars--;
  107.                     $lpoffs -= 2;
  108.                 }
  109.                 if ($pchars <= $lpbase)
  110.                     continue;
  111.                 if ($this->fuzzy[$pchars] == $ptable[$lptail] && $stmlen > $this->minstem - 2) {
  112.                     $lpoffs_tmp = (ord($this->fuzzy[$lpoffs+1]) << 8) + ord($this->fuzzy[$lpoffs]);
  113.                     $result += $this->GetStemLen($word, $lpoffs_tmp << 3, $lptail - 1 );
  114.                 } elseif (ord($this->fuzzy[$pchars]) == 0 && $stmlen >= $this->minstem - 2 && $this->mcount != 0 ) {
  115.                     $this->lplist[] = $stmlen + 2;
  116.                     $this->mcount--;
  117.                     return $result + 1;
  118.                 }
  119.                 $pchars--;
  120.                 $lpoffs -= 2;
  121.             }
  122.             return $result;
  123.         }
  124.         function LoadFuzzy($lang = '', $path = '') {
  125.             if (file_exists($path . 'fuzzy' . $lang . '.dat')) {
  126.                 $this->fuzzy = file_get_contents($path . 'fuzzy' . $lang . '.dat');
  127.                 $ret = true;
  128.             } else {
  129.                 $this->fuzzy = '';
  130.                 $ret = false;
  131.             }
  132.             return $ret;
  133.         }
  134.     }
  135. ?>
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement