lemmatizer

<?php

class Lemmatizer {

    /*
    ********************************************************************************
    ***     ATTRIBUTES
    ********************************************************************************
    */

    /*
        Regular expression for vowels

        @const
        @var string
    */
    const VOWEL = "[aiueo]";

    /*
        Regular expression for consonants

        @const
        @var string
    */
    const CONSONANT = "[bcdfghjklmnpqrstvwxyz]";

    /*
        Regular expression for alphabet, including a stripe in between
        for pluralized or repetitive form

        @const
        @var string
    */
    const ALPHA = "[a-z]+-?[a-z]*";

    /*
        Holds the removed suffixes/prefixes for backtracking procedure

        @var array list of strings
    */
    protected $removed = array(
        'particle' => '',
        'possessive_pronoun' => '',
        'derivational_suffix' => '',
        'derivational_prefix' => ''
    );

    /*
        Serves as a container for successful dictionary lookup

        @var string
    */
    protected $found = null;

    /*
        Tracks all the changes made to the word; The array is indexed by
        the general prefix form, such as di,ke,se,be,me,pe,te.

        for example, the word 'menapak' undergoes transformation men-tapak.
        the variable's structure would be:
        ["me"] => (
                ["men"] => "t"
            )

        @var array
    */
    public $complex_prefix_tracker = array();

    /*
        Saves recoding path for corresponding rules; the array is indexed by
        the general prefix form.

        same as prefix tracker variable; this variable is structured as:
        ["me"] => (
                ["me"] => "n"
            )

        @var array
    */
    public $recoding_tracker = array();

    /*
        Serves as the error indicator if a termination condition occurs.
        The conditions are:
            > 'disallowed_pairs':   the identified prefix forms a disallowed
                                    affix combination with suffix that was
                                    removed in previous steps.
            > 'lemma_not_found'     the lemmatizer fails to detect input word.

        @var string
    */
    public $error = null;

    /*
        Records how many lookups performed

        @var integer
    */
    public $total_lookup = 0;


    /*
        Saves connection string to MySQL.
        host: localhost
        user: root
        pass: <none>

        @var PDO Connection
    */
    protected $database;


    private $time;

    /*
    ********************************************************************************
    ***     METHODS
    ********************************************************************************
    */

    /*

        Opens new database connection on instance construction

    */
    public function __construct() {

        $this->database = new PDO("mysql:host=localhost;dbname=lemmatizer", "root", "");

        $this->time = microtime(true);

    }

    /*
        Checks the input word against the dictionary; returns the word if found,
        or returns false if not found

        @param string $word
        @return mixed
    */
    protected function lookup($word) {

        // If the input word's length is smaller then 3, don't bother.
        if(strlen($word)<3) return false;


        /*
            Saves input word for further processing

            @var string
        */
        $check = $word;

        $check2 = "";

        /*
            Saves query result from PDO Query

            @var string
        */
        $query_string;

        /*

            Checks for repeated form that represents pluralized form;
            for example 'buku-buku'

        */
        if(preg_match("/^([a-z]+)-([a-z]+)$/", $check, $match)) {

            if($match[1] == $match[2]) {

                $check = $match[1];
                $check2 = $word;

            }

        }

        /*

                Attempts to programatically split joined words, in order to produce lemmas with
                more than one word. The split method only works when the first word contains 2 syllables.

        */
        if(strlen($word) <= 6) {

            // executes lemma from database
            $query_string = "'$check'";

        }
        else {

            // regex string for a valid Indonesian syllable
            $syllable = "([bcdfghjklmnpqrstvwxyz]|sy)?([aiueo])(?U)([bcdfghjklmnpqrstvwxyz]|ng)?";

            // regex string for identifying the two words.
            $reg = "/^(?<first>aneka|({$syllable}{$syllable}))(?<second>{$syllable}{$syllable}(?U)({$syllable})*)$/";

            if(preg_match($reg, $word, $match)) {

                // Performs query via PDO
                $query_string = "'".$match['first']." ".$match['second']."' OR lemma LIKE '$check'";

            } else {

                $query_string = "'$check'";

            }
        }

        if($check2!="") {

            $query_string .= " OR lemma LIKE '$check2'";

        }

        /*

            If the checked word is ended with a vowel and the removed derivational suffix is -kan,
            there is a likely chance of overstemming; that is why the algorithm will check for
            both possibilities; with -k or without -k, sorted by its PART OF SPEECH (verb prioritized)

        */
        if(preg_match('/[aiueo]$/', $word) && $this->removed['derivational_suffix']=='kan' && strlen($word)>3) {

            $query_string .= " OR lemma LIKE '{$check}k' ORDER BY pos DESC";
        }

        /*
            Executes lookup to database.

            @var PDO Object
        */
        $query = $this->database->query("SELECT * FROM dictionary WHERE lemma LIKE $query_string LIMIT 1");

        // updates total dictionary lookup counter
        $this->total_lookup++;

        if($row = $query->fetch()) {

            // updates class property
            $this->found = $row['lemma'];

            // returns result to function caller
            return $this->found;

        }

    }


    /*
        Checks input word for rule precedence; If the input word has a confix:
        be - lah, be - an, me - i, di - i, pe - i, te - i
        Then, derivational prefix removal will be performed first

        @param string $word
        @return boolean
    */
    protected function check_rule_precedence($word) {

        /*
            Loads normalized alphabet regex (including stripes) from class' [constant];
            for shorthand purposes.

            @var string
        */
        $alpha = self::ALPHA;

        /*
            Regular expression for affix pairs:
            ber - lah
            ber - an
            me - i
            di - i
            pe - i
            ter - i

            @var array list of strings
        */
        $patterns = array(
                0 => "/^be(?<word>{$alpha})([^k]an|lah)$/",
                1 => "/^(me|di|pe|te)(?<word>{$alpha})(i)$/",
                2 => "/^(k|s)e(?<word>{$alpha})(i|kan)$/",
                3 => "/^(me|di|te|pe)(?<word>{$alpha})(an)$/",
                4 => "/^pe(?<word>{$alpha}(tah|[^k]an))/"
            );

        /*

            Checks whether the input word matches the affix pairs above;
            returns true if pattern is found, and false if not found

        */
        foreach($patterns as $pattern) {

            if(preg_match($pattern, $word, $match)
                && $match['word'] != 'ngalam') return true;

        }

        return false;

    }


    /*
        Checks whether the input word contains disallowed affix pairs/confixes;
        returns true if the word has disallowed pair

        @return boolean
    */
    protected function has_disallowed_pairs() {

        /*
            Loads normalized alphabet regex (including stripes) from class' [constant];
            for shorthand purposes.

            @var string
        */
        $alpha = self::ALPHA;

        /*
            Regular expression for disallowed affix pairs:
            be - i
            ke - i and kan
            se - i and kan
            di - an
            te - an

            @var array list of strings
        */
        $patterns = array(
            0 => "/^be[^r]i$/",
            1 => "/^(k|s)e(i|kan)$/",
            2 => "/^(di|me|te)[^krwylp]an$/"
        );

        /*

            Checks whether the identified derivational prefix and suffix matches the
            affix pairs above; returns true if pattern is found, and false if not found

        */
        if($this->removed["derivational_prefix"]!="" && $this->removed["derivational_suffix"]!="") {

            $prefix = reset($this->removed["derivational_prefix"]);

            foreach($patterns as $pattern) {

                if(preg_match($pattern, $prefix . $this->removed["derivational_suffix"])) {

                    return true;
                }
            }
        }

        // no disallowed pairs found, good to go
        return false;

    }


    /*
        Attempts to remove inflectional suffixes:
        (particles) -kah, -lah, -tah, -pun and (possessive pronoun) -ku, -mu, -nya
        from input word; Returns original value if no inflectional suffix found

        @param string $word
        @return string
    */
    protected function delete_inflectional_suffix($word) {

        /*
            Holds the value after suffix removal process

            @var string
        */
        $result = $word;

        /*
            Regular expression for Particle suffixes: (-kah, -lah, -tah, -pun)
            and Possessive Pronoun suffixes (-ku, -mu, -nya)

            @var array
        */
        $patterns = array(
                'particle' => "/([klt]ah|pun)$/",
                'possessive_pronoun' => "/([km]u|nya)$/"
            );

        /*

            Checks whether the input word contains inflectional suffix, with
            additional handling for Particle endings; because inflectional suffix
            can be stacked, e.g. "mobilnyapun"

        */
        foreach($patterns as $key => $pattern) {

            if(preg_match($pattern, $result, $match)) {

                $result = preg_replace($pattern, '', $result);

                // Updates the removed value holder
                $this->removed[$key] = $match[0];

                // Perform database lookup
                $check = $this->lookup($result);

                // If a lemma is successfully found, return it.
                if($check) return $check;

            }

        }

        // returns the suffix removal result
        return $result;

    }


    /*
        Attempts to remove derivational suffixes -i, -kan, -an from input word;
        Returns original value if no derivational suffix found

        @param string $word
        @return string
    */
    protected function delete_derivational_suffix($word) {

        /*
            Holds the value after suffix removal process

            @var string
        */
        $result = $word;

        /*
            Regular expression for derivational suffixes: -i, -kan, an

            @var string
        */
        $derivational_suffix = "/(i|k?an)$/";

        /*

            Checks whether input word contains derivational suffix; before
            stripping the suffix, an additional check for disallowed affix pair
            is performed

        */
        if(preg_match($derivational_suffix, $result, $match)) {

            // Removes the derivational suffix from given word
            $result = preg_replace($derivational_suffix, '', $result);

            // Updates the removed value holder
            $this->removed['derivational_suffix'] = $match[0];

            // Perform database lookup
            $check = $this->lookup($result);

            // If a lemma is successfully found, return it.
            if($check) return $check;

        }

        return $result;

    }


    /*
        Attempts to remove derivational prefixes di-, ke-, se-, be-, pe-,
        me-, pe- from input word. Generally, derivational prefix is divided to
        2 different group:
            plain (di-, ke-, se-) and
            complex (be-,me-,pe-,te-)

        Complex prefixes need transformation rules for certain cases in order to
        correctly lemmatize the input word.

        @param string $word
        @return mixed
    */
    protected function delete_derivational_prefix($word) {

        /*
            Loads normalized vowel regex from class' [constant]; for shorthand purposes.

            @var string
        */
        $vowel = self::VOWEL;

        /*
            Loads normalized consonant regex from class' [constant]; for shorthand purposes.

            @var string
        */
        $consonant = self::CONSONANT;

        /*
            Loads normalized alphabet regex (including stripes) from class' [constant];
            for shorthand purposes.

            @var string
        */
        $alpha = self::ALPHA;

        /*
            Holds the value after suffix removal process

            @var string
        */
        $result = $word;

        /*
            Records what type of prefix is removed; plain or complex,
            in boolean form with [TRUE for plain]

            @var boolean
        */
        $type;

        /*
            Records what the matching prefix is for later use

            @var string
        */
        $prefix;

        /*
            Regular expressions for plain and complex derivational prefixes

            @var array list of strings
        */
        $patterns = array(
                'plain' => "/^(di|(k|s)e)/",
                'complex' => "/^(b|m|p|t)e/"
            );

        /*

            A check is performed; if the input word has less than four characters,
            then the prefix removal process will be skipped.

        */
        if(strlen($result)< 4) {
            return $result;
        }


        foreach($patterns as $key => $pattern) {

            if(preg_match($pattern, $result, $match)) {

                // saves the detected prefix's type
                $type = ($key=='plain') ? true : false;

                // saves matching prefix for later usage
                $prefix = $match[0];


                /*

                    Performs check whether identified prefix is identical with the
                    previously removed prefixes; the prefix removal process will be
                    terminated here if duplicate prefix detected.

                */
                if($this->removed["derivational_prefix"]!="" && in_array($prefix, $this->removed["derivational_prefix"])) {

                    return $result;

                }


                /*

                    Initializes recoding variable for found prefix; if the corresponding
                    rule does not have recoding path, then the value will be empty string

                */
                $this->recoding_tracker[$match[0]] = "";

                /*

                    If the prefix belongs to the 'plain' group, then immediate removal is done;
                    However if then prefix belongs to complex group, transformation rules must apply

                */
                if($type) {

                    $array = $this->removed['derivational_prefix'];

                    if($prefix=='ke' && $array!="" && ($array[0]=="di" && !preg_match('/(tawa|tahu)/', $result)) && $array[0]!="be") return $result;

                    $result = preg_replace($pattern, '', $result);

                    // save modification changes to prefix tracker
                    $this->complex_prefix_tracker[$prefix] = array($prefix => "");

                } else {

                    /*
                        Temporary single-member array, used to hold complex prefix transformations
                        to be pushed to the tracker.

                        @var array
                    */
                    $modification = null;

                    /*************************************************************************
                    **  "be-" PREFIX RULES
                    **   total rule: 5
                    *************************************************************************/

                    if($prefix == "be") {

                        /*

                            If a prefix has been removed before, these rules check for
                            combination, if it is an allowed type of combination or not.

                        */
                        if($this->removed['derivational_prefix']!="") {

                            // Get the array value of first index
                            $array = reset($this->complex_prefix_tracker);

                            // Get the first index of modification value
                            $added = reset($array);

                            // pp: Previous Prefix; Get the key (removed part) of modification value
                            $pp = key($array);

                            /*

                                Allowed combinations:
                                diber-,
                                keber-,
                                member-,
                                pember

                            */
                            if($pp!='mem' && $pp!='pem' && $pp!= 'di' && $pp!='ke') return $result;

                        }

                        /*
                            RULE 1
                            input: berV...
                            output: berV... | be - rV...
                        */
                        if(preg_match("/^ber$vowel/", $result)) {

                            $result = preg_replace("/^ber/", "", $result);

                            // save prefix changes
                            $modification = array("ber" => "");

                            // save recoding path
                            $this->recoding_tracker[$prefix] = array("be" => "");

                        }

                        /*
                            RULE 2
                            input: berCAP... where C!='r' and P!='er'
                            output: ber-CAP...
                        */
                        else if(preg_match("/^ber[bcdfghjklmnpqstvwxyz][a-z](?!er)/", $result)) {

                            $result = preg_replace("/^ber/", "", $result);

                            // save prefix changes
                            $modification = array("ber" => "");
                        }

                        /*
                            RULE 3
                            input: berCAerV... where C!= 'r'
                            output: ber-CAerV
                        */
                        else if(preg_match("/^ber[bcdfghjklmnpqstvwxyz][a-z]er$vowel/", $result)) {

                            $result = preg_replace("/^ber/", "", $result);

                            //save prefix changes
                            $modification = array("ber" => "");

                        }

                        /*
                            RULE 4
                            input: belajar
                            output: bel - ajar
                        */
                        else if(preg_match("/^belajar$/", $result)) {

                            $result = preg_replace("/^bel/", "", $result);

                            // save prefix changes
                            $modification = array("bel" => "");

                        }

                        /*
                            RULE 5
                            input: beC1erC2... where C1!= 'r' or 'l'
                            output: be-C1erC2
                        */
                        else if(preg_match("/^be[bcdfghjkmnpqstvwxyz]er$consonant/", $result)) {

                            $result = preg_replace("/^be/", "", $result);

                            // save prefix changes
                            $modification = array("be" => "");

                        }

                        /*

                            In this case, the rule is unsuccessful, therefore the
                            original input word will be returned. The previously
                            initialized recoding chars will also be unset.

                        */
                        else {

                            unset($this->recoding_tracker[$prefix]);

                            return $word;
                        }

                    }


                    /*************************************************************************
                    **  "te-" PREFIX RULES
                    **  total rule: 5
                    *************************************************************************/

                    else if($prefix == "te") {

                        /*

                            If a prefix has been removed before, these rules check for
                            combination, if it is an allowed type of combination or not.

                        */
                        if($this->removed['derivational_prefix']!="") {

                            // Get the array value of first index
                            $array = reset($this->complex_prefix_tracker);

                            // Get the first index of modification value
                            $added = reset($array);

                            // pp: Previous Prefix; Get the key (removed part) of modification value
                            $pp = key($array);

                            /*

                                Allowed combinations:
                                ke-,
                                men- (special for tawa),
                                pen- (special for tawa)

                            */
                            if($pp!='ke' && (($pp=='me' || $pp=='men' || $pp=='pen') && !preg_match('/tawa/', $result))) {

                                return $result;
                            }// menerbangkan
                            //
                        }

                        /*
                            RULE 6
                            input: terV...
                            output: ter-V... | te-rV...
                        */
                        if(preg_match("/^ter$vowel/", $result)) {

                            $result = preg_replace("/^ter/", "", $result);

                            // save prefix changes
                            $modification = array("ter" => "");

                            // save recoding path
                            $this->recoding_tracker[$prefix] = array("te" => "");

                        }

                        /*
                            RULE 7
                            input: terCerV...
                            output: ter-CerV... where C!='r'
                        */
                        else if(preg_match("/^ter[bcdfghjklmnpqstvwxyz]er$vowel/", $result)) {

                            $result = preg_replace("/^ter/", "", $result);

                            // save prefix changes
                            $modification = array("ter" => "");

                        }

                        /*
                            RULE 8
                            input: terCP...
                            output: ter-CP...
                        */
                        else if(preg_match("/^ter$consonant(?!er)/", $result)) {

                            $result = preg_replace("/^ter/", "", $result);

                            // save prefix changes
                            $modification = array("ter" => "");

                        }

                        /*
                            RULE 9
                            input: teC1erC2...
                            output: te-C1erC2... where C1!='r'
                        */
                        else if(preg_match("/^ter[bcdfghjklmnpqstvwxyz]er$consonant/", $result)) {

                            $result = preg_replace("/^te/", "", $result);

                            // save prefix changes
                            $modification = array("te" => "");

                        }

                        /*
                            RULE 10
                            input: terC1erC2...
                            output: ter-C1erC2... where C1!='r'
                        */
                        else if(preg_match("/^ter[bcdfghjklmnpqstvwxyz]er$consonant/", $result)) {

                            $result = preg_replace("/^ter/", "", $result);

                            // save prefix changes
                            $modification = array("ter", "");

                        }

                        /*

                            In this case, the rule is unsuccessful, therefore the
                            original input word will be returned. The previously
                            initialized recoding chars will also be unset.

                        */
                        else {

                            unset($this->recoding_tracker[$prefix]);

                            return $word;
                        }

                    }


                    /*************************************************************************
                    **  "me-" PREFIX RULES
                    **  total rule: 10
                    *************************************************************************/

                    else if($prefix == "me") {

                        /*

                            This prefix cannot be a second-level prefix. If there is
                            already a removed prefix, immediately return input word.

                        */
                        if($this->removed['derivational_prefix']!="") return $result;

                        /*
                            RULE 11
                            input: me{l|r|w|y}V...
                            output: me-{l|r|w|y}V...
                        */
                        if(preg_match("/^me[lrwy]$vowel/", $result)) {

                            $result = preg_replace("/^me/", "", $result);

                            // save prefix changes
                            $modification = array("me" => "");

                        }

                        /*
                            RULE 12
                            input: mem{b|f|v}...
                            output: mem-{b|f|v}...
                        */
                        else if(preg_match("/^mem[bfv]/", $result)) {

                            $result = preg_replace("/^mem/", "", $result);

                            // save prefix changes
                            $modification = array("mem" => "");

                        }

                        /*
                            RULE 13
                            input: mempe...
                            output: mem-pe..
                        */
                        else if(preg_match("/^mempe/", $result)) {

                            $result = preg_replace("/^mem/", "", $result);

                            // save prefix changes
                            $modification = array("mem" => "");

                        }

                        /*
                            RULE 14
                            input: mem{rV|V}...
                            output:me-m{rV|V}... | me-p{rV|V}...
                        */
                        else if(preg_match("/^mem(r?)$vowel/", $result, $match)) {

                            $result = preg_replace("/^me/", "", $result);

                            // save prefix changes
                            $modification = array("me$match[1]" => "");

                            // save recoding path
                            $this->recoding_tracker[$prefix] = array("mem" => "p");

                        }

                        /*
                            RULE 15
                            input: men{c|d|j|s|z}...
                            output:men-{c|dj|s|z}...
                        */
                        else if(preg_match("/^men[cdsjz]/", $result)) {

                            $result = preg_replace("/^men/", "", $result);

                            // save prefix changes
                            $modification = array("men" => "");

                        }

                        /*
                            RULE 16
                            input: menV...
                            output:me-tV... | me-nV...
                        */
                        else if(preg_match("/^men$vowel/", $result)) {

                            $result = preg_replace("/^men/", "t", $result);

                            // save prefix changes
                            $modification = array("men" => "t");

                            // save recoding path
                            $this->recoding_tracker[$prefix] = array("me" => "");

                        }

                        /*
                            RULE 17
                            input: meng{g|h|q|k}...
                            output: meng-{g|h|q|k}...
                        */
                        else if(preg_match("/^meng[ghqk]/", $result)) {

                            $result = preg_replace("/^meng/", "", $result);

                            // save prefix changes
                            $modification = array("meng" => "");

                        }

                        /*
                            RULE 18
                            input: mengV...
                            output: meng-V... | meng-kV... | mengV-... if V='e'
                        */
                        else if(preg_match("/^meng($vowel)/", $result, $match)) {

                            $result = preg_replace("/^meng/", "", $result);

                            // save prefix changes
                            $modification = array("meng" => "");

                            // save recoding path
                            $this->recoding_tracker[$prefix] = array("meng1" => "k");
                            $this->recoding_tracker[$prefix]["menge"] = "";

                        }

                        /*
                            RULE 19
                            input: menyV...
                            output: meny-sV... | me-nyV...
                        */
                        else if(preg_match("/^meny$vowel/", $result)) {

                            $result = preg_replace("/^me/", "", $result);

                            // save prefix changes
                            $modification = array("me" => "");

                            // save recoding path
                            $this->recoding_tracker[$prefix] = array("meny" => "s");

                        }

                        /*
                            RULE 20
                            input: mempA...
                            output: mem-pA... where A!='e'
                        */
                        else if(preg_match("/^memp[abcdfghijklmnopqrstuvwxyz]/", $result)) {

                            $result = preg_replace("/^mem/", "", $result);

                            // save prefix changes
                            $modification = array("mem" => "");

                        }

                        /*

                            In this case, the rule is unsuccessful, therefore the
                            original input word will be returned. The previously
                            initialized recoding chars will also be unset.

                        */
                        else {

                            unset($this->recoding_tracker[$prefix]);

                            return $word;
                        }

                    }


                    /*************************************************************************
                    **  "pe-" PREFIX RULES
                    **  total rule: 15
                    *************************************************************************/

                    else if($prefix == "pe") {

                        /*

                            If a prefix has been removed before, these rules check for
                            combination, if it is an allowed type of combination or not.

                        */
                        if($this->removed['derivational_prefix']!="") {

                            // Get the array value of first index
                            $array = reset($this->complex_prefix_tracker);

                            // Get the first index of modification value
                            $added = reset($array);

                            // pp: Previous Prefix; Get the key (removed part) of modification value
                            $pp = key($array);

                            /*

                                Allowed combinations:
                                di-,
                                peN-,
                                mem-.

                            */
                            if($pp!='di' && $pp!='ber' && $pp!= 'mem' && $pp!='se' && $pp!='ke') return $result;

                        }

                        /*
                            RULE 21
                            input: pe{w|y}V...
                            output: pe-{w|y}V...
                        */
                        if(preg_match("/^pe[wy]$vowel/", $result)) {

                            $result = preg_replace("/^pe/", "", $result);

                            // save prefix changes
                            $modification = array("pe" => "");

                        }

                        /*
                            RULE 22
                            input: perV...
                            output: per-V... | pe-rV...
                        */
                        else if(preg_match("/^per$vowel/", $result)) {

                            $result = preg_replace("/^per/", "", $result);

                            // save prefix changes
                            $modification = array("per" => "");

                            // save recoding path
                            $this->recoding_tracker[$prefix] = array("pe" => "");

                        }

                        /*
                            RULE 23
                            input: perCAP...
                            output: per-CAP... where C!='r' and P!='er'
                        */
                        else if(preg_match("/^per[bcdfghjklmnpqstvwxyz][a-z](?!er)/", $result)) {

                            $result = preg_replace("/^per/", "", $result);

                            // save prefix changes
                            $modification = array("per" => "");

                        }

                        /*
                            RULE 24
                            input: perCAerV...
                            output: per-CAerV... where C!= 'r'
                        */
                        else if(preg_match("/^per[bcdfghjklmnpqstvwxyz][a-z]er$vowel/", $result)) {

                            $result = preg_replace("/^per/", "", $result);

                            // save prefix changes
                            $modification = array("per" => "");

                        }

                        /*
                            RULE 25
                            input: pem{b|f|v}...
                            output: pem-{b|f|v}...
                        */
                        else if(preg_match("/^pem[bfv]/", $result)) {

                            $result = preg_replace("/^pem/", "", $result);

                            // save prefix changes
                            $modification = array("pem" => "");

                        }

                        /*
                            RULE 26
                            input: pem{rV|V}...
                            output: pe-m{rV|V}... | pe-p{rV|V}...
                        */
                        else if(preg_match("/^pem(r?)$vowel/", $result)) {

                            $result = preg_replace("/^pe/", "", $result);

                            // save prefix changes
                            $modification = array("pe" => "");

                            // save recoding path
                            $this->recoding_tracker[$prefix] = array("pem" => "p");

                        }

                        /*
                            RULE 27
                            input: pen{c|d|j|z}...
                            output: pen-{c|d|j|z}...
                        */
                        else if(preg_match("/^pen[cdjz]/", $result)) {

                            $result = preg_replace("/^pen/", "", $result);

                            // save prefix changes
                            $modification = array("pen" => "");

                        }

                        /*
                            RULE 28
                            input: penV...
                            output: pe-tV... | pe-nV...
                        */
                        else if(preg_match("/^pen$vowel/", $result)) {

                            $result = preg_replace("/^pen/", "t", $result);

                            // save prefix changes
                            $modification = array("pen" => "t");

                            // save recoding path
                            $this->recoding_tracker[$prefix] = array("pe" => "");

                        }

                        /*
                            RULE 29
                            input: pengC...
                            output: peng-C...
                        */
                        else if(preg_match("/^peng$consonant/", $result)) {

                            $result = preg_replace("/^peng/", "", $result);

                            // save prefix changes
                            $modification = array("peng" => "");

                        }

                        /*
                            RULE 30
                            input: pengV...
                            output: peng-V | peng-kV... | pengV-... if V='e'
                        */
                        else if(preg_match("/^peng($vowel)/", $result, $match)) {

                            $result = preg_replace("/^peng/", "", $result);

                            // save prefix changes
                            $modification = array("peng" => "");

                            // save recoding path
                            $this->recoding_tracker[$prefix] = array("peng1" => "k");
                            $this->recoding_tracker[$prefix]["penge"] = "";


                            // if($match[1] == 'e') {

                            //     $result = preg_replace("/^penge/", "", $result);

                            //     // save prefix changes
                            //     $modification = array("penge" => "");

                            //     $this->recoding_tracker[$prefix] = array("peng1" => "");
                            //     $this->recoding_tracker[$prefix]["peng2"] = "k";

                            // } else {

                            //     $result = preg_replace("/^peng/", "", $result);

                            //     // save prefix changes
                            //     $modification = array("peng" => "");

                            //     // save recoding path
                            //     $this->recoding_tracker[$prefix] = array("peng" => "k");
                            // }

                        }

                        /*
                            RULE 31
                            input: penyV...
                            output: peny-sV... | pe-nyV...
                        */
                        else if(preg_match("/^peny$vowel/", $result)) {

                            $result = preg_replace("/^peny/", "s", $result);

                            // save prefix changes
                            $modification = array("peny" => "s");

                            // save recoding path
                            $this->recoding_tracker[$prefix] = array("pe" => "");

                        }

                        /*
                            RULE 32
                            input: pelV...
                            output: pe-lV... | pel-V if 'pelajar'
                        */
                        else if(preg_match("/^pel$vowel/", $result)) {

                            if($result == "pelajar") {

                                $result = preg_replace("/^pel/", "", $result);

                                // save prefix changes
                                $modification = array("pel" => "");

                            } else {

                                $result = preg_replace("/^pe/", "", $result);

                                // save prefix changes
                                $modification = array("pe" => "");

                            }

                        }

                        /*
                            RULE 33
                            input: peCerV...
                            output: per-CerV... where C!={r|w|y|l|m|n}
                        */
                        else if(preg_match("/^pe[bcdfghjkpqstvxz]er$vowel/", $result)) {

                            $result = preg_replace("/^pe/", "", $result);

                            // save prefix changes
                            $modification = array("pe" => "");

                        }

                        /*
                            RULE 34
                            input: peCP...
                            output: pe-CP... where C!={r|w|y|l|m|n} and P!='er'
                        */
                        else if(preg_match("/^pe[bcdfghjkpqstvxz](?!er)/", $result)) {

                            $result = preg_replace("/^pe/", "", $result);

                            // save prefix changes
                            $modification = array("pe" => "");

                        }

                        /*
                            RULE 35
                            input: peC1erC2...
                            output: pe-C1erC2... where C1!={r|w|y|l|m|n}
                        */
                        else if(preg_match("/^pe[bcdfghjkpqstvxz]er$consonant/", $result)) {

                            $result = preg_replace("/^pe/", "", $result);

                            // save prefix changes
                            $modification = array("pe", "");

                        }

                        /*

                            In this case, the rule is unsuccessful, therefore the
                            original input word will be returned. The previously
                            initialized recoding chars will also be unset.

                        */
                        else {

                            unset($this->recoding_tracker[$prefix]);

                            return $word;
                        }


                    }

                    /*

                        Moves the temporary saved modification to prefix tracker
                        attribute (provided it's not null); If there is no modification
                        detected, then the this process is terminated.

                    */
                    if($modification!=null) {

                        // saves modification changes to prefix tracker
                        $this->complex_prefix_tracker[$prefix] = $modification;

                    } else {

                        // If there is no changes made, return original word.
                        return $result;

                    }

                }

                /*

                    Updates the removed value holder. Since derivational prefix
                    is stackable (up to 2), the value is kept in an array fashion

                */
                if($this->removed['derivational_prefix']=='') {

                    $this->removed['derivational_prefix'] = array();

                }

                // Adds the detected prefix type to the removed affix tracker.
                array_push($this->removed['derivational_prefix'], $prefix);

                // Performs dictionary lookup
                $this->lookup($result);

                // once the prefix is removed, we need to enter next iteration.
                return $result;

            }

        }

        // if no prefix found, return original word instead
        return $result;

    }

    /*
        Performs recoding on input word
        (provided there are recoding paths available)

        @param string $word
        @return mixed
    */
    protected function recode($word) {

        /*
            Holds the value after suffix removal process

            @var string
        */
        $result = $word;

        /*
            Holds the reversed version of prefix tracker; because it is used
            to return previously removed prefixes.

            @var array
        */
        $prefixes = array_reverse($this->complex_prefix_tracker);


        /*

            For each iteration, check whether the prefix has recoding path(s).
            If recoding path is found, then it will be applied

        */
        foreach($prefixes as $prefix => $changes) {

            /*

                Checks whether the current prefix has available recoding path,
                stored in a variable

                @var array
            */
            $recode = $this->recoding_tracker[$prefix];

            /*
                fetch the added value when removing this prefix

                @var string
            */
            $prefix_added = reset($changes);

            /*
                fetch the removed value when removing this prefix

                @var string
            */
            $prefix_removed = key($changes);

            /*

                If something was added in the process of current prefix's removal,
                then it will be removed; and replaced with the removed value.

            */
            if($prefix_added!="") {

                // replace the added value with the removed value
                $result = preg_replace("/^$prefix_added/", $prefix_removed, $result);

            }
            else {

                // prepend the removed value to current word
                $result = $prefix_removed . $result;
            }

            /*

                If a recoding path is available, then it will be checked whether
                there are more than one path. For every path, the word is configured
                with the recoding path, and checked against the database.

            */
            if($recode!="") {

                /*
                    Temporary variable for storing word changes; used for checking
                    and lookup

                    @var string
                */
                $temp;

                foreach($recode as $raw_removed => $added) {

                    /*
                        There are some cases where the recoding path is more than
                        one, and both have identical removed value; because this
                        can cause duplicate array keys (which will lead to overwriting),
                        some rules are appended with numbers. Before the removed value
                        is stored, it removes any number appended in the value

                        @var string
                    */
                    $removed = preg_replace("/[0-9]+/", "", $raw_removed);

                    // Attempts to apply recoding path.
                    $temp = preg_replace("/^$removed/", ($added) ? $added : "", $result);

                    /*

                        Performs dictionary lookup. If found, this will return the lookup result,
                        and updates class' property: $found

                    */
                    if($this->lookup($temp)) {

                        // updates the prefix tracker value
                        $this->complex_prefix_tracker[$prefix] = array($removed => $added);

                        // returns the result
                        return $temp;

                    }

                    $previous = "";

                    // records to variable to $record for continued processing
                    $record = $temp;

                    $before = count($this->complex_prefix_tracker);

                    // the iteration is done for maximum three times
                    for($i=0; $i<3; $i++) {

                        /*
                            Temporary variable; holds the value before the word
                            undergoes derivation prefix removal. Used for comparison,
                            whether

                            @var string
                        */
                        $previous = $record;

                        // delete derivational prefix
                        $record = $this->delete_derivational_prefix($record);

                        /*

                            Checks for disallowed affix combination,
                            Checks if the lemma is already found,
                            Checks if the no prefix was removed, or the amount of prefixes removed are already 2.

                        */
                        if(($i==0 && $this->has_disallowed_pairs())
                            || $record == $previous
                            || count($this->removed['derivational_prefix'])>3)
                        {
                            break;
                        }
                        else if($this->found) return $record;
                    }

                    if(count($this->complex_prefix_tracker) > $before) {

                        $count = 0;
                        foreach($this->complex_prefix_tracker as $key => $value) {
                            $count++;
                            if($count <= $before) continue;

                            unset($this->complex_prefix_tracker[$key]);
                            unset($this->removed['derivational_prefix'][$count-1]);
                        }
                    }

                }

                // updates result variable for next iteration
                $result = $temp;

            }

        }

        // If recoding is unsuccessful or does not exist, return initial word
        return $word;

    }


    /**

        @todo   LOW - description will be available later! (once most of the things are up.)
                name is still a jest, of course. we'll come up with something better!

        @todo   LOW - implementation documentation for BACKTRACKING procedure (case 7)

    */
    public function eat($word, $backtrack_step = false) {

        /*
            Serves as the container for prefix/suffix removal results

            @var string
        */
        $result = $word;

        /*
            Serves as the temporary variable; holds string if process works
            without error and holds FALSE if there is an detected error.

            @var mixed
        */
        $temp = $this->lookup($word);

        /*

            STEP 1: perform dictionary lookup on input word

        */
        if($temp) {

            return $temp;

        } else {

            /*
                Checks the rule precedence; contains TRUE if derivational prefix
                is performed first and false for otherwise

                @var mixed
            */
            $steps = $this->check_rule_precedence($word);

            /*

                STEP 2: function ordering based on rule precedence result
                identifies whether this is a backtrack step or not;
                if this is main step then perform rule precedence check

            */
            if($backtrack_step) {

                $steps = array(5,6);

            } else {

                if($steps) {

                    $steps = array(5,6,3,4,7);

                } else {

                    $steps = array(3,4,5,6,7);

                }

            }

            foreach($steps as $step) {

                switch($step) {

                    // STEP 3: delete inflectional suffix
                    case 3:
                        $temp = $this->delete_inflectional_suffix($result);
                        break;

                    // STEP 4: delete derivational suffix
                    case 4:
                        $temp = $this->delete_derivational_suffix($result);
                        break;

                    // STEP 5: delete derivational prefix
                    case 5:
                        // records to variable to $temp for continued processing
                        $temp = $result;

                        // the iteration is done for maximum three times
                        for($i=0; $i<3; $i++) {

                            /*
                                Temporary variable; holds the value before the word
                                undergoes derivation prefix removal. Used for comparison,
                                whether

                                @var string
                            */
                            $previous = $temp;

                            // delete derivational prefix
                            $temp = $this->delete_derivational_prefix($temp);

                            /*

                                Checks for disallowed affix combination,
                                Checks if the lemma is already found,
                                Checks if the no prefix was removed, or the amount of prefixes removed are already 2.

                            */
                            if(($i==0 && $this->has_disallowed_pairs())
                                || $this->found
                                || $temp == $previous
                                || count($this->removed['derivational_prefix'])>3)
                            {
                                break;
                            }
                        }
                        break;

                    // STEP 6: perform recoding
                    case 6:
                        $temp = $this->recode($result);
                        break;

                    /**

                        @todo implementation docs for backtracking

                    */
                    // STEP 7: perform suffix backtracking
                    case 7:

                        $prefixes = array_reverse($this->complex_prefix_tracker);

                        foreach($prefixes as $prefix => $changes) {

                            $prefix_added = reset($changes);
                            $prefix_removed = key($changes);

                            if($prefix_added!="") {

                                $temp = preg_replace("/^$prefix_added/", $prefix_removed, $temp);

                            }
                            else {

                                $temp = $prefix_removed . $temp;
                            }
                        }

                        $this->removed["derivational_prefix"] = "";
                        $this->complex_prefix_tracker = array();
                        $backtrack = $this->eat($temp, true);

                        if($this->found) break;

                        // return derivational suffix
                        if(!$this->found && $this->removed['derivational_suffix']!="") {

                            if($this->removed['derivational_suffix'] == "kan") {

                                $temp = $temp . "k";
                                $this->removed["derivational_prefix"] = "";
                                $this->complex_prefix_tracker = array();
                                $backtrack = $this->eat($temp, true);

                                if($this->found) break;

                                $temp = $temp . "an";

                            }
                            else {

                                $temp = $temp . $this->removed["derivational_suffix"];

                            }

                            $this->removed["derivational_prefix"] = "";
                            $this->complex_prefix_tracker = array();
                            $backtrack = $this->eat($temp, true);

                        }

                        // return possessive pronoun
                        if(!$this->found && $this->removed["possessive_pronoun"]!="") {

                            $temp = $temp . $this->removed["possessive_pronoun"];
                            $this->removed["derivational_prefix"] = "";
                            $this->complex_prefix_tracker = array();
                            $backtrack = $this->eat($temp, true);

                            if($this->found) break;

                        }

                        // return particle
                        if(!$this->found && $this->removed["particle"]!="") {

                            $temp = $temp . $this->removed["particle"];
                            $this->removed["derivational_prefix"] = "";
                            $this->complex_prefix_tracker = array();
                            $backtrack = $this->eat($temp, true);

                            if($this->found) break;

                        }

                }

                /*

                     If the lookup already succeeded from previous result,
                     then directly return the result

                */
                if($this->found) return $this->found;

                // if the removal is success, proceed to next step
                $result = $temp;

            }

            /*

                STEP 8: if the dictionary lookup still fails, return original word.
                since the word was returned to its original form, removal histories
                are considered 'undone'; for better semantics

            */
            if(!$backtrack_step) if(!$this->error) $this->error = "lemma_not_found";
            return $word;

        }

    }


    /**

        @debug  basically this gives access to the world about what prefixes/suffixes
                have been removed. Will be removed later!

    **/
    public function getRemoved() {

        return $this->removed;

    }

    /*

        Closes database connection on instance destruction.

    */
    public function __destruct() {

        $this->database = null;

    }
}