Untitled

// ---------------------------------------------------------------------------------------------------------------------
// This file is part of CodeBox, an open-source toolkit for D.
//
// Copyright (c) 2009, Sean Kerr.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without modification, are permitted provided that the
// following conditions are met:
//
// * Redistributions of source code must retain the above copyright notice, this list of conditions and the following
//   disclaimer.
// * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following
//   disclaimer in the documentation and/or other materials provided with the distribution.
// * Neither the name CodeBox nor the names of its contributors may be used to endorse or promote products derived from
//   this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
// INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANYWAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Author:  Sean Kerr <sean@code-box.org>
// Version: $Id$
// ---------------------------------------------------------------------------------------------------------------------

module codebox.text.Regex;

// +-------------------------------------------------------------------------------------------------------------------+
// | IMPORTS                                                                                                           |
// +-------------------------------------------------------------------------------------------------------------------+

private import codebox.capi.PCRE;
private import codebox.core.Exception;
private import tango.stdc.stringz;

debug (UnitTest) {

    private import tango.io.Stdout;

}

// +-------------------------------------------------------------------------------------------------------------------+
// | ENUMS                                                                                                             |
// +-------------------------------------------------------------------------------------------------------------------+

/**
 * Pattern matching modifiers.
 */
public enum Modifier : int {

    /** Enable case-insensitive matching. */
    I = PCRE_CASELESS,

    /** Enable multi-line mode. */
    M = PCRE_MULTILINE,

    /** Force a period (.) to match newline sequences. */
    S = PCRE_DOTALL,

    /** Enable UTF-8 matching. */
    U = PCRE_UTF8

}

// +-------------------------------------------------------------------------------------------------------------------+
// | CLASSES                                                                                                           |
// +-------------------------------------------------------------------------------------------------------------------+

/**
 * Regex represents a single compiled regular expression pattern upon which all matching, searching, replacing and
 * splitting of textual data occurs.
 */
public class Regex {

    /** The count of captured groups. */
    private int _groups;

    /** The modifiers used when compiling the regular expression pattern. */
    private int _modifiers;

    /** The group indexes and their associated group names. */
    private int[char[]] _names;

    /** The regular expression pattern. */
    private char[] _pattern;

    /** The internal pcre struct instance. */
    private pcre* _pcreIn;

    /** The internal pcre_extra struct instance. */
    private pcre_extra* _pcreExtraIn;

    // +---------------------------------------------------------------------------------------------------------------+
    // | CONSTRUCTORS / DESTRUCTOR                                                                                     |
    // +---------------------------------------------------------------------------------------------------------------+

    /**
     * Create a new Regex instance.
     *
     * Params:
     *   pcreIn      = The internal pcre struct instance.
     *   pcreExtraIn = The internal pcre_extra struct instance.
     *   groups      = The count of captured groups.
     *   names       = The group indexes and their associated group names.
     *   pattern     = The regular expression pattern.
     *   modifiers   = The modifiers used when compiling /pattern/.
     */
    private this (pcre* pcreIn, pcre_extra* pcreExtraIn, int groups, int[char[]] names, char[] pattern, int modifiers) {

        _groups      = groups;
        _modifiers   = modifiers;
        _names       = names;
        _pattern     = pattern;
        _pcreIn      = pcreIn;
        _pcreExtraIn = pcreExtraIn;

    }

    /**
     * Destroy this Regex instance.
     */
    public ~this () {

        if (_pcreExtraIn) {

            (*pcre_free)(_pcreExtraIn);

        }

        (*pcre_free)(_pcreIn);

    }

    // +---------------------------------------------------------------------------------------------------------------+
    // | OPERATORS                                                                                                     |
    // +---------------------------------------------------------------------------------------------------------------+

    /**
     * Handle an in statement that checks for existence of /group/ within the named groups.
     *
     * Params:
     *   group = The group name.
     */
    public bool opIn_r (char[] group) {

        return (group in _names) ? true : false;

    }

    // +---------------------------------------------------------------------------------------------------------------+
    // | METHODS                                                                                                       |
    // +---------------------------------------------------------------------------------------------------------------+

    /**
     * Compile a regular expression pattern.
     *
     * Params:
     *   pattern   = The regular expression pattern.
     *   modifiers = The modifiers to use when compiling the regular expression pattern.
     *
     * Throws: RegexException If the pattern cannot be compiled.
     */
    public static Regex compile (char[] pattern, int modifiers = 0) {

        char*       error;
        int         errorOffset;
        int         groups;
        int         nameCount;
        int         nameSize;
        char*       nameTable;
        int[char[]] names;
        pcre*       pcreIn;
        pcre_extra* pcreExtraIn;

        // compile the pattern
        pcreIn = pcre_compile(toStringz(pattern), modifiers, &error, &errorOffset, null);

        if (error) {

            throw new RegexException("Failed to compiled pattern at offset {}: {}", errorOffset, fromStringz(error));

        }

        // get pattern optimizations
        pcreExtraIn = pcre_study(pcreIn, 0, &error);

        if (error) {

            (*pcre_free)(pcreIn);

            throw new RegexException("Failed to get pattern optimizations: {}", fromStringz(error));

        }

        // get pattern details
        pcre_fullinfo(pcreIn, pcreExtraIn, PCRE_INFO_CAPTURECOUNT,  &groups);
        pcre_fullinfo(pcreIn, pcreExtraIn, PCRE_INFO_NAMECOUNT,     &nameCount);
        pcre_fullinfo(pcreIn, pcreExtraIn, PCRE_INFO_NAMEENTRYSIZE, &nameSize);
        pcre_fullinfo(pcreIn, pcreExtraIn, PCRE_INFO_NAMETABLE,     &nameTable);

        // get all named groups and their related indexes
        while (nameCount--) {

            names[fromStringz(nameTable + 2)] = (nameTable[0] << 8) | nameTable[1];

            nameTable += nameSize;

        }

        return new Regex(pcreIn, pcreExtraIn, groups, names, pattern, modifiers);

    }

    /**
     * Retrieve the count of groups that will be captured when executing match(), search(), replace(), or split().
     */
    public int groups () {

        return _groups;

    }

    /**
     * Execute a match at the beginning of /subject/ and retrieve the first match.
     *
     * Params:
     *   subject = The subject upon which the match will take place.
     */
    public Match match (char[] subject) {

        Match match = new Match(this, subject, PCRE_ANCHORED);

        match.find();

        return match;

    }

    /**
     * Retrieve the modifiers used within this Regex instance.
     */
    public int modifiers () {

        return _modifiers;

    }

    /**
     * Retrieve all named groups.
     */
    public char[][] names () {

        return _names.keys;

    }

    /**
     * Execute a replacement on /subject/ where all pattern matches are replaced with /value/. If /max/ is unspecified
     * or 0, all matches will be replaced.
     *
     * Params:
     *   subject = The subject upon which the replacement will take place.
     *   value   = The value that will replace each pattern match.
     *   max     = The maximum number of times the replacement will take place.
     *
     * Returns: A copy of /subject/ with replaced values.
     */
    public char[] replace (char[] subject, char[] value, int max = 0) {

        int    iteration;
        int    position;
        char[] result;

        foreach (match; search(subject)) {

            iteration++;

            result   ~= subject[position .. match._vector[0]] ~ value;
            position  = match._vector[0] + subject[match._vector[0] .. match._vector[1]].length;

            if (max && iteration >= max) {

                break;

            }

        }

        result ~= subject[position .. $];

        return result;

    }

    /**
     * Execute a replacement on /subject/ where each captured group within the pattern is replaced by the same value in
     * the /values/ array that applies to the same index. If /max/ is unspecified or 0, all matches will be replaced.
     *
     * NOTE: This only replaces captured groups, not uncaptured match data.
     *
     * Params:
     *   subject = The subject upon which the replacement will take place.
     *   values  = The array of values that will replace each captured group. This must contain the same amount of
     *             values as there are captured groups.
     *   max     = The maximum number of times the replacement will take place.
     *
     * Returns: A copy of /subject/ with replaced values.
     *
     * Throws: ArrayBoundsException If there are more captured groups than there are indexes inside /values/.
     */
    public char[] replace (char[] subject, char[][] values, int max = 0) {

        int    iteration;
        int    position;
        char[] result;

        foreach (match; search(subject)) {

            iteration++;

            result   ~= subject[position .. match._vector[0]];
            position  = match._vector[0] + subject[match._vector[0] .. match._vector[1]].length;

            for (int group = 0; group < _groups; group++) {

                result ~= values[group];

            }

            if (max && iteration >= max) {

                break;

            }

        }

        result ~= subject[position .. $];

        return result;

    }

    /**
     * Execute a replacement on /subject/ where each match iteration, group index and group value are passed to /dg/
     * for replacement. If /max/ is unspecified or 0, all matches will be replaced.
     *
     * NOTE: This calls /dg/ for each captured group, not uncaptured match data.
     *
     * Params:
     *   subject = The subject upon which the replacement will take place.
     *   dg      = The delegate that is called for each replacement. It takes three arguments:
     *             1. iteration = The match iteration.
     *             2. group     = The group index.
     *             3. value     = The value.
     *   max     = The maximum number of times the replacement will take place.
     *
     * Returns: A copy of /subject/ with replaced values.
     */
    public char[] replace (char[] subject, char[] delegate (int iteration, int group, char[] value) dg, int max = 0) {

        int    iteration;
        int    position;
        char[] result;

        foreach (match; search(subject)) {

            iteration++;

            result   ~= subject[position .. match._vector[0]];
            position  = match._vector[0] + subject[match._vector[0] .. match._vector[1]].length;

            for (int group = 1; group <= _groups; group++) {

                result ~= dg(iteration, group, subject[match._vector[group << 1] .. match._vector[(group << 1) + 1]]);

            }

            if (max && iteration >= max) {

                break;

            }

        }

        result ~= subject[position .. $];

        return result;

    }

    /**
     * Retrieve the regular expression pattern used within this Regex instance.
     */
    public char[] pattern () {

        return _pattern;

    }

    /**
     * Execute a search within /subject/ and retrieve the Match instance associated. Because /subject/ is searched,
     * you must iterate over the Match instance, or use its find() method to find each match.
     *
     * Params:
     *   subject = The subject upon which the search will take place.
     *
     * Returns: This always return a Match instance.
     */
    public Match search (char[] subject) {

        return new Match(this, subject);

    }

    /**
     * Execute a split on /subject/. If /max/ is unspecified or 0, all matches will be split.
     *
     * Params:
     *   subject = The subject upon which the split will take place.
     *   max     = The maximum number of times the split will take place.
     *
     * Returns: The array of split /subject/ data.
     */
    public char[][] split (char[] subject, int max = 0) {

        int      iteration;
        int      position;
        char[][] result;

        foreach (match; search(subject)) {

            iteration++;

            result   ~= subject[position .. match._vector[0]];
            position  = match._vector[0] + subject[match._vector[0] .. match._vector[1]].length;

            for (int group = 1; group <= _groups; group++) {

                result ~= subject[match._vector[group << 1] .. match._vector[(group << 1) + 1]];

            }

            if (max && iteration >= max) {

                break;

            }

        }

        result ~= subject[position .. $];

        return result;

    }

    /**
     * Execute a split on /subject/. If /max/ is unspecified or 0, all matches will be split.
     *
     * NOTE: If the /group/ value passed to /dg/ is 0, it means the value is not part of the match pattern.
     *
     * Params:
     *   subject = The subject upon which the split will take place.
     *   dg      = The delegate that is called for each split. It takes three arguments:
     *             1. iteration = The match iteration.
     *             2. group     = The group index.
     *             3. value     = The value.
     *   max     = The maximum number of times the split will take place.
     */
    public void split (char[] subject, void delegate (int iteration, int group, char[] value) dg, int max = 0) {

        int iteration;
        int position;

        foreach (match; search(subject)) {

            iteration++;

            dg(iteration, 0, subject[position .. match._vector[0]]);

            position = match._vector[0] + subject[match._vector[0] .. match._vector[1]].length;

            for (int group = 1; group <= _groups; group++) {

                dg(iteration, group, subject[match._vector[group << 1] .. match._vector[(group << 1) + 1]]);

            }

            if (max && iteration >= max) {

                break;

            }

        }

        dg(++iteration, 0, subject[position .. $]);

    }

}

/**
 * Match represents a single match. When this object is iterated or find() is called, it will then represent the next
 * available match.
 */
public class Match {

    /** Indicates that the most recent iteration or find() call completed successfully and a new match is available. */
    private bool _matches;

    /** The modifiers used when matching the subject. */
    private int _modifiers = PCRE_NEWLINE_ANYCRLF;

    /** The parent Regex instance under which this Match instance is operating. */
    private Regex _regex;

    /** The subject upon which all matching will take place. **/
    private char[] _subject;

    /** The internal vector of match positions. */
    private int[] _vector;

    // +---------------------------------------------------------------------------------------------------------------+
    // | CONSTRUCTORS / DESTRUCTOR                                                                                     |
    // +---------------------------------------------------------------------------------------------------------------+

    /**
     * Create a new Match instance.
     *
     * Params:
     *   regex     = The parent Regex instance under whcih this Match instance will operate.
     *   subject   = The subject upon which all matching will take place.
     *   modifiers = The modifiers used when matching /subject/.
     */
    private this (Regex regex, char[] subject, int modifiers = 0) {

        _modifiers     |= modifiers;
        _regex          = regex;
        _subject        = subject;
        _vector.length  = (_regex.groups + 1) * 3;

    }

    // +---------------------------------------------------------------------------------------------------------------+
    // | OPERATORS                                                                                                     |
    // +---------------------------------------------------------------------------------------------------------------+

    /**
     * Handle a foreach statement that expects /iteration/ and /match/.
     *
     * NOTE: This calls reset() prior to matching.
     *
     * Params:
     *   dg = The delegate that is called for each match.
     */
    public int opApply (int delegate (ref int iteration, ref Match match) dg) {

        int iteration;
        int result;

        reset();

        while (find()) {

            iteration++;

            result = dg(iteration, this);

            if (result) {

                break;

            }

        }

        return result;

    }

    /**
     * Handle a foreach statement that expects /match/.
     *
     * NOTE: This calls reset() prior to matching.
     *
     * Params:
     *   dg = The delegate that is called for each match.
     */
    public int opApply (int delegate (ref Match match) dg) {

        int result;

        reset();

        while (find()) {

            result = dg(this);

            if (result) {

                break;

            }

        }

        return result;

    }

    /**
     * Handle an index expression that retrieves the captured group value that is associated with /group/.
     *
     * Params:
     *   group = The captured group name.
     *
     * Throws: ArrayBoundsException If the group name is nonexistent.
     */
    public char[] opIndex (char[] group) {

        int index = _regex._names[group];

        return _subject[_vector[index << 1] .. _vector[(index << 1) + 1]];

    }

    /**
     * Handle an index expression that retrieves the captured group value that is associated with /group/.
     *
     * Params:
     *   group = The captured group index.
     *
     * Throws: ArrayBoundsException If the group index is nonexistent.
     */
    public char[] opIndex (int group) {

        return _subject[_vector[group << 1] .. _vector[(group << 1) + 1]];

    }

    // +---------------------------------------------------------------------------------------------------------------+
    // | METHODS                                                                                                       |
    // +---------------------------------------------------------------------------------------------------------------+

    /**
     * Execute the next pattern match and reflect the match information in this Match instance.
     *
     * NOTE: matches() must be called to determine if a match was found before you access any match data.
     *
     * Returns: If a match was found, this will return true, otherwise false.
     */
    public bool find () {

        _matches = false;

        if (_vector[1] > -1) {

            int exec = pcre_exec(_regex._pcreIn, _regex._pcreExtraIn, _subject.ptr, _subject.length, _vector[1],
                                 _modifiers, _vector.ptr, _vector.length);

            if (exec > PCRE_ERROR_NOMATCH) {

                _matches = true;

            } else if (exec < PCRE_ERROR_NOMATCH) {

                throw new RegexException("Failed to retrieve next pattern match");

            }

        }

        return _matches;

    }

    /**
     * Retrieve the portion of the subject that matched the pattern. This is identical to accessing group index 0.
     */
    public char[] match () {

        return _subject[_vector[0] .. _vector[1]];

    }

    /**
     * Indicates whether or not the most recent find() call matched a new portion of the subject.
     */
    public bool matches () {

        return _matches;

    }

    /**
     * Retrieve the position of the subject at which the most recent iteration or find() call matched.
     */
    public int position () {

        return _vector[0];

    }

    /**
     * Retrieve the parent Regex instance under which this Match instance is operating.
     */
    public Regex regex () {

        return _regex;

    }

    /**
     * Reset all iteration and find() results so the next iteration or find() call will start from the beginning of
     * the subject.
     */
    public void reset () {

        _vector[0] = 0;
        _vector[1] = 0;

    }

    /**
     * Retrieve the subject upon which this Match instance is operating.
     */
    public char[] subject () {

        return _subject;

    }

}

// +-------------------------------------------------------------------------------------------------------------------+
// | UNIT TESTS                                                                                                        |
// +-------------------------------------------------------------------------------------------------------------------+

debug (UnitTest) {

    unittest {

        Regex  r = Regex.compile(r"^(?P<start>[a-z]{2}\d) (?P<data>[^\s]+) end$", Modifier.I | Modifier.M);
        char[] s = "aB5 regexfun end\nYz9 juststuff end\nmn1 fancycode end";

        assert("start" in r);
        assert("data" in r);
        assert(r.groups() == 2);

        foreach (i, m; r.search(s)) {

            if (i == 1) {

                assert(m[0] == m.match());
                assert(m[0] == "aB5 regexfun end");
                assert(m[1] == "aB5");
                assert(m[2] == "regexfun");
                assert(m["start"] == "aB5");
                assert(m["data"] == "regexfun");

            } else if (i == 2) {

                assert(m[0] == m.match());
                assert(m[0] == "Yz9 juststuff end");
                assert(m[1] == "Yz9");
                assert(m[2] == "juststuff");
                assert(m["start"] == "Yz9");
                assert(m["data"] == "juststuff");

            } else if (i == 3) {

                assert(m[0] == m.match());
                assert(m[0] == "mn1 fancycode end");
                assert(m[1] == "mn1");
                assert(m[2] == "fancycode");
                assert(m["start"] == "mn1");
                assert(m["data"] == "fancycode");

            }

        }

        assert(r.replace(s, "xxx") == "xxx\nxxx\nxxx");
        assert(r.replace(s, "xxx", 2) == "xxx\nxxx\nmn1 fancycode end");
        assert(r.replace(s, ["123", "456"]) == "123456\n123456\n123456");
        assert(r.replace(s, ["123", "456"], 1) == "123456\nYz9 juststuff end\nmn1 fancycode end");

        r = Regex.compile(r"start (\d+)(\w+) end", Modifier.I);
        s = "start 42abc end start 38176qwerty end start 1337bugs end";

        assert(r.replace(s,

            (int iteration, int group, char[] value) {

                if (iteration == 1) {

                    if (group == 1) {

                        assert(value == "42");

                    } else {

                        assert(value == "abc");

                    }

                } else if (iteration == 2) {

                    if (group == 1) {

                        assert(value == "38176");

                    } else {

                        assert(value == "qwerty");

                    }

                } else {

                    if (group == 1) {

                        assert(value == "1337");

                    } else {

                        assert(value == "bugs");

                    }

                }

                return value;

            }

        ) == "42abc 38176qwerty 1337bugs");

        r = Regex.compile(r"\W+", Modifier.I);
        s = "Hello, World, In, D";

        assert(r.split(s) == ["Hello", "World", "In", "D"]);
        assert(r.split(s, 2) == ["Hello", "World", "In, D"]);

        r = Regex.compile(r"([a-z]+) (\d+)", Modifier.I);
        s = "#?#?Hello 123!!##D 456&&##";

        r.split(s,

            (int iteration, int group, char[] value) {

                if (iteration == 1) {

                    if (group == 0) {

                        assert(value == "#?#?");

                    } else if (group == 1) {

                        assert(value == "Hello");

                    } else if (group == 2) {

                        assert(value == "123");

                    }

                } else if (iteration == 2) {

                    if (group == 0) {

                        assert(value == "!!##");

                    } else if (group == 1) {

                        assert(value == "D");

                    } else if (group == 2) {

                        assert(value == "456");

                    }

                } else if (iteration == 3) {

                    assert(value == "&&##");

                }

            }

        );

    }

}