Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- // ---------------------------------------------------------------------------------------------------------------------
- // This file is part of CodeBox, an open-source toolkit for D.
- //
- // Copyright (c) 2009, Sean Kerr.
- // All rights reserved.
- //
- // Redistribution and use in source and binary forms, with or without modification, are permitted provided that the
- // following conditions are met:
- //
- // * Redistributions of source code must retain the above copyright notice, this list of conditions and the following
- // disclaimer.
- // * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following
- // disclaimer in the documentation and/or other materials provided with the distribution.
- // * Neither the name CodeBox nor the names of its contributors may be used to endorse or promote products derived from
- // this software without specific prior written permission.
- //
- // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
- // INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
- // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANYWAY OUT OF THE USE
- // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- //
- // Author: Sean Kerr <sean@code-box.org>
- // Version: $Id$
- // ---------------------------------------------------------------------------------------------------------------------
- module codebox.text.Regex;
- // +-------------------------------------------------------------------------------------------------------------------+
- // | IMPORTS |
- // +-------------------------------------------------------------------------------------------------------------------+
- private import codebox.capi.PCRE;
- private import codebox.core.Exception;
- private import tango.stdc.stringz;
- debug (UnitTest) {
- private import tango.io.Stdout;
- }
- // +-------------------------------------------------------------------------------------------------------------------+
- // | ENUMS |
- // +-------------------------------------------------------------------------------------------------------------------+
- /**
- * Pattern matching modifiers.
- */
- public enum Modifier : int {
- /** Enable case-insensitive matching. */
- I = PCRE_CASELESS,
- /** Enable multi-line mode. */
- M = PCRE_MULTILINE,
- /** Force a period (.) to match newline sequences. */
- S = PCRE_DOTALL,
- /** Enable UTF-8 matching. */
- U = PCRE_UTF8
- }
- // +-------------------------------------------------------------------------------------------------------------------+
- // | CLASSES |
- // +-------------------------------------------------------------------------------------------------------------------+
- /**
- * Regex represents a single compiled regular expression pattern upon which all matching, searching, replacing and
- * splitting of textual data occurs.
- */
- public class Regex {
- /** The count of captured groups. */
- private int _groups;
- /** The modifiers used when compiling the regular expression pattern. */
- private int _modifiers;
- /** The group indexes and their associated group names. */
- private int[char[]] _names;
- /** The regular expression pattern. */
- private char[] _pattern;
- /** The internal pcre struct instance. */
- private pcre* _pcreIn;
- /** The internal pcre_extra struct instance. */
- private pcre_extra* _pcreExtraIn;
- // +---------------------------------------------------------------------------------------------------------------+
- // | CONSTRUCTORS / DESTRUCTOR |
- // +---------------------------------------------------------------------------------------------------------------+
- /**
- * Create a new Regex instance.
- *
- * Params:
- * pcreIn = The internal pcre struct instance.
- * pcreExtraIn = The internal pcre_extra struct instance.
- * groups = The count of captured groups.
- * names = The group indexes and their associated group names.
- * pattern = The regular expression pattern.
- * modifiers = The modifiers used when compiling /pattern/.
- */
- private this (pcre* pcreIn, pcre_extra* pcreExtraIn, int groups, int[char[]] names, char[] pattern, int modifiers) {
- _groups = groups;
- _modifiers = modifiers;
- _names = names;
- _pattern = pattern;
- _pcreIn = pcreIn;
- _pcreExtraIn = pcreExtraIn;
- }
- /**
- * Destroy this Regex instance.
- */
- public ~this () {
- if (_pcreExtraIn) {
- (*pcre_free)(_pcreExtraIn);
- }
- (*pcre_free)(_pcreIn);
- }
- // +---------------------------------------------------------------------------------------------------------------+
- // | OPERATORS |
- // +---------------------------------------------------------------------------------------------------------------+
- /**
- * Handle an in statement that checks for existence of /group/ within the named groups.
- *
- * Params:
- * group = The group name.
- */
- public bool opIn_r (char[] group) {
- return (group in _names) ? true : false;
- }
- // +---------------------------------------------------------------------------------------------------------------+
- // | METHODS |
- // +---------------------------------------------------------------------------------------------------------------+
- /**
- * Compile a regular expression pattern.
- *
- * Params:
- * pattern = The regular expression pattern.
- * modifiers = The modifiers to use when compiling the regular expression pattern.
- *
- * Throws: RegexException If the pattern cannot be compiled.
- */
- public static Regex compile (char[] pattern, int modifiers = 0) {
- char* error;
- int errorOffset;
- int groups;
- int nameCount;
- int nameSize;
- char* nameTable;
- int[char[]] names;
- pcre* pcreIn;
- pcre_extra* pcreExtraIn;
- // compile the pattern
- pcreIn = pcre_compile(toStringz(pattern), modifiers, &error, &errorOffset, null);
- if (error) {
- throw new RegexException("Failed to compiled pattern at offset {}: {}", errorOffset, fromStringz(error));
- }
- // get pattern optimizations
- pcreExtraIn = pcre_study(pcreIn, 0, &error);
- if (error) {
- (*pcre_free)(pcreIn);
- throw new RegexException("Failed to get pattern optimizations: {}", fromStringz(error));
- }
- // get pattern details
- pcre_fullinfo(pcreIn, pcreExtraIn, PCRE_INFO_CAPTURECOUNT, &groups);
- pcre_fullinfo(pcreIn, pcreExtraIn, PCRE_INFO_NAMECOUNT, &nameCount);
- pcre_fullinfo(pcreIn, pcreExtraIn, PCRE_INFO_NAMEENTRYSIZE, &nameSize);
- pcre_fullinfo(pcreIn, pcreExtraIn, PCRE_INFO_NAMETABLE, &nameTable);
- // get all named groups and their related indexes
- while (nameCount--) {
- names[fromStringz(nameTable + 2)] = (nameTable[0] << 8) | nameTable[1];
- nameTable += nameSize;
- }
- return new Regex(pcreIn, pcreExtraIn, groups, names, pattern, modifiers);
- }
- /**
- * Retrieve the count of groups that will be captured when executing match(), search(), replace(), or split().
- */
- public int groups () {
- return _groups;
- }
- /**
- * Execute a match at the beginning of /subject/ and retrieve the first match.
- *
- * Params:
- * subject = The subject upon which the match will take place.
- */
- public Match match (char[] subject) {
- Match match = new Match(this, subject, PCRE_ANCHORED);
- match.find();
- return match;
- }
- /**
- * Retrieve the modifiers used within this Regex instance.
- */
- public int modifiers () {
- return _modifiers;
- }
- /**
- * Retrieve all named groups.
- */
- public char[][] names () {
- return _names.keys;
- }
- /**
- * Execute a replacement on /subject/ where all pattern matches are replaced with /value/. If /max/ is unspecified
- * or 0, all matches will be replaced.
- *
- * Params:
- * subject = The subject upon which the replacement will take place.
- * value = The value that will replace each pattern match.
- * max = The maximum number of times the replacement will take place.
- *
- * Returns: A copy of /subject/ with replaced values.
- */
- public char[] replace (char[] subject, char[] value, int max = 0) {
- int iteration;
- int position;
- char[] result;
- foreach (match; search(subject)) {
- iteration++;
- result ~= subject[position .. match._vector[0]] ~ value;
- position = match._vector[0] + subject[match._vector[0] .. match._vector[1]].length;
- if (max && iteration >= max) {
- break;
- }
- }
- result ~= subject[position .. $];
- return result;
- }
- /**
- * Execute a replacement on /subject/ where each captured group within the pattern is replaced by the same value in
- * the /values/ array that applies to the same index. If /max/ is unspecified or 0, all matches will be replaced.
- *
- * NOTE: This only replaces captured groups, not uncaptured match data.
- *
- * Params:
- * subject = The subject upon which the replacement will take place.
- * values = The array of values that will replace each captured group. This must contain the same amount of
- * values as there are captured groups.
- * max = The maximum number of times the replacement will take place.
- *
- * Returns: A copy of /subject/ with replaced values.
- *
- * Throws: ArrayBoundsException If there are more captured groups than there are indexes inside /values/.
- */
- public char[] replace (char[] subject, char[][] values, int max = 0) {
- int iteration;
- int position;
- char[] result;
- foreach (match; search(subject)) {
- iteration++;
- result ~= subject[position .. match._vector[0]];
- position = match._vector[0] + subject[match._vector[0] .. match._vector[1]].length;
- for (int group = 0; group < _groups; group++) {
- result ~= values[group];
- }
- if (max && iteration >= max) {
- break;
- }
- }
- result ~= subject[position .. $];
- return result;
- }
- /**
- * Execute a replacement on /subject/ where each match iteration, group index and group value are passed to /dg/
- * for replacement. If /max/ is unspecified or 0, all matches will be replaced.
- *
- * NOTE: This calls /dg/ for each captured group, not uncaptured match data.
- *
- * Params:
- * subject = The subject upon which the replacement will take place.
- * dg = The delegate that is called for each replacement. It takes three arguments:
- * 1. iteration = The match iteration.
- * 2. group = The group index.
- * 3. value = The value.
- * max = The maximum number of times the replacement will take place.
- *
- * Returns: A copy of /subject/ with replaced values.
- */
- public char[] replace (char[] subject, char[] delegate (int iteration, int group, char[] value) dg, int max = 0) {
- int iteration;
- int position;
- char[] result;
- foreach (match; search(subject)) {
- iteration++;
- result ~= subject[position .. match._vector[0]];
- position = match._vector[0] + subject[match._vector[0] .. match._vector[1]].length;
- for (int group = 1; group <= _groups; group++) {
- result ~= dg(iteration, group, subject[match._vector[group << 1] .. match._vector[(group << 1) + 1]]);
- }
- if (max && iteration >= max) {
- break;
- }
- }
- result ~= subject[position .. $];
- return result;
- }
- /**
- * Retrieve the regular expression pattern used within this Regex instance.
- */
- public char[] pattern () {
- return _pattern;
- }
- /**
- * Execute a search within /subject/ and retrieve the Match instance associated. Because /subject/ is searched,
- * you must iterate over the Match instance, or use its find() method to find each match.
- *
- * Params:
- * subject = The subject upon which the search will take place.
- *
- * Returns: This always return a Match instance.
- */
- public Match search (char[] subject) {
- return new Match(this, subject);
- }
- /**
- * Execute a split on /subject/. If /max/ is unspecified or 0, all matches will be split.
- *
- * Params:
- * subject = The subject upon which the split will take place.
- * max = The maximum number of times the split will take place.
- *
- * Returns: The array of split /subject/ data.
- */
- public char[][] split (char[] subject, int max = 0) {
- int iteration;
- int position;
- char[][] result;
- foreach (match; search(subject)) {
- iteration++;
- result ~= subject[position .. match._vector[0]];
- position = match._vector[0] + subject[match._vector[0] .. match._vector[1]].length;
- for (int group = 1; group <= _groups; group++) {
- result ~= subject[match._vector[group << 1] .. match._vector[(group << 1) + 1]];
- }
- if (max && iteration >= max) {
- break;
- }
- }
- result ~= subject[position .. $];
- return result;
- }
- /**
- * Execute a split on /subject/. If /max/ is unspecified or 0, all matches will be split.
- *
- * NOTE: If the /group/ value passed to /dg/ is 0, it means the value is not part of the match pattern.
- *
- * Params:
- * subject = The subject upon which the split will take place.
- * dg = The delegate that is called for each split. It takes three arguments:
- * 1. iteration = The match iteration.
- * 2. group = The group index.
- * 3. value = The value.
- * max = The maximum number of times the split will take place.
- */
- public void split (char[] subject, void delegate (int iteration, int group, char[] value) dg, int max = 0) {
- int iteration;
- int position;
- foreach (match; search(subject)) {
- iteration++;
- dg(iteration, 0, subject[position .. match._vector[0]]);
- position = match._vector[0] + subject[match._vector[0] .. match._vector[1]].length;
- for (int group = 1; group <= _groups; group++) {
- dg(iteration, group, subject[match._vector[group << 1] .. match._vector[(group << 1) + 1]]);
- }
- if (max && iteration >= max) {
- break;
- }
- }
- dg(++iteration, 0, subject[position .. $]);
- }
- }
- /**
- * Match represents a single match. When this object is iterated or find() is called, it will then represent the next
- * available match.
- */
- public class Match {
- /** Indicates that the most recent iteration or find() call completed successfully and a new match is available. */
- private bool _matches;
- /** The modifiers used when matching the subject. */
- private int _modifiers = PCRE_NEWLINE_ANYCRLF;
- /** The parent Regex instance under which this Match instance is operating. */
- private Regex _regex;
- /** The subject upon which all matching will take place. **/
- private char[] _subject;
- /** The internal vector of match positions. */
- private int[] _vector;
- // +---------------------------------------------------------------------------------------------------------------+
- // | CONSTRUCTORS / DESTRUCTOR |
- // +---------------------------------------------------------------------------------------------------------------+
- /**
- * Create a new Match instance.
- *
- * Params:
- * regex = The parent Regex instance under whcih this Match instance will operate.
- * subject = The subject upon which all matching will take place.
- * modifiers = The modifiers used when matching /subject/.
- */
- private this (Regex regex, char[] subject, int modifiers = 0) {
- _modifiers |= modifiers;
- _regex = regex;
- _subject = subject;
- _vector.length = (_regex.groups + 1) * 3;
- }
- // +---------------------------------------------------------------------------------------------------------------+
- // | OPERATORS |
- // +---------------------------------------------------------------------------------------------------------------+
- /**
- * Handle a foreach statement that expects /iteration/ and /match/.
- *
- * NOTE: This calls reset() prior to matching.
- *
- * Params:
- * dg = The delegate that is called for each match.
- */
- public int opApply (int delegate (ref int iteration, ref Match match) dg) {
- int iteration;
- int result;
- reset();
- while (find()) {
- iteration++;
- result = dg(iteration, this);
- if (result) {
- break;
- }
- }
- return result;
- }
- /**
- * Handle a foreach statement that expects /match/.
- *
- * NOTE: This calls reset() prior to matching.
- *
- * Params:
- * dg = The delegate that is called for each match.
- */
- public int opApply (int delegate (ref Match match) dg) {
- int result;
- reset();
- while (find()) {
- result = dg(this);
- if (result) {
- break;
- }
- }
- return result;
- }
- /**
- * Handle an index expression that retrieves the captured group value that is associated with /group/.
- *
- * Params:
- * group = The captured group name.
- *
- * Throws: ArrayBoundsException If the group name is nonexistent.
- */
- public char[] opIndex (char[] group) {
- int index = _regex._names[group];
- return _subject[_vector[index << 1] .. _vector[(index << 1) + 1]];
- }
- /**
- * Handle an index expression that retrieves the captured group value that is associated with /group/.
- *
- * Params:
- * group = The captured group index.
- *
- * Throws: ArrayBoundsException If the group index is nonexistent.
- */
- public char[] opIndex (int group) {
- return _subject[_vector[group << 1] .. _vector[(group << 1) + 1]];
- }
- // +---------------------------------------------------------------------------------------------------------------+
- // | METHODS |
- // +---------------------------------------------------------------------------------------------------------------+
- /**
- * Execute the next pattern match and reflect the match information in this Match instance.
- *
- * NOTE: matches() must be called to determine if a match was found before you access any match data.
- *
- * Returns: If a match was found, this will return true, otherwise false.
- */
- public bool find () {
- _matches = false;
- if (_vector[1] > -1) {
- int exec = pcre_exec(_regex._pcreIn, _regex._pcreExtraIn, _subject.ptr, _subject.length, _vector[1],
- _modifiers, _vector.ptr, _vector.length);
- if (exec > PCRE_ERROR_NOMATCH) {
- _matches = true;
- } else if (exec < PCRE_ERROR_NOMATCH) {
- throw new RegexException("Failed to retrieve next pattern match");
- }
- }
- return _matches;
- }
- /**
- * Retrieve the portion of the subject that matched the pattern. This is identical to accessing group index 0.
- */
- public char[] match () {
- return _subject[_vector[0] .. _vector[1]];
- }
- /**
- * Indicates whether or not the most recent find() call matched a new portion of the subject.
- */
- public bool matches () {
- return _matches;
- }
- /**
- * Retrieve the position of the subject at which the most recent iteration or find() call matched.
- */
- public int position () {
- return _vector[0];
- }
- /**
- * Retrieve the parent Regex instance under which this Match instance is operating.
- */
- public Regex regex () {
- return _regex;
- }
- /**
- * Reset all iteration and find() results so the next iteration or find() call will start from the beginning of
- * the subject.
- */
- public void reset () {
- _vector[0] = 0;
- _vector[1] = 0;
- }
- /**
- * Retrieve the subject upon which this Match instance is operating.
- */
- public char[] subject () {
- return _subject;
- }
- }
- // +-------------------------------------------------------------------------------------------------------------------+
- // | UNIT TESTS |
- // +-------------------------------------------------------------------------------------------------------------------+
- debug (UnitTest) {
- unittest {
- Regex r = Regex.compile(r"^(?P<start>[a-z]{2}\d) (?P<data>[^\s]+) end$", Modifier.I | Modifier.M);
- char[] s = "aB5 regexfun end\nYz9 juststuff end\nmn1 fancycode end";
- assert("start" in r);
- assert("data" in r);
- assert(r.groups() == 2);
- foreach (i, m; r.search(s)) {
- if (i == 1) {
- assert(m[0] == m.match());
- assert(m[0] == "aB5 regexfun end");
- assert(m[1] == "aB5");
- assert(m[2] == "regexfun");
- assert(m["start"] == "aB5");
- assert(m["data"] == "regexfun");
- } else if (i == 2) {
- assert(m[0] == m.match());
- assert(m[0] == "Yz9 juststuff end");
- assert(m[1] == "Yz9");
- assert(m[2] == "juststuff");
- assert(m["start"] == "Yz9");
- assert(m["data"] == "juststuff");
- } else if (i == 3) {
- assert(m[0] == m.match());
- assert(m[0] == "mn1 fancycode end");
- assert(m[1] == "mn1");
- assert(m[2] == "fancycode");
- assert(m["start"] == "mn1");
- assert(m["data"] == "fancycode");
- }
- }
- assert(r.replace(s, "xxx") == "xxx\nxxx\nxxx");
- assert(r.replace(s, "xxx", 2) == "xxx\nxxx\nmn1 fancycode end");
- assert(r.replace(s, ["123", "456"]) == "123456\n123456\n123456");
- assert(r.replace(s, ["123", "456"], 1) == "123456\nYz9 juststuff end\nmn1 fancycode end");
- r = Regex.compile(r"start (\d+)(\w+) end", Modifier.I);
- s = "start 42abc end start 38176qwerty end start 1337bugs end";
- assert(r.replace(s,
- (int iteration, int group, char[] value) {
- if (iteration == 1) {
- if (group == 1) {
- assert(value == "42");
- } else {
- assert(value == "abc");
- }
- } else if (iteration == 2) {
- if (group == 1) {
- assert(value == "38176");
- } else {
- assert(value == "qwerty");
- }
- } else {
- if (group == 1) {
- assert(value == "1337");
- } else {
- assert(value == "bugs");
- }
- }
- return value;
- }
- ) == "42abc 38176qwerty 1337bugs");
- r = Regex.compile(r"\W+", Modifier.I);
- s = "Hello, World, In, D";
- assert(r.split(s) == ["Hello", "World", "In", "D"]);
- assert(r.split(s, 2) == ["Hello", "World", "In, D"]);
- r = Regex.compile(r"([a-z]+) (\d+)", Modifier.I);
- s = "#?#?Hello 123!!##D 456&&##";
- r.split(s,
- (int iteration, int group, char[] value) {
- if (iteration == 1) {
- if (group == 0) {
- assert(value == "#?#?");
- } else if (group == 1) {
- assert(value == "Hello");
- } else if (group == 2) {
- assert(value == "123");
- }
- } else if (iteration == 2) {
- if (group == 0) {
- assert(value == "!!##");
- } else if (group == 1) {
- assert(value == "D");
- } else if (group == 2) {
- assert(value == "456");
- }
- } else if (iteration == 3) {
- assert(value == "&&##");
- }
- }
- );
- }
- }
Add Comment
Please, Sign In to add comment