Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- /**
- Unicode Character Types
- */
- module std.uni;
- import core.stdc.stdint;
- import core.stdc.stdio;
- import std.ctype; // this really needs pure functions that return bools
- class UCharException : Exception {
- this(string msg, dchar c) {
- char[255] buf = void;
- uint len = snprintf(buf.ptr, buf.length, "%*s: \\u%04X\\u%04X", msg,
- c & 0xFFFF, (c >> 15) & 0xFFFF);
- super(buf[0 .. len].idup);
- }
- }
- // Lu, Ll, Lt, Lm, Lo
- /*pure*/ bool isUniAlpha(dchar code) {
- if(code <= 0x7F) return cast(bool)isalpha(code);
- switch(getType(code)) {
- case CharType.UPPERCASE_LETTER:
- case CharType.LOWERCASE_LETTER:
- case CharType.TITLECASE_LETTER:
- case CharType.MODIFIER_LETTER:
- case CharType.OTHER_LETTER:
- return true;
- default:
- return false;
- }
- }
- // Lu
- /*pure*/ bool isUniUpper(dchar code) {
- if(code <= 0x7F) return cast(bool)isupper(code);
- return getType(code) == CharType.UPPERCASE_LETTER;
- }
- // Ll
- /*pure*/ bool isUniLower(dchar code) {
- if(code <= 0x7F) return cast(bool)islower(code);
- return getType(code) == CharType.LOWERCASE_LETTER;
- }
- // Lt
- /*pure*/ bool isUniTitle(dchar code) {
- return getType(code) == CharType.TITLECASE_LETTER;
- }
- // Lu, Ll, Lt, Lm, Lo, Nd
- /*pure*/ bool isUniAlphaNum(dchar code) {
- if(code <= 0x7F) return cast(bool)isalnum(code);
- switch(getType(code)) {
- case CharType.UPPERCASE_LETTER:
- case CharType.LOWERCASE_LETTER:
- case CharType.TITLECASE_LETTER:
- case CharType.MODIFIER_LETTER:
- case CharType.OTHER_LETTER:
- case CharType.DECIMAL_DIGIT_NUMBER:
- return true;
- default:
- return false;
- }
- }
- // Nd, Nl, No
- /*pure*/ bool isUniNumber(dchar code) {
- if(code <= 0x7F) return cast(bool)isdigit(code);
- switch(getType(code)) {
- case CharType.DECIMAL_DIGIT_NUMBER:
- case CharType.LETTER_NUMBER:
- case CharType.OTHER_NUMBER:
- return true;
- default:
- return false;
- }
- }
- // Nd
- /*pure*/ bool isUniDigit(dchar code) {
- if(code <= 0x7F) return cast(bool)isdigit(code);
- return getType(code) == CharType.DECIMAL_DIGIT_NUMBER;
- }
- // Zs, Zl, Zp
- /*pure*/ bool isUniSeparator(dchar code) {
- if(code <= 0x7F) return cast(bool)isspace(code);
- switch(getType(code)) {
- case CharType.SPACE_SEPARATOR:
- case CharType.LINE_SEPARATOR:
- case CharType.PARAGRAPH_SEPARATOR:
- return true;
- default:
- return false;
- }
- }
- // Zs
- /*pure*/ bool isUniSpace(dchar code) {
- if(code <= 0x7F) return cast(bool)isspace(code);
- return getType(code) == CharType.SPACE_SEPARATOR;
- }
- // Zl
- /*pure*/ bool isUniLine(dchar code) {
- return getType(code) == CharType.LINE_SEPARATOR;
- }
- // Zp
- /*pure*/ bool isUniParagraph(dchar code) {
- return getType(code) == CharType.PARAGRAPH_SEPARATOR;
- }
- // Mn, Mc, Me
- /*pure*/ bool isUniMark(dchar code) {
- switch(getType(code)) {
- case CharType.NONSPACING_MARK:
- case CharType.COMBINING_SPACING_MARK:
- case CharType.ENCLOSING_MARK:
- return true;
- default:
- return false;
- }
- }
- /*pure*/ bool isUniNonspacing(dchar code) {
- return getType(code) == CharType.NONSPACING_MARK;
- }
- // Pc, Pd, Ps, Pe, Pi, Pf, Po
- /*pure*/ bool isUniPunctuation(dchar code) {
- if(code <= 0x7F) return cast(bool)ispunct(code);
- switch(getType(code)) {
- case CharType.CONNECTOR_PUNCTUATION:
- case CharType.DASH_PUNCTUATION:
- case CharType.OPEN_PUNCTUATION:
- case CharType.CLOSE_PUNCTUATION:
- case CharType.INITIAL_QUOTE_PUNCTUATION:
- case CharType.FINAL_QUOTE_PUNCTUATION:
- case CharType.OTHER_PUNCTUATION:
- return true;
- default:
- return false;
- }
- }
- // Sm, Sc, Sk, So
- /*pure*/ bool isUniSymbol(dchar code) {
- switch(getType(code)) {
- case CharType.MATH_SYMBOL:
- case CharType.CURRENCY_SYMBOL:
- case CharType.MODIFIER_SYMBOL:
- case CharType.OTHER_SYMBOL:
- return true;
- default:
- return false;
- }
- }
- // Cc, Cf, Cs, Co, Cn
- /*pure*/ bool isUniOther(dchar code) {
- switch(getType(code)) {
- case CharType.CONTROL:
- case CharType.FORMAT:
- case CharType.SURROGATE:
- case CharType.PRIVATE_USE:
- case CharType.NOT_ASSIGNED:
- return true;
- default:
- return false;
- }
- }
- // Cc
- /*pure*/ bool isUniControl(dchar code) {
- if(code <= 0x7F) return cast(bool)iscntrl(code);
- return getType(code) == CharType.CONTROL;
- }
- // Cf
- /*pure*/ bool isUniFormat(dchar code) {
- return getType(code) == CharType.FORMAT;
- }
- // Cs
- /*pure*/ bool isUniSurrogate(dchar code) {
- return getType(code) == CharType.SURROGATE;
- }
- // Co
- /*pure*/ bool isUniPrivateUse(dchar code) {
- return getType(code) == CharType.PRIVATE_USE;
- }
- // Lu, Ll, Lt, Lm, Lo, Nd, Nl, No, Mn, Mc, Me,
- // Pc, Pd, Ps, Pe, Pi, Pf, Po, Sm, Sc, Sk, So
- /*pure*/ bool isUniGraph(dchar code) {
- if(code <= 0x7F) return cast(bool)isgraph(code);
- switch(getType(code)) {
- case CharType.UPPERCASE_LETTER:
- case CharType.LOWERCASE_LETTER:
- case CharType.TITLECASE_LETTER:
- case CharType.MODIFIER_LETTER:
- case CharType.OTHER_LETTER:
- case CharType.DECIMAL_DIGIT_NUMBER:
- case CharType.LETTER_NUMBER:
- case CharType.OTHER_NUMBER:
- case CharType.NONSPACING_MARK:
- case CharType.COMBINING_SPACING_MARK:
- case CharType.ENCLOSING_MARK:
- case CharType.CONNECTOR_PUNCTUATION:
- case CharType.DASH_PUNCTUATION:
- case CharType.OPEN_PUNCTUATION:
- case CharType.CLOSE_PUNCTUATION:
- case CharType.INITIAL_QUOTE_PUNCTUATION:
- case CharType.FINAL_QUOTE_PUNCTUATION:
- case CharType.OTHER_PUNCTUATION:
- case CharType.MATH_SYMBOL:
- case CharType.CURRENCY_SYMBOL:
- case CharType.MODIFIER_SYMBOL:
- case CharType.OTHER_SYMBOL:
- return true;
- default:
- return false;
- }
- }
- // Lu, Ll, Lt, Lm, Lo, Nd, Nl, No, Zs, Mn, Mc, Me,
- // Pc, Pd, Ps, Pe, Pi, Pf, Po, Sm, Sc, Sk, So
- /*pure*/ bool isUniPrint(dchar code) {
- if(code <= 0x7F) return cast(bool)isprint(code);
- switch(getType(code)) {
- case CharType.UPPERCASE_LETTER:
- case CharType.LOWERCASE_LETTER:
- case CharType.TITLECASE_LETTER:
- case CharType.MODIFIER_LETTER:
- case CharType.OTHER_LETTER:
- case CharType.DECIMAL_DIGIT_NUMBER:
- case CharType.LETTER_NUMBER:
- case CharType.OTHER_NUMBER:
- case CharType.SPACE_SEPARATOR:
- case CharType.NONSPACING_MARK:
- case CharType.COMBINING_SPACING_MARK:
- case CharType.ENCLOSING_MARK:
- case CharType.CONNECTOR_PUNCTUATION:
- case CharType.DASH_PUNCTUATION:
- case CharType.OPEN_PUNCTUATION:
- case CharType.CLOSE_PUNCTUATION:
- case CharType.INITIAL_QUOTE_PUNCTUATION:
- case CharType.FINAL_QUOTE_PUNCTUATION:
- case CharType.OTHER_PUNCTUATION:
- case CharType.MATH_SYMBOL:
- case CharType.CURRENCY_SYMBOL:
- case CharType.MODIFIER_SYMBOL:
- case CharType.OTHER_SYMBOL:
- return true;
- default:
- return false;
- }
- }
- // Ws
- /*pure*/ bool isUniDirWhiteSpace(dchar code) {
- return getDirectionality(code) == Direction.WHITESPACE;
- }
- // R
- /*pure*/ bool isUniDirLTL(dchar code) {
- return getDirectionality(code) == Direction.LEFT_TO_RIGHT;
- }
- // L
- /*pure*/ bool isUniDirRTL(dchar code) {
- return getDirectionality(code) == Direction.RIGHT_TO_LEFT;
- }
- // L, R
- /*pure*/ bool isUniDirStrong(dchar code) {
- switch(getDirectionality(code)) {
- case Direction.LEFT_TO_RIGHT:
- case Direction.RIGHT_TO_LEFT:
- return true;
- default:
- return false;
- }
- }
- // En, Es, Et, An, Cs
- /*pure*/ bool isUniDirWeak(dchar code) {
- switch(getDirectionality(code)) {
- case Direction.EUROPEAN_NUMBER:
- case Direction.EUROPEAN_NUMBER_SEPARATOR:
- case Direction.EUROPEAN_NUMBER_TERMINATOR:
- case Direction.ARABIC_NUMBER:
- case Direction.COMMON_NUMBER_SEPARATOR:
- return true;
- default:
- return false;
- }
- }
- //B, S, Ws, On
- /*pure*/ bool isUniDirNeutral(dchar code) {
- switch(getDirectionality(code)) {
- case Direction.BLOCK_SEPARATOR:
- case Direction.SEGMENT_SEPARATOR:
- case Direction.WHITESPACE:
- case Direction.OTHER_NEUTRALS:
- return true;
- default:
- return false;
- }
- }
- // B, S
- /*pure*/ bool isUniDirSeparator(dchar code) {
- switch(getDirectionality(code)) {
- case Direction.BLOCK_SEPARATOR:
- case Direction.SEGMENT_SEPARATOR:
- return true;
- default:
- return false;
- }
- }
- //
- /*pure*/ bool isUniNonBreaking(dchar code) {
- return getDecompositionType(code) == DecompositionType.NOBREAK;
- }
- /*pure*/ bool isUniMirroring(dchar code) {
- return ((getPackedData(code) >> MIRRORED_SHIFT) & MIRRORED_MASK) != 0;
- }
- // Lu -> Lt
- /*pure*/ dchar toUniLower(dchar code) {
- return code + LCDIFF[(getPackedData(code) >> TOLOWER_SHIFT) & TOLOWER_MASK];
- }
- // Lt -> Lu
- /*pure*/ dchar toUniUpper(dchar code) {
- return code + UCDIFF[(getPackedData(code) >> TOUPPER_SHIFT) & TOUPPER_MASK];
- }
- // L& -> Lt
- /*pure*/ dchar toUniTitle(dchar c) {
- int32_t diff = TCDIFF[(getPackedData(c) >> TOTITLE_SHIFT) & TOTITLE_MASK];
- return diff != TOTITLE_MASK ? c + diff : toUniUpper(c);
- }
- /*pure*/ dchar toUniMirror(dchar c) {
- if(!isUniMirroring(c)) return c;
- return c + MIRROR_DIFF[(getPackedData(c) >> MIRROR_SHIFT) & MIRROR_MASK];
- }
- // The following is taken from AndroidUnicode.h, AndroidUnicode.cpp and
- // characterData.h from the Andriod project; after hunting high and low for
- // documentation about Unicode on Google and not wanting to go for heavyweight
- // implementations like ICU I found this gem. I ported it to D to implement
- // the public Unicode api. Other implementations of interest are in V8 and
- // Mozilla, some benchmarks would be useful to determine the one which performs
- // best.
- /*
- * Copyright (C) 2008 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- private:
- /*
- * Directions specified in the Unicode standard. These directions map directly
- * to java.lang.Character.
- */
- enum Direction {
- UNDEFINED = -1,
- LEFT_TO_RIGHT,
- RIGHT_TO_LEFT,
- RIGHT_TO_LEFT_ARABIC,
- EUROPEAN_NUMBER,
- EUROPEAN_NUMBER_SEPARATOR,
- EUROPEAN_NUMBER_TERMINATOR,
- ARABIC_NUMBER,
- COMMON_NUMBER_SEPARATOR,
- NONSPACING_MARK,
- BOUNDARY_NEUTRAL,
- BLOCK_SEPARATOR,
- SEGMENT_SEPARATOR,
- WHITESPACE,
- OTHER_NEUTRALS,
- LEFT_TO_RIGHT_EMBEDDING,
- LEFT_TO_RIGHT_OVERRIDE,
- RIGHT_TO_LEFT_EMBEDDING,
- RIGHT_TO_LEFT_OVERRIDE,
- POP_DIRECTIONAL_FORMAT
- }
- /*
- * Character types as specified in the Unicode standard. These map directly to
- * java.lang.Character.
- */
- enum CharType {
- UNASSIGNED,
- UPPERCASE_LETTER,
- LOWERCASE_LETTER,
- TITLECASE_LETTER,
- MODIFIER_LETTER,
- OTHER_LETTER,
- NONSPACING_MARK,
- ENCLOSING_MARK,
- COMBINING_SPACING_MARK,
- DECIMAL_DIGIT_NUMBER,
- LETTER_NUMBER,
- OTHER_NUMBER,
- SPACE_SEPARATOR,
- LINE_SEPARATOR,
- PARAGRAPH_SEPARATOR,
- CONTROL,
- FORMAT,
- NOT_ASSIGNED,
- PRIVATE_USE,
- SURROGATE,
- DASH_PUNCTUATION,
- OPEN_PUNCTUATION,
- CLOSE_PUNCTUATION,
- CONNECTOR_PUNCTUATION,
- OTHER_PUNCTUATION,
- MATH_SYMBOL,
- CURRENCY_SYMBOL,
- MODIFIER_SYMBOL,
- OTHER_SYMBOL,
- INITIAL_QUOTE_PUNCTUATION,
- FINAL_QUOTE_PUNCTUATION
- }
- /*
- * Decomposition types as described by the unicode standard. These values map to
- * the same values in dchar.h in ICU.
- */
- enum DecompositionType {
- NONE,
- CANONICAL,
- COMPAT,
- CIRCLE,
- FINAL,
- FONT,
- FRACTION,
- INITIAL,
- ISOLATED,
- MEDIAL,
- NARROW,
- NOBREAK,
- SMALL,
- SQUARE,
- SUB,
- SUPER,
- VERTICAL,
- WIDE
- }
- enum {
- MIN_RADIX = 2,
- MAX_RADIX = 36,
- TYPE_SHIFT = 0,
- TYPE_MASK = (1 << 5) - 1,
- DIRECTION_SHIFT = TYPE_SHIFT + 5,
- DIRECTION_MASK = (1 << 5) - 1,
- MIRRORED_SHIFT = DIRECTION_SHIFT + 5,
- MIRRORED_MASK = (1 << 1) - 1,
- TOUPPER_SHIFT = MIRRORED_SHIFT + 1,
- TOUPPER_MASK = (1 << 6)-1,
- TOLOWER_SHIFT = TOUPPER_SHIFT + 6,
- TOLOWER_MASK = (1 << 6)-1,
- TOTITLE_SHIFT = TOLOWER_SHIFT+6,
- TOTITLE_MASK = (1 << 2) - 1,
- MIRROR_SHIFT = TOTITLE_SHIFT + 2,
- MIRROR_MASK = (1 << 5) - 1,
- NUMERIC_SHIFT = TOTITLE_SHIFT + 2,
- NUMERIC_MASK = (1 << 7) - 1,
- DECOMPOSITION_SHIFT = 11,
- DECOMPOSITION_MASK = (1 << 5) - 1
- }
- /**
- * Returns the packed data for java calls
- * @param c The unicode character.
- * @return The packed data for the character.
- *
- * Copied from java.lang.Character implementation:
- * 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
- * F E D C B A 9 8 7 6 5 4 3 2 1 0 F E D C B A 9 8 7 6 5 4 3 2 1 0
- *
- * 31 types ---------
- * 18 directionalities ---------
- * 2 mirroreds -
- * ----------- 56 toupper diffs
- * ----------- 48 tolower diffs
- * --- 4 totitlecase diffs
- * ------------- 84 numeric values
- * --------- 24 mirror char diffs
- */
- /*pure*/ dchar getPackedData(dchar c) {
- // findCharacterValue returns a 16-bit value with the top 5 bits containing
- // a decomposition type and the remaining bits containing an index.
- return PACKED_DATA[findCharacterValue(c) & 0x7FF];
- }
- /*
- * Get the Character type.
- * @param c The unicode character.
- * @return The character's type or CHARTYPE_UNASSIGNED if the character is
- * invalid or has an unassigned class.
- */
- /*pure*/ CharType getType(dchar c) {
- if(c >= 0x10FFFF) return CharType.UNASSIGNED;
- return cast(CharType)((getPackedData(c) >> TYPE_SHIFT) & TYPE_MASK);
- }
- /**
- * Get the Character's decomposition type.
- * @param c The unicode character.
- * @return The character's decomposition type or DECOMPOSITION_NONE is there
- * is no decomposition.
- */
- /*pure*/ DecompositionType getDecompositionType(dchar c) {
- // findCharacterValue returns a 16-bit value with the top 5 bits containing
- // a decomposition type and the remaining bits containing an index.
- return cast(DecompositionType)
- ((findCharacterValue(c) >> DECOMPOSITION_SHIFT) & DECOMPOSITION_MASK);
- }
- /*pure*/ int getDigitValue(dchar c, int radix) {
- if(radix < MIN_RADIX || radix > MAX_RADIX) return -1;
- int tempValue = radix;
- if(c >= '0' && c <= '9')
- tempValue = c - '0';
- else if(c >= 'a' && c <= 'z')
- tempValue = c - 'a' + 10;
- else if(c >= 'A' && c <= 'Z')
- tempValue = c - 'A' + 10;
- return tempValue < radix ? tempValue : -1;
- }
- /*pure*/ int getNumericValue(dchar c) {
- if(isUniMirroring(c)) return -1;
- return NUMERICS[((getPackedData(c) >> NUMERIC_SHIFT) & NUMERIC_MASK)];
- }
- /*pure*/ Direction getDirectionality(dchar c) {
- uint32_t data = getPackedData(c);
- if(!data) return Direction.UNDEFINED;
- Direction d = cast(Direction)((data >> DIRECTION_SHIFT) & DIRECTION_MASK);
- return d != DIRECTION_MASK ? d : Direction.UNDEFINED;
- }
- /*pure*/ ushort findCharacterValue(dchar c) {
- if(c > 0x10FFFF) throw new UCharException("invalid Unicode codepoint", c);
- if(c <= 0xFF) return LATIN1_DATA[c];
- // Rotate the bits because the tables are separated into even and odd
- // codepoints
- dchar u = (c >> 1) | ((c & 1) << 20);
- const Range search = FULL_DATA[u >> 16];
- const uint[] array = search.array;
- // This trick is so that that compare in the while loop does not need to
- // shift the array entry down by 16
- u <<= 16;
- u |= 0xFFFF;
- int high = search.length - 1;
- int low = 0;
- if(high < 0) return 0;
- while(low < high - 1) {
- int probe = (high + low) >> 1;
- // The entries contain the codepoint in the high 16 bits and the index
- // into PACKED_DATA in the low 16.
- if(array[probe] > u)
- high = probe;
- else
- low = probe;
- }
- if(array[low] > u) throw new UCharException("a suitable range was not found", c);
- return cast(ushort)(array[low] & 0xFFFF);
- }
- // Structure containing an array of ranges
- struct Range {
- int length;
- uint[] array;
- };
- // For Latin1 characters just index into this array to get the index and decomposition
- immutable ushort[] LATIN1_DATA = [
- 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001,
- 0x0001, 0x0002, 0x0003, 0x0002, 0x0004, 0x0003, 0x0001, 0x0001,
- 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001,
- 0x0001, 0x0001, 0x0001, 0x0001, 0x0003, 0x0003, 0x0003, 0x0002,
- 0x0005, 0x0006, 0x0006, 0x0007, 0x0008, 0x0007, 0x0006, 0x0006,
- 0x0009, 0x000A, 0x0006, 0x000B, 0x000C, 0x000D, 0x000C, 0x000C,
- 0x000E, 0x000F, 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015,
- 0x0016, 0x0017, 0x000C, 0x0006, 0x0018, 0x0019, 0x001A, 0x0006,
- 0x0006, 0x001B, 0x001C, 0x001D, 0x001E, 0x001F, 0x0020, 0x0021,
- 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029,
- 0x002A, 0x002B, 0x002C, 0x002D, 0x002E, 0x002F, 0x0030, 0x0031,
- 0x0032, 0x0033, 0x0034, 0x0035, 0x0006, 0x0036, 0x0037, 0x0038,
- 0x0037, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F,
- 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
- 0x0048, 0x0049, 0x004A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F,
- 0x0050, 0x0051, 0x0052, 0x0035, 0x0019, 0x0036, 0x0019, 0x0001,
- 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0003, 0x0001, 0x0001,
- 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001,
- 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001,
- 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001,
- 0x5853, 0x0006, 0x0008, 0x0008, 0x0008, 0x0008, 0x0054, 0x0054,
- 0x1037, 0x0054, 0x7855, 0x0056, 0x0019, 0x0057, 0x0054, 0x1037,
- 0x0058, 0x0059, 0x785A, 0x785B, 0x1037, 0x105C, 0x0054, 0x0006,
- 0x1037, 0x785D, 0x7855, 0x005E, 0x305F, 0x305F, 0x305F, 0x0006,
- 0x0860, 0x0860, 0x0860, 0x0860, 0x0860, 0x0860, 0x0060, 0x0860,
- 0x0860, 0x0860, 0x0860, 0x0860, 0x0860, 0x0860, 0x0860, 0x0860,
- 0x0060, 0x0860, 0x0860, 0x0860, 0x0860, 0x0860, 0x0860, 0x0019,
- 0x0060, 0x0860, 0x0860, 0x0860, 0x0860, 0x0860, 0x0060, 0x0055,
- 0x0861, 0x0861, 0x0861, 0x0861, 0x0861, 0x0861, 0x0061, 0x0861,
- 0x0861, 0x0861, 0x0861, 0x0861, 0x0861, 0x0861, 0x0861, 0x0861,
- 0x0061, 0x0861, 0x0861, 0x0861, 0x0861, 0x0861, 0x0861, 0x0019,
- 0x0061, 0x0861, 0x0861, 0x0861, 0x0861, 0x0861, 0x0061, 0x0862
- ];
- // Each of these arrays is stripped into ranges. In order to build the arrays, each
- // codepoint was bit-shifted so that even and odd characters were separated into different
- // arrays. The identifier of each array is the top byte after bit-shifting.
- // The numbers stored in the array are the bit-shifted codepoint, the decomposition, and an
- // index into another array of all possible packed data values. The top 16 bits are the
- // codepoint and the bottom 16 are the decomposition and index. The top 5 bits for the decomposition
- // and the rest for the index.
- // The full set of all arrays to be searched.
- immutable Range[] FULL_DATA = [
- Range(a0.length / uint.sizeof, a0),
- Range(a1.sizeof / uint.sizeof, a1),
- Range(0, null),
- Range(0, null),
- Range(0, null),
- Range(0, null),
- Range(0, null),
- Range(a7.sizeof / uint.sizeof, a7),
- Range(a8.sizeof / uint.sizeof, a8),
- Range(0, null),
- Range(0, null),
- Range(0, null),
- Range(0, null),
- Range(0, null),
- Range(0, null),
- Range(0, null),
- Range(a16.sizeof / uint.sizeof, a16),
- Range(a17.sizeof / uint.sizeof, a17),
- Range(0, null),
- Range(0, null),
- Range(0, null),
- Range(0, null),
- Range(0, null),
- Range(a23.sizeof / uint.sizeof, a23),
- Range(a24.sizeof / uint.sizeof, a24),
- Range(0, null),
- Range(0, null),
- Range(0, null),
- Range(0, null),
- Range(0, null),
- Range(0, null),
- Range(0, null)
- ];
- // Array of uppercase differences
- immutable short[] UCDIFF = [
- 0, -32, 743, 121, -1, -232, -300, 97,
- 163, 130, 56, -2, -79, -210, -206, -205,
- -202, -203, -207, -209, -211, -213, -214, -218,
- -217, -219, -83, 84, -38, -37, -31, -64,
- -63, -62, -57, -47, -54, -86, -80, 7,
- -96, -48, -59, 8, 74, 86, 100, 128,
- 112, 126, 9, -7205, -16, -26, -7264, -40
- ];
- // Array of lowercase differences
- immutable short[] LCDIFF = [
- 0, 32, 1, -199, -121, 210, 206, 205,
- 79, 202, 203, 207, 211, 209, 213, 214,
- 218, 217, 219, 2, -97, -56, -130, -163,
- 83, 38, 37, 64, 63, -60, -7, 80,
- 48, 7264, -8, -74, -9, -86, -100, -112,
- -128, -126, -7517, -8383, -8262, 16, 26, 40
- ];
- // Array of titlecase differences
- immutable short[] TCDIFF = [
- 3, 1, 0, -1
- ];
- // Array of mirrored character differences
- immutable short[] MIRROR_DIFF = [
- 0, 1, -1, 2, -2, 16, -16, 3,
- -3, 2016, 138, 1824, 2104, 2108, 2106, -138,
- 8, 7, -8, -7, -1824, -2016, -2104, -2106,
- -2108
- ];
- // Array of all possible numeric values
- immutable int[] NUMERICS = [
- -1, 0, 1, 2, 3, 4, 5, 6,
- 7, 8, 9, 10, 11, 12, 13, 14,
- 15, 16, 17, 18, 19, 20, 21, 22,
- 23, 24, 25, 26, 27, 28, 29, 30,
- 31, 32, 33, 34, 35, -2, 100, 1000,
- 40, 50, 60, 70, 80, 90, 10000, 500,
- 5000, 36, 37, 38, 39, 41, 42, 43,
- 44, 45, 46, 47, 48, 49, 200, 300,
- 400, 600, 700, 800, 900, 2000, 3000, 4000,
- 6000, 7000, 8000, 9000, 20000, 30000, 40000, 50000,
- 60000, 70000, 80000, 90000
- ];
- // All possible packed data values, no duplicates
- immutable uint[] PACKED_DATA = [
- 0x00000000, 0x0000012F, 0x0000016F, 0x0000014F, 0x0000018F, 0x0000018C, 0x000001B8, 0x000000B8,
- 0x000000BA, 0x020005B5, 0x040005B6, 0x00000099, 0x000000F8, 0x00000094, 0x02000069, 0x04000069,
- 0x06000069, 0x08000069, 0x0A000069, 0x0C000069, 0x0E000069, 0x10000069, 0x12000069, 0x14000069,
- 0x060005B9, 0x000001B9, 0x080005B9, 0x16020001, 0x18020001, 0x1A020001, 0x1C020001, 0x1E020001,
- 0x20020001, 0x22020001, 0x24020001, 0x26020001, 0x28020001, 0x2A020001, 0x2C020001, 0x2E020001,
- 0x30020001, 0x32020001, 0x34020001, 0x36020001, 0x38020001, 0x3A020001, 0x3C020001, 0x3E020001,
- 0x40020001, 0x42020001, 0x44020001, 0x46020001, 0x48020001, 0x060005B5, 0x080005B6, 0x000001BB,
- 0x000001B7, 0x16000802, 0x18000802, 0x1A000802, 0x1C000802, 0x1E000802, 0x20000802, 0x22000802,
- 0x24000802, 0x26000802, 0x28000802, 0x2A000802, 0x2C000802, 0x2E000802, 0x30000802, 0x32000802,
- 0x34000802, 0x36000802, 0x38000802, 0x3A000802, 0x3C000802, 0x3E000802, 0x40000802, 0x42000802,
- 0x44000802, 0x46000802, 0x48000802, 0x000000EC, 0x000001BC, 0x00000002, 0x0A0005BD, 0x00000130,
- 0x000000BC, 0x000000B9, 0x0600006B, 0x0800006B, 0x00001002, 0x0400006B, 0x0C0005BE, 0x4A0001AB,
- 0x00020001, 0x00000802, 0x00001802, 0x00040001, 0x00060001, 0x00002002, 0x00080001, 0x000C0001,
- 0x000E0001, 0x00100001, 0x00140001, 0x00160001, 0x00180001, 0x00004002, 0x00004802, 0x00200001,
- 0x00220001, 0x00000005, 0x00A60001, 0x01805802, 0x01042003, 0x00280001, 0x002C0001, 0x00000001,
- 0x00000000, 0x00007002, 0x00007802, 0x00009802, 0x0000A802, 0x0000B802, 0x0000C002, 0x0000C802,
- 0x0000D002, 0x00000004, 0x000001A4, 0x00000106, 0x00320001, 0x00340001, 0x00360001, 0x00380001,
- 0x0000E002, 0x0000E802, 0x0000F002, 0x0000F802, 0x00010002, 0x00010802, 0x00012002, 0x00012802,
- 0x00013802, 0x003A0001, 0x003E0001, 0x00013002, 0x0000001C, 0x00000107, 0x00400001, 0x00000018,
- 0x00014802, 0x000001B4, 0x00000038, 0x00000025, 0x00000050, 0x00000058, 0x00000045, 0x00000044,
- 0x020000C9, 0x060000C9, 0x0A0000C9, 0x0E0000C9, 0x120000C9, 0x000000D8, 0x0000005C, 0x00000008,
- 0x02000009, 0x06000009, 0x0A000009, 0x0E000009, 0x12000009, 0x0400000B, 0x0800000B, 0x0000000B,
- 0x1600000B, 0x4E00000B, 0x00000006, 0x4A00000B, 0x000001B5, 0x00420001, 0x0600000B, 0x0A00000B,
- 0x0E00000B, 0x1200000B, 0x3E00000B, 0x5200000B, 0x5600000B, 0x5A00000B, 0x5C00000B, 0x000001B6,
- 0x2400000A, 0x2800000A, 0x00000010, 0x020001AB, 0x060001AB, 0x0A0001AB, 0x0E0001AB, 0x120001AB,
- 0x00000108, 0x00015802, 0x00440001, 0x00016002, 0x00016802, 0x00017002, 0x00017802, 0x00018002,
- 0x00018802, 0x00440003, 0x00460001, 0x00480003, 0x00019802, 0x004A0001, 0x004C0001, 0x004E0001,
- 0x003C0001, 0x00500001, 0x00520001, 0x000001BD, 0x0000018D, 0x000001D0, 0x00000250, 0x00000230,
- 0x040005BE, 0x000000F9, 0x0200006B, 0x0A00006B, 0x0E00006B, 0x1200006B, 0x00540001, 0x00560001,
- 0x000005B9, 0x045A000A, 0x085A000A, 0x0C5A000A, 0x105A000A, 0x145A000A, 0x185A000A, 0x525A000A,
- 0x5E5A000A, 0x0401A00A, 0x0801A00A, 0x0C01A00A, 0x1001A00A, 0x1401A00A, 0x1801A00A, 0x5201A00A,
- 0x5E01A00A, 0x4E00000A, 0x5C00000A, 0x0E0005B9, 0x100005B9, 0x020005B9, 0x040005B9, 0x160005B9,
- 0x180005B9, 0x1A0005B9, 0x200005B9, 0x220005B9, 0x240005B9, 0x260005B9, 0x040001AB, 0x080001AB,
- 0x0C0001AB, 0x100001AB, 0x140001AB, 0x180001AB, 0x1C0001AB, 0x200001AB, 0x240001AB, 0x280001AB,
- 0x0C00006B, 0x1000006B, 0x1400006B, 0x1800006B, 0x1C00006B, 0x2000006B, 0x2400006B, 0x2800006B,
- 0x005C001C, 0x0001A81C, 0x1A0001AB, 0x1E0001AB, 0x220001AB, 0x260001AB, 0x2A0001AB, 0x160001AB,
- 0x020005B6, 0x100005B6, 0x280005B9, 0x2C0005B9, 0x300005B9, 0x0001B002, 0x020005BD, 0x0600000A,
- 0x0A00000A, 0x0E00000A, 0x1200000A, 0x1600000A, 0x3E00000A, 0x0C00000B, 0x1000000B, 0x1400000B,
- 0x2E0001AB, 0x320001AB, 0x360001AB, 0x3A0001AB, 0x3E0001AB, 0x420001AB, 0x460001AB, 0x640001AB,
- 0x680001AB, 0x6A0001AB, 0x6E0001AB, 0x720001AB, 0x760001AB, 0x7A0001AB, 0x00000013, 0x00000012,
- 0x0000005A, 0x000001B0, 0x7C00000B, 0x8000000B, 0x8200000B, 0x8600000B, 0x8C00000B, 0x6000000B,
- 0x9200000B, 0x9600000B, 0x9800000B, 0x9C00000B, 0xA000000B, 0xA400000B, 0x4A0001AA, 0x040001AA,
- 0x520001AA, 0x600001AA, 0x0C0001AA, 0x5E0001AA, 0x160001AA, 0x4C0001AA, 0x4E0001AA, 0x9E0001AA,
- 0x060001AA, 0x8800000A, 0x2A0001AA, 0x005E0001, 0x0001B802, 0x0400002B, 0x0800002B, 0x1600002B,
- 0x4C00002B, 0x00002802, 0x00003002, 0x000A0001, 0x00120001, 0x00003802, 0x001A0001, 0x001C0001,
- 0x001E0001, 0x00240001, 0x00005002, 0x00006002, 0x002A0001, 0x002E0001, 0x00300001, 0x00006802,
- 0x00008002, 0x00008802, 0x00009002, 0x0000A002, 0x0000B002, 0x0000D906, 0x00011002, 0x00011802,
- 0x00014002, 0x040000C9, 0x080000C9, 0x0C0000C9, 0x100000C9, 0x140000C9, 0x04000009, 0x08000009,
- 0x0C000009, 0x10000009, 0x14000009, 0x2200000B, 0x4C00000B, 0x2A00000B, 0x5000000B, 0x5400000B,
- 0x5800000B, 0x2600000A, 0x00015002, 0x00019002, 0x00000030, 0x000001BE, 0x0000014E, 0x00000210,
- 0x000001F0, 0x00580001, 0x065A000A, 0x0A5A000A, 0x0E5A000A, 0x125A000A, 0x165A000A, 0x1A5A000A,
- 0x4C5A000A, 0x4E5A000A, 0x0601A00A, 0x0A01A00A, 0x0E01A00A, 0x1201A00A, 0x1601A00A, 0x1A01A00A,
- 0x4C01A00A, 0x4E01A00A, 0x6000000A, 0x0000000A, 0x120005B9, 0x140005B9, 0x1C0005B9, 0x1E0005B9,
- 0x1600006B, 0x1A00006B, 0x1E00006B, 0x2200006B, 0x2600006B, 0x2A00006B, 0x0E0005B5, 0x040005B5,
- 0x2A0005B9, 0x2E0005B9, 0x0200000A, 0x0400000A, 0x0800000A, 0x0C00000A, 0x1000000A, 0x1400000A,
- 0x2A00000A, 0x2C0001AB, 0x300001AB, 0x340001AB, 0x380001AB, 0x3C0001AB, 0x400001AB, 0x440001AB,
- 0x480001AB, 0x620001AB, 0x660001AB, 0x500001AB, 0x6C0001AB, 0x700001AB, 0x740001AB, 0x780001AB,
- 0x520001AB, 0x7E00000B, 0x5E00000B, 0x8400000B, 0x8800000B, 0x8A00000B, 0x8E00000B, 0x9000000B,
- 0x9400000B, 0x9A00000B, 0x9E00000B, 0xA200000B, 0xA600000B, 0x5C0001AA, 0x3E0001AA, 0x7E0001AA,
- 0x0600002B, 0x0A00002B, 0x2A00002B, 0x4E00002B, 0x00000019
- ];
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement