Advertisement
Guest User

Jeremie Pelletier

a guest
Oct 11th, 2009
729
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
D 73.69 KB | None | 0 0
  1. /**
  2.  Unicode Character Types
  3. */
  4. module std.uni;
  5.  
  6. import core.stdc.stdint;
  7. import core.stdc.stdio;
  8.  
  9. import std.ctype; // this really needs pure functions that return bools
  10.  
  11. class UCharException : Exception {
  12.     this(string msg, dchar c) {
  13.         char[255] buf = void;
  14.         uint len = snprintf(buf.ptr, buf.length, "%*s: \\u%04X\\u%04X", msg,
  15.             c & 0xFFFF, (c >> 15) & 0xFFFF);
  16.         super(buf[0 .. len].idup);
  17.     }
  18. }
  19.  
  20. // Lu, Ll, Lt, Lm, Lo
  21. /*pure*/ bool isUniAlpha(dchar code) {
  22.     if(code <= 0x7F) return cast(bool)isalpha(code);
  23.  
  24.     switch(getType(code)) {
  25.     case CharType.UPPERCASE_LETTER:
  26.     case CharType.LOWERCASE_LETTER:
  27.     case CharType.TITLECASE_LETTER:
  28.     case CharType.MODIFIER_LETTER:
  29.     case CharType.OTHER_LETTER:
  30.         return true;
  31.  
  32.     default:
  33.         return false;
  34.     }
  35. }
  36.  
  37. // Lu
  38. /*pure*/ bool isUniUpper(dchar code) {
  39.     if(code <= 0x7F) return cast(bool)isupper(code);
  40.     return getType(code) == CharType.UPPERCASE_LETTER;
  41. }
  42.  
  43. // Ll
  44. /*pure*/ bool isUniLower(dchar code) {
  45.     if(code <= 0x7F) return cast(bool)islower(code);
  46.     return getType(code) == CharType.LOWERCASE_LETTER;
  47. }
  48.  
  49. // Lt
  50. /*pure*/ bool isUniTitle(dchar code) {
  51.     return getType(code) == CharType.TITLECASE_LETTER;
  52. }
  53.  
  54. // Lu, Ll, Lt, Lm, Lo, Nd
  55. /*pure*/ bool isUniAlphaNum(dchar code) {
  56.     if(code <= 0x7F) return cast(bool)isalnum(code);
  57.     switch(getType(code)) {
  58.     case CharType.UPPERCASE_LETTER:
  59.     case CharType.LOWERCASE_LETTER:
  60.     case CharType.TITLECASE_LETTER:
  61.     case CharType.MODIFIER_LETTER:
  62.     case CharType.OTHER_LETTER:
  63.     case CharType.DECIMAL_DIGIT_NUMBER:
  64.         return true;
  65.  
  66.     default:
  67.         return false;
  68.     }
  69. }
  70.  
  71. // Nd, Nl, No
  72. /*pure*/ bool isUniNumber(dchar code) {
  73.     if(code <= 0x7F) return cast(bool)isdigit(code);
  74.     switch(getType(code)) {
  75.     case CharType.DECIMAL_DIGIT_NUMBER:
  76.     case CharType.LETTER_NUMBER:
  77.     case CharType.OTHER_NUMBER:
  78.         return true;
  79.  
  80.     default:
  81.         return false;
  82.     }
  83. }
  84.  
  85. // Nd
  86. /*pure*/ bool isUniDigit(dchar code) {
  87.     if(code <= 0x7F) return cast(bool)isdigit(code);
  88.     return getType(code) == CharType.DECIMAL_DIGIT_NUMBER;
  89. }
  90.  
  91. // Zs, Zl, Zp
  92. /*pure*/ bool isUniSeparator(dchar code) {
  93.     if(code <= 0x7F) return cast(bool)isspace(code);
  94.     switch(getType(code)) {
  95.     case CharType.SPACE_SEPARATOR:
  96.     case CharType.LINE_SEPARATOR:
  97.     case CharType.PARAGRAPH_SEPARATOR:
  98.         return true;
  99.  
  100.     default:
  101.         return false;
  102.     }
  103. }
  104.  
  105. // Zs
  106. /*pure*/ bool isUniSpace(dchar code) {
  107.     if(code <= 0x7F) return cast(bool)isspace(code);
  108.     return getType(code) == CharType.SPACE_SEPARATOR;
  109. }
  110.  
  111. // Zl
  112. /*pure*/ bool isUniLine(dchar code) {
  113.     return getType(code) == CharType.LINE_SEPARATOR;
  114. }
  115.  
  116. // Zp
  117. /*pure*/ bool isUniParagraph(dchar code) {
  118.     return getType(code) == CharType.PARAGRAPH_SEPARATOR;
  119. }
  120.  
  121. // Mn, Mc, Me
  122. /*pure*/ bool isUniMark(dchar code) {
  123.     switch(getType(code)) {
  124.     case CharType.NONSPACING_MARK:
  125.     case CharType.COMBINING_SPACING_MARK:
  126.     case CharType.ENCLOSING_MARK:
  127.         return true;
  128.  
  129.     default:
  130.         return false;
  131.     }
  132. }
  133.  
  134. /*pure*/ bool isUniNonspacing(dchar code) {
  135.     return getType(code) == CharType.NONSPACING_MARK;
  136. }
  137.  
  138. // Pc, Pd, Ps, Pe, Pi, Pf, Po
  139. /*pure*/ bool isUniPunctuation(dchar code) {
  140.     if(code <= 0x7F) return cast(bool)ispunct(code);
  141.     switch(getType(code)) {
  142.     case CharType.CONNECTOR_PUNCTUATION:
  143.     case CharType.DASH_PUNCTUATION:
  144.     case CharType.OPEN_PUNCTUATION:
  145.     case CharType.CLOSE_PUNCTUATION:
  146.     case CharType.INITIAL_QUOTE_PUNCTUATION:
  147.     case CharType.FINAL_QUOTE_PUNCTUATION:
  148.     case CharType.OTHER_PUNCTUATION:
  149.         return true;
  150.  
  151.     default:
  152.         return false;
  153.     }
  154. }
  155.  
  156. // Sm, Sc, Sk, So
  157. /*pure*/ bool isUniSymbol(dchar code) {
  158.     switch(getType(code)) {
  159.     case CharType.MATH_SYMBOL:
  160.     case CharType.CURRENCY_SYMBOL:
  161.     case CharType.MODIFIER_SYMBOL:
  162.     case CharType.OTHER_SYMBOL:
  163.         return true;
  164.  
  165.     default:
  166.         return false;
  167.     }
  168. }
  169.  
  170. // Cc, Cf, Cs, Co, Cn
  171. /*pure*/ bool isUniOther(dchar code) {
  172.     switch(getType(code)) {
  173.     case CharType.CONTROL:
  174.     case CharType.FORMAT:
  175.     case CharType.SURROGATE:
  176.     case CharType.PRIVATE_USE:
  177.     case CharType.NOT_ASSIGNED:
  178.         return true;
  179.  
  180.     default:
  181.         return false;
  182.     }
  183. }
  184.  
  185. // Cc
  186. /*pure*/ bool isUniControl(dchar code) {
  187.     if(code <= 0x7F) return cast(bool)iscntrl(code);
  188.     return getType(code) == CharType.CONTROL;
  189. }
  190.  
  191. // Cf
  192. /*pure*/ bool isUniFormat(dchar code) {
  193.     return getType(code) == CharType.FORMAT;
  194. }
  195.  
  196. // Cs
  197. /*pure*/ bool isUniSurrogate(dchar code) {
  198.     return getType(code) == CharType.SURROGATE;
  199. }
  200.  
  201. // Co
  202. /*pure*/ bool isUniPrivateUse(dchar code) {
  203.     return getType(code) == CharType.PRIVATE_USE;
  204. }
  205.  
  206. // Lu, Ll, Lt, Lm, Lo, Nd, Nl, No, Mn, Mc, Me,
  207. // Pc, Pd, Ps, Pe, Pi, Pf, Po, Sm, Sc, Sk, So
  208. /*pure*/ bool isUniGraph(dchar code) {
  209.     if(code <= 0x7F) return cast(bool)isgraph(code);
  210.     switch(getType(code)) {
  211.     case CharType.UPPERCASE_LETTER:
  212.     case CharType.LOWERCASE_LETTER:
  213.     case CharType.TITLECASE_LETTER:
  214.     case CharType.MODIFIER_LETTER:
  215.     case CharType.OTHER_LETTER:
  216.     case CharType.DECIMAL_DIGIT_NUMBER:
  217.     case CharType.LETTER_NUMBER:
  218.     case CharType.OTHER_NUMBER:
  219.     case CharType.NONSPACING_MARK:
  220.     case CharType.COMBINING_SPACING_MARK:
  221.     case CharType.ENCLOSING_MARK:
  222.     case CharType.CONNECTOR_PUNCTUATION:
  223.     case CharType.DASH_PUNCTUATION:
  224.     case CharType.OPEN_PUNCTUATION:
  225.     case CharType.CLOSE_PUNCTUATION:
  226.     case CharType.INITIAL_QUOTE_PUNCTUATION:
  227.     case CharType.FINAL_QUOTE_PUNCTUATION:
  228.     case CharType.OTHER_PUNCTUATION:
  229.     case CharType.MATH_SYMBOL:
  230.     case CharType.CURRENCY_SYMBOL:
  231.     case CharType.MODIFIER_SYMBOL:
  232.     case CharType.OTHER_SYMBOL:
  233.         return true;
  234.  
  235.     default:
  236.         return false;
  237.     }
  238. }
  239.  
  240. // Lu, Ll, Lt, Lm, Lo, Nd, Nl, No, Zs, Mn, Mc, Me,
  241. // Pc, Pd, Ps, Pe, Pi, Pf, Po, Sm, Sc, Sk, So
  242. /*pure*/ bool isUniPrint(dchar code) {
  243.     if(code <= 0x7F) return cast(bool)isprint(code);
  244.     switch(getType(code)) {
  245.     case CharType.UPPERCASE_LETTER:
  246.     case CharType.LOWERCASE_LETTER:
  247.     case CharType.TITLECASE_LETTER:
  248.     case CharType.MODIFIER_LETTER:
  249.     case CharType.OTHER_LETTER:
  250.     case CharType.DECIMAL_DIGIT_NUMBER:
  251.     case CharType.LETTER_NUMBER:
  252.     case CharType.OTHER_NUMBER:
  253.     case CharType.SPACE_SEPARATOR:
  254.     case CharType.NONSPACING_MARK:
  255.     case CharType.COMBINING_SPACING_MARK:
  256.     case CharType.ENCLOSING_MARK:
  257.     case CharType.CONNECTOR_PUNCTUATION:
  258.     case CharType.DASH_PUNCTUATION:
  259.     case CharType.OPEN_PUNCTUATION:
  260.     case CharType.CLOSE_PUNCTUATION:
  261.     case CharType.INITIAL_QUOTE_PUNCTUATION:
  262.     case CharType.FINAL_QUOTE_PUNCTUATION:
  263.     case CharType.OTHER_PUNCTUATION:
  264.     case CharType.MATH_SYMBOL:
  265.     case CharType.CURRENCY_SYMBOL:
  266.     case CharType.MODIFIER_SYMBOL:
  267.     case CharType.OTHER_SYMBOL:
  268.         return true;
  269.  
  270.     default:
  271.         return false;
  272.     }
  273. }
  274.  
  275. // Ws
  276. /*pure*/ bool isUniDirWhiteSpace(dchar code) {
  277.     return getDirectionality(code) == Direction.WHITESPACE;
  278. }
  279.  
  280. // R
  281. /*pure*/ bool isUniDirLTL(dchar code) {
  282.     return getDirectionality(code) == Direction.LEFT_TO_RIGHT;
  283. }
  284.  
  285. // L
  286. /*pure*/ bool isUniDirRTL(dchar code) {
  287.     return getDirectionality(code) == Direction.RIGHT_TO_LEFT;
  288. }
  289.  
  290. // L, R
  291. /*pure*/ bool isUniDirStrong(dchar code) {
  292.     switch(getDirectionality(code)) {
  293.     case Direction.LEFT_TO_RIGHT:
  294.     case Direction.RIGHT_TO_LEFT:
  295.         return true;
  296.  
  297.     default:
  298.         return false;
  299.     }
  300. }
  301.  
  302. // En, Es, Et, An, Cs
  303. /*pure*/ bool isUniDirWeak(dchar code) {
  304.     switch(getDirectionality(code)) {
  305.     case Direction.EUROPEAN_NUMBER:
  306.     case Direction.EUROPEAN_NUMBER_SEPARATOR:
  307.     case Direction.EUROPEAN_NUMBER_TERMINATOR:
  308.     case Direction.ARABIC_NUMBER:
  309.     case Direction.COMMON_NUMBER_SEPARATOR:
  310.         return true;
  311.  
  312.     default:
  313.         return false;
  314.     }
  315. }
  316.  
  317. //B, S, Ws, On
  318. /*pure*/ bool isUniDirNeutral(dchar code) {
  319.     switch(getDirectionality(code)) {
  320.     case Direction.BLOCK_SEPARATOR:
  321.     case Direction.SEGMENT_SEPARATOR:
  322.     case Direction.WHITESPACE:
  323.     case Direction.OTHER_NEUTRALS:
  324.         return true;
  325.  
  326.     default:
  327.         return false;
  328.     }
  329. }
  330.  
  331. // B, S
  332. /*pure*/ bool isUniDirSeparator(dchar code) {
  333.     switch(getDirectionality(code)) {
  334.     case Direction.BLOCK_SEPARATOR:
  335.     case Direction.SEGMENT_SEPARATOR:
  336.         return true;
  337.  
  338.     default:
  339.         return false;
  340.     }
  341. }
  342.  
  343. //
  344. /*pure*/ bool isUniNonBreaking(dchar code) {
  345.     return getDecompositionType(code) == DecompositionType.NOBREAK;
  346. }
  347.  
  348. /*pure*/ bool isUniMirroring(dchar code) {
  349.     return ((getPackedData(code) >> MIRRORED_SHIFT) & MIRRORED_MASK) != 0;
  350. }
  351.  
  352. // Lu -> Lt
  353. /*pure*/ dchar toUniLower(dchar code) {
  354.     return code + LCDIFF[(getPackedData(code) >> TOLOWER_SHIFT) & TOLOWER_MASK];
  355. }
  356.  
  357. // Lt -> Lu
  358. /*pure*/ dchar toUniUpper(dchar code) {
  359.     return code + UCDIFF[(getPackedData(code) >> TOUPPER_SHIFT) & TOUPPER_MASK];
  360. }
  361.  
  362. // L& -> Lt
  363. /*pure*/ dchar toUniTitle(dchar c) {
  364.     int32_t diff = TCDIFF[(getPackedData(c) >> TOTITLE_SHIFT) & TOTITLE_MASK];
  365.  
  366.     return diff != TOTITLE_MASK ? c + diff : toUniUpper(c);
  367. }
  368.  
  369. /*pure*/ dchar toUniMirror(dchar c) {
  370.     if(!isUniMirroring(c)) return c;
  371.  
  372.     return c + MIRROR_DIFF[(getPackedData(c) >> MIRROR_SHIFT) & MIRROR_MASK];
  373. }
  374.  
  375.  
  376. // The following is taken from AndroidUnicode.h, AndroidUnicode.cpp and
  377. // characterData.h from the Andriod project; after hunting high and low for
  378. // documentation about Unicode on Google and not wanting to go for heavyweight
  379. // implementations like ICU I found this gem. I ported it to D to implement
  380. // the public Unicode api. Other implementations of interest are in V8 and
  381. // Mozilla, some benchmarks would be useful to determine the one which performs
  382. // best.
  383.  
  384. /*
  385.  * Copyright (C) 2008 The Android Open Source Project
  386.  *
  387.  * Licensed under the Apache License, Version 2.0 (the "License");
  388.  * you may not use this file except in compliance with the License.
  389.  * You may obtain a copy of the License at
  390.  *
  391.  *      http://www.apache.org/licenses/LICENSE-2.0
  392.  *
  393.  * Unless required by applicable law or agreed to in writing, software
  394.  * distributed under the License is distributed on an "AS IS" BASIS,
  395.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  396.  * See the License for the specific language governing permissions and
  397.  * limitations under the License.
  398.  */
  399. private:
  400.  
  401. /*
  402.  * Directions specified in the Unicode standard. These directions map directly
  403.  * to java.lang.Character.
  404.  */
  405. enum Direction {
  406.     UNDEFINED = -1,
  407.     LEFT_TO_RIGHT,
  408.     RIGHT_TO_LEFT,
  409.     RIGHT_TO_LEFT_ARABIC,
  410.     EUROPEAN_NUMBER,
  411.     EUROPEAN_NUMBER_SEPARATOR,
  412.     EUROPEAN_NUMBER_TERMINATOR,
  413.     ARABIC_NUMBER,
  414.     COMMON_NUMBER_SEPARATOR,
  415.     NONSPACING_MARK,
  416.     BOUNDARY_NEUTRAL,
  417.     BLOCK_SEPARATOR,
  418.     SEGMENT_SEPARATOR,
  419.     WHITESPACE,
  420.     OTHER_NEUTRALS,
  421.     LEFT_TO_RIGHT_EMBEDDING,
  422.     LEFT_TO_RIGHT_OVERRIDE,
  423.     RIGHT_TO_LEFT_EMBEDDING,
  424.     RIGHT_TO_LEFT_OVERRIDE,
  425.     POP_DIRECTIONAL_FORMAT
  426. }
  427.  
  428. /*
  429.  * Character types as specified in the Unicode standard. These map directly to
  430.  * java.lang.Character.
  431.  */
  432. enum CharType {
  433.     UNASSIGNED,
  434.     UPPERCASE_LETTER,
  435.     LOWERCASE_LETTER,
  436.     TITLECASE_LETTER,
  437.     MODIFIER_LETTER,
  438.     OTHER_LETTER,
  439.     NONSPACING_MARK,
  440.     ENCLOSING_MARK,
  441.     COMBINING_SPACING_MARK,
  442.     DECIMAL_DIGIT_NUMBER,
  443.     LETTER_NUMBER,
  444.     OTHER_NUMBER,
  445.     SPACE_SEPARATOR,
  446.     LINE_SEPARATOR,
  447.     PARAGRAPH_SEPARATOR,
  448.     CONTROL,
  449.     FORMAT,
  450.     NOT_ASSIGNED,
  451.     PRIVATE_USE,
  452.     SURROGATE,
  453.     DASH_PUNCTUATION,
  454.     OPEN_PUNCTUATION,
  455.     CLOSE_PUNCTUATION,
  456.     CONNECTOR_PUNCTUATION,
  457.     OTHER_PUNCTUATION,
  458.     MATH_SYMBOL,
  459.     CURRENCY_SYMBOL,
  460.     MODIFIER_SYMBOL,
  461.     OTHER_SYMBOL,
  462.     INITIAL_QUOTE_PUNCTUATION,
  463.     FINAL_QUOTE_PUNCTUATION
  464. }
  465.  
  466. /*
  467.  * Decomposition types as described by the unicode standard. These values map to
  468.  * the same values in dchar.h in ICU.
  469.  */
  470. enum DecompositionType {
  471.     NONE,
  472.     CANONICAL,
  473.     COMPAT,
  474.     CIRCLE,
  475.     FINAL,
  476.     FONT,
  477.     FRACTION,
  478.     INITIAL,
  479.     ISOLATED,
  480.     MEDIAL,
  481.     NARROW,
  482.     NOBREAK,
  483.     SMALL,
  484.     SQUARE,
  485.     SUB,
  486.     SUPER,
  487.     VERTICAL,
  488.     WIDE
  489. }
  490.  
  491. enum {
  492.     MIN_RADIX           = 2,
  493.     MAX_RADIX           = 36,
  494.  
  495.     TYPE_SHIFT          = 0,
  496.     TYPE_MASK           = (1 << 5) - 1,
  497.  
  498.     DIRECTION_SHIFT     = TYPE_SHIFT + 5,
  499.     DIRECTION_MASK      = (1 << 5) - 1,
  500.  
  501.     MIRRORED_SHIFT      = DIRECTION_SHIFT + 5,
  502.     MIRRORED_MASK       = (1 << 1) - 1,
  503.  
  504.     TOUPPER_SHIFT       = MIRRORED_SHIFT + 1,
  505.     TOUPPER_MASK        = (1 << 6)-1,
  506.  
  507.     TOLOWER_SHIFT       = TOUPPER_SHIFT + 6,
  508.     TOLOWER_MASK        = (1 << 6)-1,
  509.  
  510.     TOTITLE_SHIFT       = TOLOWER_SHIFT+6,
  511.     TOTITLE_MASK        = (1 << 2) - 1,
  512.  
  513.     MIRROR_SHIFT        = TOTITLE_SHIFT + 2,
  514.     MIRROR_MASK         = (1 << 5) - 1,
  515.  
  516.     NUMERIC_SHIFT       = TOTITLE_SHIFT + 2,
  517.     NUMERIC_MASK        = (1 << 7) - 1,
  518.  
  519.     DECOMPOSITION_SHIFT = 11,
  520.     DECOMPOSITION_MASK  = (1 << 5) - 1
  521. }
  522.  
  523. /**
  524.  * Returns the packed data for java calls
  525.  * @param c The unicode character.
  526.  * @return The packed data for the character.
  527.  *
  528.  * Copied from java.lang.Character implementation:
  529.  * 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
  530.  * F E D C B A 9 8 7 6 5 4 3 2 1 0 F E D C B A 9 8 7 6 5 4 3 2 1 0
  531.  *
  532.  *                              31 types                 ---------
  533.  *                   18 directionalities       ---------
  534.  *                   2 mirroreds             -
  535.  *                               -----------      56  toupper diffs
  536.  *                   -----------                  48  tolower diffs
  537.  *               ---                              4 totitlecase diffs
  538.  * -------------                                 84 numeric values
  539.  *     ---------                                 24 mirror char diffs
  540.  */
  541. /*pure*/ dchar getPackedData(dchar c) {
  542.     // findCharacterValue returns a 16-bit value with the top 5 bits containing
  543.     // a decomposition type and the remaining bits containing an index.
  544.     return PACKED_DATA[findCharacterValue(c) & 0x7FF];
  545. }
  546.  
  547. /*
  548.  * Get the Character type.
  549.  * @param c The unicode character.
  550.  * @return The character's type or CHARTYPE_UNASSIGNED if the character is
  551.  *         invalid or has an unassigned class.
  552.  */
  553. /*pure*/ CharType getType(dchar c) {
  554.     if(c >= 0x10FFFF) return CharType.UNASSIGNED;
  555.  
  556.     return cast(CharType)((getPackedData(c) >> TYPE_SHIFT) & TYPE_MASK);
  557. }
  558.  
  559. /**
  560.  * Get the Character's decomposition type.
  561.  * @param c The unicode character.
  562.  * @return The character's decomposition type or DECOMPOSITION_NONE is there
  563.  *         is no decomposition.
  564.  */
  565. /*pure*/ DecompositionType getDecompositionType(dchar c) {
  566.     // findCharacterValue returns a 16-bit value with the top 5 bits containing
  567.     // a decomposition type and the remaining bits containing an index.
  568.     return cast(DecompositionType)
  569.         ((findCharacterValue(c) >> DECOMPOSITION_SHIFT) & DECOMPOSITION_MASK);
  570. }
  571.  
  572. /*pure*/ int getDigitValue(dchar c, int radix) {
  573.     if(radix < MIN_RADIX || radix > MAX_RADIX) return -1;
  574.  
  575.     int tempValue = radix;
  576.  
  577.     if(c >= '0' && c <= '9')
  578.         tempValue = c - '0';
  579.     else if(c >= 'a' && c <= 'z')
  580.         tempValue = c - 'a' + 10;
  581.     else if(c >= 'A' && c <= 'Z')
  582.         tempValue = c - 'A' + 10;
  583.  
  584.     return tempValue < radix ? tempValue : -1;
  585. }
  586.  
  587. /*pure*/ int getNumericValue(dchar c) {
  588.     if(isUniMirroring(c)) return -1;
  589.  
  590.     return NUMERICS[((getPackedData(c) >> NUMERIC_SHIFT) & NUMERIC_MASK)];
  591. }
  592.  
  593. /*pure*/ Direction getDirectionality(dchar c) {
  594.     uint32_t data = getPackedData(c);
  595.  
  596.     if(!data) return Direction.UNDEFINED;
  597.  
  598.     Direction d = cast(Direction)((data >> DIRECTION_SHIFT) & DIRECTION_MASK);
  599.  
  600.     return d != DIRECTION_MASK ? d : Direction.UNDEFINED;
  601. }
  602.  
  603. /*pure*/ ushort findCharacterValue(dchar c) {
  604.     if(c > 0x10FFFF) throw new UCharException("invalid Unicode codepoint", c);
  605.  
  606.     if(c <= 0xFF) return LATIN1_DATA[c];
  607.  
  608.     // Rotate the bits because the tables are separated into even and odd
  609.     // codepoints
  610.     dchar u = (c >> 1) | ((c & 1) << 20);
  611.  
  612.     const Range search = FULL_DATA[u >> 16];
  613.     const uint[] array = search.array;
  614.  
  615.     // This trick is so that that compare in the while loop does not need to
  616.     // shift the array entry down by 16
  617.     u <<= 16;
  618.     u |= 0xFFFF;
  619.  
  620.     int high = search.length - 1;
  621.     int low = 0;
  622.  
  623.     if(high < 0) return 0;
  624.  
  625.     while(low < high - 1) {
  626.         int probe = (high + low) >> 1;
  627.  
  628.         // The entries contain the codepoint in the high 16 bits and the index
  629.         // into PACKED_DATA in the low 16.
  630.         if(array[probe] > u)
  631.             high = probe;
  632.         else
  633.             low = probe;
  634.     }
  635.  
  636.     if(array[low] > u) throw new UCharException("a suitable range was not found", c);
  637.  
  638.     return cast(ushort)(array[low] & 0xFFFF);
  639. }
  640.  
  641. // Structure containing an array of ranges
  642. struct Range {
  643.     int length;
  644.     uint[] array;
  645. };
  646.  
  647. // For Latin1 characters just index into this array to get the index and decomposition
  648. immutable ushort[] LATIN1_DATA = [
  649.     0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001,
  650.     0x0001, 0x0002, 0x0003, 0x0002, 0x0004, 0x0003, 0x0001, 0x0001,
  651.     0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001,
  652.     0x0001, 0x0001, 0x0001, 0x0001, 0x0003, 0x0003, 0x0003, 0x0002,
  653.     0x0005, 0x0006, 0x0006, 0x0007, 0x0008, 0x0007, 0x0006, 0x0006,
  654.     0x0009, 0x000A, 0x0006, 0x000B, 0x000C, 0x000D, 0x000C, 0x000C,
  655.     0x000E, 0x000F, 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015,
  656.     0x0016, 0x0017, 0x000C, 0x0006, 0x0018, 0x0019, 0x001A, 0x0006,
  657.     0x0006, 0x001B, 0x001C, 0x001D, 0x001E, 0x001F, 0x0020, 0x0021,
  658.     0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029,
  659.     0x002A, 0x002B, 0x002C, 0x002D, 0x002E, 0x002F, 0x0030, 0x0031,
  660.     0x0032, 0x0033, 0x0034, 0x0035, 0x0006, 0x0036, 0x0037, 0x0038,
  661.     0x0037, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F,
  662.     0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
  663.     0x0048, 0x0049, 0x004A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F,
  664.     0x0050, 0x0051, 0x0052, 0x0035, 0x0019, 0x0036, 0x0019, 0x0001,
  665.     0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0003, 0x0001, 0x0001,
  666.     0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001,
  667.     0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001,
  668.     0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001,
  669.     0x5853, 0x0006, 0x0008, 0x0008, 0x0008, 0x0008, 0x0054, 0x0054,
  670.     0x1037, 0x0054, 0x7855, 0x0056, 0x0019, 0x0057, 0x0054, 0x1037,
  671.     0x0058, 0x0059, 0x785A, 0x785B, 0x1037, 0x105C, 0x0054, 0x0006,
  672.     0x1037, 0x785D, 0x7855, 0x005E, 0x305F, 0x305F, 0x305F, 0x0006,
  673.     0x0860, 0x0860, 0x0860, 0x0860, 0x0860, 0x0860, 0x0060, 0x0860,
  674.     0x0860, 0x0860, 0x0860, 0x0860, 0x0860, 0x0860, 0x0860, 0x0860,
  675.     0x0060, 0x0860, 0x0860, 0x0860, 0x0860, 0x0860, 0x0860, 0x0019,
  676.     0x0060, 0x0860, 0x0860, 0x0860, 0x0860, 0x0860, 0x0060, 0x0055,
  677.     0x0861, 0x0861, 0x0861, 0x0861, 0x0861, 0x0861, 0x0061, 0x0861,
  678.     0x0861, 0x0861, 0x0861, 0x0861, 0x0861, 0x0861, 0x0861, 0x0861,
  679.     0x0061, 0x0861, 0x0861, 0x0861, 0x0861, 0x0861, 0x0861, 0x0019,
  680.     0x0061, 0x0861, 0x0861, 0x0861, 0x0861, 0x0861, 0x0061, 0x0862
  681. ];
  682.  
  683. // Each of these arrays is stripped into ranges. In order to build the arrays, each
  684. // codepoint was bit-shifted so that even and odd characters were separated into different
  685. // arrays. The identifier of each array is the top byte after bit-shifting.
  686. // The numbers stored in the array are the bit-shifted codepoint, the decomposition, and an
  687. // index into another array of all possible packed data values. The top 16 bits are the
  688. // codepoint and the bottom 16 are the decomposition and index. The top 5 bits for the decomposition
  689. // and the rest for the index.
  690. // The full set of all arrays to be searched.
  691. immutable Range[] FULL_DATA = [
  692.     Range(a0.length / uint.sizeof, a0),
  693.     Range(a1.sizeof / uint.sizeof, a1),
  694.     Range(0, null),
  695.     Range(0, null),
  696.     Range(0, null),
  697.     Range(0, null),
  698.     Range(0, null),
  699.     Range(a7.sizeof / uint.sizeof, a7),
  700.     Range(a8.sizeof / uint.sizeof, a8),
  701.     Range(0, null),
  702.     Range(0, null),
  703.     Range(0, null),
  704.     Range(0, null),
  705.     Range(0, null),
  706.     Range(0, null),
  707.     Range(0, null),
  708.     Range(a16.sizeof / uint.sizeof, a16),
  709.     Range(a17.sizeof / uint.sizeof, a17),
  710.     Range(0, null),
  711.     Range(0, null),
  712.     Range(0, null),
  713.     Range(0, null),
  714.     Range(0, null),
  715.     Range(a23.sizeof / uint.sizeof, a23),
  716.     Range(a24.sizeof / uint.sizeof, a24),
  717.     Range(0, null),
  718.     Range(0, null),
  719.     Range(0, null),
  720.     Range(0, null),
  721.     Range(0, null),
  722.     Range(0, null),
  723.     Range(0, null)
  724. ];
  725.  
  726. // Array of uppercase differences
  727. immutable short[] UCDIFF = [
  728.         0,   -32,   743,   121,    -1,  -232,  -300,    97,
  729.       163,   130,    56,    -2,   -79,  -210,  -206,  -205,
  730.      -202,  -203,  -207,  -209,  -211,  -213,  -214,  -218,
  731.      -217,  -219,   -83,    84,   -38,   -37,   -31,   -64,
  732.       -63,   -62,   -57,   -47,   -54,   -86,   -80,     7,
  733.       -96,   -48,   -59,     8,    74,    86,   100,   128,
  734.       112,   126,     9, -7205,   -16,   -26, -7264,   -40
  735. ];
  736.  
  737. // Array of lowercase differences
  738. immutable short[] LCDIFF = [
  739.         0,    32,     1,  -199,  -121,   210,   206,   205,
  740.        79,   202,   203,   207,   211,   209,   213,   214,
  741.       218,   217,   219,     2,   -97,   -56,  -130,  -163,
  742.        83,    38,    37,    64,    63,   -60,    -7,    80,
  743.        48,  7264,    -8,   -74,    -9,   -86,  -100,  -112,
  744.      -128,  -126, -7517, -8383, -8262,    16,    26,    40
  745. ];
  746.  
  747. // Array of titlecase differences
  748. immutable short[] TCDIFF = [
  749.     3,     1,     0,    -1
  750. ];
  751.  
  752. // Array of mirrored character differences
  753. immutable short[] MIRROR_DIFF = [
  754.         0,     1,    -1,     2,    -2,    16,   -16,     3,
  755.        -3,  2016,   138,  1824,  2104,  2108,  2106,  -138,
  756.         8,     7,    -8,    -7, -1824, -2016, -2104, -2106,
  757.     -2108
  758. ];
  759.  
  760. // Array of all possible numeric values
  761. immutable int[] NUMERICS = [
  762.         -1,      0,      1,      2,      3,      4,      5,      6,
  763.          7,      8,      9,     10,     11,     12,     13,     14,
  764.         15,     16,     17,     18,     19,     20,     21,     22,
  765.         23,     24,     25,     26,     27,     28,     29,     30,
  766.         31,     32,     33,     34,     35,     -2,    100,   1000,
  767.         40,     50,     60,     70,     80,     90,  10000,    500,
  768.       5000,     36,     37,     38,     39,     41,     42,     43,
  769.         44,     45,     46,     47,     48,     49,    200,    300,
  770.        400,    600,    700,    800,    900,   2000,   3000,   4000,
  771.       6000,   7000,   8000,   9000,  20000,  30000,  40000,  50000,
  772.      60000,  70000,  80000,  90000
  773. ];
  774.  
  775. // All possible packed data values, no duplicates
  776. immutable uint[] PACKED_DATA = [
  777.     0x00000000, 0x0000012F, 0x0000016F, 0x0000014F, 0x0000018F, 0x0000018C, 0x000001B8, 0x000000B8,
  778.     0x000000BA, 0x020005B5, 0x040005B6, 0x00000099, 0x000000F8, 0x00000094, 0x02000069, 0x04000069,
  779.     0x06000069, 0x08000069, 0x0A000069, 0x0C000069, 0x0E000069, 0x10000069, 0x12000069, 0x14000069,
  780.     0x060005B9, 0x000001B9, 0x080005B9, 0x16020001, 0x18020001, 0x1A020001, 0x1C020001, 0x1E020001,
  781.     0x20020001, 0x22020001, 0x24020001, 0x26020001, 0x28020001, 0x2A020001, 0x2C020001, 0x2E020001,
  782.     0x30020001, 0x32020001, 0x34020001, 0x36020001, 0x38020001, 0x3A020001, 0x3C020001, 0x3E020001,
  783.     0x40020001, 0x42020001, 0x44020001, 0x46020001, 0x48020001, 0x060005B5, 0x080005B6, 0x000001BB,
  784.     0x000001B7, 0x16000802, 0x18000802, 0x1A000802, 0x1C000802, 0x1E000802, 0x20000802, 0x22000802,
  785.     0x24000802, 0x26000802, 0x28000802, 0x2A000802, 0x2C000802, 0x2E000802, 0x30000802, 0x32000802,
  786.     0x34000802, 0x36000802, 0x38000802, 0x3A000802, 0x3C000802, 0x3E000802, 0x40000802, 0x42000802,
  787.     0x44000802, 0x46000802, 0x48000802, 0x000000EC, 0x000001BC, 0x00000002, 0x0A0005BD, 0x00000130,
  788.     0x000000BC, 0x000000B9, 0x0600006B, 0x0800006B, 0x00001002, 0x0400006B, 0x0C0005BE, 0x4A0001AB,
  789.     0x00020001, 0x00000802, 0x00001802, 0x00040001, 0x00060001, 0x00002002, 0x00080001, 0x000C0001,
  790.     0x000E0001, 0x00100001, 0x00140001, 0x00160001, 0x00180001, 0x00004002, 0x00004802, 0x00200001,
  791.     0x00220001, 0x00000005, 0x00A60001, 0x01805802, 0x01042003, 0x00280001, 0x002C0001, 0x00000001,
  792.     0x00000000, 0x00007002, 0x00007802, 0x00009802, 0x0000A802, 0x0000B802, 0x0000C002, 0x0000C802,
  793.     0x0000D002, 0x00000004, 0x000001A4, 0x00000106, 0x00320001, 0x00340001, 0x00360001, 0x00380001,
  794.     0x0000E002, 0x0000E802, 0x0000F002, 0x0000F802, 0x00010002, 0x00010802, 0x00012002, 0x00012802,
  795.     0x00013802, 0x003A0001, 0x003E0001, 0x00013002, 0x0000001C, 0x00000107, 0x00400001, 0x00000018,
  796.     0x00014802, 0x000001B4, 0x00000038, 0x00000025, 0x00000050, 0x00000058, 0x00000045, 0x00000044,
  797.     0x020000C9, 0x060000C9, 0x0A0000C9, 0x0E0000C9, 0x120000C9, 0x000000D8, 0x0000005C, 0x00000008,
  798.     0x02000009, 0x06000009, 0x0A000009, 0x0E000009, 0x12000009, 0x0400000B, 0x0800000B, 0x0000000B,
  799.     0x1600000B, 0x4E00000B, 0x00000006, 0x4A00000B, 0x000001B5, 0x00420001, 0x0600000B, 0x0A00000B,
  800.     0x0E00000B, 0x1200000B, 0x3E00000B, 0x5200000B, 0x5600000B, 0x5A00000B, 0x5C00000B, 0x000001B6,
  801.     0x2400000A, 0x2800000A, 0x00000010, 0x020001AB, 0x060001AB, 0x0A0001AB, 0x0E0001AB, 0x120001AB,
  802.     0x00000108, 0x00015802, 0x00440001, 0x00016002, 0x00016802, 0x00017002, 0x00017802, 0x00018002,
  803.     0x00018802, 0x00440003, 0x00460001, 0x00480003, 0x00019802, 0x004A0001, 0x004C0001, 0x004E0001,
  804.     0x003C0001, 0x00500001, 0x00520001, 0x000001BD, 0x0000018D, 0x000001D0, 0x00000250, 0x00000230,
  805.     0x040005BE, 0x000000F9, 0x0200006B, 0x0A00006B, 0x0E00006B, 0x1200006B, 0x00540001, 0x00560001,
  806.     0x000005B9, 0x045A000A, 0x085A000A, 0x0C5A000A, 0x105A000A, 0x145A000A, 0x185A000A, 0x525A000A,
  807.     0x5E5A000A, 0x0401A00A, 0x0801A00A, 0x0C01A00A, 0x1001A00A, 0x1401A00A, 0x1801A00A, 0x5201A00A,
  808.     0x5E01A00A, 0x4E00000A, 0x5C00000A, 0x0E0005B9, 0x100005B9, 0x020005B9, 0x040005B9, 0x160005B9,
  809.     0x180005B9, 0x1A0005B9, 0x200005B9, 0x220005B9, 0x240005B9, 0x260005B9, 0x040001AB, 0x080001AB,
  810.     0x0C0001AB, 0x100001AB, 0x140001AB, 0x180001AB, 0x1C0001AB, 0x200001AB, 0x240001AB, 0x280001AB,
  811.     0x0C00006B, 0x1000006B, 0x1400006B, 0x1800006B, 0x1C00006B, 0x2000006B, 0x2400006B, 0x2800006B,
  812.     0x005C001C, 0x0001A81C, 0x1A0001AB, 0x1E0001AB, 0x220001AB, 0x260001AB, 0x2A0001AB, 0x160001AB,
  813.     0x020005B6, 0x100005B6, 0x280005B9, 0x2C0005B9, 0x300005B9, 0x0001B002, 0x020005BD, 0x0600000A,
  814.     0x0A00000A, 0x0E00000A, 0x1200000A, 0x1600000A, 0x3E00000A, 0x0C00000B, 0x1000000B, 0x1400000B,
  815.     0x2E0001AB, 0x320001AB, 0x360001AB, 0x3A0001AB, 0x3E0001AB, 0x420001AB, 0x460001AB, 0x640001AB,
  816.     0x680001AB, 0x6A0001AB, 0x6E0001AB, 0x720001AB, 0x760001AB, 0x7A0001AB, 0x00000013, 0x00000012,
  817.     0x0000005A, 0x000001B0, 0x7C00000B, 0x8000000B, 0x8200000B, 0x8600000B, 0x8C00000B, 0x6000000B,
  818.     0x9200000B, 0x9600000B, 0x9800000B, 0x9C00000B, 0xA000000B, 0xA400000B, 0x4A0001AA, 0x040001AA,
  819.     0x520001AA, 0x600001AA, 0x0C0001AA, 0x5E0001AA, 0x160001AA, 0x4C0001AA, 0x4E0001AA, 0x9E0001AA,
  820.     0x060001AA, 0x8800000A, 0x2A0001AA, 0x005E0001, 0x0001B802, 0x0400002B, 0x0800002B, 0x1600002B,
  821.     0x4C00002B, 0x00002802, 0x00003002, 0x000A0001, 0x00120001, 0x00003802, 0x001A0001, 0x001C0001,
  822.     0x001E0001, 0x00240001, 0x00005002, 0x00006002, 0x002A0001, 0x002E0001, 0x00300001, 0x00006802,
  823.     0x00008002, 0x00008802, 0x00009002, 0x0000A002, 0x0000B002, 0x0000D906, 0x00011002, 0x00011802,
  824.     0x00014002, 0x040000C9, 0x080000C9, 0x0C0000C9, 0x100000C9, 0x140000C9, 0x04000009, 0x08000009,
  825.     0x0C000009, 0x10000009, 0x14000009, 0x2200000B, 0x4C00000B, 0x2A00000B, 0x5000000B, 0x5400000B,
  826.     0x5800000B, 0x2600000A, 0x00015002, 0x00019002, 0x00000030, 0x000001BE, 0x0000014E, 0x00000210,
  827.     0x000001F0, 0x00580001, 0x065A000A, 0x0A5A000A, 0x0E5A000A, 0x125A000A, 0x165A000A, 0x1A5A000A,
  828.     0x4C5A000A, 0x4E5A000A, 0x0601A00A, 0x0A01A00A, 0x0E01A00A, 0x1201A00A, 0x1601A00A, 0x1A01A00A,
  829.     0x4C01A00A, 0x4E01A00A, 0x6000000A, 0x0000000A, 0x120005B9, 0x140005B9, 0x1C0005B9, 0x1E0005B9,
  830.     0x1600006B, 0x1A00006B, 0x1E00006B, 0x2200006B, 0x2600006B, 0x2A00006B, 0x0E0005B5, 0x040005B5,
  831.     0x2A0005B9, 0x2E0005B9, 0x0200000A, 0x0400000A, 0x0800000A, 0x0C00000A, 0x1000000A, 0x1400000A,
  832.     0x2A00000A, 0x2C0001AB, 0x300001AB, 0x340001AB, 0x380001AB, 0x3C0001AB, 0x400001AB, 0x440001AB,
  833.     0x480001AB, 0x620001AB, 0x660001AB, 0x500001AB, 0x6C0001AB, 0x700001AB, 0x740001AB, 0x780001AB,
  834.     0x520001AB, 0x7E00000B, 0x5E00000B, 0x8400000B, 0x8800000B, 0x8A00000B, 0x8E00000B, 0x9000000B,
  835.     0x9400000B, 0x9A00000B, 0x9E00000B, 0xA200000B, 0xA600000B, 0x5C0001AA, 0x3E0001AA, 0x7E0001AA,
  836.     0x0600002B, 0x0A00002B, 0x2A00002B, 0x4E00002B, 0x00000019
  837. ];
  838.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement