Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- def decodeUnicodeGeneralCategory(abbr, sortable=True, desc=False):
- """Decode Unicode General Category (gc) code.
- Every Unicode code point has a set of properties. One property is called
- General Category and its value indicates whether the code point represents
- a letter, numeral, symbol et cetera.
- These values can be an abbreviation (abbr) or written out (long) and can
- have a description (desc). The one-letter abbreviations are groups of
- two-letter general categories abbreviations with the same initial letter.
- These groups are never used as a property for for Unicode point properties
- as can be seen in
- http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
- for example.
- Sources are for development and maintenance are:
- http://www.unicode.org/reports/tr18/#General_Category_Property
- http://www.unicode.org/reports/tr44/#General_Category_Values
- http://www.unicode.org/Public/UNIDATA/PropertyValueAliases.txt
- Especially the latter is important as it offers the latest definitions.
- Latest update is from Unicode version 6.2.0, 2012-08-14, 16:05:11 GMT [MD]
- :param abbr: Unicode General Category code
- :param sortable: output long is alphabetically sortable
- :param desc: output is non-sortable desc instead of long
- :returns: long name or description for abbreviated Unicode General Category
- """
- gc = {
- 'C' :('Other', 'Other',
- 'Cc | Cf | Cn | Co | Cs'),
- 'Cc':('Control', 'Control',
- 'a C0 or C1 control code'),# a.k.a. cntrl
- 'Cf':('Format', 'Format',
- 'a format control character'),
- 'Cn':('Unassigned', 'Unassigned',
- 'a reserved unassigned code point or a noncharacter'),
- 'Co':('Private Use', 'Private_Use',
- 'a private-use character'),
- 'Cs':('Surrogate', 'Surrogate',
- 'a surrogate code point'),
- 'L' :('Letter', 'Letter',
- 'Ll | Lm | Lo | Lt | Lu'),
- 'LC':('Letter, Cased', 'Cased_Letter',
- 'Ll | Lt | Lu'),
- 'Ll':('Letter, Lowercase', 'Lowercase_Letter',
- 'a lowercase letter'),
- 'Lm':('Letter, Modifier', 'Modifier_Letter',
- 'a modifier letter'),
- 'Lo':('Letter, Other', 'Other_Letter',
- 'other letters, including syllables and ideographs'),
- 'Lt':('Letter, Titlecase', 'Titlecase_Letter',
- 'a digraphic character, with first part uppercase'),
- 'Lu':('Letter, Uppercase', 'Uppercase_Letter',
- 'an uppercase letter'),
- 'M' :('Mark', 'Mark',
- 'Mc | Me | Mn '),# a.k.a. Combining_Mark
- 'Mc':('Mark, Spacing', 'Spacing_Mark',
- 'a spacing combining mark (positive advance width)'),
- 'Me':('Mark, Enclosing', 'Enclosing_Mark',
- 'an enclosing combining mark'),
- 'Mn':('Mark, Nonspacing', 'Nonspacing_Mark',
- 'a nonspacing combining mark (zero advance width)'),
- 'N' :('Number', 'Number',
- 'Nd | Nl | No'),
- 'Nd':('Number, Decimal', 'Decimal_Number',
- 'a decimal digit'),# a.k.a. digit
- 'Nl':('Number, Letter', 'Letter_Number',
- 'a letterlike numeric character'),
- 'No':('Number, Other', 'Other_Number',
- 'a numeric character of other type'),
- 'P' :('Punctuation', 'Punctuation',
- 'Pc | Pd | Pe | Pf | Pi | Po | Ps'),# a.k.a. punct
- 'Pc':('Punctuation, Connector', 'Connector_Punctuation',
- 'a connecting punctuation mark, like a tie'),
- 'Pd':('Punctuation, Dash', 'Dash_Punctuation',
- 'a dash or hyphen punctuation mark'),
- 'Pe':('Punctuation, Close', 'Close_Punctuation',
- 'a closing punctuation mark (of a pair)'),
- 'Pf':('Punctuation, Final', 'Final_Punctuation',
- 'a final quotation mark'),
- 'Pi':('Punctuation, Initial', 'Initial_Punctuation',
- 'an initial quotation mark'),
- 'Po':('Punctuation, Other', 'Other_Punctuation',
- 'a punctuation mark of other type'),
- 'Ps':('Punctuation, Open', 'Open_Punctuation',
- 'an opening punctuation mark (of a pair)'),
- 'S' :('Symbol', 'Symbol',
- 'Sc | Sk | Sm | So'),
- 'Sc':('Symbol, Currency', 'Currency_Symbol',
- 'a currency sign'),
- 'Sk':('Symbol, Modifier', 'Modifier_Symbol',
- 'a non-letterlike modifier symbol'),
- 'Sm':('Symbol, Math', 'Math_Symbol',
- 'a symbol of mathematical use'),
- 'So':('Symbol, Other', 'Other_Symbol',
- 'a symbol of other type'),
- 'Z' :('Separator', 'Separator',
- 'Zl | Zp | Zs'),
- 'Zl':('Separator, Line', 'Line_Separator',
- 'U+2028 LINE SEPARATOR only'),
- 'Zp':('Separator, Paragraph', 'Paragraph_Separator',
- 'U+2029 PARAGRAPH SEPARATOR only'),
- 'Zs':('Separator, Space', 'Space_Separator',
- 'a space character (of various non-zero widths)'),
- }
- if abbr not in gc:
- raise Exception('Unknown general category abbreviation:', abbr)
- if desc:
- return gc[abbr][2]
- elif sortable:
- return gc[abbr][0]
- else:
- return gc[abbr][1]
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement