Advertisement
Guest User

Untitled

a guest
Dec 13th, 2012
126
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.26 KB | None | 0 0
  1. def decodeUnicodeGeneralCategory(abbr, sortable=True, desc=False):
  2.     """Decode Unicode General Category (gc) code.
  3.    
  4.    Every Unicode code point has a set of properties. One property is called
  5.    General Category and its value indicates whether the code point represents
  6.    a letter, numeral, symbol et cetera.
  7.    
  8.    These values can be an abbreviation (abbr) or written out (long) and can
  9.    have a description (desc). The one-letter abbreviations are groups of
  10.    two-letter general categories abbreviations with the same initial letter.
  11.    These groups are never used as a property for for Unicode point properties
  12.    as can be seen in
  13.    http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
  14.    for example.
  15.  
  16.    Sources are for development and maintenance are:
  17.    http://www.unicode.org/reports/tr18/#General_Category_Property
  18.    http://www.unicode.org/reports/tr44/#General_Category_Values
  19.    http://www.unicode.org/Public/UNIDATA/PropertyValueAliases.txt
  20.    Especially the latter is important as it offers the latest definitions.
  21.    Latest update is from Unicode version 6.2.0, 2012-08-14, 16:05:11 GMT [MD]
  22.    
  23.    :param abbr: Unicode General Category code
  24.    :param sortable: output long is alphabetically sortable
  25.    :param desc: output is non-sortable desc instead of long
  26.    :returns: long name or description for abbreviated Unicode General Category
  27.    """
  28.     gc = {
  29.     'C' :('Other',                  'Other',
  30.     'Cc | Cf | Cn | Co | Cs'),
  31.     'Cc':('Control',                'Control',          
  32.     'a C0 or C1 control code'),# a.k.a. cntrl
  33.     'Cf':('Format',                 'Format',          
  34.     'a format control character'),
  35.     'Cn':('Unassigned',             'Unassigned',      
  36.     'a reserved unassigned code point or a noncharacter'),
  37.     'Co':('Private Use',            'Private_Use',      
  38.     'a private-use character'),
  39.     'Cs':('Surrogate',              'Surrogate',        
  40.     'a surrogate code point'),
  41.     'L' :('Letter',                 'Letter',          
  42.     'Ll | Lm | Lo | Lt | Lu'),
  43.     'LC':('Letter, Cased',          'Cased_Letter',    
  44.     'Ll | Lt | Lu'),
  45.     'Ll':('Letter, Lowercase',      'Lowercase_Letter',
  46.     'a lowercase letter'),
  47.     'Lm':('Letter, Modifier',       'Modifier_Letter',  
  48.     'a modifier letter'),
  49.     'Lo':('Letter, Other',          'Other_Letter',    
  50.     'other letters, including syllables and ideographs'),
  51.     'Lt':('Letter, Titlecase',      'Titlecase_Letter',
  52.     'a digraphic character, with first part uppercase'),
  53.     'Lu':('Letter, Uppercase',      'Uppercase_Letter',
  54.     'an uppercase letter'),
  55.     'M' :('Mark',                   'Mark',
  56.     'Mc | Me | Mn '),# a.k.a. Combining_Mark
  57.     'Mc':('Mark, Spacing',          'Spacing_Mark',    
  58.     'a spacing combining mark (positive advance width)'),
  59.     'Me':('Mark, Enclosing',        'Enclosing_Mark',  
  60.     'an enclosing combining mark'),
  61.     'Mn':('Mark, Nonspacing',       'Nonspacing_Mark',  
  62.     'a nonspacing combining mark (zero advance width)'),
  63.     'N' :('Number',                 'Number',          
  64.     'Nd | Nl | No'),
  65.     'Nd':('Number, Decimal',        'Decimal_Number',  
  66.     'a decimal digit'),# a.k.a. digit
  67.     'Nl':('Number, Letter',         'Letter_Number',        
  68.     'a letterlike numeric character'),
  69.     'No':('Number, Other',          'Other_Number',
  70.     'a numeric character of other type'),
  71.     'P' :('Punctuation',            'Punctuation',          
  72.     'Pc | Pd | Pe | Pf | Pi | Po | Ps'),# a.k.a. punct
  73.     'Pc':('Punctuation, Connector', 'Connector_Punctuation',
  74.     'a connecting punctuation mark, like a tie'),
  75.     'Pd':('Punctuation, Dash',      'Dash_Punctuation',    
  76.     'a dash or hyphen punctuation mark'),
  77.     'Pe':('Punctuation, Close',     'Close_Punctuation',    
  78.     'a closing punctuation mark (of a pair)'),
  79.     'Pf':('Punctuation, Final',     'Final_Punctuation',    
  80.     'a final quotation mark'),
  81.     'Pi':('Punctuation, Initial',   'Initial_Punctuation',
  82.     'an initial quotation mark'),
  83.     'Po':('Punctuation, Other',     'Other_Punctuation',    
  84.     'a punctuation mark of other type'),
  85.     'Ps':('Punctuation, Open',      'Open_Punctuation',    
  86.     'an opening punctuation mark (of a pair)'),
  87.     'S' :('Symbol',                 'Symbol',
  88.     'Sc | Sk | Sm | So'),
  89.     'Sc':('Symbol, Currency',       'Currency_Symbol',      
  90.     'a currency sign'),
  91.     'Sk':('Symbol, Modifier',       'Modifier_Symbol',
  92.     'a non-letterlike modifier symbol'),
  93.     'Sm':('Symbol, Math',           'Math_Symbol',          
  94.     'a symbol of mathematical use'),
  95.     'So':('Symbol, Other',          'Other_Symbol',
  96.     'a symbol of other type'),
  97.     'Z' :('Separator',              'Separator',            
  98.     'Zl | Zp | Zs'),
  99.     'Zl':('Separator, Line',        'Line_Separator',
  100.     'U+2028 LINE SEPARATOR only'),
  101.     'Zp':('Separator, Paragraph',   'Paragraph_Separator',  
  102.     'U+2029 PARAGRAPH SEPARATOR only'),
  103.     'Zs':('Separator, Space',       'Space_Separator',      
  104.     'a space character (of various non-zero widths)'),
  105.     }
  106.     if abbr not in gc:
  107.         raise Exception('Unknown general category abbreviation:', abbr)
  108.     if desc:
  109.         return gc[abbr][2]
  110.     elif sortable:
  111.         return gc[abbr][0]
  112.     else:
  113.         return gc[abbr][1]
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement