Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # all unicode characters that after NFKC normalization
- # return a string with a space at the beginning
- from itertools import count
- from unicodedata import decomposition as dec, name, normalize
- chars = (unichr(i) for i in range(0xffff+1) if dec(unichr(i)))
- counter = count()
- for ch in chars:
- norm = normalize('NFKC', ch)
- if len(norm) > 1 and norm.startswith(u' '):
- print u'# %2i | % 9r | % 3s |' % (counter.next(), ch, ch),
- print u'% 20s | %s | %s | %r' % (dec(ch), name(ch), norm, norm)
- # 0 | u'\xa8' | ¨ | <compat> 0020 0308 | DIAERESIS | ̈ | u' \u0308'
- # 1 | u'\xaf' | ¯ | <compat> 0020 0304 | MACRON | ̄ | u' \u0304'
- # 2 | u'\xb4' | ´ | <compat> 0020 0301 | ACUTE ACCENT | ́ | u' \u0301'
- # 3 | u'\xb8' | ¸ | <compat> 0020 0327 | CEDILLA | ̧ | u' \u0327'
- # 4 | u'\u02d8' | ˘ | <compat> 0020 0306 | BREVE | ̆ | u' \u0306'
- # 5 | u'\u02d9' | ˙ | <compat> 0020 0307 | DOT ABOVE | ̇ | u' \u0307'
- # 6 | u'\u02da' | ˚ | <compat> 0020 030A | RING ABOVE | ̊ | u' \u030a'
- # 7 | u'\u02db' | ˛ | <compat> 0020 0328 | OGONEK | ̨ | u' \u0328'
- # 8 | u'\u02dc' | ˜ | <compat> 0020 0303 | SMALL TILDE | ̃ | u' \u0303'
- # 9 | u'\u02dd' | ˝ | <compat> 0020 030B | DOUBLE ACUTE ACCENT | ̋ | u' \u030b'
- # 10 | u'\u037a' | ͺ | <compat> 0020 0345 | GREEK YPOGEGRAMMENI | ͅ | u' \u0345'
- # 11 | u'\u0384' | ΄ | <compat> 0020 0301 | GREEK TONOS | ́ | u' \u0301'
- # 12 | u'\u0385' | ΅ | 00A8 0301 | GREEK DIALYTIKA TONOS | ̈́ | u' \u0308\u0301'
- # 13 | u'\u1fbd' | ᾽ | <compat> 0020 0313 | GREEK KORONIS | ̓ | u' \u0313'
- # 14 | u'\u1fbf' | ᾿ | <compat> 0020 0313 | GREEK PSILI | ̓ | u' \u0313'
- # 15 | u'\u1fc0' | ῀ | <compat> 0020 0342 | GREEK PERISPOMENI | ͂ | u' \u0342'
- # 16 | u'\u1fc1' | ῁ | 00A8 0342 | GREEK DIALYTIKA AND PERISPOMENI | ̈͂ | u' \u0308\u0342'
- # 17 | u'\u1fcd' | ῍ | 1FBF 0300 | GREEK PSILI AND VARIA | ̓̀ | u' \u0313\u0300'
- # 18 | u'\u1fce' | ῎ | 1FBF 0301 | GREEK PSILI AND OXIA | ̓́ | u' \u0313\u0301'
- # 19 | u'\u1fcf' | ῏ | 1FBF 0342 | GREEK PSILI AND PERISPOMENI | ̓͂ | u' \u0313\u0342'
- # 20 | u'\u1fdd' | ῝ | 1FFE 0300 | GREEK DASIA AND VARIA | ̔̀ | u' \u0314\u0300'
- # 21 | u'\u1fde' | ῞ | 1FFE 0301 | GREEK DASIA AND OXIA | ̔́ | u' \u0314\u0301'
- # 22 | u'\u1fdf' | ῟ | 1FFE 0342 | GREEK DASIA AND PERISPOMENI | ̔͂ | u' \u0314\u0342'
- # 23 | u'\u1fed' | ῭ | 00A8 0300 | GREEK DIALYTIKA AND VARIA | ̈̀ | u' \u0308\u0300'
- # 24 | u'\u1fee' | ΅ | 0385 | GREEK DIALYTIKA AND OXIA | ̈́ | u' \u0308\u0301'
- # 25 | u'\u1ffd' | ´ | 00B4 | GREEK OXIA | ́ | u' \u0301'
- # 26 | u'\u1ffe' | ῾ | <compat> 0020 0314 | GREEK DASIA | ̔ | u' \u0314'
- # 27 | u'\u2017' | ‗ | <compat> 0020 0333 | DOUBLE LOW LINE | ̳ | u' \u0333'
- # 28 | u'\u203e' | ‾ | <compat> 0020 0305 | OVERLINE | ̅ | u' \u0305'
- # 29 | u'\u309b' | ゛ | <compat> 0020 3099 | KATAKANA-HIRAGANA VOICED SOUND MARK | ゙ | u' \u3099'
- # 30 | u'\u309c' | ゜ | <compat> 0020 309A | KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK | ゚ | u' \u309a'
- # 31 | u'\ufc5e' | ﱞ | <isolated> 0020 064C 0651 | ARABIC LIGATURE SHADDA WITH DAMMATAN ISOLATED FORM | ٌّ | u' \u064c\u0651'
- # 32 | u'\ufc5f' | ﱟ | <isolated> 0020 064D 0651 | ARABIC LIGATURE SHADDA WITH KASRATAN ISOLATED FORM | ٍّ | u' \u064d\u0651'
- # 33 | u'\ufc60' | ﱠ | <isolated> 0020 064E 0651 | ARABIC LIGATURE SHADDA WITH FATHA ISOLATED FORM | َّ | u' \u064e\u0651'
- # 34 | u'\ufc61' | ﱡ | <isolated> 0020 064F 0651 | ARABIC LIGATURE SHADDA WITH DAMMA ISOLATED FORM | ُّ | u' \u064f\u0651'
- # 35 | u'\ufc62' | ﱢ | <isolated> 0020 0650 0651 | ARABIC LIGATURE SHADDA WITH KASRA ISOLATED FORM | ِّ | u' \u0650\u0651'
- # 36 | u'\ufc63' | ﱣ | <isolated> 0020 0651 0670 | ARABIC LIGATURE SHADDA WITH SUPERSCRIPT ALEF ISOLATED FORM | ّٰ | u' \u0651\u0670'
- # 37 | u'\ufe49' | ﹉ | <compat> 203E | DASHED OVERLINE | ̅ | u' \u0305'
- # 38 | u'\ufe4a' | ﹊ | <compat> 203E | CENTRELINE OVERLINE | ̅ | u' \u0305'
- # 39 | u'\ufe4b' | ﹋ | <compat> 203E | WAVY OVERLINE | ̅ | u' \u0305'
- # 40 | u'\ufe4c' | ﹌ | <compat> 203E | DOUBLE WAVY OVERLINE | ̅ | u' \u0305'
- # 41 | u'\ufe70' | ﹰ | <isolated> 0020 064B | ARABIC FATHATAN ISOLATED FORM | ً | u' \u064b'
- # 42 | u'\ufe72' | ﹲ | <isolated> 0020 064C | ARABIC DAMMATAN ISOLATED FORM | ٌ | u' \u064c'
- # 43 | u'\ufe74' | ﹴ | <isolated> 0020 064D | ARABIC KASRATAN ISOLATED FORM | ٍ | u' \u064d'
- # 44 | u'\ufe76' | ﹶ | <isolated> 0020 064E | ARABIC FATHA ISOLATED FORM | َ | u' \u064e'
- # 45 | u'\ufe78' | ﹸ | <isolated> 0020 064F | ARABIC DAMMA ISOLATED FORM | ُ | u' \u064f'
- # 46 | u'\ufe7a' | ﹺ | <isolated> 0020 0650 | ARABIC KASRA ISOLATED FORM | ِ | u' \u0650'
- # 47 | u'\ufe7c' | ﹼ | <isolated> 0020 0651 | ARABIC SHADDA ISOLATED FORM | ّ | u' \u0651'
- # 48 | u'\ufe7e' | ﹾ | <isolated> 0020 0652 | ARABIC SUKUN ISOLATED FORM | ْ | u' \u0652'
- # 49 | u'\uffe3' |  ̄ | <wide> 00AF | FULLWIDTH MACRON | ̄ | u' \u0304'
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement