Untitled

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Name        check-compose.py
Description Check compose sequences
Author      Pander <[email protected]>
License     MIT License

0.1 2012-01-06  Pander <[email protected]>
Initial release
"""

import binascii
import sys

silent = False

def isUnicodeUpper(s):
    hex = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F', )
    if (
        len(s) == 5 and s[0] == 'U' and s[1] in hex and s[2] in hex and s[3] in hex and s[4] in hex
    ) or (
        len(s) == 6 and s[0] == 'U' and s[1] in hex and s[2] in hex and s[3] in hex and s[4] in hex and s[5] in hex
    ):
        return True
    return False

def isUnicodeLower(s):
    hex = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', )
    if (
        len(s) == 5 and s[0] == 'U' and s[1] in hex and s[2] in hex and s[3] in hex and s[4] in hex
    ) or (
        len(s) == 6 and s[0] == 'U' and s[1] in hex and s[2] in hex and s[3] in hex and s[4] in hex and s[5] in hex
    ):
        return True
    return False

# Load Unicode information
unicode_info = {}
unicode_file = file('UnicodeData.txt', 'r')
for line in unicode_file.readlines():
    data = line.split(';')
    unicode_info[data[0]] = data[1]

# Load compose sequences
compose_sequences = {}
codes = []
chars = []
names = []
compose_file = file('proposal', 'r')
lines = 0
for line in compose_file.readlines():
    lines = lines + 1
    if line[0] != '<':
        continue
    if '"\t\t# ' in line:
        line = line .replace('"\t\t# ', '"    # ')
    if ':   "' in line and '<U17f' not in line:
        line = line.replace(':   "', ': "')
    seq = None
    char = None
    code = None
    name = None
    data = line.split(': "')
    seq = data[0].strip()
    try:

        charcodename = data[1].split('" ')
    except IndexError:
        if '<U17f' in seq:
            seq = seq.split(' :')[0].strip()
            if not silent:
                print 'WARNING line %s: missing second double quote and comment with name' %lines
                print ' ', line[:-1]
            char = ''
            code = ''
            name = ''
        else:
            if not silent:
                print 'ERROR lines %s: unknown malformation' %lines
                print ' ', line[:-1]
    if not (char == '' and code == '' and name == ''):
        char = charcodename[0].strip()
        charname = charcodename[1].strip().split(' # ')
        if len(charname) == 1:
            if False:
                if not silent:
                    print 'WARNING line %s: missing code (alias or Unicode code point)' %lines
                    print ' ', line[:-1]
            code = ''
            name = charname[0].replace('# ', '').strip()
        else:
            code = charname[0].strip()
            name = charname[1].strip()
    if False:#TODO for generating documentation
        print 'seq:', seq
        print '  char:', char
        print '  code:', code
        print '  name:', name

    for se in compose_sequences.keys():
        (ch, co, na) = compose_sequences[se]
        if se == seq:
            if not silent:
                print 'ERROR line %s: at least duplicate compose sequence'
                print ' ', line[:-1]
            break
        elif seq == se[:len(seq)]:
            if not silent:
                print 'ERROR line %s: compose sequence is blocking at least %s : "%s" %s # %s' %(lines, se, ch, co, na)
                print ' ', line[:-1]
            break
        elif seq[:len(se)] == se:
            if not silent:
                print 'ERROR line %s: compose sequence is at least blocked by %s : "%s" %s # %s' %(lines, se, ch, co, na)
                print ' ', line[:-1]
            break
        elif code != '' and code == co and name != na:
            if not silent:
                print 'WARNING line %s: conflicting names for same code for %s : "%s" %s # %s' %(lines, se, ch, co, na)
                print ' ', line[:-1]
            break
        elif code != '' and code != co and name == na:
            if not silent:
                print 'WARNING line %s: conflicting codes for same name for %s : "%s" %s # %s' %(lines, se, ch, co, na)
                print ' ', line[:-1]
            break
        #TODO etc.

    if code != '' and (isUnicodeUpper(code) or isUnicodeLower(code)):
        CODE = code[1:].upper()
        if CODE in unicode_info:
            info = unicode_info[CODE]
            if info != name:
                if not silent:
                    print 'WARNING line %s: incorrect comment with name, should be %s from UnicodeData.txt' %(lines, info)
                    print ' ', line[:-1]
        else:
            if not silent:
                print 'WARNING line %s: unknown Unicode code point %s according to UnicodeData.txt' %(lines, code)
                print ' ', line[:-1]

    compose_sequences[seq] = (char, code, name)
    if code != '' and code not in codes:
        codes.append(code)
    if char != '' and char not in chars:
        chars.append(char)
    if name != '' and name not in names:
        names.append(name)

if not silent:
    print 'INFO: checked %s compose sequences' %len(compose_sequences)
    print 'INFO: resulting in %s different chars' %len(chars)
    print 'INFO: related to %s different codes' %len(codes)
    print 'INFO: with %s different names in comment' %len(names)