Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- # -*- coding: utf-8 -*-
- """
- Name check-compose.py
- Description Check compose sequences
- Author Pander <pander@users.sourceforge.net>
- License MIT License
- 0.1 2012-01-06 Pander <pander@users.sourceforge.net>
- Initial release
- """
- import binascii
- import sys
- silent = False
- def isUnicodeUpper(s):
- hex = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F', )
- if (
- len(s) == 5 and s[0] == 'U' and s[1] in hex and s[2] in hex and s[3] in hex and s[4] in hex
- ) or (
- len(s) == 6 and s[0] == 'U' and s[1] in hex and s[2] in hex and s[3] in hex and s[4] in hex and s[5] in hex
- ):
- return True
- return False
- def isUnicodeLower(s):
- hex = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', )
- if (
- len(s) == 5 and s[0] == 'U' and s[1] in hex and s[2] in hex and s[3] in hex and s[4] in hex
- ) or (
- len(s) == 6 and s[0] == 'U' and s[1] in hex and s[2] in hex and s[3] in hex and s[4] in hex and s[5] in hex
- ):
- return True
- return False
- # Load Unicode information
- unicode_info = {}
- unicode_file = file('UnicodeData.txt', 'r')
- for line in unicode_file.readlines():
- data = line.split(';')
- unicode_info[data[0]] = data[1]
- # Load compose sequences
- compose_sequences = {}
- codes = []
- chars = []
- names = []
- compose_file = file('proposal', 'r')
- lines = 0
- for line in compose_file.readlines():
- lines = lines + 1
- if line[0] != '<':
- continue
- if '"\t\t# ' in line:
- line = line .replace('"\t\t# ', '" # ')
- if ': "' in line and '<U17f' not in line:
- line = line.replace(': "', ': "')
- seq = None
- char = None
- code = None
- name = None
- data = line.split(': "')
- seq = data[0].strip()
- try:
- charcodename = data[1].split('" ')
- except IndexError:
- if '<U17f' in seq:
- seq = seq.split(' :')[0].strip()
- if not silent:
- print 'WARNING line %s: missing second double quote and comment with name' %lines
- print ' ', line[:-1]
- char = ''
- code = ''
- name = ''
- else:
- if not silent:
- print 'ERROR lines %s: unknown malformation' %lines
- print ' ', line[:-1]
- if not (char == '' and code == '' and name == ''):
- char = charcodename[0].strip()
- charname = charcodename[1].strip().split(' # ')
- if len(charname) == 1:
- if False:
- if not silent:
- print 'WARNING line %s: missing code (alias or Unicode code point)' %lines
- print ' ', line[:-1]
- code = ''
- name = charname[0].replace('# ', '').strip()
- else:
- code = charname[0].strip()
- name = charname[1].strip()
- if False:#TODO for generating documentation
- print 'seq:', seq
- print ' char:', char
- print ' code:', code
- print ' name:', name
- for se in compose_sequences.keys():
- (ch, co, na) = compose_sequences[se]
- if se == seq:
- if not silent:
- print 'ERROR line %s: at least duplicate compose sequence'
- print ' ', line[:-1]
- break
- elif seq == se[:len(seq)]:
- if not silent:
- print 'ERROR line %s: compose sequence is blocking at least %s : "%s" %s # %s' %(lines, se, ch, co, na)
- print ' ', line[:-1]
- break
- elif seq[:len(se)] == se:
- if not silent:
- print 'ERROR line %s: compose sequence is at least blocked by %s : "%s" %s # %s' %(lines, se, ch, co, na)
- print ' ', line[:-1]
- break
- elif code != '' and code == co and name != na:
- if not silent:
- print 'WARNING line %s: conflicting names for same code for %s : "%s" %s # %s' %(lines, se, ch, co, na)
- print ' ', line[:-1]
- break
- elif code != '' and code != co and name == na:
- if not silent:
- print 'WARNING line %s: conflicting codes for same name for %s : "%s" %s # %s' %(lines, se, ch, co, na)
- print ' ', line[:-1]
- break
- #TODO etc.
- if code != '' and (isUnicodeUpper(code) or isUnicodeLower(code)):
- CODE = code[1:].upper()
- if CODE in unicode_info:
- info = unicode_info[CODE]
- if info != name:
- if not silent:
- print 'WARNING line %s: incorrect comment with name, should be %s from UnicodeData.txt' %(lines, info)
- print ' ', line[:-1]
- else:
- if not silent:
- print 'WARNING line %s: unknown Unicode code point %s according to UnicodeData.txt' %(lines, code)
- print ' ', line[:-1]
- compose_sequences[seq] = (char, code, name)
- if code != '' and code not in codes:
- codes.append(code)
- if char != '' and char not in chars:
- chars.append(char)
- if name != '' and name not in names:
- names.append(name)
- if not silent:
- print 'INFO: checked %s compose sequences' %len(compose_sequences)
- print 'INFO: resulting in %s different chars' %len(chars)
- print 'INFO: related to %s different codes' %len(codes)
- print 'INFO: with %s different names in comment' %len(names)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement