Advertisement
Guest User

Untitled

a guest
Mar 19th, 2012
84
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. """
  4. Name        check-compose.py
  5. Description Check compose sequences
  6. Author      Pander <pander@users.sourceforge.net>
  7. License     MIT License
  8.  
  9. 0.1 2012-01-06  Pander <pander@users.sourceforge.net>
  10. Initial release
  11.  
  12. 0.2 2012-03-19  Pander <pander@users.sourceforge.net>
  13. Added downloading
  14. """
  15.  
  16. import binascii
  17. import sys
  18. from urllib import urlretrieve
  19. from os.path import isfile, getsize
  20.  
  21. silent = False
  22.  
  23. def isUnicodeUpper(s):
  24.     hex = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F', )
  25.     if (
  26.         len(s) == 5 and s[0] == 'U' and s[1] in hex and s[2] in hex and s[3] in hex and s[4] in hex
  27.     ) or (
  28.         len(s) == 6 and s[0] == 'U' and s[1] in hex and s[2] in hex and s[3] in hex and s[4] in hex and s[5] in hex
  29.     ):
  30.         return True
  31.     return False
  32.  
  33. def isUnicodeLower(s):
  34.     hex = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', )
  35.     if (
  36.         len(s) == 5 and s[0] == 'U' and s[1] in hex and s[2] in hex and s[3] in hex and s[4] in hex
  37.     ) or (
  38.         len(s) == 6 and s[0] == 'U' and s[1] in hex and s[2] in hex and s[3] in hex and s[4] in hex and s[5] in hex
  39.     ):
  40.         return True
  41.     return False
  42.  
  43. def download_hook(blocks_transferred, block_size, file_size):
  44.     """ A download hook to provide some feedback when downloading """
  45.     if blocks_transferred == 0:
  46.         if file_size > 0:
  47.             if not silent:
  48.                 print "INFO: Downloading", file_size, "bytes: ",
  49.         else:  
  50.             if not silent:
  51.                 print "INFO: Downloading: ",
  52.     sys.stdout.write('#')
  53.     sys.stdout.flush()
  54.  
  55. def download_file(url):
  56.     """ Downloads a file provided a URL. Returns the filename. """
  57.     """ Borks on failure """
  58.     localfilename = url.split('/')[-1]
  59.     if not isfile(localfilename) or getsize(localfilename) <= 0:
  60.         if not silent:
  61.             print "INFO: Downloading ", url, "..."
  62.         try:
  63.             urlretrieve(url, localfilename, download_hook)
  64.         except IOError, (errno, strerror):
  65.             print "I/O error(%s): %s" % (errno, strerror)
  66.             sys.exit(-1)
  67.         except:
  68.             print "Unexpected error: ", sys.exc_info()
  69.             sys.exit(-1)
  70.         print " done."
  71.     else:
  72.         if not silent:
  73.             print "INFO: Using cached file for ", url
  74.     return localfilename
  75.  
  76. # Load Unicode information
  77. unicode_info = {}
  78. unicode_file = None
  79. unicode_filename = download_file('http://www.unicode.org/Public/UNIDATA/UnicodeData.txt')
  80. try:
  81.     unicode_file = open(unicode_filename, 'r')
  82. except IOError, (errno, strerror):
  83.     print "I/O error(%s): %s" % (errno, strerror)
  84.     sys.exit(-1)
  85. except:
  86.     print "Unexpected error: ", sys.exc_info()
  87.     sys.exit(-1)
  88. for line in unicode_file.readlines():
  89.     data = line.split(';')
  90.     unicode_info[data[0]] = data[1]
  91.  
  92. # Load compose sequences
  93. compose_sequences = {}
  94. codes = []
  95. chars = []
  96. names = []
  97. compose_file = None
  98. compose_filename = download_file('http://cgit.freedesktop.org/xorg/lib/libX11/plain/nls/en_US.UTF-8/Compose.pre')
  99. try:
  100.     compose_file = open(compose_filename, 'r')
  101. except IOError, (errno, strerror):
  102.     print "I/O error(%s): %s" % (errno, strerror)
  103.     sys.exit(-1)
  104. except:
  105.     print "Unexpected error: ", sys.exc_info()[0]
  106.     sys.exit(-1)
  107. lines = 0
  108. for line in compose_file.readlines():
  109.     lines = lines + 1
  110.     if line[0] != '<':
  111.         continue
  112.     if '"\t\t# ' in line:
  113.         line = line .replace('"\t\t# ', '"    # ')
  114.     if ':   "' in line and '<U17f' not in line:
  115.         line = line.replace(':   "', ': "')
  116.     seq = None
  117.     char = None
  118.     code = None
  119.     name = None
  120.     data = line.split(': "')
  121.     seq = data[0].strip()
  122.     try:
  123.  
  124.         charcodename = data[1].split('" ')
  125.         if len(charcodename) == 1:
  126.             charcodename = data[1].split('"\t')
  127.     except IndexError:
  128.         if '<U17f' in seq:
  129.             seq = seq.split(' :')[0].strip()
  130.             if not silent:
  131.                 print 'WARNING line %s: missing second double quote and comment with name' %lines
  132.                 print ' ', line[:-1]
  133.             char = ''
  134.             code = ''
  135.             name = ''
  136.         else:
  137.             if not silent:
  138.                 print 'ERROR lines %s: unknown malformation' %lines
  139.                 print ' ', line[:-1]
  140.     if not (char == '' and code == '' and name == ''):
  141.         char = charcodename[0].strip()
  142.         charname = charcodename[1].strip().split(' # ')
  143.         if len(charname) == 1:
  144.             if False:
  145.                 if not silent:
  146.                     print 'WARNING line %s: missing code (alias or Unicode code point)' %lines
  147.                     print ' ', line[:-1]
  148.             code = ''
  149.             name = charname[0].replace('# ', '').strip()
  150.         else:
  151.             code = charname[0].strip()
  152.             name = charname[1].strip()
  153.     if False:#TODO for generating documentation
  154.         print 'seq:', seq
  155.         print '  char:', char
  156.         print '  code:', code
  157.         print '  name:', name
  158.  
  159.     for se in compose_sequences.keys():
  160.         (ch, co, na) = compose_sequences[se]
  161.         if se == seq:
  162.             if not silent:
  163.                 print 'ERROR line %s: at least duplicate compose sequence'
  164.                 print ' ', line[:-1]
  165.             break
  166.         elif seq == se[:len(seq)]:
  167.             if not silent:
  168.                 print 'ERROR line %s: compose sequence is blocking at least %s : "%s" %s # %s' %(lines, se, ch, co, na)
  169.                 print ' ', line[:-1]
  170.             break
  171.         elif seq[:len(se)] == se:
  172.             if not silent:
  173.                 print 'ERROR line %s: compose sequence is at least blocked by %s : "%s" %s # %s' %(lines, se, ch, co, na)
  174.                 print ' ', line[:-1]
  175.             break
  176.         elif code != '' and code == co and name != na:
  177.             if not silent:
  178.                 print 'WARNING line %s: non-identical character names for same code for %s : "%s" %s # %s' %(lines, se, ch, co, na)
  179.                 print ' ', line[:-1]
  180.             break
  181.         elif code != '' and code != co and name == na:
  182.             if not silent:
  183.                 print 'WARNING line %s: non-identical codes for same character name for %s : "%s" %s # %s' %(lines, se, ch, co, na)
  184.                 print ' ', line[:-1]
  185.             break
  186.         #TODO etc.
  187.  
  188.     if code != '' and (isUnicodeUpper(code) or isUnicodeLower(code)):
  189.         CODE = code[1:].upper()
  190.         if CODE in unicode_info:
  191.             info = unicode_info[CODE]
  192.             if info != name:
  193.                 if not silent:
  194.                     print 'WARNING line %s: incorrect comment with name, should be %s from UnicodeData.txt' %(lines, info)
  195.                     print ' ', line[:-1]
  196.         else:
  197.             if not silent:
  198.                 print 'WARNING line %s: unknown Unicode code point %s according to UnicodeData.txt' %(lines, code)
  199.                 print ' ', line[:-1]
  200.  
  201.     compose_sequences[seq] = (char, code, name)
  202.     if code != '' and code not in codes:
  203.         codes.append(code)
  204.     if char != '' and char not in chars:
  205.         chars.append(char)
  206.     if name != '' and name not in names:
  207.         names.append(name)
  208.  
  209. if not silent:
  210.     print 'INFO: checked %s compose sequences' %len(compose_sequences)
  211.     print 'INFO: resulting in %s different chars' %len(chars)
  212.     print 'INFO: related to %s different codes' %len(codes)
  213.     print 'INFO: with %s different names in comment' %len(names)
Advertisement
RAW Paste Data Copied
Advertisement