Advertisement
Guest User

Convert a UTF-8 string to character names

a guest
Jun 15th, 2012
97
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.62 KB | None | 0 0
  1.  
  2. $ cat ./unicode.py
  3. #!/opt/python/bin/python3
  4. """
  5. $ echo "Hello © ≠" | od -t x1
  6. 0000000 48 65 6c 6c 6f 20 c2 a9 20 e2 89 a0 0a
  7. 0000015
  8. """
  9. import sys
  10. import unicodedata
  11.  
  12. byte_list = list()
  13. for line in sys.stdin.readlines():
  14.     token_list = line.split()
  15.     token_list.pop(0)
  16.     byte_list.extend(token_list)
  17. #print(byte_list)
  18.  
  19. letter_count = 1
  20. i = 0
  21. while i < len(byte_list) - 1:
  22.     print(str(letter_count).rjust(2), end=". ")
  23.     binary_char = str(bin(int(byte_list[i], 16))[2:].zfill(8))
  24.     #print(binary_char)
  25.     extra_byte_count = binary_char.find("0")
  26.     if extra_byte_count > 0:
  27.         # Mush together the binary digits from the multiple bytes
  28.         # Ignore the first extra_byte_count+1 binary digits of binary_char
  29.         # Ignore the first two binary digits of the remaining bytes
  30.         extended_binary_string = binary_char[extra_byte_count:]
  31.         for j in range(extra_byte_count-1):
  32.             i += 1
  33.             extra_binary_char = str(bin(int(byte_list[i], 16))[2:].zfill(8))
  34.             #print(" " + extra_binary_char)
  35.             extended_binary_string += extra_binary_char[2:]
  36.         print(unicodedata.name(chr(int(extended_binary_string, 2))))
  37.     else:
  38.         print(unicodedata.name(chr(int(binary_char, 2))))
  39.     i += 1
  40.     letter_count += 1
  41.  
  42.  
  43. ##################################################################
  44.  
  45. $ echo "Hello! © ≠" | od -t x1 | ./unicode.py
  46.  1. LATIN CAPITAL LETTER H
  47.  2. LATIN SMALL LETTER E
  48.  3. LATIN SMALL LETTER L
  49.  4. LATIN SMALL LETTER L
  50.  5. LATIN SMALL LETTER O
  51.  6. EXCLAMATION MARK
  52.  7. SPACE
  53.  8. COPYRIGHT SIGN
  54.  9. SPACE
  55. 10. NOT EQUAL TO
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement