Convert a UTF-8 string to character names


$ cat ./unicode.py
#!/opt/python/bin/python3
"""
$ echo "Hello © ≠" | od -t x1
0000000 48 65 6c 6c 6f 20 c2 a9 20 e2 89 a0 0a
0000015
"""
import sys
import unicodedata

byte_list = list()
for line in sys.stdin.readlines():
    token_list = line.split()
    token_list.pop(0)
    byte_list.extend(token_list)
#print(byte_list)

letter_count = 1
i = 0
while i < len(byte_list) - 1:
    print(str(letter_count).rjust(2), end=". ")
    binary_char = str(bin(int(byte_list[i], 16))[2:].zfill(8))
    #print(binary_char)
    extra_byte_count = binary_char.find("0")
    if extra_byte_count > 0:
        # Mush together the binary digits from the multiple bytes
        # Ignore the first extra_byte_count+1 binary digits of binary_char
        # Ignore the first two binary digits of the remaining bytes
        extended_binary_string = binary_char[extra_byte_count:]
        for j in range(extra_byte_count-1):
            i += 1
            extra_binary_char = str(bin(int(byte_list[i], 16))[2:].zfill(8))
            #print(" " + extra_binary_char)
            extended_binary_string += extra_binary_char[2:]
        print(unicodedata.name(chr(int(extended_binary_string, 2))))
    else:
        print(unicodedata.name(chr(int(binary_char, 2))))
    i += 1
    letter_count += 1


##################################################################

$ echo "Hello! © ≠" | od -t x1 | ./unicode.py
 1. LATIN CAPITAL LETTER H
 2. LATIN SMALL LETTER E
 3. LATIN SMALL LETTER L
 4. LATIN SMALL LETTER L
 5. LATIN SMALL LETTER O
 6. EXCLAMATION MARK
 7. SPACE
 8. COPYRIGHT SIGN
 9. SPACE
10. NOT EQUAL TO