Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- $ iconv -f UTF-8 your_file -o /dev/null
- >>> a="γεια"
- >>> a
- 'xcexb3xcexb5xcexb9xcexb1'
- >>> b='xcexb3xcexb5xcexb9xffxb1' # note second-to-last char changed
- >>> print b.decode("utf_8")
- Traceback (most recent call last):
- File "<stdin>", line 1, in <module>
- File "/usr/local/lib/python2.5/encodings/utf_8.py", line 16, in decode
- return codecs.utf_8_decode(input, errors, True)
- UnicodeDecodeError: 'utf8' codec can't decode byte 0xff in position 6: unexpected code byte
- >>> try: print b.decode("utf_8")
- ... except UnicodeDecodeError, exc: pass
- ...
- >>> exc
- UnicodeDecodeError('utf8', 'xcexb3xcexb5xcexb9xffxb1', 6, 7, 'unexpected code byte')
- >>> exc.args
- ('utf8', 'xcexb3xcexb5xcexb9xffxb1', 6, 7, 'unexpected code byte')
- $ apt-get install moreutils
- $ isutf8 your_file
- ///Returns -1 if string is valid. Invalid character is put to ch.
- int getInvalidUtf8SymbolPosition(const unsigned char *input, unsigned char &ch) {
- int nb, na;
- const unsigned char *c = input;
- for (c = input; *c; c += (nb + 1)) {
- if (!(*c & 0x80))
- nb = 0;
- else if ((*c & 0xc0) == 0x80)
- {
- ch = *c;
- return (int)c - (int)input;
- }
- else if ((*c & 0xe0) == 0xc0)
- nb = 1;
- else if ((*c & 0xf0) == 0xe0)
- nb = 2;
- else if ((*c & 0xf8) == 0xf0)
- nb = 3;
- else if ((*c & 0xfc) == 0xf8)
- nb = 4;
- else if ((*c & 0xfe) == 0xfc)
- nb = 5;
- na = nb;
- while (na-- > 0)
- if ((*(c + nb) & 0xc0) != 0x80)
- {
- ch = *(c + nb);
- return (int)(c + nb) - (int)input;
- }
- }
- return -1;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement