Advertisement
Guest User

Untitled

a guest
May 25th, 2016
51
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.58 KB | None | 0 0
  1. $ iconv -f UTF-8 your_file -o /dev/null
  2.  
  3. >>> a="γεια"
  4. >>> a
  5. 'xcexb3xcexb5xcexb9xcexb1'
  6. >>> b='xcexb3xcexb5xcexb9xffxb1' # note second-to-last char changed
  7. >>> print b.decode("utf_8")
  8. Traceback (most recent call last):
  9. File "<stdin>", line 1, in <module>
  10. File "/usr/local/lib/python2.5/encodings/utf_8.py", line 16, in decode
  11. return codecs.utf_8_decode(input, errors, True)
  12. UnicodeDecodeError: 'utf8' codec can't decode byte 0xff in position 6: unexpected code byte
  13.  
  14. >>> try: print b.decode("utf_8")
  15. ... except UnicodeDecodeError, exc: pass
  16. ...
  17. >>> exc
  18. UnicodeDecodeError('utf8', 'xcexb3xcexb5xcexb9xffxb1', 6, 7, 'unexpected code byte')
  19. >>> exc.args
  20. ('utf8', 'xcexb3xcexb5xcexb9xffxb1', 6, 7, 'unexpected code byte')
  21.  
  22. $ apt-get install moreutils
  23. $ isutf8 your_file
  24.  
  25. ///Returns -1 if string is valid. Invalid character is put to ch.
  26. int getInvalidUtf8SymbolPosition(const unsigned char *input, unsigned char &ch) {
  27. int nb, na;
  28. const unsigned char *c = input;
  29.  
  30. for (c = input; *c; c += (nb + 1)) {
  31. if (!(*c & 0x80))
  32. nb = 0;
  33. else if ((*c & 0xc0) == 0x80)
  34. {
  35. ch = *c;
  36. return (int)c - (int)input;
  37. }
  38. else if ((*c & 0xe0) == 0xc0)
  39. nb = 1;
  40. else if ((*c & 0xf0) == 0xe0)
  41. nb = 2;
  42. else if ((*c & 0xf8) == 0xf0)
  43. nb = 3;
  44. else if ((*c & 0xfc) == 0xf8)
  45. nb = 4;
  46. else if ((*c & 0xfe) == 0xfc)
  47. nb = 5;
  48. na = nb;
  49. while (na-- > 0)
  50. if ((*(c + nb) & 0xc0) != 0x80)
  51. {
  52. ch = *(c + nb);
  53. return (int)(c + nb) - (int)input;
  54. }
  55. }
  56.  
  57. return -1;
  58. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement