Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import logging
- import codecs
- import re
- from utils.error import Error
- class UnicodingError: pass
- utf8_detector = re.compile(r"""^(?:
- [\x09\x0A\x0D\x20-\x7E] # ASCII
- | [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
- | \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
- | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte
- | \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates
- | \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3
- | [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15
- | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16
- )*$""", re.X)
- cp1252_detector = re.compile(r'^(?:[\x80-\xBF])*$', re.X)
- xa4_detector = re.compile(r'^(?:\xA4)*$', re.X)
- def unicoder(string):
- '''make unicode'''
- try:
- if re.match(utf8_detector, string):
- return unicode(string, 'utf_8')
- if re.match(cp1252_detector, string):
- if re.match(xa4_detector, string):
- return unicode(string, 'iso8859_15')
- else:
- return unicode(string, 'cp1252')
- return unicode(string, 'latin_1')
- except UnicodingError:
- raise UnicodingError("still don't recognise encoding after trying do guess common english encodings")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement