Advertisement
Guest User

Col WIlson

a guest
Nov 12th, 2009
866
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.26 KB | None | 0 0
  1. import logging
  2. import codecs
  3. import re
  4. from utils.error import Error
  5.  
  6. class UnicodingError: pass
  7.  
  8. utf8_detector = re.compile(r"""^(?:
  9.     [\x09\x0A\x0D\x20-\x7E]            # ASCII
  10.   | [\xC2-\xDF][\x80-\xBF]             # non-overlong 2-byte
  11.   |  \xE0[\xA0-\xBF][\x80-\xBF]        # excluding overlongs
  12.   | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}  # straight 3-byte
  13.   |  \xED[\x80-\x9F][\x80-\xBF]        # excluding surrogates
  14.   |  \xF0[\x90-\xBF][\x80-\xBF]{2}     # planes 1-3
  15.   | [\xF1-\xF3][\x80-\xBF]{3}          # planes 4-15
  16.   |  \xF4[\x80-\x8F][\x80-\xBF]{2}     # plane 16
  17.  )*$""", re.X)
  18.  
  19. cp1252_detector = re.compile(r'^(?:[\x80-\xBF])*$', re.X)
  20. xa4_detector = re.compile(r'^(?:\xA4)*$', re.X)
  21.  
  22. def unicoder(string):
  23.     '''make unicode'''
  24.     try:
  25.         if re.match(utf8_detector, string):
  26.             return unicode(string, 'utf_8')
  27.         if re.match(cp1252_detector, string):
  28.             if re.match(xa4_detector, string):
  29.                 return unicode(string, 'iso8859_15')
  30.             else:
  31.                 return unicode(string, 'cp1252')
  32.         return unicode(string, 'latin_1')
  33.        
  34.     except UnicodingError:
  35.         raise UnicodingError("still don't recognise encoding after trying do guess common english encodings")
  36.  
  37.  
  38.    
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement