Advertisement
Guest User

Python fix broken encodings

a guest
Jan 2nd, 2013
147
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.28 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2. import os
  3. import imghdr
  4. import chardet
  5. from beets.mediafile import MediaFile
  6. import sys
  7. from pprint import PrettyPrinter
  8. import re
  9.  
  10. spaces_re = re.compile(r'\s\s+')
  11. def remove_extra_spaces(s):
  12.     return spaces_re.sub(' ', s)
  13.  
  14. class MyPrettyPrinter(PrettyPrinter):
  15.     def format(self, *args, **kwargs):
  16.         repr, readable, recursive = PrettyPrinter.format(self, *args, **kwargs)
  17.         if repr:
  18.             if repr[0] in ('"', "'"):
  19.                 repr = repr.decode('string_escape')
  20.             elif repr[0:2] in ("u'", 'u"'):
  21.                 repr = repr.decode('unicode_escape').encode(sys.stdout.encoding)
  22.         return repr, readable, recursive
  23.  
  24. def pprint(obj, stream=None, indent=1, width=80, depth=None):
  25.     printer = MyPrettyPrinter(stream=stream, indent=indent, width=width, depth=depth)
  26.     printer.pprint(obj)
  27.  
  28.  
  29. class MostFrequentEncoding(dict):
  30.     def from_attrs(self, obj):
  31.         for attr in dir(obj):
  32.             val = getattr(obj, attr)
  33.             self.feed(val)
  34.  
  35.     def feed(self, obj):
  36.         if obj and isinstance(obj, basestring):
  37.             guess = chardet.detect(obj)
  38.             encoding = guess['encoding']
  39.  
  40.             if encoding not in self:
  41.                 self.setdefault(encoding, {'confidence': 0.0, 'total': 0})
  42.  
  43.             self[encoding]['confidence'] += guess['confidence']
  44.             self[encoding]['total'] += 1
  45.  
  46.     def encodings(self):
  47.         return sorted(self, key=lambda x: self[x]['total'], reverse=True)
  48.  
  49. charset_coercing = {
  50.     ('MacCyrillic', 'windows-1251'): {'MacCyrillic': -0.1},
  51. }
  52.  
  53.  
  54. def fix_encoding(src, possible_encodings):
  55.     if not isinstance(src, basestring) or not src:
  56.         return src
  57.  
  58.     guess = chardet.detect(src)
  59.     first_encoding = guess['encoding']
  60.  
  61.     encodings = list(possible_encodings)        # copy possible encodings
  62.     if first_encoding in encodings:             # we believe chardet, so first tested
  63.         encodings.remove(first_encoding)        # encoding will be the one, detected by chardet
  64.     encodings.insert(0, first_encoding)
  65.     encodings_set = set(encodings)
  66.  
  67.     tested_encodings = { k:{'string': '', 'confidence': -1.0} for k in encodings }
  68.  
  69.     try:
  70.         lat = src.encode('latin-1') if isinstance(src, unicode) else src # make latin string
  71.     except UnicodeEncodeError:
  72.         lat = src.encode('utf-8') # may be not necessary
  73.  
  74.     while encodings:
  75.         candidate = encodings.pop(0)
  76.         if not candidate:
  77.             continue
  78.  
  79.         if not candidate in tested_encodings:
  80.             tested_encodings.setdefault(candidate, {'string': '', 'confidence': -1.0})
  81.  
  82.         try:
  83.             fixed_string = lat.decode(candidate)
  84.         except UnicodeDecodeError:
  85.             continue
  86.  
  87.         # try to detect charset again
  88.         fixed_confidence = chardet.detect(fixed_string)['confidence']
  89.         # it seems, that new confidence is usually higher, if the previous detection was right
  90.  
  91.         tested_encodings[candidate]['string'] = fixed_string
  92.         tested_encodings[candidate]['confidence'] = fixed_confidence
  93.  
  94.     # perform charset coercing
  95.     for subset, coercing_encodings in charset_coercing.items():
  96.         if set(subset).issubset(encodings_set):
  97.             for enc, penalty in coercing_encodings.items():
  98.                 tested_encodings[enc]['confidence'] += penalty
  99.  
  100.  
  101.     result = tested_encodings.get(first_encoding)
  102.     if result['confidence'] >= 0.99: # if confidence value for first detection is high, use it
  103.         return result['string']
  104.  
  105.     max_confidence_charset = max(tested_encodings, key=lambda x: tested_encodings[x]['confidence'])
  106.     return tested_encodings[max_confidence_charset]['string']
  107.  
  108.  
  109. def ascii_image(img_data):
  110.     import aalib
  111.     import Image
  112.     from cStringIO import StringIO
  113.     screen = aalib.AsciiScreen(width=80, height=40)
  114.     fp = StringIO(img_data)
  115.     image = Image.open(fp).convert('L').resize(screen.virtual_size)
  116.     screen.put_image((0, 0), image)
  117.     return screen.render()
  118.  
  119.  
  120. def extract_tags(media_file):
  121.     try:
  122.         mf = MediaFile(media_file)
  123.     except:
  124.         return {}
  125.  
  126.     mfe = MostFrequentEncoding()
  127.     mfe.from_attrs(mf)
  128.  
  129.     encodings = mfe.encodings()
  130.     tags = {}
  131.  
  132.     for attr in sorted(dir(mf)):
  133.         val = getattr(mf, attr)
  134.         if not val or callable(val) or \
  135.            attr in ['__dict__', '__doc__', '__module__', '__weakref__', 'mgfile', 'art']:
  136.             continue
  137.  
  138.         fixed = fix_encoding(val, encodings)
  139.         tags[attr] = remove_extra_spaces(fixed) if isinstance(fixed, basestring) else fixed
  140.  
  141.     if mf.art:
  142.         tags['art'] = { 'data': mf.art, 'mime': imghdr.what(None, h=mf.art) }
  143.  
  144.     return tags
  145.  
  146.  
  147. #f = '/media/Media/Music/Маврик/Одиночество/02_mavrin_sergey_mavrik_svet_dnevonoy_issyak.mp3'
  148. #pprint(extract_tags(f))
  149. #exit()
  150.  
  151. files = ( os.path.join(path,name)
  152.     for path,dirs,files in os.walk('/media/Media/Music/')
  153.         for name in files )
  154.  
  155. attrs = {}
  156.  
  157. for f in files:
  158.     print '*'*10, f
  159.     tags = extract_tags(f)
  160.  
  161.     for k in tags.keys():
  162.         if k not in attrs:
  163.             attrs.setdefault(k, 0)
  164.         attrs[k] += 1
  165.  
  166.     if 'art' in tags:
  167.         print ascii_image(tags['art']['data'])
  168.         del tags['art']
  169.  
  170.     pprint(tags)
  171.  
  172. print attrs
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement