Python fix broken encodings

# -*- coding: utf-8 -*-
import os
import imghdr
import chardet
from beets.mediafile import MediaFile
import sys
from pprint import PrettyPrinter
import re

spaces_re = re.compile(r'\s\s+')
def remove_extra_spaces(s):
    return spaces_re.sub(' ', s)

class MyPrettyPrinter(PrettyPrinter):
    def format(self, *args, **kwargs):
        repr, readable, recursive = PrettyPrinter.format(self, *args, **kwargs)
        if repr:
            if repr[0] in ('"', "'"):
                repr = repr.decode('string_escape')
            elif repr[0:2] in ("u'", 'u"'):
                repr = repr.decode('unicode_escape').encode(sys.stdout.encoding)
        return repr, readable, recursive

def pprint(obj, stream=None, indent=1, width=80, depth=None):
    printer = MyPrettyPrinter(stream=stream, indent=indent, width=width, depth=depth)
    printer.pprint(obj)


class MostFrequentEncoding(dict):
    def from_attrs(self, obj):
        for attr in dir(obj):
            val = getattr(obj, attr)
            self.feed(val)

    def feed(self, obj):
        if obj and isinstance(obj, basestring):
            guess = chardet.detect(obj)
            encoding = guess['encoding']

            if encoding not in self:
                self.setdefault(encoding, {'confidence': 0.0, 'total': 0})

            self[encoding]['confidence'] += guess['confidence']
            self[encoding]['total'] += 1

    def encodings(self):
        return sorted(self, key=lambda x: self[x]['total'], reverse=True)

charset_coercing = {
    ('MacCyrillic', 'windows-1251'): {'MacCyrillic': -0.1},
}


def fix_encoding(src, possible_encodings):
    if not isinstance(src, basestring) or not src:
        return src

    guess = chardet.detect(src)
    first_encoding = guess['encoding']

    encodings = list(possible_encodings)        # copy possible encodings
    if first_encoding in encodings:             # we believe chardet, so first tested
        encodings.remove(first_encoding)        # encoding will be the one, detected by chardet
    encodings.insert(0, first_encoding)
    encodings_set = set(encodings)

    tested_encodings = { k:{'string': '', 'confidence': -1.0} for k in encodings }

    try:
        lat = src.encode('latin-1') if isinstance(src, unicode) else src # make latin string
    except UnicodeEncodeError:
        lat = src.encode('utf-8') # may be not necessary

    while encodings:
        candidate = encodings.pop(0)
        if not candidate:
            continue

        if not candidate in tested_encodings:
            tested_encodings.setdefault(candidate, {'string': '', 'confidence': -1.0})

        try:
            fixed_string = lat.decode(candidate)
        except UnicodeDecodeError:
            continue

        # try to detect charset again
        fixed_confidence = chardet.detect(fixed_string)['confidence']
        # it seems, that new confidence is usually higher, if the previous detection was right

        tested_encodings[candidate]['string'] = fixed_string
        tested_encodings[candidate]['confidence'] = fixed_confidence

    # perform charset coercing
    for subset, coercing_encodings in charset_coercing.items():
        if set(subset).issubset(encodings_set):
            for enc, penalty in coercing_encodings.items():
                tested_encodings[enc]['confidence'] += penalty


    result = tested_encodings.get(first_encoding)
    if result['confidence'] >= 0.99: # if confidence value for first detection is high, use it
        return result['string']

    max_confidence_charset = max(tested_encodings, key=lambda x: tested_encodings[x]['confidence'])
    return tested_encodings[max_confidence_charset]['string']


def ascii_image(img_data):
    import aalib
    import Image
    from cStringIO import StringIO
    screen = aalib.AsciiScreen(width=80, height=40)
    fp = StringIO(img_data)
    image = Image.open(fp).convert('L').resize(screen.virtual_size)
    screen.put_image((0, 0), image)
    return screen.render()


def extract_tags(media_file):
    try:
        mf = MediaFile(media_file)
    except:
        return {}

    mfe = MostFrequentEncoding()
    mfe.from_attrs(mf)

    encodings = mfe.encodings()
    tags = {}

    for attr in sorted(dir(mf)):
        val = getattr(mf, attr)
        if not val or callable(val) or \
           attr in ['__dict__', '__doc__', '__module__', '__weakref__', 'mgfile', 'art']:
            continue

        fixed = fix_encoding(val, encodings)
        tags[attr] = remove_extra_spaces(fixed) if isinstance(fixed, basestring) else fixed

    if mf.art:
        tags['art'] = { 'data': mf.art, 'mime': imghdr.what(None, h=mf.art) }

    return tags


#f = '/media/Media/Music/Маврик/Одиночество/02_mavrin_sergey_mavrik_svet_dnevonoy_issyak.mp3'
#pprint(extract_tags(f))
#exit()

files = ( os.path.join(path,name)
    for path,dirs,files in os.walk('/media/Media/Music/')
        for name in files )

attrs = {}

for f in files:
    print '*'*10, f
    tags = extract_tags(f)

    for k in tags.keys():
        if k not in attrs:
            attrs.setdefault(k, 0)
        attrs[k] += 1

    if 'art' in tags:
        print ascii_image(tags['art']['data'])
        del tags['art']

    pprint(tags)

print attrs