Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: utf-8 -*-
- import os
- import imghdr
- import chardet
- from beets.mediafile import MediaFile
- import sys
- from pprint import PrettyPrinter
- import re
- spaces_re = re.compile(r'\s\s+')
- def remove_extra_spaces(s):
- return spaces_re.sub(' ', s)
- class MyPrettyPrinter(PrettyPrinter):
- def format(self, *args, **kwargs):
- repr, readable, recursive = PrettyPrinter.format(self, *args, **kwargs)
- if repr:
- if repr[0] in ('"', "'"):
- repr = repr.decode('string_escape')
- elif repr[0:2] in ("u'", 'u"'):
- repr = repr.decode('unicode_escape').encode(sys.stdout.encoding)
- return repr, readable, recursive
- def pprint(obj, stream=None, indent=1, width=80, depth=None):
- printer = MyPrettyPrinter(stream=stream, indent=indent, width=width, depth=depth)
- printer.pprint(obj)
- class MostFrequentEncoding(dict):
- def from_attrs(self, obj):
- for attr in dir(obj):
- val = getattr(obj, attr)
- self.feed(val)
- def feed(self, obj):
- if obj and isinstance(obj, basestring):
- guess = chardet.detect(obj)
- encoding = guess['encoding']
- if encoding not in self:
- self.setdefault(encoding, {'confidence': 0.0, 'total': 0})
- self[encoding]['confidence'] += guess['confidence']
- self[encoding]['total'] += 1
- def encodings(self):
- return sorted(self, key=lambda x: self[x]['total'], reverse=True)
- charset_coercing = {
- ('MacCyrillic', 'windows-1251'): {'MacCyrillic': -0.1},
- }
- def fix_encoding(src, possible_encodings):
- if not isinstance(src, basestring) or not src:
- return src
- guess = chardet.detect(src)
- first_encoding = guess['encoding']
- encodings = list(possible_encodings) # copy possible encodings
- if first_encoding in encodings: # we believe chardet, so first tested
- encodings.remove(first_encoding) # encoding will be the one, detected by chardet
- encodings.insert(0, first_encoding)
- encodings_set = set(encodings)
- tested_encodings = { k:{'string': '', 'confidence': -1.0} for k in encodings }
- try:
- lat = src.encode('latin-1') if isinstance(src, unicode) else src # make latin string
- except UnicodeEncodeError:
- lat = src.encode('utf-8') # may be not necessary
- while encodings:
- candidate = encodings.pop(0)
- if not candidate:
- continue
- if not candidate in tested_encodings:
- tested_encodings.setdefault(candidate, {'string': '', 'confidence': -1.0})
- try:
- fixed_string = lat.decode(candidate)
- except UnicodeDecodeError:
- continue
- # try to detect charset again
- fixed_confidence = chardet.detect(fixed_string)['confidence']
- # it seems, that new confidence is usually higher, if the previous detection was right
- tested_encodings[candidate]['string'] = fixed_string
- tested_encodings[candidate]['confidence'] = fixed_confidence
- # perform charset coercing
- for subset, coercing_encodings in charset_coercing.items():
- if set(subset).issubset(encodings_set):
- for enc, penalty in coercing_encodings.items():
- tested_encodings[enc]['confidence'] += penalty
- result = tested_encodings.get(first_encoding)
- if result['confidence'] >= 0.99: # if confidence value for first detection is high, use it
- return result['string']
- max_confidence_charset = max(tested_encodings, key=lambda x: tested_encodings[x]['confidence'])
- return tested_encodings[max_confidence_charset]['string']
- def ascii_image(img_data):
- import aalib
- import Image
- from cStringIO import StringIO
- screen = aalib.AsciiScreen(width=80, height=40)
- fp = StringIO(img_data)
- image = Image.open(fp).convert('L').resize(screen.virtual_size)
- screen.put_image((0, 0), image)
- return screen.render()
- def extract_tags(media_file):
- try:
- mf = MediaFile(media_file)
- except:
- return {}
- mfe = MostFrequentEncoding()
- mfe.from_attrs(mf)
- encodings = mfe.encodings()
- tags = {}
- for attr in sorted(dir(mf)):
- val = getattr(mf, attr)
- if not val or callable(val) or \
- attr in ['__dict__', '__doc__', '__module__', '__weakref__', 'mgfile', 'art']:
- continue
- fixed = fix_encoding(val, encodings)
- tags[attr] = remove_extra_spaces(fixed) if isinstance(fixed, basestring) else fixed
- if mf.art:
- tags['art'] = { 'data': mf.art, 'mime': imghdr.what(None, h=mf.art) }
- return tags
- #f = '/media/Media/Music/Маврик/Одиночество/02_mavrin_sergey_mavrik_svet_dnevonoy_issyak.mp3'
- #pprint(extract_tags(f))
- #exit()
- files = ( os.path.join(path,name)
- for path,dirs,files in os.walk('/media/Media/Music/')
- for name in files )
- attrs = {}
- for f in files:
- print '*'*10, f
- tags = extract_tags(f)
- for k in tags.keys():
- if k not in attrs:
- attrs.setdefault(k, 0)
- attrs[k] += 1
- if 'art' in tags:
- print ascii_image(tags['art']['data'])
- del tags['art']
- pprint(tags)
- print attrs
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement