Untitled

#!/usr/bin/env python3

import os
import sys
import urllib.parse
import urllib.request
import lxml.etree as ET
import icu
import collections
import math

class RatingDB:

    Song = collections.namedtuple('Song', 'title artist album duration rating filesize')

    def __init__(self):
        self.songs = []
        self.tl = icu.Transliterator.createInstance('Any-Latin; Latin-ASCII')
        self.attrs = ('title', 'artist', 'album')
        self.attrs_importance = {
                'title': 0.6,
                'artist': 0.2,
                'album': 0.2
        }

        self.shingles_to_songs = {}
        self.songs_to_shingles = {attr: [] for attr in self.attrs}


    def shingle(self, s, l=3):
        t = ''.join([c for c in self.tl.transliterate(s).lower() if c.isalnum()])
        return {t[i:i+l] for i in range(0, len(t) - l)}

    def read_song(self, x):
        def text_or_default(key, default=None):
            elem = x.find(key)
            if elem is None:
                return default
            return elem.text

        # Fetch the important metadata
        title = text_or_default("title")
        artist = text_or_default("artist")
        album = text_or_default("album")
        duration = float(text_or_default("duration", 0.0))
        rating = text_or_default("rating")
        filesize = int(text_or_default("file-size", 0))

        # Map "unknown" onto nothing
        if title.lower() == "unknown":
            title = None
        if artist.lower() == "unknown":
            artist = None
        if album.lower() == "unknown":
            album = None

        return RatingDB.Song(title, artist, album, duration, rating, filesize)


    def read_rhythmdb(self, filename):
        x_root = ET.parse(filename).getroot()
        for x_child in x_root:
            if x_child.tag != 'entry' or x_child.attrib['type'] != 'song':
                continue

            song = self.read_song(x_child)
            if song.rating is None:
                continue
            song_idx = len(self.songs)
            self.songs.append(song)
            for attr in self.attrs:
                self.songs_to_shingles[attr].append(set())
                s = getattr(song, attr)
                if s is None:
                    continue
                shingles = self.shingle(s)
                for shingle in shingles:
                    if not shingle in self.shingles_to_songs:
                        self.shingles_to_songs[shingle] = set()
                    self.shingles_to_songs[shingle].add(song_idx)
                self.songs_to_shingles[attr][song_idx] = shingles

    def infer_rating(self, song):
        # For relevant attributes, compute the similarity to other songs in the
        # local database
        matched_songs = {}
        for attr in self.attrs:
            # Convert the attribute to shingles
            value = getattr(song, attr)
            if value is None:
                continue
            shingles = self.shingle(value)

            # Fetch all songs sharing the same shingles
            songs = set()
            for shingle in shingles:
                if shingle in self.shingles_to_songs:
                    songs |= self.shingles_to_songs[shingle]

            # For each song, compute the Jaccard similarity between the song
            # shingles and the shingles for this attribute
            sim = {}
            for song_idx in songs:
                song_shingles = self.songs_to_shingles[attr][song_idx]
                sim = len(song_shingles.intersection(shingles)) / len(song_shingles | shingles)
                if sim > 0.5:
                    if not song_idx in matched_songs:
                        matched_songs[song_idx] = {attr: 0.0 for attr in self.attrs}
                    matched_songs[song_idx][attr] = sim

        # Compute the overal score, i.e. the likelihood that the given song is
        # actually one of the matched_songs. Compare the song durations. Return
        # the above-threshold song with the highest rating.
        best_rating, best_song_idx, best_p = 0, None, 0.5
        for song_idx in matched_songs:
            matched_song = self.songs[song_idx]
            p = 0.0
            for attr, w in self.attrs_importance.items():
                p += w * matched_songs[song_idx][attr]
            if song.duration > 0.0 and matched_song.duration > 0.0:
                p *= math.exp(-(song.duration - matched_song.duration)**2 / 1000)
            rating = 0 if matched_song.rating is None else int(matched_song.rating)
            if p > 0.5 and rating > best_rating:
                best_rating = rating
                best_song_idx = song_idx
                best_p = p
            elif p > best_p and rating == best_rating:
                best_song_idx = song_idx
                best_p = p
        return best_song_idx

    def transfer_ratings(self, filename):
        matched_songs = {}
        x_root = ET.parse(filename).getroot()
        for x_child in x_root:
            if x_child.tag != 'entry' or x_child.attrib['type'] != 'song':
                continue

            # Convert the element into a "song" element
            song = self.read_song(x_child)

            # Remove the "rating" element; there can be only one song matched
            # to the same logical song
            x_rating = x_child.find("rating")
            if not x_rating is None:
                x_child.remove(x_rating)

            # Try to infer the rating and the logical song
            song_idx = self.infer_rating(song)
            if song_idx is None:
                continue
            sys.stderr.write(str(song) + ' --> ' + str(self.songs[song_idx]) + '\n')

            # Remember that this song was matched to the given song index
            if not song_idx in matched_songs:
                matched_songs[song_idx] = []
            matched_songs[song_idx].append((song, x_child))

        # For each matched logical song, find the song in the XML file with the
        # best quality. Add the "rating" tag with the inferred rating to exactly
        # one song.
        for song_idx, songs in matched_songs.items():
            largest_filesize, x_tar = 0, None
            for song, x_child in songs:
                if song.filesize > largest_filesize:
                    x_tar = x_child
                    largest_filesize = song.filesize
            if not x_tar is None:
                x_rating = ET.SubElement(x_tar, "rating")
                x_rating.text = str(self.songs[song_idx].rating)

        return x_root

db = RatingDB()
db.read_rhythmdb('/home/andreas/.local/share/rhythmbox/rhythmdb.xml')
db.read_rhythmdb('/home/andreas/.local/share/rhythmbox/rhythmdb.bck.xml')
x_new = db.transfer_ratings('/home/andreas/.local/share/rhythmbox/rhythmdb.xml')
sys.stdout.buffer.write(ET.tostring(x_new, pretty_print=True))