Advertisement
Guest User

Untitled

a guest
Aug 23rd, 2019
80
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 6.71 KB | None | 0 0
  1. #!/usr/bin/env python3
  2.  
  3. import os
  4. import sys
  5. import urllib.parse
  6. import urllib.request
  7. import lxml.etree as ET
  8. import icu
  9. import collections
  10. import math
  11.  
  12. class RatingDB:
  13.  
  14. Song = collections.namedtuple('Song', 'title artist album duration rating filesize')
  15.  
  16. def __init__(self):
  17. self.songs = []
  18. self.tl = icu.Transliterator.createInstance('Any-Latin; Latin-ASCII')
  19. self.attrs = ('title', 'artist', 'album')
  20. self.attrs_importance = {
  21. 'title': 0.6,
  22. 'artist': 0.2,
  23. 'album': 0.2
  24. }
  25.  
  26. self.shingles_to_songs = {}
  27. self.songs_to_shingles = {attr: [] for attr in self.attrs}
  28.  
  29.  
  30. def shingle(self, s, l=3):
  31. t = ''.join([c for c in self.tl.transliterate(s).lower() if c.isalnum()])
  32. return {t[i:i+l] for i in range(0, len(t) - l)}
  33.  
  34. def read_song(self, x):
  35. def text_or_default(key, default=None):
  36. elem = x.find(key)
  37. if elem is None:
  38. return default
  39. return elem.text
  40.  
  41. # Fetch the important metadata
  42. title = text_or_default("title")
  43. artist = text_or_default("artist")
  44. album = text_or_default("album")
  45. duration = float(text_or_default("duration", 0.0))
  46. rating = text_or_default("rating")
  47. filesize = int(text_or_default("file-size", 0))
  48.  
  49. # Map "unknown" onto nothing
  50. if title.lower() == "unknown":
  51. title = None
  52. if artist.lower() == "unknown":
  53. artist = None
  54. if album.lower() == "unknown":
  55. album = None
  56.  
  57. return RatingDB.Song(title, artist, album, duration, rating, filesize)
  58.  
  59.  
  60. def read_rhythmdb(self, filename):
  61. x_root = ET.parse(filename).getroot()
  62. for x_child in x_root:
  63. if x_child.tag != 'entry' or x_child.attrib['type'] != 'song':
  64. continue
  65.  
  66. song = self.read_song(x_child)
  67. if song.rating is None:
  68. continue
  69. song_idx = len(self.songs)
  70. self.songs.append(song)
  71. for attr in self.attrs:
  72. self.songs_to_shingles[attr].append(set())
  73. s = getattr(song, attr)
  74. if s is None:
  75. continue
  76. shingles = self.shingle(s)
  77. for shingle in shingles:
  78. if not shingle in self.shingles_to_songs:
  79. self.shingles_to_songs[shingle] = set()
  80. self.shingles_to_songs[shingle].add(song_idx)
  81. self.songs_to_shingles[attr][song_idx] = shingles
  82.  
  83. def infer_rating(self, song):
  84. # For relevant attributes, compute the similarity to other songs in the
  85. # local database
  86. matched_songs = {}
  87. for attr in self.attrs:
  88. # Convert the attribute to shingles
  89. value = getattr(song, attr)
  90. if value is None:
  91. continue
  92. shingles = self.shingle(value)
  93.  
  94. # Fetch all songs sharing the same shingles
  95. songs = set()
  96. for shingle in shingles:
  97. if shingle in self.shingles_to_songs:
  98. songs |= self.shingles_to_songs[shingle]
  99.  
  100. # For each song, compute the Jaccard similarity between the song
  101. # shingles and the shingles for this attribute
  102. sim = {}
  103. for song_idx in songs:
  104. song_shingles = self.songs_to_shingles[attr][song_idx]
  105. sim = len(song_shingles.intersection(shingles)) / len(song_shingles | shingles)
  106. if sim > 0.5:
  107. if not song_idx in matched_songs:
  108. matched_songs[song_idx] = {attr: 0.0 for attr in self.attrs}
  109. matched_songs[song_idx][attr] = sim
  110.  
  111. # Compute the overal score, i.e. the likelihood that the given song is
  112. # actually one of the matched_songs. Compare the song durations. Return
  113. # the above-threshold song with the highest rating.
  114. best_rating, best_song_idx, best_p = 0, None, 0.5
  115. for song_idx in matched_songs:
  116. matched_song = self.songs[song_idx]
  117. p = 0.0
  118. for attr, w in self.attrs_importance.items():
  119. p += w * matched_songs[song_idx][attr]
  120. if song.duration > 0.0 and matched_song.duration > 0.0:
  121. p *= math.exp(-(song.duration - matched_song.duration)**2 / 1000)
  122. rating = 0 if matched_song.rating is None else int(matched_song.rating)
  123. if p > 0.5 and rating > best_rating:
  124. best_rating = rating
  125. best_song_idx = song_idx
  126. best_p = p
  127. elif p > best_p and rating == best_rating:
  128. best_song_idx = song_idx
  129. best_p = p
  130. return best_song_idx
  131.  
  132. def transfer_ratings(self, filename):
  133. matched_songs = {}
  134. x_root = ET.parse(filename).getroot()
  135. for x_child in x_root:
  136. if x_child.tag != 'entry' or x_child.attrib['type'] != 'song':
  137. continue
  138.  
  139. # Convert the element into a "song" element
  140. song = self.read_song(x_child)
  141.  
  142. # Remove the "rating" element; there can be only one song matched
  143. # to the same logical song
  144. x_rating = x_child.find("rating")
  145. if not x_rating is None:
  146. x_child.remove(x_rating)
  147.  
  148. # Try to infer the rating and the logical song
  149. song_idx = self.infer_rating(song)
  150. if song_idx is None:
  151. continue
  152. sys.stderr.write(str(song) + ' --> ' + str(self.songs[song_idx]) + '\n')
  153.  
  154. # Remember that this song was matched to the given song index
  155. if not song_idx in matched_songs:
  156. matched_songs[song_idx] = []
  157. matched_songs[song_idx].append((song, x_child))
  158.  
  159. # For each matched logical song, find the song in the XML file with the
  160. # best quality. Add the "rating" tag with the inferred rating to exactly
  161. # one song.
  162. for song_idx, songs in matched_songs.items():
  163. largest_filesize, x_tar = 0, None
  164. for song, x_child in songs:
  165. if song.filesize > largest_filesize:
  166. x_tar = x_child
  167. largest_filesize = song.filesize
  168. if not x_tar is None:
  169. x_rating = ET.SubElement(x_tar, "rating")
  170. x_rating.text = str(self.songs[song_idx].rating)
  171.  
  172. return x_root
  173.  
  174. db = RatingDB()
  175. db.read_rhythmdb('/home/andreas/.local/share/rhythmbox/rhythmdb.xml')
  176. db.read_rhythmdb('/home/andreas/.local/share/rhythmbox/rhythmdb.bck.xml')
  177. x_new = db.transfer_ratings('/home/andreas/.local/share/rhythmbox/rhythmdb.xml')
  178. sys.stdout.buffer.write(ET.tostring(x_new, pretty_print=True))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement