sane-tag-music: handle more character encoding edgecases

2023-12-07 15:52:28 +00:00
parent 2c66d8cad0
commit 7f08ad01db
1 changed files with 61 additions and 22 deletions
--- a/pkgs/additional/sane-scripts/src/sane-tag-music
+++ b/pkgs/additional/sane-scripts/src/sane-tag-music
@@ -14,6 +14,34 @@ import mutagen.easyid3
 logger = logging.getLogger(__name__)
 def clean_for_loose_compare(a: str) -> str:
    a = a.strip().lower()
    if a.startswith("the "):
        a = a[len("the "):]
    # goal is to help merge path-extracted tags with embedded tags.
    # it's common for a tag to have some rich characters which can't be represented in a file.
    # so just remove rich characters, but in a way which doesn't become useless when faced with primarily non-latin names
    omitable = '. &()[];:'
    unomitable = 'abcdefghijklmnopqrstuvwxyz0123456789'
    a = "".join(c for c in a if c not in omitable)
    cleaned = "".join(c for c in a if c in unomitable)
    if len(cleaned) >= 0.5 * len(a):
        return cleaned
    else:
        # we cleaned a *suspicious* amount, probably erroneously.
        # likely a non-english album/artist/track
        return a
 def loose_compare_str(a: str, b: str) -> bool:
    return clean_for_loose_compare(a) == clean_for_loose_compare(b)
 def loose_compare_lists(a: list[str], b: list[str]) -> bool:
    a = sorted(clean_for_loose_compare(i) for i in a)
    b = sorted(clean_for_loose_compare(i) for i in b)
    return a == b
@dataclass
 class Tags:
    # format matches mutagen's
@@ -68,6 +96,22 @@ class Tags:
            tracknumber=tracknumber,
        )
    def trim_fields(self) -> None:
        if len(self.title) == 1:
            self.title = [ self.title[0].strip() ]
        if len(self.artist) == 1:
            self.artist = [ self.artist[0].strip() ]
        if len(self.albumartist) == 1:
            self.albumartist = [ self.albumartist[0].strip() ]
        if len(self.album) == 1:
            self.album = [ self.album[0].strip() ]
    def expand_shorthands(self) -> None:
        if self.artist == ["V.A."]:
            self.artist = ["Various Artists"]
        if self.albumartist == ["V.A."]:
            self.albumartist = ["Various Artists"]
    def promote_albumartist(self) -> None:
        """
        1. replace shorthands like "V.A." with "Various Artists".
@@ -75,23 +119,14 @@ class Tags:
        3. if the artist and album artist are nearly identical, try to merge them.
        """
-        if self.artist == ["V.A."]:
+        if loose_compare_lists(self.artist, self.albumartist):
-            self.artist = ["Various Artists"]
+            # arist & album artist are nearly identical:
-        if self.albumartist == ["V.A."]:
+            # probably guessed one of them from filename, which was lacking certain symbols of the actual artist.
-            self.albumartist = ["Various Artists"]
+            # recover whichever of these fields had the fewer characters removed (i.e. is longest)
-
+            if len("".join(self.artist)) > len("".join(self.albumartist)):
-        unomitable = 'abcdefghijklmnopqrstuvwxyz0123456789'
+                self.artist = self.albumartist = self.artist
-        if len(self.artist) == len(self.albumartist) == 1:
+            else:
-            filtered_artist = [i for i in self.artist[0] if i.lower() in unomitable]
+                self.artist = self.albumartist = self.albumartist
            filtered_albumartist = [i for i in self.albumartist[0] if i.lower() in unomitable]
            if filtered_artist == filtered_albumartist:
                # arist & album artist are nearly identical:
                # probably guessed one of them from filename, which was lacking certain symbols of the actual artist.
                # recover whichever of these fields had the fewer characters removed (i.e. is longest)
                if len(self.artist[0]) > len(self.albumartist[0]):
                    self.artist = self.albumartist = self.artist
                else:
                    self.artist = self.albumartist = self.albumartist
        if self.artist == []:
            self.artist = self.albumartist
@@ -99,11 +134,12 @@ class Tags:
    def rewrite_singles(self) -> None:
        """ idiom is for singles to belong to self-titled album. else each artist's singles get merged into one massive album """
-        if self.album == ["Singles"]:
+        if len(self.album) != 1:
-            if self.albumartist:
+            return
-                self.album = self.albumartist
+
-            else:
+        for artist in self.albumartist[::-1] + self.artist[::-1]:
-                self.album = self.artist
+            if loose_compare_str(self.album[0], "Singles") or loose_compare_str(self.album[0], artist):
                self.album = [ artist ]
    @staticmethod
    def from_path(p: str) -> 'Tags':
@@ -246,7 +282,10 @@ class Tagger:
        old_tags = file_.tags_on_disk()
        path_tags = Tags.from_path(path_)
        # logger.debug(f"extracted tags from {path_}: {path_tags}")
        new_tags = old_tags.union(path_tags)
        new_tags.trim_fields()
        new_tags.expand_shorthands()
        new_tags.promote_albumartist()
        new_tags.rewrite_singles()