sane-tag-music: handle more character encoding edgecases
This commit is contained in:
@@ -14,6 +14,34 @@ import mutagen.easyid3
|
|||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
def clean_for_loose_compare(a: str) -> str:
|
||||||
|
a = a.strip().lower()
|
||||||
|
if a.startswith("the "):
|
||||||
|
a = a[len("the "):]
|
||||||
|
|
||||||
|
# goal is to help merge path-extracted tags with embedded tags.
|
||||||
|
# it's common for a tag to have some rich characters which can't be represented in a file.
|
||||||
|
# so just remove rich characters, but in a way which doesn't become useless when faced with primarily non-latin names
|
||||||
|
omitable = '. &()[];:'
|
||||||
|
unomitable = 'abcdefghijklmnopqrstuvwxyz0123456789'
|
||||||
|
|
||||||
|
a = "".join(c for c in a if c not in omitable)
|
||||||
|
cleaned = "".join(c for c in a if c in unomitable)
|
||||||
|
if len(cleaned) >= 0.5 * len(a):
|
||||||
|
return cleaned
|
||||||
|
else:
|
||||||
|
# we cleaned a *suspicious* amount, probably erroneously.
|
||||||
|
# likely a non-english album/artist/track
|
||||||
|
return a
|
||||||
|
|
||||||
|
def loose_compare_str(a: str, b: str) -> bool:
|
||||||
|
return clean_for_loose_compare(a) == clean_for_loose_compare(b)
|
||||||
|
|
||||||
|
def loose_compare_lists(a: list[str], b: list[str]) -> bool:
|
||||||
|
a = sorted(clean_for_loose_compare(i) for i in a)
|
||||||
|
b = sorted(clean_for_loose_compare(i) for i in b)
|
||||||
|
return a == b
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class Tags:
|
class Tags:
|
||||||
# format matches mutagen's
|
# format matches mutagen's
|
||||||
@@ -68,6 +96,22 @@ class Tags:
|
|||||||
tracknumber=tracknumber,
|
tracknumber=tracknumber,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def trim_fields(self) -> None:
|
||||||
|
if len(self.title) == 1:
|
||||||
|
self.title = [ self.title[0].strip() ]
|
||||||
|
if len(self.artist) == 1:
|
||||||
|
self.artist = [ self.artist[0].strip() ]
|
||||||
|
if len(self.albumartist) == 1:
|
||||||
|
self.albumartist = [ self.albumartist[0].strip() ]
|
||||||
|
if len(self.album) == 1:
|
||||||
|
self.album = [ self.album[0].strip() ]
|
||||||
|
|
||||||
|
def expand_shorthands(self) -> None:
|
||||||
|
if self.artist == ["V.A."]:
|
||||||
|
self.artist = ["Various Artists"]
|
||||||
|
if self.albumartist == ["V.A."]:
|
||||||
|
self.albumartist = ["Various Artists"]
|
||||||
|
|
||||||
def promote_albumartist(self) -> None:
|
def promote_albumartist(self) -> None:
|
||||||
"""
|
"""
|
||||||
1. replace shorthands like "V.A." with "Various Artists".
|
1. replace shorthands like "V.A." with "Various Artists".
|
||||||
@@ -75,23 +119,14 @@ class Tags:
|
|||||||
3. if the artist and album artist are nearly identical, try to merge them.
|
3. if the artist and album artist are nearly identical, try to merge them.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if self.artist == ["V.A."]:
|
if loose_compare_lists(self.artist, self.albumartist):
|
||||||
self.artist = ["Various Artists"]
|
# arist & album artist are nearly identical:
|
||||||
if self.albumartist == ["V.A."]:
|
# probably guessed one of them from filename, which was lacking certain symbols of the actual artist.
|
||||||
self.albumartist = ["Various Artists"]
|
# recover whichever of these fields had the fewer characters removed (i.e. is longest)
|
||||||
|
if len("".join(self.artist)) > len("".join(self.albumartist)):
|
||||||
unomitable = 'abcdefghijklmnopqrstuvwxyz0123456789'
|
self.artist = self.albumartist = self.artist
|
||||||
if len(self.artist) == len(self.albumartist) == 1:
|
else:
|
||||||
filtered_artist = [i for i in self.artist[0] if i.lower() in unomitable]
|
self.artist = self.albumartist = self.albumartist
|
||||||
filtered_albumartist = [i for i in self.albumartist[0] if i.lower() in unomitable]
|
|
||||||
if filtered_artist == filtered_albumartist:
|
|
||||||
# arist & album artist are nearly identical:
|
|
||||||
# probably guessed one of them from filename, which was lacking certain symbols of the actual artist.
|
|
||||||
# recover whichever of these fields had the fewer characters removed (i.e. is longest)
|
|
||||||
if len(self.artist[0]) > len(self.albumartist[0]):
|
|
||||||
self.artist = self.albumartist = self.artist
|
|
||||||
else:
|
|
||||||
self.artist = self.albumartist = self.albumartist
|
|
||||||
|
|
||||||
if self.artist == []:
|
if self.artist == []:
|
||||||
self.artist = self.albumartist
|
self.artist = self.albumartist
|
||||||
@@ -99,11 +134,12 @@ class Tags:
|
|||||||
|
|
||||||
def rewrite_singles(self) -> None:
|
def rewrite_singles(self) -> None:
|
||||||
""" idiom is for singles to belong to self-titled album. else each artist's singles get merged into one massive album """
|
""" idiom is for singles to belong to self-titled album. else each artist's singles get merged into one massive album """
|
||||||
if self.album == ["Singles"]:
|
if len(self.album) != 1:
|
||||||
if self.albumartist:
|
return
|
||||||
self.album = self.albumartist
|
|
||||||
else:
|
for artist in self.albumartist[::-1] + self.artist[::-1]:
|
||||||
self.album = self.artist
|
if loose_compare_str(self.album[0], "Singles") or loose_compare_str(self.album[0], artist):
|
||||||
|
self.album = [ artist ]
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def from_path(p: str) -> 'Tags':
|
def from_path(p: str) -> 'Tags':
|
||||||
@@ -246,7 +282,10 @@ class Tagger:
|
|||||||
old_tags = file_.tags_on_disk()
|
old_tags = file_.tags_on_disk()
|
||||||
|
|
||||||
path_tags = Tags.from_path(path_)
|
path_tags = Tags.from_path(path_)
|
||||||
|
# logger.debug(f"extracted tags from {path_}: {path_tags}")
|
||||||
new_tags = old_tags.union(path_tags)
|
new_tags = old_tags.union(path_tags)
|
||||||
|
new_tags.trim_fields()
|
||||||
|
new_tags.expand_shorthands()
|
||||||
new_tags.promote_albumartist()
|
new_tags.promote_albumartist()
|
||||||
new_tags.rewrite_singles()
|
new_tags.rewrite_singles()
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user