sane-tag-music: use pykakasi for better romanization

This commit is contained in:
2024-07-10 04:42:51 +00:00
parent 56032bc040
commit 50add19b14
2 changed files with 37 additions and 12 deletions

View File

@@ -205,7 +205,7 @@ let
tag-music = static-nix-shell.mkPython3 { tag-music = static-nix-shell.mkPython3 {
pname = "sane-tag-music"; pname = "sane-tag-music";
srcRoot = ./src; srcRoot = ./src;
pkgs = [ "python3.pkgs.mutagen" "python3.pkgs.pyexiftool" "python3.pkgs.unidecode" ]; pkgs = [ "python3.pkgs.mutagen" "python3.pkgs.pyexiftool" "python3.pkgs.pykakasi" "python3.pkgs.unidecode" ];
}; };
vpn = static-nix-shell.mkBash { vpn = static-nix-shell.mkBash {
pname = "sane-vpn"; pname = "sane-vpn";

View File

@@ -1,5 +1,5 @@
#!/usr/bin/env nix-shell #!/usr/bin/env nix-shell
#!nix-shell -i python3 -p python3 -p python3.pkgs.mutagen -p python3.pkgs.pyexiftool -p python3.pkgs.unidecode #!nix-shell -i python3 -p python3 -p python3.pkgs.mutagen -p python3.pkgs.pyexiftool -p python3.pkgs.pykakasi -p python3.pkgs.unidecode
# vim: set filetype=python : # vim: set filetype=python :
# #
# standard tags: # standard tags:
@@ -88,27 +88,52 @@ import mutagen.flac
import mutagen.mp3 import mutagen.mp3
import mutagen.oggopus import mutagen.oggopus
import mutagen.oggvorbis import mutagen.oggvorbis
import pykakasi
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
kks = pykakasi.kakasi()
class MediaType(Enum): class MediaType(Enum):
Audio = "audio" Audio = "audio"
Image = "image" Image = "image"
def maybe_romanize(a: str) -> str|None:
if a == "かめりあ": return "Camellia"
if a == "お握り": return "onigiri"
if a == "存流": return "ARU"
if a.lower() == "жужжалка": return "zhuzhzhalka" # Russian
if a == "+": return "."
if a == "&": return "and"
# jp kanji -> romanization
# pykakasi does word breaking, only return the conversion if on an actual word boundary
romaji = [r['hepburn'] for r in kks.convert(a)]
if len(romaji) == 1 and romaji[0] != "" and romaji[0] != a:
return romaji[0]
# catchall/fallback
if len(a) == 1: return unidecode(a).strip()
def romanize(a: str) -> str: def romanize(a: str) -> str:
""" """
transform `a` in a way which loses only a minimal amount of info transform `a` in a way which loses only a minimal amount of info
""" """
# ampersand, like: PLS&TY; &I romanized = ""
a = a.replace("&", "And") index = 0
a = a.replace("かめりあ", "Camellia") # else `unidecode` sets it to kameria while index < len(a):
a = unidecode(a) for segment_len in range(10, 0, -1):
# these diacritic replacements might be unnecessary now that i'm using unidecode? r = maybe_romanize(a[index : index+segment_len])
a = a.replace("é", "e") if r is not None:
a = a.replace("ä", "a") if len(r) > 1:
a = a.replace("ö", "o") r = r[0].upper() + r[1:]
a = a.replace("ü", "u") if romanized and len(r) != 1:
return a romanized += " "
romanized += r
if index + segment_len < len(a) and len(r) != 1:
romanized += " "
index += segment_len
return romanized
def clean_for_loose_compare(a: str) -> str: def clean_for_loose_compare(a: str) -> str:
a = romanize(a) a = romanize(a)