sane-tag-music: use pykakasi for better romanization

This commit is contained in:
2024-07-10 04:42:51 +00:00
parent 56032bc040
commit 50add19b14
2 changed files with 37 additions and 12 deletions

View File

@@ -205,7 +205,7 @@ let
tag-music = static-nix-shell.mkPython3 {
pname = "sane-tag-music";
srcRoot = ./src;
pkgs = [ "python3.pkgs.mutagen" "python3.pkgs.pyexiftool" "python3.pkgs.unidecode" ];
pkgs = [ "python3.pkgs.mutagen" "python3.pkgs.pyexiftool" "python3.pkgs.pykakasi" "python3.pkgs.unidecode" ];
};
vpn = static-nix-shell.mkBash {
pname = "sane-vpn";

View File

@@ -1,5 +1,5 @@
#!/usr/bin/env nix-shell
#!nix-shell -i python3 -p python3 -p python3.pkgs.mutagen -p python3.pkgs.pyexiftool -p python3.pkgs.unidecode
#!nix-shell -i python3 -p python3 -p python3.pkgs.mutagen -p python3.pkgs.pyexiftool -p python3.pkgs.pykakasi -p python3.pkgs.unidecode
# vim: set filetype=python :
#
# standard tags:
@@ -88,27 +88,52 @@ import mutagen.flac
import mutagen.mp3
import mutagen.oggopus
import mutagen.oggvorbis
import pykakasi
logger = logging.getLogger(__name__)
kks = pykakasi.kakasi()
class MediaType(Enum):
Audio = "audio"
Image = "image"
def maybe_romanize(a: str) -> str|None:
if a == "かめりあ": return "Camellia"
if a == "お握り": return "onigiri"
if a == "存流": return "ARU"
if a.lower() == "жужжалка": return "zhuzhzhalka" # Russian
if a == "+": return "."
if a == "&": return "and"
# jp kanji -> romanization
# pykakasi does word breaking, only return the conversion if on an actual word boundary
romaji = [r['hepburn'] for r in kks.convert(a)]
if len(romaji) == 1 and romaji[0] != "" and romaji[0] != a:
return romaji[0]
# catchall/fallback
if len(a) == 1: return unidecode(a).strip()
def romanize(a: str) -> str:
"""
transform `a` in a way which loses only a minimal amount of info
"""
# ampersand, like: PLS&TY; &I
a = a.replace("&", "And")
a = a.replace("かめりあ", "Camellia") # else `unidecode` sets it to kameria
a = unidecode(a)
# these diacritic replacements might be unnecessary now that i'm using unidecode?
a = a.replace("é", "e")
a = a.replace("ä", "a")
a = a.replace("ö", "o")
a = a.replace("ü", "u")
return a
romanized = ""
index = 0
while index < len(a):
for segment_len in range(10, 0, -1):
r = maybe_romanize(a[index : index+segment_len])
if r is not None:
if len(r) > 1:
r = r[0].upper() + r[1:]
if romanized and len(r) != 1:
romanized += " "
romanized += r
if index + segment_len < len(a) and len(r) != 1:
romanized += " "
index += segment_len
return romanized
def clean_for_loose_compare(a: str) -> str:
a = romanize(a)