sane-tag-music: use pykakasi for better romanization
This commit is contained in:
@@ -205,7 +205,7 @@ let
|
||||
tag-music = static-nix-shell.mkPython3 {
|
||||
pname = "sane-tag-music";
|
||||
srcRoot = ./src;
|
||||
pkgs = [ "python3.pkgs.mutagen" "python3.pkgs.pyexiftool" "python3.pkgs.unidecode" ];
|
||||
pkgs = [ "python3.pkgs.mutagen" "python3.pkgs.pyexiftool" "python3.pkgs.pykakasi" "python3.pkgs.unidecode" ];
|
||||
};
|
||||
vpn = static-nix-shell.mkBash {
|
||||
pname = "sane-vpn";
|
||||
|
@@ -1,5 +1,5 @@
|
||||
#!/usr/bin/env nix-shell
|
||||
#!nix-shell -i python3 -p python3 -p python3.pkgs.mutagen -p python3.pkgs.pyexiftool -p python3.pkgs.unidecode
|
||||
#!nix-shell -i python3 -p python3 -p python3.pkgs.mutagen -p python3.pkgs.pyexiftool -p python3.pkgs.pykakasi -p python3.pkgs.unidecode
|
||||
# vim: set filetype=python :
|
||||
#
|
||||
# standard tags:
|
||||
@@ -88,27 +88,52 @@ import mutagen.flac
|
||||
import mutagen.mp3
|
||||
import mutagen.oggopus
|
||||
import mutagen.oggvorbis
|
||||
import pykakasi
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
kks = pykakasi.kakasi()
|
||||
|
||||
class MediaType(Enum):
|
||||
Audio = "audio"
|
||||
Image = "image"
|
||||
|
||||
def maybe_romanize(a: str) -> str|None:
|
||||
if a == "かめりあ": return "Camellia"
|
||||
if a == "お握り": return "onigiri"
|
||||
if a == "存流": return "ARU"
|
||||
if a.lower() == "жужжалка": return "zhuzhzhalka" # Russian
|
||||
if a == "+": return "."
|
||||
if a == "&": return "and"
|
||||
|
||||
# jp kanji -> romanization
|
||||
# pykakasi does word breaking, only return the conversion if on an actual word boundary
|
||||
romaji = [r['hepburn'] for r in kks.convert(a)]
|
||||
if len(romaji) == 1 and romaji[0] != "" and romaji[0] != a:
|
||||
return romaji[0]
|
||||
|
||||
# catchall/fallback
|
||||
if len(a) == 1: return unidecode(a).strip()
|
||||
|
||||
def romanize(a: str) -> str:
|
||||
"""
|
||||
transform `a` in a way which loses only a minimal amount of info
|
||||
"""
|
||||
# ampersand, like: PLS&TY; &I
|
||||
a = a.replace("&", "And")
|
||||
a = a.replace("かめりあ", "Camellia") # else `unidecode` sets it to kameria
|
||||
a = unidecode(a)
|
||||
# these diacritic replacements might be unnecessary now that i'm using unidecode?
|
||||
a = a.replace("é", "e")
|
||||
a = a.replace("ä", "a")
|
||||
a = a.replace("ö", "o")
|
||||
a = a.replace("ü", "u")
|
||||
return a
|
||||
romanized = ""
|
||||
index = 0
|
||||
while index < len(a):
|
||||
for segment_len in range(10, 0, -1):
|
||||
r = maybe_romanize(a[index : index+segment_len])
|
||||
if r is not None:
|
||||
if len(r) > 1:
|
||||
r = r[0].upper() + r[1:]
|
||||
if romanized and len(r) != 1:
|
||||
romanized += " "
|
||||
romanized += r
|
||||
if index + segment_len < len(a) and len(r) != 1:
|
||||
romanized += " "
|
||||
index += segment_len
|
||||
return romanized
|
||||
|
||||
def clean_for_loose_compare(a: str) -> str:
|
||||
a = romanize(a)
|
||||
|
Reference in New Issue
Block a user