sane-tag-music: use pykakasi for better romanization
This commit is contained in:
@@ -205,7 +205,7 @@ let
|
|||||||
tag-music = static-nix-shell.mkPython3 {
|
tag-music = static-nix-shell.mkPython3 {
|
||||||
pname = "sane-tag-music";
|
pname = "sane-tag-music";
|
||||||
srcRoot = ./src;
|
srcRoot = ./src;
|
||||||
pkgs = [ "python3.pkgs.mutagen" "python3.pkgs.pyexiftool" "python3.pkgs.unidecode" ];
|
pkgs = [ "python3.pkgs.mutagen" "python3.pkgs.pyexiftool" "python3.pkgs.pykakasi" "python3.pkgs.unidecode" ];
|
||||||
};
|
};
|
||||||
vpn = static-nix-shell.mkBash {
|
vpn = static-nix-shell.mkBash {
|
||||||
pname = "sane-vpn";
|
pname = "sane-vpn";
|
||||||
|
@@ -1,5 +1,5 @@
|
|||||||
#!/usr/bin/env nix-shell
|
#!/usr/bin/env nix-shell
|
||||||
#!nix-shell -i python3 -p python3 -p python3.pkgs.mutagen -p python3.pkgs.pyexiftool -p python3.pkgs.unidecode
|
#!nix-shell -i python3 -p python3 -p python3.pkgs.mutagen -p python3.pkgs.pyexiftool -p python3.pkgs.pykakasi -p python3.pkgs.unidecode
|
||||||
# vim: set filetype=python :
|
# vim: set filetype=python :
|
||||||
#
|
#
|
||||||
# standard tags:
|
# standard tags:
|
||||||
@@ -88,27 +88,52 @@ import mutagen.flac
|
|||||||
import mutagen.mp3
|
import mutagen.mp3
|
||||||
import mutagen.oggopus
|
import mutagen.oggopus
|
||||||
import mutagen.oggvorbis
|
import mutagen.oggvorbis
|
||||||
|
import pykakasi
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
kks = pykakasi.kakasi()
|
||||||
|
|
||||||
class MediaType(Enum):
|
class MediaType(Enum):
|
||||||
Audio = "audio"
|
Audio = "audio"
|
||||||
Image = "image"
|
Image = "image"
|
||||||
|
|
||||||
|
def maybe_romanize(a: str) -> str|None:
|
||||||
|
if a == "かめりあ": return "Camellia"
|
||||||
|
if a == "お握り": return "onigiri"
|
||||||
|
if a == "存流": return "ARU"
|
||||||
|
if a.lower() == "жужжалка": return "zhuzhzhalka" # Russian
|
||||||
|
if a == "+": return "."
|
||||||
|
if a == "&": return "and"
|
||||||
|
|
||||||
|
# jp kanji -> romanization
|
||||||
|
# pykakasi does word breaking, only return the conversion if on an actual word boundary
|
||||||
|
romaji = [r['hepburn'] for r in kks.convert(a)]
|
||||||
|
if len(romaji) == 1 and romaji[0] != "" and romaji[0] != a:
|
||||||
|
return romaji[0]
|
||||||
|
|
||||||
|
# catchall/fallback
|
||||||
|
if len(a) == 1: return unidecode(a).strip()
|
||||||
|
|
||||||
def romanize(a: str) -> str:
|
def romanize(a: str) -> str:
|
||||||
"""
|
"""
|
||||||
transform `a` in a way which loses only a minimal amount of info
|
transform `a` in a way which loses only a minimal amount of info
|
||||||
"""
|
"""
|
||||||
# ampersand, like: PLS&TY; &I
|
romanized = ""
|
||||||
a = a.replace("&", "And")
|
index = 0
|
||||||
a = a.replace("かめりあ", "Camellia") # else `unidecode` sets it to kameria
|
while index < len(a):
|
||||||
a = unidecode(a)
|
for segment_len in range(10, 0, -1):
|
||||||
# these diacritic replacements might be unnecessary now that i'm using unidecode?
|
r = maybe_romanize(a[index : index+segment_len])
|
||||||
a = a.replace("é", "e")
|
if r is not None:
|
||||||
a = a.replace("ä", "a")
|
if len(r) > 1:
|
||||||
a = a.replace("ö", "o")
|
r = r[0].upper() + r[1:]
|
||||||
a = a.replace("ü", "u")
|
if romanized and len(r) != 1:
|
||||||
return a
|
romanized += " "
|
||||||
|
romanized += r
|
||||||
|
if index + segment_len < len(a) and len(r) != 1:
|
||||||
|
romanized += " "
|
||||||
|
index += segment_len
|
||||||
|
return romanized
|
||||||
|
|
||||||
def clean_for_loose_compare(a: str) -> str:
|
def clean_for_loose_compare(a: str) -> str:
|
||||||
a = romanize(a)
|
a = romanize(a)
|
||||||
|
Reference in New Issue
Block a user