nix-files/pkgs/additional/sane-scripts/src/sane-sync-music

#!/usr/bin/env nix-shell
#!nix-shell -i python3 -p "python3.withPackages (ps: [ ps.unidecode ])" -p ffmpeg -p sox
# vim: set filetype=python :
import argparse
import concurrent.futures
import datetime
import logging
import multiprocessing
import os
import subprocess

from pathlib import Path
from unidecode import unidecode

logger = logging.getLogger(__name__)

LOSSLESS_FMTS = [
    '.flac',
    '.wav',
]

MAYBE_LOSSY_FMTS = [
    # WMA can be lossy or lossless
    '.wma',
]

LOSSY_FMTS = [
    '.aac',
    '.m4a',
    '.mp3',
    '.oga',
    '.ogg',
    '.opus',
]

COMPAT_AUDIO_FMTS = [
    '.flac',
    '.mp3',
    '.oga',
    '.ogg',
    '.opus',
]

AUDIO_FMTS = LOSSLESS_FMTS + MAYBE_LOSSY_FMTS + LOSSY_FMTS

IMAGE_FMTS = [
    '.bmp',
    '.gif',
    '.jpeg',
    '.jpg',
    '.png',
]

VIDEO_FMTS = [
    '.avi',
    '.mov',
    '.mp4',
    '.webm',
]

METADATA_FMTS = [
    '.inf',
    '.lyrics',
    '.nfo',
    '.pdf',
    '.toc',
    '.txt',
]

NON_AUDIO_FMTS = IMAGE_FMTS + VIDEO_FMTS + METADATA_FMTS

IGNORE = [
    '.DS_Store',
    '.cue',
    '.log',
    '.m3u',
    '.nsf_',
]

def approx_eq(a: float, b: float, threshold: float) -> bool:
    return abs(b - a) <= threshold

def clean_name(path: Path) -> Path:
    '''
    transform a path into something which most filesystems/protocols can reliably support.
    also removes irregularities like uppercase file extensions.
    '''
    out_path = Path()
    for part in path.parts:
        blacklisted = '"\'!@#$%^&*()[]{};:,<>?`~|\\/'
        part = unidecode(part)
        part = ''.join(c for c in part if c not in blacklisted)
        out_path /= part

    return out_path.with_suffix(out_path.suffix.lower())

class TranscodePreferences:
    def __init__(self, compress: bool, compat: bool):
        self.compress = compress
        self.compat = compat

    def get_output(self, input_ext: str) -> str | None:
        """
        for some source type (e.g. `.wav`), return the desired output type (e.g. `.mp3`).
        returns `.null` to indicate the file shouldn't be copied.
        returns `None` if i don't understand the source file.
        """
        desired_output = None

        if input_ext in AUDIO_FMTS:
            desired_output = input_ext
            if self.compress:
                desired_output = self.get_compressed_audio_output(desired_output)
            if self.compat:
                desired_output = self.get_compat_audio_output(desired_output)
        elif input_ext in IMAGE_FMTS:
            desired_output = input_ext
        elif input_ext in VIDEO_FMTS:
            desired_output = input_ext
        elif input_ext in METADATA_FMTS:
            desired_output = input_ext
        elif input_ext in IGNORE:
            desired_output = ".null"

        return desired_output

    def desired_samplerate(self, input_samplerate: int | None) -> int | None:
        samplerate_map = {
            192000: 48000 if self.compress else 192000,
            96000:  48000 if self.compress else 96000,
            88200:  44100 if self.compress else 88200,
            # preserve as-is
            48000:  48000,
            44100:  44100,
        }
        return samplerate_map.get(input_samplerate)

    def get_compressed_audio_output(self, input_ext: str) -> str:
        if input_ext in LOSSY_FMTS:
            return input_ext
        else:
            return ".mp3"

    def get_compat_audio_output(self, input_ext: str) -> str:
        if input_ext in COMPAT_AUDIO_FMTS:
            return input_ext
        elif input_ext in LOSSLESS_FMTS:
            return ".flac"
        else:
            return ".mp3"


class Encoder:
    def __init__(self, prefs: TranscodePreferences, dry_run: bool = False):
        self.prefs = prefs
        self.dry_run = dry_run

    def destructive(self, default_, f, *args, **kwargs):
        if self.dry_run:
            pretty_args = ", ".join(
               [repr(a) for a in args]
               + [f"{k}={v!r}" for k, v in kwargs.items()]
            )
            logger.debug(f"[dry-run: not invoking]: {f.__name__}({pretty_args})")
            return default_
        else:
            return f(*args, **kwargs)

    def _check_output(self, args: list[str], quiet: bool = False) -> bytes:
        res = subprocess.run(args, capture_output=True)

        stderr = res.stderr.strip()
        if stderr and not quiet:
            logger.error(stderr)

        res.check_returncode()
        return res.stdout

    def check_output(self, args: list[str], has_side_effect=True, **kwargs) -> str:
        if has_side_effect:
            return self.destructive(b'', self._check_output, args, **kwargs)
        else:
            return self._check_output(args, **kwargs)

    def cp(self, source: Path, dest: Path) -> None:
        logger.info(f'copying {source} -> {dest}')
        self.check_output(['cp', str(source), str(dest)])

    def ensure_dir(self, dir: Path) -> None:
        self.destructive(None, os.makedirs, str(dir), exist_ok=True)

    def remove(self, path: Path) -> None:
        self.destructive(None, os.remove, path)

    def convert(self, source: Path, dest: Path, target_samplerate: int | None) -> None:
        assert dest.suffix == '.mp3', "conversion to a target other than mp3 not yet supported"
        logger.info(f'converting {source} -> {dest}')

        samplerate_flags = ['-ar', str(target_samplerate)] if target_samplerate else []

        self.check_output([
            'ffmpeg',
            '-loglevel', 'warning',
            '-y',  # force overwrite
            '-i', str(source),
            '-codec:v', 'copy',
            '-codec:a', 'libmp3lame',
            '-qscale:a', '0'
        ] + samplerate_flags + [str(dest)])

    def cp_or_convert(self, source: Path, dest: Path) -> None:
        source_samplerate = None
        if source.suffix.lower() not in NON_AUDIO_FMTS:
            try:
                source_samplerate = int(
                    self.check_output(
                        ['soxi', '-r', str(source)],
                        has_side_effect=False,
                        quiet=True,
                    ).decode("utf-8").strip()
                )
            except:
                if source.suffix.lower() in ['.aac', '.m4a', '.wma']:
                    # sox is known to not support these formats
                    logging.debug(f'unsupported extension for samplerate: {source}')
                else:
                    logging.warning(f'unable to obtain samplerate for {source}')

        target_samplerate = self.prefs.desired_samplerate(source_samplerate)
        if source_samplerate and not target_samplerate:
            logging.warning(f'unable to map source sample rate: {source_samplerate}')

        if source_samplerate != target_samplerate:
            # resampling -> convert
            self.convert(source, dest, target_samplerate)
        elif source.suffix.lower() != dest.suffix:
            # transcoding -> convert
            self.convert(source, dest, target_samplerate)
        else:
            # neither resampling nor transcoding -> simple copy will suffice
            self.cp(source, dest)

        # in all these cases, on success, synchronize the `mtime` to be in agreement
        st = os.stat(source)
        mtime = st.st_mtime
        atime = datetime.datetime.now().timestamp()
        self.destructive(None, os.utime, str(dest), (atime, mtime))

class Sync:
    def __init__(self, encoder: Encoder, in_dir: str, out_dir: str, force_copy: bool = False):
        self.encoder = encoder
        self.in_dir = in_dir
        self.out_dir = out_dir
        self.force_copy = force_copy

    def target_name(self, source_name: Path) -> Path | None:
        n = clean_name(source_name)
        output_type = self.encoder.prefs.get_output(n.suffix)

        if output_type is None:
            logger.warning(f"skipping {source_name} because i don't recognize its filetype ({n.suffix})")
            return None

        if output_type == ".null":
            return None
        elif output_type == n.suffix:
            return n
        else:
            return Path(str(n) + output_type)

    def calculate_delta(self) -> tuple[set[Path], set[tuple[Path, Path]], set[tuple[Path, Path]]]:
        '''
        Returns, as a tuple:
        - dest files which need to be deleted
        - new files to copy (in-path/out-path pairs)
        - existing files which need to be updated (in-path/out-path pairs)

        all returned paths are relative to in_dir/out_dir.
        '''
        in_files = { p.relative_to(self.in_dir) for p in Path(self.in_dir).rglob("*") if not p.is_dir() }
        logger.info(f'found {len(in_files)} files in source')

        # create a map from source path to dest path
        in_out_map = ((in_f, self.target_name(in_f)) for in_f in in_files)
        in_out_map = dict((in_f, out_f) for (in_f, out_f) in in_out_map if out_f is not None)
        logger.info(f'recognized {len(in_files)} source files as media')

        existing_out_files = { p.relative_to(self.out_dir) for p in Path(self.out_dir).rglob("*") if not p.is_dir() }
        logger.info(f'found {len(existing_out_files)} files in dest')

        expected_out_files = in_out_map.values()

        to_del = {
          f for f in existing_out_files
          if f not in expected_out_files
        }
        logger.info(f'found {len(to_del)} files to delete')

        to_copy = {
          (in_f, out_f) for (in_f, out_f) in in_out_map.items()
          if out_f not in existing_out_files
        }
        logger.info(f'found {len(to_copy)} files to copy')

        to_update = {
          (in_f, out_f) for (in_f, out_f) in in_out_map.items()
          if (in_f, out_f) not in to_copy and (self.force_copy or self.needs_update(in_f, out_f))
        }
        logger.info(f'found {len(to_update)} files to update')

        return to_del, to_copy, to_update

    def needs_update(self, src: Path, dest: Path) -> bool:
        '''
        files are relative to in_dir/out_dir
        '''
        src_stat = os.stat(self.in_dir / src)
        dest_stat = os.stat(self.out_dir / dest)
        return not approx_eq(src_stat.st_mtime, dest_stat.st_mtime, threshold=120.0)

    def rm_dest_files(self, files: list[Path]) -> None:
        '''
        files are relative to out_dir
        '''
        for f in files:
            logger.info(f'removing {f}')
            f = Path(self.out_dir) / f
            self.encoder.remove(f)
            # if the directory is empty after removing this file, then remove the directory (and possibly prune its parents too)
            if not os.listdir(f.parent):
                os.removedirs(f.parent)

    def copy_one(self, src_name: Path, dest_name: Path) -> None:
        '''
        path names are relative to in_dir/out_dir
        '''
        source = Path(self.in_dir) / src_name
        dest = Path(self.out_dir) / dest_name

        self.encoder.ensure_dir(dest.parent)

        self.encoder.cp_or_convert(source, dest)

    def try_invoke(self, f, *args) -> None:
        """
        try to invoke `f` with the provided `args`, and log if it fails.
        this overcomes the issue that background tasks which fail via Exception otherwise do so silently.
        """
        try:
            f(*args)
        except Exception as e:
            logger.error(f"task failed: {e}")

    def cp_files(self, file_pairs: list[tuple[Path, Path]], jobs: int):
        logger.info(f'using {jobs} jobs to copy {len(file_pairs)} files')
        with concurrent.futures.ThreadPoolExecutor(max_workers=jobs) as executor:
            for src_f, dest_f in file_pairs:
                executor.submit(self.try_invoke, self.copy_one, src_f, dest_f)


def sync_all(
    in_dir: str,
    out_dir: str,
    compress: bool = False,
    compat: bool = False,
    force_copy: bool = False,
    dry_run: bool = False,
    jobs: int = None,
) -> None:
    prefs = TranscodePreferences(compress=compress, compat=compat)
    encoder = Encoder(prefs, dry_run=dry_run)
    sync = Sync(encoder, in_dir, out_dir, force_copy=force_copy)
    to_del, to_copy, to_update = sync.calculate_delta()

    sync.rm_dest_files(sorted(to_del))
    sync.cp_files(sorted(to_copy) + sorted(to_update), jobs = jobs or multiprocessing.cpu_count())

def main() -> None:
    logging.basicConfig()
    logger.setLevel(logging.INFO)

    parser = argparse.ArgumentParser(description="synchronize music from one directory to another, possibly compressing it")
    parser.add_argument("src", help="source directory")
    parser.add_argument("dest", help="destination directory")
    parser.add_argument("--compress", action='store_true', help="compress audio files (to mp3)")
    parser.add_argument("--compat", action='store_true', help="convert poorly supported file formats to better-supported formats (e.g. avoid wma)")
    parser.add_argument("--jobs", help="number of cores to compress music with (default: all CPU cores)", default=None, type=int)
    parser.add_argument("--dry-run", action='store_true', help="don't actually run any commands")
    parser.add_argument("--verbose", action='store_true', help="more logging")
    parser.add_argument("--quiet", action='store_true', help="less logging")
    parser.add_argument("--force-copy", action='store_true', help="copy over files that already exist (in case metadata needs updating)")

    args = parser.parse_args()

    if args.verbose:
        logger.setLevel(logging.DEBUG)
    if args.quiet:
        logger.setLevel(logging.WARN)

    sync_all(
        args.src,
        args.dest,
        compress=args.compress,
        compat=args.compat,
        force_copy=args.force_copy,
        dry_run=args.dry_run,
        jobs=args.jobs,
    )

if __name__ == '__main__':
    main()