sane-sync-music: add --compress and --compat options

2023-12-08 10:24:48 +00:00 · 2023-12-08 10:24:48 +00:00 · 4caf61387e
commit 4caf61387e
parent ab020327f4
1 changed files with 239 additions and 87 deletions
--- a/pkgs/additional/sane-scripts/src/sane-sync-music
+++ b/pkgs/additional/sane-scripts/src/sane-sync-music
@ -13,56 +13,163 @@ from unidecode import unidecode

 logger = logging.getLogger(__name__)

-MAKE_MP3 = [
+LOSSLESS_FMTS = [
    '.flac',
+    '.wav',
+]
+
+MAYBE_LOSSY_FMTS = [
+    # WMA can be lossy or lossless
+    '.wma',
+]
+
+LOSSY_FMTS = [
+    '.aac',
+    '.m4a',
+    '.mp3',
    '.oga',
    '.ogg',
    '.opus',
-    '.wav',
-    '.wma',
 ]
-COPY_RAW = [
-    '.aac',
+
+COMPAT_AUDIO_FMTS = [
+    '.flac',
+    '.mp3',
+    '.oga',
+    '.ogg',
+    '.opus',
+]
+
+AUDIO_FMTS = LOSSLESS_FMTS + MAYBE_LOSSY_FMTS + LOSSY_FMTS
+
+IMAGE_FMTS = [
    '.bmp',
    '.gif',
    '.jpeg',
    '.jpg',
-    '.m4a',
-    '.mp3',
    '.png',
 ]
-IGNORE = [
-    '.DS_Store',
+
+VIDEO_FMTS = [
    '.avi',
-    '.cue',
-    '.inf',
-    '.log',
-    '.lyrics',
-    '.m3u',
    '.mov',
    '.mp4',
-    '.nsf_',
-    '.pdf',
-    '.toc',
-    '.txt',
    '.webm',
 ]

+METADATA_FMTS = [
+    '.inf',
+    '.lyrics',
+    '.nfo',
+    '.pdf',
+    '.toc',
+    '.txt',
+]
+
+NON_AUDIO_FMTS = IMAGE_FMTS + VIDEO_FMTS + METADATA_FMTS
+
+IGNORE = [
+    '.DS_Store',
+    '.cue',
+    '.log',
+    '.m3u',
+    '.nsf_',
+]
+
+def clean_name(path: Path) -> Path:
+    '''
+    transform a path into something which most filesystems/protocols can reliably support.
+    also removes irregularities like uppercase file extensions.
+    '''
+    out_path = Path()
+    for part in path.parts:
+        blacklisted = '"\'!@#$%^&*()[]{};:,<>?`~|\\/'
+        part = unidecode(part)
+        part = ''.join(c for c in part if c not in blacklisted)
+        out_path /= part
+
+    return out_path.with_suffix(out_path.suffix.lower())
+
+class TranscodePreferences:
+    def __init__(self, compress: bool, compat: bool):
+        self.compress = compress
+        self.compat = compat
+
+    def get_output(self, input_ext: str) -> str | None:
+        """
+        for some source type (e.g. `.wav`), return the desired output type (e.g. `.mp3`).
+        returns `.null` to indicate the file shouldn't be copied.
+        returns `None` if i don't understand the source file.
+        """
+        desired_output = None
+
+        if input_ext in AUDIO_FMTS:
+            desired_output = input_ext
+            if self.compress:
+                desired_output = self.get_compressed_audio_output(desired_output)
+            if self.compat:
+                desired_output = self.get_compat_audio_output(desired_output)
+        elif input_ext in IMAGE_FMTS:
+            desired_output = input_ext
+        elif input_ext in VIDEO_FMTS:
+            desired_output = input_ext
+        elif input_ext in METADATA_FMTS:
+            desired_output = input_ext
+        elif input_ext in IGNORE:
+            desired_output = ".null"
+
+        return desired_output
+
+    def desired_samplerate(self, input_samplerate: int | None) -> int | None:
+        samplerate_map = {
+            192000: 48000 if self.compress else 192000,
+            96000:  48000 if self.compress else 96000,
+            88200:  44100 if self.compress else 88200,
+            # preserve as-is
+            48000:  48000,
+            44100:  44100,
+        }
+        return samplerate_map.get(input_samplerate)
+
+    def get_compressed_audio_output(self, input_ext: str) -> str:
+        if input_ext in LOSSY_FMTS:
+            return input_ext
+        else:
+            return ".mp3"
+
+    def get_compat_audio_output(self, input_ext: str) -> str:
+        if input_ext in COMPAT_AUDIO_FMTS:
+            return input_ext
+        elif input_ext in LOSSLESS_FMTS:
+            return ".flac"
+        else:
+            return ".mp3"
+
+
 class Encoder:
-    def __init__(self, dry_run: bool = False):
+    def __init__(self, prefs: TranscodePreferences, dry_run: bool = False):
+        self.prefs = prefs
        self.dry_run = dry_run

-    def check_output_no_sideeffect(self, args: list[str]) -> bytes:
-        return subprocess.check_output(args)
+    def _check_output(self, args: list[str], quiet: bool = False) -> bytes:
+        res = subprocess.run(args, capture_output=True)

-    def check_output(self, args: list[str]) -> str:
-        if self.dry_run:
+        stderr = res.stderr.strip()
+        if stderr and not quiet:
+            logger.error(stderr)
+
+        res.check_returncode()
+        return res.stdout
+
+    def check_output(self, args: list[str], has_side_effect=True, **kwargs) -> str:
+        if self.dry_run and has_side_effect:
            logger.debug("not invoking because dry run: " + ' '.join(args))
            return b""
        else:
-            return subprocess.check_output(args)
+            return self._check_output(args, **kwargs)

    def cp(self, source: Path, dest: Path) -> None:
+        logger.info(f'copying {source} -> {dest}')
        self.check_output(['cp', str(source), str(dest)])

    def ensure_dir(self, dir: Path) -> None:
@ -78,51 +185,52 @@ class Encoder:
        else:
            os.remove(path)

-    def convert(self, source: Path, dest: Path) -> None:
-        source_samplerate = None
-        try:
-            source_samplerate = int(self.check_output_no_sideeffect(['soxi', '-r', str(source)]).decode("utf-8").strip())
-        except:
-            if str(source).endswith('.wma'):
-                logging.debug(f'unsupported extension for samplerate: {source}')
-            else:
-                logging.warning(f'unable to obtain samplerate for {source}')
+    def convert(self, source: Path, dest: Path, target_samplerate: int | None) -> None:
+        assert dest.suffix == '.mp3', "conversion to a target other than mp3 not yet supported"
+        logger.info(f'converting {source} -> {dest}')

-        samplerate_map = {
-            192000: 48000,
-            96000: 48000,
-            88200: 44100,
-            # preserve as-is
-            48000: 48000,
-            44100: 44100,
-        }
-
-        target_samplerate = samplerate_map.get(source_samplerate)
-        if source_samplerate and not target_samplerate:
-            logging.warning(f'unable to map source sample rate: {source_samplerate}')
        samplerate_flags = ['-ar', str(target_samplerate)] if target_samplerate else []

        self.check_output([
            'ffmpeg',
+            '-loglevel', 'warning',
            '-i', str(source),
-            '-y', # overwrite output
            '-codec:v', 'copy',
            '-codec:a', 'libmp3lame',
            '-qscale:a', '0'
        ] + samplerate_flags + [str(dest)])

+    def cp_or_convert(self, source: Path, dest: Path) -> None:
+        source_samplerate = None
+        if source.suffix.lower() not in NON_AUDIO_FMTS:
+            try:
+                source_samplerate = int(
+                    self.check_output(
+                        ['soxi', '-r', str(source)],
+                        has_side_effect=False,
+                        quiet=True,
+                    ).decode("utf-8").strip()
+                )
+            except:
+                if source.suffix.lower() in ['.aac', '.m4a', '.wma']:
+                    # sox is known to not support these formats
+                    logging.debug(f'unsupported extension for samplerate: {source}')
+                else:
+                    logging.warning(f'unable to obtain samplerate for {source}')

-def clean_name(path: str) -> Path:
-    '''
-    transform a path into something which most filesystems/protocols can reliably support
-    '''
-    out_path = Path()
-    for part in path.parts:
-        blacklisted = '"\'!@#$%^&*()[]{};:,<>?`~|\\/'
-        part = unidecode(part)
-        part = ''.join(c for c in part if c not in blacklisted)
-        out_path /= part
-    return out_path
+        target_samplerate = self.prefs.desired_samplerate(source_samplerate)
+        if source_samplerate and not target_samplerate:
+            logging.warning(f'unable to map source sample rate: {source_samplerate}')
+
+        if source_samplerate != target_samplerate:
+            # resampling -> convert
+            self.convert(source, dest, target_samplerate)
+        elif source.suffix.lower() != dest.suffix:
+            # transcoding -> convert
+            self.convert(source, dest, target_samplerate)
+        else:
+            # neither resampling nor transcoding -> simple copy will suffice
+            self.cp(source, dest)

 class Sync:
    def __init__(self, encoder: Encoder, in_dir: str, out_dir: str, force_copy: bool = False):
@ -131,66 +239,100 @@ class Sync:
        self.out_dir = out_dir
        self.force_copy = force_copy

-    def target_name(self, source_name: str) -> Path:
+    def target_name(self, source_name: Path) -> Path | None:
        n = clean_name(source_name)
-        if n.suffix in MAKE_MP3:
-            return Path(str(n) + '.mp3')
-        else:
-            return n
+        output_type = self.encoder.prefs.get_output(n.suffix)

-    def calculate_delta(self) -> tuple[set[Path], set[Path]]:
+        if output_type is None:
+            logger.warning(f"skipping {source_name} because i don't recognize its filetype ({n.suffix})")
+            return None
+
+        if output_type == ".null":
+            return None
+        elif output_type == n.suffix:
+            return n
+        else:
+            return Path(str(n) + output_type)
+
+    def calculate_delta(self) -> tuple[set[Path], set[tuple[Path, Path]]]:
        '''
-        Returns the set of dest files which need to be deleted, followed by the files to copy
+        Returns, as a tuple:
+        - dest files which need to be deleted
+        - files to copy (in-path/out-path pairs)
+
+        all returned paths are relative to in_dir/out_dir.
        '''
        in_files = { p.relative_to(self.in_dir) for p in Path(self.in_dir).rglob("*") if not p.is_dir() }
        logger.info(f'found {len(in_files)} files in source')
+
+        in_out_map = ((in_f, self.target_name(in_f)) for in_f in in_files)
+        in_out_map = dict((in_f, out_f) for (in_f, out_f) in in_out_map if out_f is not None)
+        logger.info(f'recognized {len(in_files)} source files as media')
+
        existing_out_files = { p.relative_to(self.out_dir) for p in Path(self.out_dir).rglob("*") if not p.is_dir() }
        logger.info(f'found {len(existing_out_files)} files in dest')

-        expected_out_files = { self.target_name(n) for n in in_files }
+        expected_out_files = in_out_map.values()

        to_del = { f for f in existing_out_files if f not in expected_out_files }
        logger.info(f'found {len(to_del)} files to delete')
-        to_copy = { f for f in in_files if (self.force_copy or self.target_name(f) not in existing_out_files) and f.suffix not in IGNORE }
+        to_copy = { (in_f, out_f) for (in_f, out_f) in in_out_map.items() if (self.force_copy or out_f not in existing_out_files) }
        logger.info(f'found {len(to_copy)} files to copy')

        return to_del, to_copy

    def rm_dest_files(self, files: set[Path]) -> None:
+        '''
+        files are relative to out_dir
+        '''
        for f in files:
-            logger.info(f'removing {f} because it does not exist on host')
+            logger.info(f'removing {f}')
            self.encoder.remove(Path(self.out_dir) / f)

-    def copy_one(self, name: Path) -> None:
-        source = self.in_dir / name
-        dest = self.out_dir / self.target_name(name)
+    def copy_one(self, src_name: Path, dest_name: Path) -> None:
+        '''
+        path names are relative to in_dir/out_dir
+        '''
+        source = Path(self.in_dir) / src_name
+        dest = Path(self.out_dir) / dest_name

        self.encoder.ensure_dir(dest.parent)

-        if source.suffix in MAKE_MP3:
-            logger.debug(f'converting {source} -> {dest}')
-            self.encoder.convert(source, dest)
-        elif source.suffix in COPY_RAW:
-            logger.debug(f'copying {source} -> {dest}')
-            self.encoder.cp(source, dest)
-        else:
-            logger.warning(f"skipping {source} because I don't know what to do with that file type")
+        self.encoder.cp_or_convert(source, dest)

-    def cp_src_files(self, src_names: set[Path], jobs: int):
-        logger.info(f'using {jobs} jobs to copy {len(src_names)} files')
-        # Parallel(n_jobs=jobs)(delayed(copy_one)(encoder, in_dir / n, out_dir / target_name(n)) for n in src_names)
+    def try_invoke(self, f, *args) -> None:
+        """
+        try to invoke `f` with the provided `args`, and log if it fails.
+        this overcomes the issue that background tasks which fail via Exception otherwise do so silently.
+        """
+        try:
+            f(*args)
+        except Exception as e:
+            logger.error(f"task failed: {e}")
+
+    def cp_files(self, file_pairs: set[tuple[Path, Path]], jobs: int):
+        logger.info(f'using {jobs} jobs to copy {len(file_pairs)} files')
        with concurrent.futures.ThreadPoolExecutor(max_workers=jobs) as executor:
-            for n in src_names:
-                executor.submit(self.copy_one, n)
+            for src_f, dest_f in file_pairs:
+                executor.submit(self.try_invoke, self.copy_one, src_f, dest_f)


-def sync_all(in_dir: str, out_dir: str, jobs: int = None, dry_run: bool = False, force_copy: bool = False) -> None:
-    encoder = Encoder(dry_run=dry_run)
+def sync_all(
+    in_dir: str,
+    out_dir: str,
+    compress: bool = False,
+    compat: bool = False,
+    force_copy: bool = False,
+    dry_run: bool = False,
+    jobs: int = None,
+) -> None:
+    prefs = TranscodePreferences(compress=compress, compat=compat)
+    encoder = Encoder(prefs, dry_run=dry_run)
    sync = Sync(encoder, in_dir, out_dir, force_copy=force_copy)
    to_del, to_copy = sync.calculate_delta()

    sync.rm_dest_files(to_del)
-    sync.cp_src_files(to_copy, jobs = jobs or multiprocessing.cpu_count())
+    sync.cp_files(to_copy, jobs = jobs or multiprocessing.cpu_count())

 def main() -> None:
    logging.basicConfig()
@ -199,6 +341,8 @@ def main() -> None:
    parser = argparse.ArgumentParser(description="synchronize music from one directory to another, possibly compressing it")
    parser.add_argument("src", help="source directory")
    parser.add_argument("dest", help="destination directory")
+    parser.add_argument("--compress", action='store_true', help="compress audio files (to mp3)")
+    parser.add_argument("--compat", action='store_true', help="convert poorly supported file formats to better-supported formats (e.g. avoid wma)")
    parser.add_argument("--jobs", help="number of cores to compress music with (default: all CPU cores)", default=None, type=int)
    parser.add_argument("--dry-run", action='store_true', help="don't actually run any commands")
    parser.add_argument("--verbose", action='store_true', help="more logging")
@ -212,7 +356,15 @@ def main() -> None:
    if args.quiet:
        logger.setLevel(logging.WARN)

-    sync_all(args.src, args.dest, args.jobs, args.dry_run, args.force_copy)
+    sync_all(
+        args.src,
+        args.dest,
+        compress=args.compress,
+        compat=args.compat,
+        force_copy=args.force_copy,
+        dry_run=args.dry_run,
+        jobs=args.jobs,
+    )

 if __name__ == '__main__':
    main()