diff --git a/pkgs/additional/sane-scripts/src/sane-sync-music b/pkgs/additional/sane-scripts/src/sane-sync-music index 8b389664..ddf1ce66 100755 --- a/pkgs/additional/sane-scripts/src/sane-sync-music +++ b/pkgs/additional/sane-scripts/src/sane-sync-music @@ -13,56 +13,163 @@ from unidecode import unidecode logger = logging.getLogger(__name__) -MAKE_MP3 = [ +LOSSLESS_FMTS = [ '.flac', + '.wav', +] + +MAYBE_LOSSY_FMTS = [ + # WMA can be lossy or lossless + '.wma', +] + +LOSSY_FMTS = [ + '.aac', + '.m4a', + '.mp3', '.oga', '.ogg', '.opus', - '.wav', - '.wma', ] -COPY_RAW = [ - '.aac', + +COMPAT_AUDIO_FMTS = [ + '.flac', + '.mp3', + '.oga', + '.ogg', + '.opus', +] + +AUDIO_FMTS = LOSSLESS_FMTS + MAYBE_LOSSY_FMTS + LOSSY_FMTS + +IMAGE_FMTS = [ '.bmp', '.gif', '.jpeg', '.jpg', - '.m4a', - '.mp3', '.png', ] -IGNORE = [ - '.DS_Store', + +VIDEO_FMTS = [ '.avi', - '.cue', - '.inf', - '.log', - '.lyrics', - '.m3u', '.mov', '.mp4', - '.nsf_', - '.pdf', - '.toc', - '.txt', '.webm', ] +METADATA_FMTS = [ + '.inf', + '.lyrics', + '.nfo', + '.pdf', + '.toc', + '.txt', +] + +NON_AUDIO_FMTS = IMAGE_FMTS + VIDEO_FMTS + METADATA_FMTS + +IGNORE = [ + '.DS_Store', + '.cue', + '.log', + '.m3u', + '.nsf_', +] + +def clean_name(path: Path) -> Path: + ''' + transform a path into something which most filesystems/protocols can reliably support. + also removes irregularities like uppercase file extensions. + ''' + out_path = Path() + for part in path.parts: + blacklisted = '"\'!@#$%^&*()[]{};:,<>?`~|\\/' + part = unidecode(part) + part = ''.join(c for c in part if c not in blacklisted) + out_path /= part + + return out_path.with_suffix(out_path.suffix.lower()) + +class TranscodePreferences: + def __init__(self, compress: bool, compat: bool): + self.compress = compress + self.compat = compat + + def get_output(self, input_ext: str) -> str | None: + """ + for some source type (e.g. `.wav`), return the desired output type (e.g. `.mp3`). + returns `.null` to indicate the file shouldn't be copied. + returns `None` if i don't understand the source file. + """ + desired_output = None + + if input_ext in AUDIO_FMTS: + desired_output = input_ext + if self.compress: + desired_output = self.get_compressed_audio_output(desired_output) + if self.compat: + desired_output = self.get_compat_audio_output(desired_output) + elif input_ext in IMAGE_FMTS: + desired_output = input_ext + elif input_ext in VIDEO_FMTS: + desired_output = input_ext + elif input_ext in METADATA_FMTS: + desired_output = input_ext + elif input_ext in IGNORE: + desired_output = ".null" + + return desired_output + + def desired_samplerate(self, input_samplerate: int | None) -> int | None: + samplerate_map = { + 192000: 48000 if self.compress else 192000, + 96000: 48000 if self.compress else 96000, + 88200: 44100 if self.compress else 88200, + # preserve as-is + 48000: 48000, + 44100: 44100, + } + return samplerate_map.get(input_samplerate) + + def get_compressed_audio_output(self, input_ext: str) -> str: + if input_ext in LOSSY_FMTS: + return input_ext + else: + return ".mp3" + + def get_compat_audio_output(self, input_ext: str) -> str: + if input_ext in COMPAT_AUDIO_FMTS: + return input_ext + elif input_ext in LOSSLESS_FMTS: + return ".flac" + else: + return ".mp3" + + class Encoder: - def __init__(self, dry_run: bool = False): + def __init__(self, prefs: TranscodePreferences, dry_run: bool = False): + self.prefs = prefs self.dry_run = dry_run - def check_output_no_sideeffect(self, args: list[str]) -> bytes: - return subprocess.check_output(args) + def _check_output(self, args: list[str], quiet: bool = False) -> bytes: + res = subprocess.run(args, capture_output=True) - def check_output(self, args: list[str]) -> str: - if self.dry_run: + stderr = res.stderr.strip() + if stderr and not quiet: + logger.error(stderr) + + res.check_returncode() + return res.stdout + + def check_output(self, args: list[str], has_side_effect=True, **kwargs) -> str: + if self.dry_run and has_side_effect: logger.debug("not invoking because dry run: " + ' '.join(args)) return b"" else: - return subprocess.check_output(args) + return self._check_output(args, **kwargs) def cp(self, source: Path, dest: Path) -> None: + logger.info(f'copying {source} -> {dest}') self.check_output(['cp', str(source), str(dest)]) def ensure_dir(self, dir: Path) -> None: @@ -78,51 +185,52 @@ class Encoder: else: os.remove(path) - def convert(self, source: Path, dest: Path) -> None: - source_samplerate = None - try: - source_samplerate = int(self.check_output_no_sideeffect(['soxi', '-r', str(source)]).decode("utf-8").strip()) - except: - if str(source).endswith('.wma'): - logging.debug(f'unsupported extension for samplerate: {source}') - else: - logging.warning(f'unable to obtain samplerate for {source}') + def convert(self, source: Path, dest: Path, target_samplerate: int | None) -> None: + assert dest.suffix == '.mp3', "conversion to a target other than mp3 not yet supported" + logger.info(f'converting {source} -> {dest}') - samplerate_map = { - 192000: 48000, - 96000: 48000, - 88200: 44100, - # preserve as-is - 48000: 48000, - 44100: 44100, - } - - target_samplerate = samplerate_map.get(source_samplerate) - if source_samplerate and not target_samplerate: - logging.warning(f'unable to map source sample rate: {source_samplerate}') samplerate_flags = ['-ar', str(target_samplerate)] if target_samplerate else [] self.check_output([ 'ffmpeg', + '-loglevel', 'warning', '-i', str(source), - '-y', # overwrite output '-codec:v', 'copy', '-codec:a', 'libmp3lame', '-qscale:a', '0' ] + samplerate_flags + [str(dest)]) + def cp_or_convert(self, source: Path, dest: Path) -> None: + source_samplerate = None + if source.suffix.lower() not in NON_AUDIO_FMTS: + try: + source_samplerate = int( + self.check_output( + ['soxi', '-r', str(source)], + has_side_effect=False, + quiet=True, + ).decode("utf-8").strip() + ) + except: + if source.suffix.lower() in ['.aac', '.m4a', '.wma']: + # sox is known to not support these formats + logging.debug(f'unsupported extension for samplerate: {source}') + else: + logging.warning(f'unable to obtain samplerate for {source}') -def clean_name(path: str) -> Path: - ''' - transform a path into something which most filesystems/protocols can reliably support - ''' - out_path = Path() - for part in path.parts: - blacklisted = '"\'!@#$%^&*()[]{};:,<>?`~|\\/' - part = unidecode(part) - part = ''.join(c for c in part if c not in blacklisted) - out_path /= part - return out_path + target_samplerate = self.prefs.desired_samplerate(source_samplerate) + if source_samplerate and not target_samplerate: + logging.warning(f'unable to map source sample rate: {source_samplerate}') + + if source_samplerate != target_samplerate: + # resampling -> convert + self.convert(source, dest, target_samplerate) + elif source.suffix.lower() != dest.suffix: + # transcoding -> convert + self.convert(source, dest, target_samplerate) + else: + # neither resampling nor transcoding -> simple copy will suffice + self.cp(source, dest) class Sync: def __init__(self, encoder: Encoder, in_dir: str, out_dir: str, force_copy: bool = False): @@ -131,66 +239,100 @@ class Sync: self.out_dir = out_dir self.force_copy = force_copy - def target_name(self, source_name: str) -> Path: + def target_name(self, source_name: Path) -> Path | None: n = clean_name(source_name) - if n.suffix in MAKE_MP3: - return Path(str(n) + '.mp3') - else: - return n + output_type = self.encoder.prefs.get_output(n.suffix) - def calculate_delta(self) -> tuple[set[Path], set[Path]]: + if output_type is None: + logger.warning(f"skipping {source_name} because i don't recognize its filetype ({n.suffix})") + return None + + if output_type == ".null": + return None + elif output_type == n.suffix: + return n + else: + return Path(str(n) + output_type) + + def calculate_delta(self) -> tuple[set[Path], set[tuple[Path, Path]]]: ''' - Returns the set of dest files which need to be deleted, followed by the files to copy + Returns, as a tuple: + - dest files which need to be deleted + - files to copy (in-path/out-path pairs) + + all returned paths are relative to in_dir/out_dir. ''' in_files = { p.relative_to(self.in_dir) for p in Path(self.in_dir).rglob("*") if not p.is_dir() } logger.info(f'found {len(in_files)} files in source') + + in_out_map = ((in_f, self.target_name(in_f)) for in_f in in_files) + in_out_map = dict((in_f, out_f) for (in_f, out_f) in in_out_map if out_f is not None) + logger.info(f'recognized {len(in_files)} source files as media') + existing_out_files = { p.relative_to(self.out_dir) for p in Path(self.out_dir).rglob("*") if not p.is_dir() } logger.info(f'found {len(existing_out_files)} files in dest') - expected_out_files = { self.target_name(n) for n in in_files } + expected_out_files = in_out_map.values() to_del = { f for f in existing_out_files if f not in expected_out_files } logger.info(f'found {len(to_del)} files to delete') - to_copy = { f for f in in_files if (self.force_copy or self.target_name(f) not in existing_out_files) and f.suffix not in IGNORE } + to_copy = { (in_f, out_f) for (in_f, out_f) in in_out_map.items() if (self.force_copy or out_f not in existing_out_files) } logger.info(f'found {len(to_copy)} files to copy') return to_del, to_copy def rm_dest_files(self, files: set[Path]) -> None: + ''' + files are relative to out_dir + ''' for f in files: - logger.info(f'removing {f} because it does not exist on host') + logger.info(f'removing {f}') self.encoder.remove(Path(self.out_dir) / f) - def copy_one(self, name: Path) -> None: - source = self.in_dir / name - dest = self.out_dir / self.target_name(name) + def copy_one(self, src_name: Path, dest_name: Path) -> None: + ''' + path names are relative to in_dir/out_dir + ''' + source = Path(self.in_dir) / src_name + dest = Path(self.out_dir) / dest_name self.encoder.ensure_dir(dest.parent) - if source.suffix in MAKE_MP3: - logger.debug(f'converting {source} -> {dest}') - self.encoder.convert(source, dest) - elif source.suffix in COPY_RAW: - logger.debug(f'copying {source} -> {dest}') - self.encoder.cp(source, dest) - else: - logger.warning(f"skipping {source} because I don't know what to do with that file type") + self.encoder.cp_or_convert(source, dest) - def cp_src_files(self, src_names: set[Path], jobs: int): - logger.info(f'using {jobs} jobs to copy {len(src_names)} files') - # Parallel(n_jobs=jobs)(delayed(copy_one)(encoder, in_dir / n, out_dir / target_name(n)) for n in src_names) + def try_invoke(self, f, *args) -> None: + """ + try to invoke `f` with the provided `args`, and log if it fails. + this overcomes the issue that background tasks which fail via Exception otherwise do so silently. + """ + try: + f(*args) + except Exception as e: + logger.error(f"task failed: {e}") + + def cp_files(self, file_pairs: set[tuple[Path, Path]], jobs: int): + logger.info(f'using {jobs} jobs to copy {len(file_pairs)} files') with concurrent.futures.ThreadPoolExecutor(max_workers=jobs) as executor: - for n in src_names: - executor.submit(self.copy_one, n) + for src_f, dest_f in file_pairs: + executor.submit(self.try_invoke, self.copy_one, src_f, dest_f) -def sync_all(in_dir: str, out_dir: str, jobs: int = None, dry_run: bool = False, force_copy: bool = False) -> None: - encoder = Encoder(dry_run=dry_run) +def sync_all( + in_dir: str, + out_dir: str, + compress: bool = False, + compat: bool = False, + force_copy: bool = False, + dry_run: bool = False, + jobs: int = None, +) -> None: + prefs = TranscodePreferences(compress=compress, compat=compat) + encoder = Encoder(prefs, dry_run=dry_run) sync = Sync(encoder, in_dir, out_dir, force_copy=force_copy) to_del, to_copy = sync.calculate_delta() sync.rm_dest_files(to_del) - sync.cp_src_files(to_copy, jobs = jobs or multiprocessing.cpu_count()) + sync.cp_files(to_copy, jobs = jobs or multiprocessing.cpu_count()) def main() -> None: logging.basicConfig() @@ -199,6 +341,8 @@ def main() -> None: parser = argparse.ArgumentParser(description="synchronize music from one directory to another, possibly compressing it") parser.add_argument("src", help="source directory") parser.add_argument("dest", help="destination directory") + parser.add_argument("--compress", action='store_true', help="compress audio files (to mp3)") + parser.add_argument("--compat", action='store_true', help="convert poorly supported file formats to better-supported formats (e.g. avoid wma)") parser.add_argument("--jobs", help="number of cores to compress music with (default: all CPU cores)", default=None, type=int) parser.add_argument("--dry-run", action='store_true', help="don't actually run any commands") parser.add_argument("--verbose", action='store_true', help="more logging") @@ -212,7 +356,15 @@ def main() -> None: if args.quiet: logger.setLevel(logging.WARN) - sync_all(args.src, args.dest, args.jobs, args.dry_run, args.force_copy) + sync_all( + args.src, + args.dest, + compress=args.compress, + compat=args.compat, + force_copy=args.force_copy, + dry_run=args.dry_run, + jobs=args.jobs, + ) if __name__ == '__main__': main()