From 59ae65a8de5b6b74bfc811b4ec2e58056e0a79b3 Mon Sep 17 00:00:00 2001 From: Chris Farrell Date: Sun, 17 Nov 2024 23:57:26 +0000 Subject: [PATCH 01/66] Nov 17, 2024, 3:57 PM --- .gitignore | 27 +++ sample-shrinker-python/README.md | 130 +++++++++++ sample-shrinker-python/requirements.txt | 4 + sample-shrinker-python/sample-shrinker.py | 253 ++++++++++++++++++++++ 4 files changed, 414 insertions(+) create mode 100644 .gitignore create mode 100644 sample-shrinker-python/README.md create mode 100644 sample-shrinker-python/requirements.txt create mode 100644 sample-shrinker-python/sample-shrinker.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..061bfe6 --- /dev/null +++ b/.gitignore @@ -0,0 +1,27 @@ + +# Ignore macOS metadata files +.DS_Store +._* +Thumbs.db + +# Ignore Python bytecode +__pycache__/ +*.py[cod] + +# Ignore logs and backup files +*.log +_backup/ + +# Ignore output files like spectrograms +*.png + +# Ignore directories created by the script +sample-shrinker-python/_backup/ +sample-shrinker-python/*.log +sample-shrinker-python/*.png + +# Virtual environment files +venv/ +env/ +.venv +sample-shrinker_venv/ \ No newline at end of file diff --git a/sample-shrinker-python/README.md b/sample-shrinker-python/README.md new file mode 100644 index 0000000..44c5d3c --- /dev/null +++ b/sample-shrinker-python/README.md @@ -0,0 +1,130 @@ + +# Sample Shrinker + +A Python script to conditionally batch-convert audio samples into minimal `.wav` files, based on target criteria. This script is useful for saving storage space and reducing the I/O stress during simultaneous real-time streaming of multiple `.wav` files on devices like the Dirtywave M8 tracker. + +If you have directories full of 24/32-bit stereo `.wav` files or stereo samples with effectively mono content, this script can reclaim wasted storage space and reduce I/O stress on your SD card. It can also detect if the content of a stereo sample is actually mono and convert it automatically! + +## Features +- **Conditional Conversion**: Only converts samples that don't meet the target criteria (bit depth, channels, etc.). +- **Auto-Mono**: Automatically convert stereo samples to mono if the content is effectively mono, with a configurable threshold. +- **Backup and Spectrogram Generation**: Converted files are backed up (unless disabled) and spectrograms of old vs. new files are generated. +- **Pre-Normalization**: Optionally normalize samples before downsampling the bit depth to preserve dynamic range. +- **Parallel Processing**: Use the `-j` option to process multiple files in parallel for faster conversions. + +## Requirements + +- Python 3.10 or later +- `pydub`, `librosa`, `matplotlib`, `soundfile` (install with `pip`) +- `ffmpeg` or `libav` installed for `pydub` + +Install dependencies: +```bash +pip install -r requirements.txt +``` + +You will also need `ffmpeg`: +```bash +# MacOS with Homebrew +brew install ffmpeg + +# Ubuntu/Debian +sudo apt install ffmpeg +``` + +## Usage + +```bash +python sample-shrinker.py [options] FILE|DIRECTORY ... +``` + +### Basic Example: +```bash +python sample-shrinker.py directory_of_samples/ +``` + +This will: +- Convert samples in place with a target bit depth of 16 and stereo channels unchanged. +- Back up the original files in a parallel `_backup/` directory. +- Generate `.png` spectrograms comparing old and new files. + +### Options: +- `-b BIT_DEPTH`: Set the target bit depth (default: 16). Samples will only be downsampled unless `-B` is set. +- `-B MIN_BIT_DEPTH`: Set a minimum bit depth. This will upsample any samples below the minimum. +- `-c CHANNELS`: Set the target number of output channels (default: 2). For mono, use `-c 1`. +- `-r SAMPLERATE`: Set the target sample rate (default: 44100 Hz). +- `-R MIN_SAMPLERATE`: Set a minimum sample rate. Samples below this will be upsampled. +- `-a`: Automatically convert stereo samples to mono if they are effectively mono. +- `-A DB_THRESHOLD`: Set the auto-mono threshold in dB (default: `-95.5`). This implies `-a`. +- `-p`: Pre-normalize samples before downsampling bit depth. +- `-S`: Skip generating spectrogram files. +- `-d BACKUP_DIR`: Set a directory to store backups. Use `-d -` to disable backups and spectrogram generation. +- `-l`: List files and preview changes without converting. +- `-n`: Dry run—log actions without converting any files. +- `-j JOBS`: Process files in parallel with multiple jobs (default: 1). +- `-v`: Increase verbosity. + +## Examples + +### Convert a Directory with Default Settings +```bash +python sample-shrinker.py my_samples/ +``` +- Convert samples to 16-bit with channels left unchanged. +- Back up the original files under `_backup/`. +- Generate spectrogram `.png` files for comparison. + +### Convert to Mono Automatically for Effectively Mono Samples +```bash +python sample-shrinker.py -a my_samples/ +``` +- Automatically convert stereo samples to mono if they are effectively mono (i.e., the difference between the channels is below the threshold). + +### Preview Changes Without Modifying Files +```bash +python sample-shrinker.py -l -a -A -80 my_samples/ +``` +- Lists all files and shows which ones would be changed without actually modifying them. The threshold for auto-mono is set to -80 dB. + +### Convert and Skip Backups +```bash +python sample-shrinker.py -d - my_samples/ +``` +- Converts files but does not create backups or generate spectrograms. + +### Pre-Normalize Before Downsampling +```bash +python sample-shrinker.py -p my_samples/ +``` +- Normalize the audio before downsampling the bit depth to preserve as much dynamic range as possible. + +### Process Files in Parallel +```bash +python sample-shrinker.py -j 10 my_samples/ +``` +- Process up to 10 files at the same time for faster batch conversion. + +## Output Example: + +```bash +Processing file: /Volumes/Untitled/Samples/wii sports/sound effects/Baseball/Sample_0028.wav +/Volumes/Untitled/Samples/wii sports/sound effects/Baseball/Sample_0028.wav [UNCHANGED] +Processing file: /Volumes/Untitled/Samples/wii sports/sound effects/Boxing/Sample_0029.wav +/Volumes/Untitled/Samples/wii sports/sound effects/Baseball/Sample_0029.wav [CHANGED]: sample rate 48000 -> 44100 +Processing file: /Volumes/Untitled/Samples/wii sports/sound effects/Boxing/Sample_0030.wav +/Volumes/Untitled/Samples/wii sports/sound effects/Baseball/Sample_0030.wav[CHANGED]: auto-mono +``` + +In the updated output format: +- The script logs each file being processed with the `Processing file:` prefix. +- After processing, each file will either be marked as `[UNCHANGED]` or `[CHANGED]` depending on whether any modifications (bit depth, sample rate, or channels) were made. +- If changes are made, the specific adjustments (e.g., `sample rate 48000 -> 44100`) will be displayed. + +### Additional Details: +- The `[CHANGED]` notation follows files that were modified. +- `[UNCHANGED]` appears for files that meet the target criteria and required no modifications. +- **Changes made**: + - Sample rate conversions (e.g., `sample rate 48000 -> 44100`). + - Bit depth reductions (e.g., `bit depth 32 -> 16`). + - Channel conversions (e.g., stereo to mono). +- Verbose output (`-v`) will print additional information such as ongoing file processing. diff --git a/sample-shrinker-python/requirements.txt b/sample-shrinker-python/requirements.txt new file mode 100644 index 0000000..4a880fc --- /dev/null +++ b/sample-shrinker-python/requirements.txt @@ -0,0 +1,4 @@ +librosa==0.10.2.post1 +matplotlib==3.9.2 +numpy==2.1.2 +pydub==0.25.1 diff --git a/sample-shrinker-python/sample-shrinker.py b/sample-shrinker-python/sample-shrinker.py new file mode 100644 index 0000000..c283fad --- /dev/null +++ b/sample-shrinker-python/sample-shrinker.py @@ -0,0 +1,253 @@ +import os +import shutil +import argparse +import soundfile as sf +from pydub import AudioSegment +import librosa +import matplotlib.pyplot as plt +import numpy as np +from concurrent.futures import ThreadPoolExecutor, as_completed +import concurrent.futures + +def usage_intro(): + return """ +Conditionally batch-converts audio samples into minimal .wav files. + +Each DIRECTORY is recursively searched for audio files to process, based on their extension (configured with -x). Any FILE specified directly will be processed (regardless of its extension). + +If a sample does not already meet the target BIT_DEPTH or CHANNELS, it will be converted in place and the original will be backed up to a parallel directory structure. + +Upon conversion, spectrogram .png files are generated alongside the backed-up original file to compare the original vs new audio files (disable with -S). + +Examples: + Recursively convert samples under 'sample_dir/' using the default settings: + $ sample-shrinker.py sample_dir/ + Convert samples down to 8-bit, mono: + $ sample-shrinker.py -c 1 -b 8 sample_dir/ + Auto-convert stereo samples to mono: + $ sample-shrinker.py -a sample_dir/ + """ + +def parse_args(): + """Parse command line arguments.""" + parser = argparse.ArgumentParser(description="Batch convert audio files.") + parser.add_argument('files', nargs='+', help='Files or directories to process') + parser.add_argument('-b', '--bitdepth', type=int, default=16, help='Target bit depth (8, 16, 24)') + parser.add_argument('-B', '--min_bitdepth', type=int, help='Minimum bit depth (8, 16, 24)') + parser.add_argument('-c', '--channels', type=int, default=2, help='Target number of channels (1=mono, 2=stereo)') + parser.add_argument('-r', '--samplerate', type=int, default=44100, help='Target sample rate') + parser.add_argument('-R', '--min_samplerate', type=int, help='Minimum sample rate') + parser.add_argument('-x', '--ext', default='wav', help='File extension to search for (default: wav)') + parser.add_argument('-a', '--auto_mono', action='store_true', help='Automatically convert stereo samples to mono') + parser.add_argument('-A', '--auto_mono_threshold', type=float, default=-95.5, help='Auto-mono threshold dB') + parser.add_argument('-S', '--skip_spectrograms', action='store_true', help='Skip generating spectrogram files') + parser.add_argument('-d', '--backup_dir', default="_backup", help='Directory to store backups (default: _backup)') + parser.add_argument('-p', '--pre_normalize', action='store_true', help='Pre-normalize before downsampling bit-depth') + parser.add_argument('-l', '--list', action='store_true', help='List files without converting') + parser.add_argument('-n', '--dry_run', action='store_true', help='Log actions without converting') + parser.add_argument('-j', '--jobs', type=int, default=1, help='Number of parallel jobs (default: 1)') + parser.add_argument('-v', '--verbose', action='store_true', help='Increase verbosity') + + return parser.parse_args() + +def delete_resource_forks(directory): + """Recursively find and delete all '._' resource fork files in the directory.""" + for root, dirs, files in os.walk(directory): + for file in files: + if file.startswith("._"): + file_path = os.path.join(root, file) + print(f"Deleting resource fork file: {file_path}") + os.remove(file_path) + +def reencode_audio(file_path): + """Re-encode audio file to PCM 16-bit if it has a different encoding.""" + try: + with sf.SoundFile(file_path) as f: + print(f"Audio encoding: {f.format}, subtype: {f.subtype}, channels: {f.channels}") + if f.subtype != 'PCM_16': + # If the file is not PCM 16, re-save it as PCM_16 + data, samplerate = sf.read(file_path) + temp_output = file_path.replace(os.path.splitext(file_path)[1], "_reencoded.wav") + sf.write(temp_output, data, samplerate, subtype='PCM_16') + print(f"File re-encoded to PCM_16: {file_path} -> {temp_output}") + return temp_output + except Exception as e: + print(f"Error re-encoding {file_path}: {e}") + return None + +def process_audio(file_path, args, dry_run=False): + """Main function to process audio files based on arguments.""" + try: + print(f"Processing file: {file_path}") # Debug logging to trace progress + audio = AudioSegment.from_file(file_path) + modified = False + change_reason = [] + + # Check if we need to convert the channels + if audio.channels > args.channels: + change_reason.append("channels") + if not dry_run: + audio = audio.set_channels(args.channels) + modified = True + + # Auto-mono logic: convert stereo to mono if it is effectively mono + if args.auto_mono and audio.channels == 2: + mono_candidate = check_effectively_mono(audio, args.auto_mono_threshold) + if mono_candidate: + change_reason.append("auto-mono") + if not dry_run: + audio = audio.set_channels(1) + modified = True + + # Pre-normalize before downsampling bit depth if necessary + if args.pre_normalize: + change_reason.append("pre-normalize") + if not dry_run: + audio = audio.apply_gain(-audio.max_dBFS) + modified = True + + # Check if we need to convert the bit depth + if audio.sample_width * 8 > args.bitdepth: + change_reason.append(f"bit depth {audio.sample_width * 8} -> {args.bitdepth}") + if not dry_run: + audio = audio.set_sample_width(args.bitdepth // 8) + modified = True + + # Sample rate conversion logic: Downsample only + if audio.frame_rate > args.samplerate: + change_reason.append(f"sample rate {audio.frame_rate} -> {args.samplerate}") + if not dry_run: + audio = audio.set_frame_rate(args.samplerate) + modified = True + elif args.min_samplerate and audio.frame_rate < args.min_samplerate: + # Only upsample if the user specifies a minimum sample rate + change_reason.append(f"sample rate {audio.frame_rate} -> {args.min_samplerate}") + if not dry_run: + audio = audio.set_frame_rate(args.min_samplerate) + modified = True + + if modified: + print(f"{file_path} [CHANGED]: {', '.join(change_reason)}") + if not dry_run: + # Backup the original file if required + if args.backup_dir != "-": + backup_path = os.path.join(args.backup_dir, os.path.basename(file_path)) + os.makedirs(os.path.dirname(backup_path), exist_ok=True) + shutil.copy(file_path, backup_path) + + # Export the converted audio file + output_file = file_path.replace(os.path.splitext(file_path)[1], ".wav") + audio.export(output_file, format="wav") + + # Generate spectrogram if enabled + if not args.skip_spectrograms: + generate_spectrogram(file_path, output_file, args.backup_dir) + else: + print(f"{file_path} [UNCHANGED]") + + except Exception as e: + print(f"Error processing {file_path}: {e}") + + # Try re-encoding the file if ffmpeg failed + reencoded_file = reencode_audio(file_path) + if reencoded_file: + try: + # Retry the process with the re-encoded file + process_audio(reencoded_file, args, dry_run) + except Exception as retry_error: + print(f"Failed to process the re-encoded file {reencoded_file}: {retry_error}") + +def check_effectively_mono(audio, threshold_dB): + """Check if a stereo file is effectively mono.""" + left_channel = audio.split_to_mono()[0] + right_channel = audio.split_to_mono()[1].invert_phase() + + difference = left_channel.overlay(right_channel) + peak_diff_db = difference.max_dBFS + return peak_diff_db < threshold_dB + +def generate_spectrogram(original_file, new_file, backup_dir): + """Generate and save spectrograms for the original and new files.""" + y_old, sr_old = librosa.load(original_file, sr=None) + y_new, sr_new = librosa.load(new_file, sr=None) + + # Spectrogram for original file + plt.figure(figsize=(10, 4)) + D_old = librosa.amplitude_to_db(np.abs(librosa.stft(y_old)), ref=np.max) + librosa.display.specshow(D_old, sr=sr_old, x_axis='time', y_axis='log') + plt.colorbar(format='%+2.0f dB') + plt.title(f'Spectrogram of {original_file}') + old_spectrogram_path = os.path.join(backup_dir, os.path.basename(original_file) + ".old.png") + plt.savefig(old_spectrogram_path) + plt.close() + + # Spectrogram for new file + plt.figure(figsize=(10, 4)) + D_new = librosa.amplitude_to_db(np.abs(librosa.stft(y_new)), ref=np.max) + librosa.display.specshow(D_new, sr=sr_new, x_axis='time', y_axis='log') + plt.colorbar(format='%+2.0f dB') + plt.title(f'Spectrogram of {new_file}') + new_spectrogram_path = os.path.join(backup_dir, os.path.basename(new_file) + ".new.png") + plt.savefig(new_spectrogram_path) + plt.close() + +def list_files(args, file_list): + """Prints file summary and actions without performing them.""" + for file_path in file_list: + print(f"Previewing: {file_path}") + +def collect_files(args): + """Collect all files from provided directories and files, skipping resource fork files.""" + file_list = [] + for path in args.files: + if os.path.isdir(path): + for root, dirs, files in os.walk(path): + for file in files: + if file.endswith(f".{args.ext}") and not file.startswith("._"): + file_list.append(os.path.join(root, file)) + elif os.path.isfile(path): + if not os.path.basename(path).startswith("._"): + file_list.append(path) + return file_list + +def run_in_parallel(file_list, args): + """Run the audio processing in parallel.""" + try: + with ThreadPoolExecutor(max_workers=args.jobs) as executor: + futures = {executor.submit(process_audio, file, args): file for file in file_list} + for future in concurrent.futures.as_completed(futures): + try: + result = future.result() # Get the result of the future (processed file) + except Exception as exc: + file = futures[future] + print(f"File {file} generated an exception: {exc}") + except KeyboardInterrupt: + print("Received KeyboardInterrupt, attempting to cancel all threads...") + executor.shutdown(wait=False, cancel_futures=True) + raise + +def main(): + args = parse_args() + + # Ensure that at least one file or directory is provided + if not args.files: + print(usage_intro()) + return + + # Delete all '._' files before processing anything + for path in args.files: + if os.path.isdir(path): + delete_resource_forks(path) + + # Collect the files to process + file_list = collect_files(args) + + if args.dry_run or args.list: + list_files(args, file_list) + for file in file_list: + process_audio(file, args, dry_run=True) + else: + run_in_parallel(file_list, args) + +if __name__ == "__main__": + main() \ No newline at end of file From cb0b4cc4bcfac6454c52924e71577d3768c4d02a Mon Sep 17 00:00:00 2001 From: Chris Farrell Date: Mon, 18 Nov 2024 00:11:47 +0000 Subject: [PATCH 02/66] Nov 17, 2024, 4:11 PM --- sample-shrinker-python/sample-shrinker.py | 174 ++++++++++++++++------ 1 file changed, 130 insertions(+), 44 deletions(-) diff --git a/sample-shrinker-python/sample-shrinker.py b/sample-shrinker-python/sample-shrinker.py index c283fad..3e66d66 100644 --- a/sample-shrinker-python/sample-shrinker.py +++ b/sample-shrinker-python/sample-shrinker.py @@ -1,13 +1,15 @@ +import argparse +import concurrent.futures import os import shutil -import argparse -import soundfile as sf -from pydub import AudioSegment +from concurrent.futures import ThreadPoolExecutor, as_completed + import librosa import matplotlib.pyplot as plt import numpy as np -from concurrent.futures import ThreadPoolExecutor, as_completed -import concurrent.futures +import soundfile as sf +from pydub import AudioSegment + def usage_intro(): return """ @@ -28,28 +30,78 @@ def usage_intro(): $ sample-shrinker.py -a sample_dir/ """ + def parse_args(): """Parse command line arguments.""" parser = argparse.ArgumentParser(description="Batch convert audio files.") - parser.add_argument('files', nargs='+', help='Files or directories to process') - parser.add_argument('-b', '--bitdepth', type=int, default=16, help='Target bit depth (8, 16, 24)') - parser.add_argument('-B', '--min_bitdepth', type=int, help='Minimum bit depth (8, 16, 24)') - parser.add_argument('-c', '--channels', type=int, default=2, help='Target number of channels (1=mono, 2=stereo)') - parser.add_argument('-r', '--samplerate', type=int, default=44100, help='Target sample rate') - parser.add_argument('-R', '--min_samplerate', type=int, help='Minimum sample rate') - parser.add_argument('-x', '--ext', default='wav', help='File extension to search for (default: wav)') - parser.add_argument('-a', '--auto_mono', action='store_true', help='Automatically convert stereo samples to mono') - parser.add_argument('-A', '--auto_mono_threshold', type=float, default=-95.5, help='Auto-mono threshold dB') - parser.add_argument('-S', '--skip_spectrograms', action='store_true', help='Skip generating spectrogram files') - parser.add_argument('-d', '--backup_dir', default="_backup", help='Directory to store backups (default: _backup)') - parser.add_argument('-p', '--pre_normalize', action='store_true', help='Pre-normalize before downsampling bit-depth') - parser.add_argument('-l', '--list', action='store_true', help='List files without converting') - parser.add_argument('-n', '--dry_run', action='store_true', help='Log actions without converting') - parser.add_argument('-j', '--jobs', type=int, default=1, help='Number of parallel jobs (default: 1)') - parser.add_argument('-v', '--verbose', action='store_true', help='Increase verbosity') - + parser.add_argument("files", nargs="+", help="Files or directories to process") + parser.add_argument( + "-b", "--bitdepth", type=int, default=16, help="Target bit depth (8, 16, 24)" + ) + parser.add_argument( + "-B", "--min_bitdepth", type=int, help="Minimum bit depth (8, 16, 24)" + ) + parser.add_argument( + "-c", + "--channels", + type=int, + default=2, + help="Target number of channels (1=mono, 2=stereo)", + ) + parser.add_argument( + "-r", "--samplerate", type=int, default=44100, help="Target sample rate" + ) + parser.add_argument("-R", "--min_samplerate", type=int, help="Minimum sample rate") + parser.add_argument( + "-x", "--ext", default="wav", help="File extension to search for (default: wav)" + ) + parser.add_argument( + "-a", + "--auto_mono", + action="store_true", + help="Automatically convert stereo samples to mono", + ) + parser.add_argument( + "-A", + "--auto_mono_threshold", + type=float, + default=-95.5, + help="Auto-mono threshold dB", + ) + parser.add_argument( + "-S", + "--skip_spectrograms", + action="store_true", + help="Skip generating spectrogram files", + ) + parser.add_argument( + "-d", + "--backup_dir", + default="_backup", + help="Directory to store backups (default: _backup)", + ) + parser.add_argument( + "-p", + "--pre_normalize", + action="store_true", + help="Pre-normalize before downsampling bit-depth", + ) + parser.add_argument( + "-l", "--list", action="store_true", help="List files without converting" + ) + parser.add_argument( + "-n", "--dry_run", action="store_true", help="Log actions without converting" + ) + parser.add_argument( + "-j", "--jobs", type=int, default=1, help="Number of parallel jobs (default: 1)" + ) + parser.add_argument( + "-v", "--verbose", action="store_true", help="Increase verbosity" + ) + return parser.parse_args() + def delete_resource_forks(directory): """Recursively find and delete all '._' resource fork files in the directory.""" for root, dirs, files in os.walk(directory): @@ -59,26 +111,32 @@ def delete_resource_forks(directory): print(f"Deleting resource fork file: {file_path}") os.remove(file_path) + def reencode_audio(file_path): """Re-encode audio file to PCM 16-bit if it has a different encoding.""" try: with sf.SoundFile(file_path) as f: - print(f"Audio encoding: {f.format}, subtype: {f.subtype}, channels: {f.channels}") - if f.subtype != 'PCM_16': + print( + f"Audio encoding: {f.format}, subtype: {f.subtype}, channels: {f.channels}" + ) + if f.subtype != "PCM_16": # If the file is not PCM 16, re-save it as PCM_16 data, samplerate = sf.read(file_path) - temp_output = file_path.replace(os.path.splitext(file_path)[1], "_reencoded.wav") - sf.write(temp_output, data, samplerate, subtype='PCM_16') + temp_output = file_path.replace( + os.path.splitext(file_path)[1], "_reencoded.wav" + ) + sf.write(temp_output, data, samplerate, subtype="PCM_16") print(f"File re-encoded to PCM_16: {file_path} -> {temp_output}") return temp_output except Exception as e: print(f"Error re-encoding {file_path}: {e}") return None + def process_audio(file_path, args, dry_run=False): """Main function to process audio files based on arguments.""" try: - print(f"Processing file: {file_path}") # Debug logging to trace progress + print(f"Processing file: {file_path}") audio = AudioSegment.from_file(file_path) modified = False change_reason = [] @@ -108,7 +166,9 @@ def process_audio(file_path, args, dry_run=False): # Check if we need to convert the bit depth if audio.sample_width * 8 > args.bitdepth: - change_reason.append(f"bit depth {audio.sample_width * 8} -> {args.bitdepth}") + change_reason.append( + f"bit depth {audio.sample_width * 8} -> {args.bitdepth}" + ) if not dry_run: audio = audio.set_sample_width(args.bitdepth // 8) modified = True @@ -121,7 +181,9 @@ def process_audio(file_path, args, dry_run=False): modified = True elif args.min_samplerate and audio.frame_rate < args.min_samplerate: # Only upsample if the user specifies a minimum sample rate - change_reason.append(f"sample rate {audio.frame_rate} -> {args.min_samplerate}") + change_reason.append( + f"sample rate {audio.frame_rate} -> {args.min_samplerate}" + ) if not dry_run: audio = audio.set_frame_rate(args.min_samplerate) modified = True @@ -131,9 +193,13 @@ def process_audio(file_path, args, dry_run=False): if not dry_run: # Backup the original file if required if args.backup_dir != "-": - backup_path = os.path.join(args.backup_dir, os.path.basename(file_path)) + # Get the relative path from the current working directory + rel_path = os.path.relpath(file_path) + # Create the backup path maintaining the directory structure + backup_path = os.path.join(args.backup_dir, rel_path) + # Ensure the directory structure exists os.makedirs(os.path.dirname(backup_path), exist_ok=True) - shutil.copy(file_path, backup_path) + shutil.copy2(file_path, backup_path) # copy2 preserves metadata # Export the converted audio file output_file = file_path.replace(os.path.splitext(file_path)[1], ".wav") @@ -141,7 +207,9 @@ def process_audio(file_path, args, dry_run=False): # Generate spectrogram if enabled if not args.skip_spectrograms: - generate_spectrogram(file_path, output_file, args.backup_dir) + generate_spectrogram( + file_path, output_file, os.path.dirname(backup_path) + ) else: print(f"{file_path} [UNCHANGED]") @@ -155,7 +223,10 @@ def process_audio(file_path, args, dry_run=False): # Retry the process with the re-encoded file process_audio(reencoded_file, args, dry_run) except Exception as retry_error: - print(f"Failed to process the re-encoded file {reencoded_file}: {retry_error}") + print( + f"Failed to process the re-encoded file {reencoded_file}: {retry_error}" + ) + def check_effectively_mono(audio, threshold_dB): """Check if a stereo file is effectively mono.""" @@ -166,6 +237,7 @@ def check_effectively_mono(audio, threshold_dB): peak_diff_db = difference.max_dBFS return peak_diff_db < threshold_dB + def generate_spectrogram(original_file, new_file, backup_dir): """Generate and save spectrograms for the original and new files.""" y_old, sr_old = librosa.load(original_file, sr=None) @@ -174,28 +246,35 @@ def generate_spectrogram(original_file, new_file, backup_dir): # Spectrogram for original file plt.figure(figsize=(10, 4)) D_old = librosa.amplitude_to_db(np.abs(librosa.stft(y_old)), ref=np.max) - librosa.display.specshow(D_old, sr=sr_old, x_axis='time', y_axis='log') - plt.colorbar(format='%+2.0f dB') - plt.title(f'Spectrogram of {original_file}') - old_spectrogram_path = os.path.join(backup_dir, os.path.basename(original_file) + ".old.png") + librosa.display.specshow(D_old, sr=sr_old, x_axis="time", y_axis="log") + plt.colorbar(format="%+2.0f dB") + plt.title(f"Spectrogram of {os.path.basename(original_file)}") + old_spectrogram_path = os.path.join( + backup_dir, os.path.basename(original_file) + ".old.png" + ) + os.makedirs(backup_dir, exist_ok=True) # Ensure the directory exists plt.savefig(old_spectrogram_path) plt.close() # Spectrogram for new file plt.figure(figsize=(10, 4)) D_new = librosa.amplitude_to_db(np.abs(librosa.stft(y_new)), ref=np.max) - librosa.display.specshow(D_new, sr=sr_new, x_axis='time', y_axis='log') - plt.colorbar(format='%+2.0f dB') - plt.title(f'Spectrogram of {new_file}') - new_spectrogram_path = os.path.join(backup_dir, os.path.basename(new_file) + ".new.png") + librosa.display.specshow(D_new, sr=sr_new, x_axis="time", y_axis="log") + plt.colorbar(format="%+2.0f dB") + plt.title(f"Spectrogram of {os.path.basename(new_file)}") + new_spectrogram_path = os.path.join( + backup_dir, os.path.basename(new_file) + ".new.png" + ) plt.savefig(new_spectrogram_path) plt.close() + def list_files(args, file_list): """Prints file summary and actions without performing them.""" for file_path in file_list: print(f"Previewing: {file_path}") + def collect_files(args): """Collect all files from provided directories and files, skipping resource fork files.""" file_list = [] @@ -210,14 +289,19 @@ def collect_files(args): file_list.append(path) return file_list + def run_in_parallel(file_list, args): """Run the audio processing in parallel.""" try: with ThreadPoolExecutor(max_workers=args.jobs) as executor: - futures = {executor.submit(process_audio, file, args): file for file in file_list} + futures = { + executor.submit(process_audio, file, args): file for file in file_list + } for future in concurrent.futures.as_completed(futures): try: - result = future.result() # Get the result of the future (processed file) + result = ( + future.result() + ) # Get the result of the future (processed file) except Exception as exc: file = futures[future] print(f"File {file} generated an exception: {exc}") @@ -226,6 +310,7 @@ def run_in_parallel(file_list, args): executor.shutdown(wait=False, cancel_futures=True) raise + def main(): args = parse_args() @@ -249,5 +334,6 @@ def main(): else: run_in_parallel(file_list, args) + if __name__ == "__main__": - main() \ No newline at end of file + main() From afdd47b23103d40294b806b3b5e58867bd09ad8e Mon Sep 17 00:00:00 2001 From: Chris Farrell Date: Mon, 18 Nov 2024 00:12:55 +0000 Subject: [PATCH 03/66] Nov 17, 2024, 4:12 PM --- sample-shrinker-python/sample-shrinker.py | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/sample-shrinker-python/sample-shrinker.py b/sample-shrinker-python/sample-shrinker.py index 3e66d66..0bfc0d2 100644 --- a/sample-shrinker-python/sample-shrinker.py +++ b/sample-shrinker-python/sample-shrinker.py @@ -53,7 +53,10 @@ def parse_args(): ) parser.add_argument("-R", "--min_samplerate", type=int, help="Minimum sample rate") parser.add_argument( - "-x", "--ext", default="wav", help="File extension to search for (default: wav)" + "-x", + "--ext", + default="wav,mp3", + help="Comma-separated file extensions to search for (default: wav,mp3)", ) parser.add_argument( "-a", @@ -276,16 +279,26 @@ def list_files(args, file_list): def collect_files(args): - """Collect all files from provided directories and files, skipping resource fork files.""" + """Collect all wav and mp3 files from provided directories and files.""" file_list = [] + # Split extensions string into a list and clean up whitespace + valid_extensions = [ext.strip().lower() for ext in args.ext.split(",")] + for path in args.files: if os.path.isdir(path): for root, dirs, files in os.walk(path): for file in files: - if file.endswith(f".{args.ext}") and not file.startswith("._"): + file_lower = file.lower() + # Check if file ends with any of the valid extensions + if any( + file_lower.endswith(f".{ext}") for ext in valid_extensions + ) and not file.startswith("._"): file_list.append(os.path.join(root, file)) elif os.path.isfile(path): - if not os.path.basename(path).startswith("._"): + file_lower = path.lower() + if any( + file_lower.endswith(f".{ext}") for ext in valid_extensions + ) and not os.path.basename(path).startswith("._"): file_list.append(path) return file_list From 91a9605256d59fac411268454d941e0646a3f5c2 Mon Sep 17 00:00:00 2001 From: Chris Farrell Date: Mon, 18 Nov 2024 00:17:48 +0000 Subject: [PATCH 04/66] Nov 17, 2024, 4:17 PM --- sample-shrinker-python/sample-shrinker.py | 115 +++++++++++++++++++--- 1 file changed, 101 insertions(+), 14 deletions(-) diff --git a/sample-shrinker-python/sample-shrinker.py b/sample-shrinker-python/sample-shrinker.py index 0bfc0d2..ba56239 100644 --- a/sample-shrinker-python/sample-shrinker.py +++ b/sample-shrinker-python/sample-shrinker.py @@ -2,11 +2,15 @@ import concurrent.futures import os import shutil +import time +from collections import defaultdict from concurrent.futures import ThreadPoolExecutor, as_completed +from pathlib import Path import librosa import matplotlib.pyplot as plt import numpy as np +import questionary import soundfile as sf from pydub import AudioSegment @@ -324,6 +328,61 @@ def run_in_parallel(file_list, args): raise +def find_duplicate_directories(paths): + """Find directories with matching names and file counts.""" + dir_map = defaultdict(list) + + for path in paths: + path = Path(path) + if path.is_dir(): + for dir_path in path.rglob("*"): + if dir_path.is_dir(): + # Get directory name, file count, and total size + dir_name = dir_path.name.lower() # Case-insensitive comparison + files = list(dir_path.glob("*")) + file_count = len([f for f in files if f.is_file()]) + total_size = sum(f.stat().st_size for f in files if f.is_file()) + + dir_map[(dir_name, file_count, total_size)].append(dir_path) + + # Return only directories that have duplicates + return {k: v for k, v in dir_map.items() if len(v) > 1} + + +def process_duplicate_directories(duplicates, args): + """Process duplicate directories, keeping the oldest copy.""" + for (dir_name, file_count, total_size), paths in duplicates.items(): + print( + f"\nFound duplicate directories named '{dir_name}' with {file_count} files ({total_size} bytes):" + ) + + # Sort paths by creation time + paths_with_time = [(p, p.stat().st_ctime) for p in paths] + paths_with_time.sort(key=lambda x: x[1]) + + # Keep the oldest directory + original_dir = paths_with_time[0][0] + print( + f"Keeping oldest copy: {original_dir} (created: {time.ctime(paths_with_time[0][1])})" + ) + + # Process newer copies + for dir_path, ctime in paths_with_time[1:]: + print(f"Moving duplicate: {dir_path} (created: {time.ctime(ctime)})") + if not args.dry_run: + # Create backup path + rel_path = dir_path.relative_to(dir_path.parent.parent) + backup_path = Path(args.backup_dir) / rel_path + + # Ensure backup directory exists + backup_path.parent.mkdir(parents=True, exist_ok=True) + + try: + shutil.move(str(dir_path), str(backup_path)) + except Exception as e: + print(f"Error moving directory {dir_path}: {e}") + + def main(): args = parse_args() @@ -332,20 +391,48 @@ def main(): print(usage_intro()) return - # Delete all '._' files before processing anything - for path in args.files: - if os.path.isdir(path): - delete_resource_forks(path) - - # Collect the files to process - file_list = collect_files(args) - - if args.dry_run or args.list: - list_files(args, file_list) - for file in file_list: - process_audio(file, args, dry_run=True) - else: - run_in_parallel(file_list, args) + # Ask user what they want to do + action = questionary.select( + "What would you like to do?", + choices=[ + "Shrink samples (convert audio files)", + "Remove duplicate directories", + "Exit", + ], + ).ask() + + if action == "Exit": + return + elif action == "Remove duplicate directories": + # Find and process duplicate directories + print("\nSearching for duplicate directories...") + duplicates = find_duplicate_directories(args.files) + + if not duplicates: + print("No duplicate directories found.") + return + + if args.dry_run: + print("\nDRY RUN - No files will be moved") + + process_duplicate_directories(duplicates, args) + print("\nDuplicate removal complete!") + + else: # Shrink samples + # Delete all '._' files before processing anything + for path in args.files: + if os.path.isdir(path): + delete_resource_forks(path) + + # Collect the files to process + file_list = collect_files(args) + + if args.dry_run or args.list: + list_files(args, file_list) + for file in file_list: + process_audio(file, args, dry_run=True) + else: + run_in_parallel(file_list, args) if __name__ == "__main__": From 1b57e195ee567ae300caf59e030c1cc9c6d8651c Mon Sep 17 00:00:00 2001 From: Chris Farrell Date: Mon, 18 Nov 2024 00:22:08 +0000 Subject: [PATCH 05/66] Nov 17, 2024, 4:22 PM --- sample-shrinker-python/sample-shrinker.py | 122 ++++++++++++++++++++-- 1 file changed, 111 insertions(+), 11 deletions(-) diff --git a/sample-shrinker-python/sample-shrinker.py b/sample-shrinker-python/sample-shrinker.py index ba56239..c41e4ca 100644 --- a/sample-shrinker-python/sample-shrinker.py +++ b/sample-shrinker-python/sample-shrinker.py @@ -383,15 +383,10 @@ def process_duplicate_directories(duplicates, args): print(f"Error moving directory {dir_path}: {e}") -def main(): - args = parse_args() - - # Ensure that at least one file or directory is provided - if not args.files: - print(usage_intro()) - return - - # Ask user what they want to do +def get_interactive_config(): + """Get configuration through interactive questionary prompts.""" + + # First, get the action type action = questionary.select( "What would you like to do?", choices=[ @@ -402,8 +397,113 @@ def main(): ).ask() if action == "Exit": + return None, None + + # Get the directory/files to process + paths = questionary.path( + "Select directory or file to process:", + only_directories=False, + multiple=True + ).ask() + + if not paths: + return None, None + + # Create a namespace object to match argparse structure + args = argparse.Namespace() + args.files = paths.split(",") if isinstance(paths, str) else paths + + # Set defaults + args.backup_dir = "_backup" + args.dry_run = False + args.skip_spectrograms = False + args.jobs = 1 + args.verbose = False + args.ext = "wav,mp3" + + if action == "Remove duplicate directories": + # For duplicate removal, we only need a few additional options + args.dry_run = questionary.confirm( + "Would you like to do a dry run first (preview without making changes)?", + default=True + ).ask() + + return "duplicates", args + + # For sample shrinking, get all the conversion options + args.bitdepth = questionary.select( + "Select target bit depth:", + choices=["8", "16", "24"], + default="16" + ).ask() + args.bitdepth = int(args.bitdepth) + + args.channels = questionary.select( + "Select target channels:", + choices=[ + "1 (mono)", + "2 (stereo)" + ], + default="2 (stereo)" + ).ask() + args.channels = 1 if "1" in args.channels else 2 + + args.samplerate = questionary.select( + "Select target sample rate:", + choices=["22050", "44100", "48000"], + default="44100" + ).ask() + args.samplerate = int(args.samplerate) + + # Advanced options in a checkbox group + advanced_options = questionary.checkbox( + "Select additional options:", + choices=[ + "Auto-convert stereo to mono when possible", + "Pre-normalize before conversion", + "Skip generating spectrograms", + "Preview changes (dry run)", + "Process files in parallel" + ] + ).ask() + + args.auto_mono = "Auto-convert stereo to mono when possible" in advanced_options + args.pre_normalize = "Pre-normalize before conversion" in advanced_options + args.skip_spectrograms = "Skip generating spectrograms" in advanced_options + args.dry_run = "Preview changes (dry run)" in advanced_options + + if "Process files in parallel" in advanced_options: + args.jobs = questionary.select( + "How many parallel jobs?", + choices=["2", "4", "8", "16"], + default="4" + ).ask() + args.jobs = int(args.jobs) + + if args.auto_mono: + args.auto_mono_threshold = float( + questionary.text( + "Auto-mono threshold in dB (default: -95.5):", + default="-95.5" + ).ask() + ) + + return "shrink", args + + +def main(): + # Check if command line arguments were provided + if len(sys.argv) > 1: + args = parse_args() + action = "shrink" # Default to shrink mode for command line + else: + # Use interactive mode + action, args = get_interactive_config() + + if not args: return - elif action == "Remove duplicate directories": + + if action == "duplicates": # Find and process duplicate directories print("\nSearching for duplicate directories...") duplicates = find_duplicate_directories(args.files) @@ -427,7 +527,7 @@ def main(): # Collect the files to process file_list = collect_files(args) - if args.dry_run or args.list: + if args.dry_run: list_files(args, file_list) for file in file_list: process_audio(file, args, dry_run=True) From 78c45d6dc107a1180f1d2ea2c55566c215b21b2e Mon Sep 17 00:00:00 2001 From: Chris Farrell Date: Mon, 18 Nov 2024 00:24:50 +0000 Subject: [PATCH 06/66] Nov 17, 2024, 4:24 PM --- sample-shrinker-python/sample-shrinker.py | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/sample-shrinker-python/sample-shrinker.py b/sample-shrinker-python/sample-shrinker.py index c41e4ca..a98357c 100644 --- a/sample-shrinker-python/sample-shrinker.py +++ b/sample-shrinker-python/sample-shrinker.py @@ -504,20 +504,7 @@ def main(): return if action == "duplicates": - # Find and process duplicate directories - print("\nSearching for duplicate directories...") - duplicates = find_duplicate_directories(args.files) - - if not duplicates: - print("No duplicate directories found.") - return - - if args.dry_run: - print("\nDRY RUN - No files will be moved") - - process_duplicate_directories(duplicates, args) - print("\nDuplicate removal complete!") - + process_duplicates(args) else: # Shrink samples # Delete all '._' files before processing anything for path in args.files: From bb3cea8a46e5f894d82c909c347bd17f37638bcf Mon Sep 17 00:00:00 2001 From: Chris Farrell Date: Mon, 18 Nov 2024 00:26:46 +0000 Subject: [PATCH 07/66] Nov 17, 2024, 4:26 PM --- sample-shrinker-python/sample-shrinker.py | 69 +++++++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/sample-shrinker-python/sample-shrinker.py b/sample-shrinker-python/sample-shrinker.py index a98357c..dd915dd 100644 --- a/sample-shrinker-python/sample-shrinker.py +++ b/sample-shrinker-python/sample-shrinker.py @@ -6,6 +6,8 @@ from collections import defaultdict from concurrent.futures import ThreadPoolExecutor, as_completed from pathlib import Path +import hashlib +import filecmp import librosa import matplotlib.pyplot as plt @@ -491,6 +493,73 @@ def get_interactive_config(): return "shrink", args +def process_duplicates(args): + """Process both directory and file level duplicates with safety checks.""" + print("\nPhase 1: Searching for duplicate directories...") + dir_duplicates = find_duplicate_directories(args.files) + + if dir_duplicates: + print(f"\nFound {sum(len(v) - 1 for v in dir_duplicates.values())} duplicate directories") + + # Safety check: Verify directory contents match exactly + verified_duplicates = {} + for key, paths in dir_duplicates.items(): + dir_name, file_count, total_size = key + + # Get file listing for each directory + dir_contents = defaultdict(list) + for path in paths: + files = sorted(f.relative_to(path) for f in path.rglob("*") if f.is_file()) + content_hash = hashlib.sha256(str(files).encode()).hexdigest() + dir_contents[content_hash].append(path) + + # Only keep directories with exactly matching contents + for content_hash, matching_paths in dir_contents.items(): + if len(matching_paths) > 1: + verified_duplicates[key + (content_hash,)] = matching_paths + + if args.dry_run: + print("\nDRY RUN - No directories will be moved") + process_duplicate_directories(verified_duplicates, args) + else: + print("No duplicate directories found.") + + print("\nPhase 2: Searching for duplicate files...") + file_duplicates = find_duplicate_files(args.files) + + if file_duplicates: + total_duplicates = sum(len(group) - 1 for group in file_duplicates) + print(f"\nFound {total_duplicates} duplicate files") + + # Additional safety checks for file processing + safe_duplicates = [] + for group in file_duplicates: + # Verify files are not symbolic links + real_files = [f for f in group if not f.is_symlink()] + + # Check if files are in use (on Windows) or locked + available_files = [] + for file in real_files: + try: + with open(file, 'rb') as f: + # Try to get a shared lock + pass + available_files.append(file) + except (IOError, OSError): + print(f"Warning: File {file} appears to be in use, skipping") + + if len(available_files) > 1: + safe_duplicates.append(available_files) + + if args.dry_run: + print("\nDRY RUN - No files will be moved") + process_duplicate_files(safe_duplicates, args) + else: + print("No duplicate files found.") + + print("\nDuplicate removal complete!") + + def main(): # Check if command line arguments were provided if len(sys.argv) > 1: From c9f2111179fd36efd3ab63f73a4113e612eef64f Mon Sep 17 00:00:00 2001 From: Chris Farrell Date: Mon, 18 Nov 2024 00:28:19 +0000 Subject: [PATCH 08/66] Nov 17, 2024, 4:28 PM --- sample-shrinker-python/sample-shrinker.py | 139 +++++++++++++++++++++- 1 file changed, 137 insertions(+), 2 deletions(-) diff --git a/sample-shrinker-python/sample-shrinker.py b/sample-shrinker-python/sample-shrinker.py index dd915dd..82ca17b 100644 --- a/sample-shrinker-python/sample-shrinker.py +++ b/sample-shrinker-python/sample-shrinker.py @@ -8,6 +8,7 @@ from pathlib import Path import hashlib import filecmp +import ssdeep # Add to imports import librosa import matplotlib.pyplot as plt @@ -330,6 +331,140 @@ def run_in_parallel(file_list, args): raise +def get_file_hash(file_path, fuzzy=False, chunk_size=1024*1024): + """Calculate file hash using either SHA-256 or fuzzy hashing.""" + if fuzzy: + try: + # Generate fuzzy hash for the file + return ssdeep.hash_from_file(str(file_path)) + except Exception as e: + print(f"Error generating fuzzy hash for {file_path}: {e}") + return None + else: + # Standard SHA-256 hash with quick check + sha256_hash = hashlib.sha256() + file_size = os.path.getsize(file_path) + + with open(file_path, "rb") as f: + # Read first chunk + first_chunk = f.read(chunk_size) + sha256_hash.update(first_chunk) + + # If file is large enough, read last chunk + if file_size > chunk_size * 2: + f.seek(-chunk_size, 2) + last_chunk = f.read(chunk_size) + sha256_hash.update(last_chunk) + + return sha256_hash.hexdigest() + +def is_audio_file(file_path): + """Check if file is an audio file we want to process.""" + return file_path.lower().endswith(('.wav', '.mp3')) + +def find_duplicate_files(paths, fuzzy_threshold=90): + """Find duplicate files using a multi-stage approach with optional fuzzy matching.""" + # Stage 1: Group by size (fast) + size_groups = defaultdict(list) + + for path in paths: + path = Path(path) + if path.is_dir(): + for file_path in path.rglob("*"): + if file_path.is_file() and is_audio_file(str(file_path)): + size = file_path.stat().st_size + size_groups[size].append(file_path) + + # Stage 2: For same-size files, group by quick hash + hash_groups = defaultdict(list) + fuzzy_groups = [] # Store groups of similar files + + for size, file_paths in size_groups.items(): + if len(file_paths) > 1: # Only process groups with potential duplicates + # First, try exact matches + for file_path in file_paths: + try: + file_hash = get_file_hash(file_path, fuzzy=False) + hash_groups[file_hash].append(file_path) + except Exception as e: + print(f"Error hashing file {file_path}: {e}") + + # Then, try fuzzy matching for files that weren't exact matches + unmatched_files = [f for f in file_paths if not any(f in group for group in hash_groups.values() if len(group) > 1)] + if len(unmatched_files) > 1: + fuzzy_matches = defaultdict(list) + for file_path in unmatched_files: + fuzzy_hash = get_file_hash(file_path, fuzzy=True) + if fuzzy_hash: + fuzzy_matches[file_path] = fuzzy_hash + + # Compare fuzzy hashes + matched = set() + for file1, hash1 in fuzzy_matches.items(): + if file1 in matched: + continue + similar_files = [file1] + for file2, hash2 in fuzzy_matches.items(): + if file2 != file1 and file2 not in matched: + similarity = ssdeep.compare(hash1, hash2) + if similarity >= fuzzy_threshold: + similar_files.append(file2) + matched.add(file2) + if len(similar_files) > 1: + fuzzy_groups.append(similar_files) + matched.add(file1) + + # Combine exact and fuzzy matches + duplicates = [group for group in hash_groups.values() if len(group) > 1] + duplicates.extend(fuzzy_groups) + + return duplicates, fuzzy_groups + +def process_duplicate_files(duplicates, fuzzy_groups, args): + """Process duplicate files with enhanced reporting.""" + for group in duplicates: + is_fuzzy = group in fuzzy_groups + match_type = "similar" if is_fuzzy else "identical" + + # Get file size for reporting + file_size = group[0].stat().st_size + print(f"\nFound {match_type} files: '{group[0].name}' ({file_size} bytes)") + + if is_fuzzy: + # For fuzzy matches, show similarity percentages + base_hash = get_file_hash(group[0], fuzzy=True) + print("Similarity scores:") + for file in group[1:]: + file_hash = get_file_hash(file, fuzzy=True) + similarity = ssdeep.compare(base_hash, file_hash) + print(f" {file.name}: {similarity}% similar") + + # Sort files by creation time + files_with_time = [(f, f.stat().st_ctime) for f in group] + files_with_time.sort(key=lambda x: x[1]) + + # Keep the oldest file + original_file = files_with_time[0][0] + print(f"Keeping oldest copy: {original_file} (created: {time.ctime(files_with_time[0][1])})") + + # Process newer copies + for file_path, ctime in files_with_time[1:]: + print(f"Moving {match_type} file: {file_path} (created: {time.ctime(ctime)})") + if not args.dry_run: + try: + # Create backup path maintaining directory structure + rel_path = file_path.relative_to(file_path.parent.parent) + backup_path = Path(args.backup_dir) / rel_path + + # Ensure backup directory exists + backup_path.parent.mkdir(parents=True, exist_ok=True) + + # Move the file + shutil.move(str(file_path), str(backup_path)) + except Exception as e: + print(f"Error moving file {file_path}: {e}") + + def find_duplicate_directories(paths): """Find directories with matching names and file counts.""" dir_map = defaultdict(list) @@ -525,7 +660,7 @@ def process_duplicates(args): print("No duplicate directories found.") print("\nPhase 2: Searching for duplicate files...") - file_duplicates = find_duplicate_files(args.files) + file_duplicates, fuzzy_groups = find_duplicate_files(args.files) if file_duplicates: total_duplicates = sum(len(group) - 1 for group in file_duplicates) @@ -553,7 +688,7 @@ def process_duplicates(args): if args.dry_run: print("\nDRY RUN - No files will be moved") - process_duplicate_files(safe_duplicates, args) + process_duplicate_files(safe_duplicates, fuzzy_groups, args) else: print("No duplicate files found.") From 3936bc8ecec62c1e75f968ec38f98d1863d39fb9 Mon Sep 17 00:00:00 2001 From: Chris Farrell Date: Mon, 18 Nov 2024 00:29:44 +0000 Subject: [PATCH 09/66] Nov 17, 2024, 4:29 PM --- sample-shrinker-python/sample-shrinker.py | 155 ++++++++++++++++------ 1 file changed, 114 insertions(+), 41 deletions(-) diff --git a/sample-shrinker-python/sample-shrinker.py b/sample-shrinker-python/sample-shrinker.py index 82ca17b..4386eff 100644 --- a/sample-shrinker-python/sample-shrinker.py +++ b/sample-shrinker-python/sample-shrinker.py @@ -362,9 +362,9 @@ def is_audio_file(file_path): """Check if file is an audio file we want to process.""" return file_path.lower().endswith(('.wav', '.mp3')) -def find_duplicate_files(paths, fuzzy_threshold=90): +def find_duplicate_files(paths, args): """Find duplicate files using a multi-stage approach with optional fuzzy matching.""" - # Stage 1: Group by size (fast) + print("Scanning for duplicate files...") size_groups = defaultdict(list) for path in paths: @@ -372,51 +372,76 @@ def find_duplicate_files(paths, fuzzy_threshold=90): if path.is_dir(): for file_path in path.rglob("*"): if file_path.is_file() and is_audio_file(str(file_path)): + if args.verbose: + print(f"Scanning: {file_path}") size = file_path.stat().st_size size_groups[size].append(file_path) - # Stage 2: For same-size files, group by quick hash hash_groups = defaultdict(list) - fuzzy_groups = [] # Store groups of similar files + fuzzy_groups = [] for size, file_paths in size_groups.items(): - if len(file_paths) > 1: # Only process groups with potential duplicates - # First, try exact matches + if len(file_paths) > 1: + if args.verbose: + print(f"\nChecking {len(file_paths)} files of size {size} bytes...") + + # First pass: exact matches for file_path in file_paths: try: file_hash = get_file_hash(file_path, fuzzy=False) - hash_groups[file_hash].append(file_path) + if args.ignore_names: + # Use only the hash for grouping if ignoring names + hash_groups[file_hash].append(file_path) + else: + # Include name in grouping key + name_key = file_path.stem.lower() + hash_groups[(name_key, file_hash)].append(file_path) except Exception as e: print(f"Error hashing file {file_path}: {e}") - # Then, try fuzzy matching for files that weren't exact matches - unmatched_files = [f for f in file_paths if not any(f in group for group in hash_groups.values() if len(group) > 1)] - if len(unmatched_files) > 1: - fuzzy_matches = defaultdict(list) - for file_path in unmatched_files: - fuzzy_hash = get_file_hash(file_path, fuzzy=True) - if fuzzy_hash: - fuzzy_matches[file_path] = fuzzy_hash - - # Compare fuzzy hashes - matched = set() - for file1, hash1 in fuzzy_matches.items(): - if file1 in matched: - continue - similar_files = [file1] - for file2, hash2 in fuzzy_matches.items(): - if file2 != file1 and file2 not in matched: - similarity = ssdeep.compare(hash1, hash2) - if similarity >= fuzzy_threshold: - similar_files.append(file2) - matched.add(file2) - if len(similar_files) > 1: - fuzzy_groups.append(similar_files) - matched.add(file1) + # Second pass: fuzzy matching if enabled + if args.use_fuzzy: + unmatched = [f for f in file_paths if not any(f in g for g in hash_groups.values() if len(g) > 1)] + if len(unmatched) > 1: + fuzzy_matches = defaultdict(list) + + for file_path in unmatched: + try: + audio = AudioSegment.from_file(str(file_path)) + fuzzy_key = [] + + if "Compare file lengths" in args.fuzzy_options: + fuzzy_key.append(len(audio)) + if "Compare sample rates" in args.fuzzy_options: + fuzzy_key.append(audio.frame_rate) + if "Compare channel counts" in args.fuzzy_options: + fuzzy_key.append(audio.channels) + + fuzzy_hash = get_file_hash(file_path, fuzzy=True) + if fuzzy_hash: + fuzzy_matches[(tuple(fuzzy_key), fuzzy_hash)].append(file_path) + except Exception as e: + print(f"Error analyzing {file_path}: {e}") + + # Compare fuzzy matches + for key, matches in fuzzy_matches.items(): + if len(matches) > 1: + base_hash = get_file_hash(matches[0], fuzzy=True) + similar_files = [matches[0]] + + for other_file in matches[1:]: + other_hash = get_file_hash(other_file, fuzzy=True) + similarity = ssdeep.compare(base_hash, other_hash) + if similarity >= args.fuzzy_threshold: + similar_files.append(other_file) + + if len(similar_files) > 1: + fuzzy_groups.append(similar_files) - # Combine exact and fuzzy matches + # Combine results based on exact and fuzzy matches duplicates = [group for group in hash_groups.values() if len(group) > 1] - duplicates.extend(fuzzy_groups) + if args.use_fuzzy: + duplicates.extend(fuzzy_groups) return duplicates, fuzzy_groups @@ -553,18 +578,66 @@ def get_interactive_config(): # Set defaults args.backup_dir = "_backup" args.dry_run = False - args.skip_spectrograms = False - args.jobs = 1 args.verbose = False args.ext = "wav,mp3" if action == "Remove duplicate directories": - # For duplicate removal, we only need a few additional options - args.dry_run = questionary.confirm( - "Would you like to do a dry run first (preview without making changes)?", - default=True + # For duplicate removal, get configuration options + duplicate_options = questionary.checkbox( + "Select duplicate removal options:", + choices=[ + "Use fuzzy matching for similar files", + "Ignore filenames (match by content only)", + "Preview changes (dry run)", + "Show detailed progress", + ], + default=["Preview changes (dry run)"] ).ask() - + + args.use_fuzzy = "Use fuzzy matching for similar files" in duplicate_options + args.ignore_names = "Ignore filenames (match by content only)" in duplicate_options + args.dry_run = "Preview changes (dry run)" in duplicate_options + args.verbose = "Show detailed progress" in duplicate_options + + if args.use_fuzzy: + # Get fuzzy matching configuration + args.fuzzy_threshold = questionary.select( + "Select fuzzy matching threshold (higher = more strict):", + choices=[ + "95 - Nearly identical", + "90 - Very similar", + "85 - Similar", + "80 - Somewhat similar" + ], + default="90 - Very similar" + ).ask() + args.fuzzy_threshold = int(args.fuzzy_threshold.split()[0]) + + args.fuzzy_options = questionary.checkbox( + "Select fuzzy matching options:", + choices=[ + "Compare file lengths", + "Compare sample rates", + "Compare channel counts", + ], + default=["Compare file lengths", "Compare sample rates"] + ).ask() + + # Get backup options + backup_choice = questionary.select( + "How should duplicates be handled?", + choices=[ + "Move to backup directory (safe)", + "Delete immediately (dangerous)", + "Preview only (no changes)" + ], + default="Move to backup directory (safe)" + ).ask() + + args.backup_dir = "_backup" if "Move" in backup_choice else None + args.delete_duplicates = "Delete" in backup_choice + args.dry_run = "Preview" in backup_choice + return "duplicates", args # For sample shrinking, get all the conversion options @@ -660,7 +733,7 @@ def process_duplicates(args): print("No duplicate directories found.") print("\nPhase 2: Searching for duplicate files...") - file_duplicates, fuzzy_groups = find_duplicate_files(args.files) + file_duplicates, fuzzy_groups = find_duplicate_files(args.files, args) if file_duplicates: total_duplicates = sum(len(group) - 1 for group in file_duplicates) From 6a0740f90c5b4e57a6e8fd51cbff3443faa4a115 Mon Sep 17 00:00:00 2001 From: Chris Farrell Date: Mon, 18 Nov 2024 00:30:43 +0000 Subject: [PATCH 10/66] Nov 17, 2024, 4:30 PM --- sample-shrinker-python/sample-shrinker.py | 163 ++++++++++++---------- 1 file changed, 86 insertions(+), 77 deletions(-) diff --git a/sample-shrinker-python/sample-shrinker.py b/sample-shrinker-python/sample-shrinker.py index 4386eff..ff0e2fe 100644 --- a/sample-shrinker-python/sample-shrinker.py +++ b/sample-shrinker-python/sample-shrinker.py @@ -1,20 +1,20 @@ import argparse import concurrent.futures +import filecmp +import hashlib import os import shutil import time from collections import defaultdict from concurrent.futures import ThreadPoolExecutor, as_completed from pathlib import Path -import hashlib -import filecmp -import ssdeep # Add to imports import librosa import matplotlib.pyplot as plt import numpy as np import questionary import soundfile as sf +import ssdeep # Add to imports from pydub import AudioSegment @@ -331,7 +331,7 @@ def run_in_parallel(file_list, args): raise -def get_file_hash(file_path, fuzzy=False, chunk_size=1024*1024): +def get_file_hash(file_path, fuzzy=False, chunk_size=1024 * 1024): """Calculate file hash using either SHA-256 or fuzzy hashing.""" if fuzzy: try: @@ -344,29 +344,31 @@ def get_file_hash(file_path, fuzzy=False, chunk_size=1024*1024): # Standard SHA-256 hash with quick check sha256_hash = hashlib.sha256() file_size = os.path.getsize(file_path) - + with open(file_path, "rb") as f: # Read first chunk first_chunk = f.read(chunk_size) sha256_hash.update(first_chunk) - + # If file is large enough, read last chunk if file_size > chunk_size * 2: f.seek(-chunk_size, 2) last_chunk = f.read(chunk_size) sha256_hash.update(last_chunk) - + return sha256_hash.hexdigest() + def is_audio_file(file_path): """Check if file is an audio file we want to process.""" - return file_path.lower().endswith(('.wav', '.mp3')) + return file_path.lower().endswith((".wav", ".mp3")) + def find_duplicate_files(paths, args): """Find duplicate files using a multi-stage approach with optional fuzzy matching.""" print("Scanning for duplicate files...") size_groups = defaultdict(list) - + for path in paths: path = Path(path) if path.is_dir(): @@ -376,15 +378,15 @@ def find_duplicate_files(paths, args): print(f"Scanning: {file_path}") size = file_path.stat().st_size size_groups[size].append(file_path) - + hash_groups = defaultdict(list) fuzzy_groups = [] - + for size, file_paths in size_groups.items(): if len(file_paths) > 1: if args.verbose: print(f"\nChecking {len(file_paths)} files of size {size} bytes...") - + # First pass: exact matches for file_path in file_paths: try: @@ -398,63 +400,70 @@ def find_duplicate_files(paths, args): hash_groups[(name_key, file_hash)].append(file_path) except Exception as e: print(f"Error hashing file {file_path}: {e}") - + # Second pass: fuzzy matching if enabled if args.use_fuzzy: - unmatched = [f for f in file_paths if not any(f in g for g in hash_groups.values() if len(g) > 1)] + unmatched = [ + f + for f in file_paths + if not any(f in g for g in hash_groups.values() if len(g) > 1) + ] if len(unmatched) > 1: fuzzy_matches = defaultdict(list) - + for file_path in unmatched: try: audio = AudioSegment.from_file(str(file_path)) fuzzy_key = [] - + if "Compare file lengths" in args.fuzzy_options: fuzzy_key.append(len(audio)) if "Compare sample rates" in args.fuzzy_options: fuzzy_key.append(audio.frame_rate) if "Compare channel counts" in args.fuzzy_options: fuzzy_key.append(audio.channels) - + fuzzy_hash = get_file_hash(file_path, fuzzy=True) if fuzzy_hash: - fuzzy_matches[(tuple(fuzzy_key), fuzzy_hash)].append(file_path) + fuzzy_matches[(tuple(fuzzy_key), fuzzy_hash)].append( + file_path + ) except Exception as e: print(f"Error analyzing {file_path}: {e}") - + # Compare fuzzy matches for key, matches in fuzzy_matches.items(): if len(matches) > 1: base_hash = get_file_hash(matches[0], fuzzy=True) similar_files = [matches[0]] - + for other_file in matches[1:]: other_hash = get_file_hash(other_file, fuzzy=True) similarity = ssdeep.compare(base_hash, other_hash) if similarity >= args.fuzzy_threshold: similar_files.append(other_file) - + if len(similar_files) > 1: fuzzy_groups.append(similar_files) - + # Combine results based on exact and fuzzy matches duplicates = [group for group in hash_groups.values() if len(group) > 1] if args.use_fuzzy: duplicates.extend(fuzzy_groups) - + return duplicates, fuzzy_groups + def process_duplicate_files(duplicates, fuzzy_groups, args): """Process duplicate files with enhanced reporting.""" for group in duplicates: is_fuzzy = group in fuzzy_groups match_type = "similar" if is_fuzzy else "identical" - + # Get file size for reporting file_size = group[0].stat().st_size print(f"\nFound {match_type} files: '{group[0].name}' ({file_size} bytes)") - + if is_fuzzy: # For fuzzy matches, show similarity percentages base_hash = get_file_hash(group[0], fuzzy=True) @@ -463,27 +472,31 @@ def process_duplicate_files(duplicates, fuzzy_groups, args): file_hash = get_file_hash(file, fuzzy=True) similarity = ssdeep.compare(base_hash, file_hash) print(f" {file.name}: {similarity}% similar") - + # Sort files by creation time files_with_time = [(f, f.stat().st_ctime) for f in group] files_with_time.sort(key=lambda x: x[1]) - + # Keep the oldest file original_file = files_with_time[0][0] - print(f"Keeping oldest copy: {original_file} (created: {time.ctime(files_with_time[0][1])})") - + print( + f"Keeping oldest copy: {original_file} (created: {time.ctime(files_with_time[0][1])})" + ) + # Process newer copies for file_path, ctime in files_with_time[1:]: - print(f"Moving {match_type} file: {file_path} (created: {time.ctime(ctime)})") + print( + f"Moving {match_type} file: {file_path} (created: {time.ctime(ctime)})" + ) if not args.dry_run: try: # Create backup path maintaining directory structure rel_path = file_path.relative_to(file_path.parent.parent) backup_path = Path(args.backup_dir) / rel_path - + # Ensure backup directory exists backup_path.parent.mkdir(parents=True, exist_ok=True) - + # Move the file shutil.move(str(file_path), str(backup_path)) except Exception as e: @@ -547,7 +560,7 @@ def process_duplicate_directories(duplicates, args): def get_interactive_config(): """Get configuration through interactive questionary prompts.""" - + # First, get the action type action = questionary.select( "What would you like to do?", @@ -563,9 +576,7 @@ def get_interactive_config(): # Get the directory/files to process paths = questionary.path( - "Select directory or file to process:", - only_directories=False, - multiple=True + "Select directory or file to process:", only_directories=False, multiple=True ).ask() if not paths: @@ -574,7 +585,7 @@ def get_interactive_config(): # Create a namespace object to match argparse structure args = argparse.Namespace() args.files = paths.split(",") if isinstance(paths, str) else paths - + # Set defaults args.backup_dir = "_backup" args.dry_run = False @@ -591,11 +602,13 @@ def get_interactive_config(): "Preview changes (dry run)", "Show detailed progress", ], - default=["Preview changes (dry run)"] + default=["Preview changes (dry run)"], ).ask() args.use_fuzzy = "Use fuzzy matching for similar files" in duplicate_options - args.ignore_names = "Ignore filenames (match by content only)" in duplicate_options + args.ignore_names = ( + "Ignore filenames (match by content only)" in duplicate_options + ) args.dry_run = "Preview changes (dry run)" in duplicate_options args.verbose = "Show detailed progress" in duplicate_options @@ -607,9 +620,9 @@ def get_interactive_config(): "95 - Nearly identical", "90 - Very similar", "85 - Similar", - "80 - Somewhat similar" + "80 - Somewhat similar", ], - default="90 - Very similar" + default="90 - Very similar", ).ask() args.fuzzy_threshold = int(args.fuzzy_threshold.split()[0]) @@ -620,7 +633,7 @@ def get_interactive_config(): "Compare sample rates", "Compare channel counts", ], - default=["Compare file lengths", "Compare sample rates"] + default=["Compare file lengths", "Compare sample rates"], ).ask() # Get backup options @@ -629,9 +642,9 @@ def get_interactive_config(): choices=[ "Move to backup directory (safe)", "Delete immediately (dangerous)", - "Preview only (no changes)" + "Preview only (no changes)", ], - default="Move to backup directory (safe)" + default="Move to backup directory (safe)", ).ask() args.backup_dir = "_backup" if "Move" in backup_choice else None @@ -642,26 +655,21 @@ def get_interactive_config(): # For sample shrinking, get all the conversion options args.bitdepth = questionary.select( - "Select target bit depth:", - choices=["8", "16", "24"], - default="16" + "Select target bit depth:", choices=["8", "16", "24"], default="16" ).ask() args.bitdepth = int(args.bitdepth) args.channels = questionary.select( "Select target channels:", - choices=[ - "1 (mono)", - "2 (stereo)" - ], - default="2 (stereo)" + choices=["1 (mono)", "2 (stereo)"], + default="2 (stereo)", ).ask() args.channels = 1 if "1" in args.channels else 2 args.samplerate = questionary.select( "Select target sample rate:", choices=["22050", "44100", "48000"], - default="44100" + default="44100", ).ask() args.samplerate = int(args.samplerate) @@ -673,28 +681,25 @@ def get_interactive_config(): "Pre-normalize before conversion", "Skip generating spectrograms", "Preview changes (dry run)", - "Process files in parallel" - ] + "Process files in parallel", + ], ).ask() args.auto_mono = "Auto-convert stereo to mono when possible" in advanced_options args.pre_normalize = "Pre-normalize before conversion" in advanced_options args.skip_spectrograms = "Skip generating spectrograms" in advanced_options args.dry_run = "Preview changes (dry run)" in advanced_options - + if "Process files in parallel" in advanced_options: args.jobs = questionary.select( - "How many parallel jobs?", - choices=["2", "4", "8", "16"], - default="4" + "How many parallel jobs?", choices=["2", "4", "8", "16"], default="4" ).ask() args.jobs = int(args.jobs) if args.auto_mono: args.auto_mono_threshold = float( questionary.text( - "Auto-mono threshold in dB (default: -95.5):", - default="-95.5" + "Auto-mono threshold in dB (default: -95.5):", default="-95.5" ).ask() ) @@ -705,66 +710,70 @@ def process_duplicates(args): """Process both directory and file level duplicates with safety checks.""" print("\nPhase 1: Searching for duplicate directories...") dir_duplicates = find_duplicate_directories(args.files) - + if dir_duplicates: - print(f"\nFound {sum(len(v) - 1 for v in dir_duplicates.values())} duplicate directories") - + print( + f"\nFound {sum(len(v) - 1 for v in dir_duplicates.values())} duplicate directories" + ) + # Safety check: Verify directory contents match exactly verified_duplicates = {} for key, paths in dir_duplicates.items(): dir_name, file_count, total_size = key - + # Get file listing for each directory dir_contents = defaultdict(list) for path in paths: - files = sorted(f.relative_to(path) for f in path.rglob("*") if f.is_file()) + files = sorted( + f.relative_to(path) for f in path.rglob("*") if f.is_file() + ) content_hash = hashlib.sha256(str(files).encode()).hexdigest() dir_contents[content_hash].append(path) - + # Only keep directories with exactly matching contents for content_hash, matching_paths in dir_contents.items(): if len(matching_paths) > 1: verified_duplicates[key + (content_hash,)] = matching_paths - + if args.dry_run: print("\nDRY RUN - No directories will be moved") process_duplicate_directories(verified_duplicates, args) else: print("No duplicate directories found.") - + print("\nPhase 2: Searching for duplicate files...") file_duplicates, fuzzy_groups = find_duplicate_files(args.files, args) - + if file_duplicates: total_duplicates = sum(len(group) - 1 for group in file_duplicates) print(f"\nFound {total_duplicates} duplicate files") - + # Additional safety checks for file processing safe_duplicates = [] for group in file_duplicates: # Verify files are not symbolic links real_files = [f for f in group if not f.is_symlink()] - + # Check if files are in use (on Windows) or locked available_files = [] for file in real_files: try: - with open(file, 'rb') as f: + with open(file, "rb") as f: # Try to get a shared lock pass available_files.append(file) except (IOError, OSError): print(f"Warning: File {file} appears to be in use, skipping") - + if len(available_files) > 1: safe_duplicates.append(available_files) - + if args.dry_run: print("\nDRY RUN - No files will be moved") process_duplicate_files(safe_duplicates, fuzzy_groups, args) else: print("No duplicate files found.") - + print("\nDuplicate removal complete!") @@ -776,7 +785,7 @@ def main(): else: # Use interactive mode action, args = get_interactive_config() - + if not args: return From bfca75a5acc15458a92ad20d729f37a8704f287c Mon Sep 17 00:00:00 2001 From: Chris Farrell Date: Mon, 18 Nov 2024 00:32:20 +0000 Subject: [PATCH 11/66] Nov 17, 2024, 4:32 PM --- sample-shrinker-python/requirements.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sample-shrinker-python/requirements.txt b/sample-shrinker-python/requirements.txt index 4a880fc..07fcf64 100644 --- a/sample-shrinker-python/requirements.txt +++ b/sample-shrinker-python/requirements.txt @@ -1,4 +1,4 @@ -librosa==0.10.2.post1 -matplotlib==3.9.2 -numpy==2.1.2 -pydub==0.25.1 +librosa +matplotlib +numpy +pydub From b099b57b4b9a676808331a09330e2d775578ea11 Mon Sep 17 00:00:00 2001 From: Chris Farrell Date: Mon, 18 Nov 2024 00:32:40 +0000 Subject: [PATCH 12/66] Nov 17, 2024, 4:32 PM --- sample-shrinker-python/requirements.txt | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/sample-shrinker-python/requirements.txt b/sample-shrinker-python/requirements.txt index 07fcf64..e5a6959 100644 --- a/sample-shrinker-python/requirements.txt +++ b/sample-shrinker-python/requirements.txt @@ -1,4 +1,7 @@ -librosa -matplotlib -numpy -pydub +librosa==0.10.2.post1 +matplotlib==3.9.2 +numpy==2.1.3 +pydub==0.25.1 +questionary==2.0.1 +soundfile==0.12.1 +ssdeep==3.4 From a8e578755356dfb482e02cf0576330cd8f76adf0 Mon Sep 17 00:00:00 2001 From: Chris Farrell Date: Mon, 18 Nov 2024 00:33:24 +0000 Subject: [PATCH 13/66] Nov 17, 2024, 4:33 PM --- sample-shrinker-python/README.md | 232 +++++++++++++++++++------------ 1 file changed, 140 insertions(+), 92 deletions(-) diff --git a/sample-shrinker-python/README.md b/sample-shrinker-python/README.md index 44c5d3c..26aff3f 100644 --- a/sample-shrinker-python/README.md +++ b/sample-shrinker-python/README.md @@ -1,130 +1,178 @@ - # Sample Shrinker -A Python script to conditionally batch-convert audio samples into minimal `.wav` files, based on target criteria. This script is useful for saving storage space and reducing the I/O stress during simultaneous real-time streaming of multiple `.wav` files on devices like the Dirtywave M8 tracker. - -If you have directories full of 24/32-bit stereo `.wav` files or stereo samples with effectively mono content, this script can reclaim wasted storage space and reduce I/O stress on your SD card. It can also detect if the content of a stereo sample is actually mono and convert it automatically! +A Python script to conditionally batch-convert audio samples into minimal `.wav` files and manage duplicate audio files. This script is useful for saving storage space, reducing I/O stress during simultaneous real-time streaming of multiple `.wav` files, and cleaning up duplicate samples across your library. ## Features -- **Conditional Conversion**: Only converts samples that don't meet the target criteria (bit depth, channels, etc.). -- **Auto-Mono**: Automatically convert stereo samples to mono if the content is effectively mono, with a configurable threshold. -- **Backup and Spectrogram Generation**: Converted files are backed up (unless disabled) and spectrograms of old vs. new files are generated. -- **Pre-Normalization**: Optionally normalize samples before downsampling the bit depth to preserve dynamic range. -- **Parallel Processing**: Use the `-j` option to process multiple files in parallel for faster conversions. -## Requirements +### Sample Conversion +- **Conditional Conversion**: Only converts samples that don't meet the target criteria (bit depth, channels, etc.) +- **Auto-Mono**: Automatically convert stereo samples to mono if the content is effectively mono +- **Backup and Spectrogram Generation**: Converted files are backed up with original folder structure preserved +- **Pre-Normalization**: Optionally normalize samples before downsampling bit depth +- **Parallel Processing**: Process multiple files simultaneously for faster conversions -- Python 3.10 or later -- `pydub`, `librosa`, `matplotlib`, `soundfile` (install with `pip`) -- `ffmpeg` or `libav` installed for `pydub` +### Duplicate Management +- **Multi-Level Detection**: Finds duplicates at both directory and file levels +- **Intelligent Matching**: Uses file size, content hashes, and optional fuzzy matching +- **Safe Defaults**: Moves duplicates to backup instead of deleting +- **Fuzzy Audio Matching**: Can detect similar audio files using configurable criteria +- **Directory Structure**: Maintains original folder structure in backup directory -Install dependencies: -```bash -pip install -r requirements.txt -``` +## Requirements -You will also need `ffmpeg`: +- Python 3.10 or later +- Required Python packages (install with `pip install -r requirements.txt`): + ``` + librosa==0.10.2.post1 + matplotlib==3.9.2 + numpy==2.1.2 + pydub==0.25.1 + questionary==2.0.1 + ssdeep==3.4 + ``` +- `ffmpeg` or `libav` installed for audio processing + +Install system dependencies: ```bash # MacOS with Homebrew -brew install ffmpeg +brew install ffmpeg ssdeep # Ubuntu/Debian -sudo apt install ffmpeg +sudo apt install ffmpeg ssdeep ``` ## Usage +### Interactive Mode +Simply run the script without arguments for an interactive interface: ```bash -python sample-shrinker.py [options] FILE|DIRECTORY ... +python sample-shrinker.py ``` -### Basic Example: +The interactive mode will guide you through: +1. Choosing between sample conversion or duplicate removal +2. Selecting directories/files to process +3. Configuring operation-specific options + +### Command Line Mode +For automation or scripting: ```bash -python sample-shrinker.py directory_of_samples/ +python sample-shrinker.py [options] FILE|DIRECTORY ... ``` -This will: -- Convert samples in place with a target bit depth of 16 and stereo channels unchanged. -- Back up the original files in a parallel `_backup/` directory. -- Generate `.png` spectrograms comparing old and new files. - -### Options: -- `-b BIT_DEPTH`: Set the target bit depth (default: 16). Samples will only be downsampled unless `-B` is set. -- `-B MIN_BIT_DEPTH`: Set a minimum bit depth. This will upsample any samples below the minimum. -- `-c CHANNELS`: Set the target number of output channels (default: 2). For mono, use `-c 1`. -- `-r SAMPLERATE`: Set the target sample rate (default: 44100 Hz). -- `-R MIN_SAMPLERATE`: Set a minimum sample rate. Samples below this will be upsampled. -- `-a`: Automatically convert stereo samples to mono if they are effectively mono. -- `-A DB_THRESHOLD`: Set the auto-mono threshold in dB (default: `-95.5`). This implies `-a`. -- `-p`: Pre-normalize samples before downsampling bit depth. -- `-S`: Skip generating spectrogram files. -- `-d BACKUP_DIR`: Set a directory to store backups. Use `-d -` to disable backups and spectrogram generation. -- `-l`: List files and preview changes without converting. -- `-n`: Dry run—log actions without converting any files. -- `-j JOBS`: Process files in parallel with multiple jobs (default: 1). -- `-v`: Increase verbosity. +## Sample Conversion Options + +### Interactive Configuration +When choosing "Shrink samples", you can configure: +- Target bit depth (8, 16, or 24 bit) +- Channel count (mono or stereo) +- Sample rate (22050, 44100, or 48000 Hz) +- Advanced options: + - Auto-mono conversion + - Pre-normalization + - Spectrogram generation + - Parallel processing + - Dry run preview + +### Command Line Options +- `-b BIT_DEPTH`: Set target bit depth (default: 16) +- `-B MIN_BIT_DEPTH`: Set minimum bit depth +- `-c CHANNELS`: Set target channels (1=mono, 2=stereo) +- `-r SAMPLERATE`: Set target sample rate (default: 44100) +- `-a`: Enable auto-mono conversion +- `-p`: Enable pre-normalization +- `-j JOBS`: Set number of parallel jobs +- `-n`: Preview changes without converting +- `-d BACKUP_DIR`: Set backup directory (default: _backup) + +## Duplicate Removal Options + +### Interactive Configuration +When choosing "Remove duplicates", you can configure: +- Fuzzy matching options: + - Similarity threshold (80-95%) + - File length comparison + - Sample rate comparison + - Channel count comparison +- Filename handling: + - Match by name and content + - Match by content only +- Duplicate handling: + - Move to backup (safe) + - Delete immediately + - Preview only + +### Process +1. **Directory Level**: + - Finds directories with matching names + - Compares file counts and total sizes + - Verifies exact content matches + - Keeps oldest copy, moves others to backup + +2. **File Level**: + - Groups files by size + - Performs quick hash comparison + - Optionally uses fuzzy matching for similar audio + - Maintains original directory structure in backup + +### Safety Features +- Dry run option to preview changes +- Backup by default instead of deletion +- Verification of file accessibility +- Symlink detection +- Lock checking +- Detailed progress reporting ## Examples -### Convert a Directory with Default Settings +### Basic Sample Conversion ```bash -python sample-shrinker.py my_samples/ -``` -- Convert samples to 16-bit with channels left unchanged. -- Back up the original files under `_backup/`. -- Generate spectrogram `.png` files for comparison. +# Interactive mode (recommended) +python sample-shrinker.py -### Convert to Mono Automatically for Effectively Mono Samples -```bash -python sample-shrinker.py -a my_samples/ +# Command line with specific options +python sample-shrinker.py -c 1 -b 16 -a samples/ ``` -- Automatically convert stereo samples to mono if they are effectively mono (i.e., the difference between the channels is below the threshold). -### Preview Changes Without Modifying Files +### Duplicate Removal ```bash -python sample-shrinker.py -l -a -A -80 my_samples/ -``` -- Lists all files and shows which ones would be changed without actually modifying them. The threshold for auto-mono is set to -80 dB. +# Interactive mode with guided configuration +python sample-shrinker.py -### Convert and Skip Backups -```bash -python sample-shrinker.py -d - my_samples/ +# Preview duplicate detection +python sample-shrinker.py samples/ -n ``` -- Converts files but does not create backups or generate spectrograms. -### Pre-Normalize Before Downsampling -```bash -python sample-shrinker.py -p my_samples/ +### Output Example ``` -- Normalize the audio before downsampling the bit depth to preserve as much dynamic range as possible. - -### Process Files in Parallel -```bash -python sample-shrinker.py -j 10 my_samples/ +Processing file: samples/drums/kick.wav +samples/drums/kick.wav [CHANGED]: bit depth 24 -> 16, auto-mono + +Found duplicate directories named 'drums' with 10 files (1.2MB): +Keeping oldest copy: samples/drums (created: Thu Mar 21 10:00:00 2024) +Moving duplicate: samples/backup/drums (created: Thu Mar 21 11:30:00 2024) + +Found similar files: 'snare.wav' (250KB) +Similarity scores: + snare_old.wav: 92% similar + snare_copy.wav: 95% similar +Keeping oldest copy: samples/snare.wav +Moving similar files to backup... ``` -- Process up to 10 files at the same time for faster batch conversion. - -## Output Example: -```bash -Processing file: /Volumes/Untitled/Samples/wii sports/sound effects/Baseball/Sample_0028.wav -/Volumes/Untitled/Samples/wii sports/sound effects/Baseball/Sample_0028.wav [UNCHANGED] -Processing file: /Volumes/Untitled/Samples/wii sports/sound effects/Boxing/Sample_0029.wav -/Volumes/Untitled/Samples/wii sports/sound effects/Baseball/Sample_0029.wav [CHANGED]: sample rate 48000 -> 44100 -Processing file: /Volumes/Untitled/Samples/wii sports/sound effects/Boxing/Sample_0030.wav -/Volumes/Untitled/Samples/wii sports/sound effects/Baseball/Sample_0030.wav[CHANGED]: auto-mono +## Directory Structure +``` +samples/ # Original directory + drums/ + kick.wav + snare.wav +_backup/ # Backup directory + samples/ # Original structure preserved + drums/ + kick.wav.old # Original files + kick.wav.old.png # Spectrograms + kick.wav.new.png ``` -In the updated output format: -- The script logs each file being processed with the `Processing file:` prefix. -- After processing, each file will either be marked as `[UNCHANGED]` or `[CHANGED]` depending on whether any modifications (bit depth, sample rate, or channels) were made. -- If changes are made, the specific adjustments (e.g., `sample rate 48000 -> 44100`) will be displayed. - -### Additional Details: -- The `[CHANGED]` notation follows files that were modified. -- `[UNCHANGED]` appears for files that meet the target criteria and required no modifications. -- **Changes made**: - - Sample rate conversions (e.g., `sample rate 48000 -> 44100`). - - Bit depth reductions (e.g., `bit depth 32 -> 16`). - - Channel conversions (e.g., stereo to mono). -- Verbose output (`-v`) will print additional information such as ongoing file processing. +## Contributing +Contributions are welcome! Please feel free to submit a Pull Request. From 1edd19091dbd09ab7f64633e3a2b43c9ad74bd41 Mon Sep 17 00:00:00 2001 From: Chris Farrell Date: Mon, 18 Nov 2024 05:59:12 +0000 Subject: [PATCH 14/66] Nov 17, 2024, 9:59 PM --- sample-shrinker-python/sample-shrinker.py | 151 ++++++++++++++-------- 1 file changed, 96 insertions(+), 55 deletions(-) diff --git a/sample-shrinker-python/sample-shrinker.py b/sample-shrinker-python/sample-shrinker.py index ff0e2fe..5d1fa1f 100644 --- a/sample-shrinker-python/sample-shrinker.py +++ b/sample-shrinker-python/sample-shrinker.py @@ -14,7 +14,8 @@ import numpy as np import questionary import soundfile as sf -import ssdeep # Add to imports +import scipy.signal +from scipy.io import wavfile from pydub import AudioSegment @@ -364,11 +365,63 @@ def is_audio_file(file_path): return file_path.lower().endswith((".wav", ".mp3")) +def get_audio_fingerprint(file_path): + """Generate an audio fingerprint using cross-correlation.""" + try: + # Load audio file + audio = AudioSegment.from_file(file_path) + # Convert to mono for comparison + if audio.channels > 1: + audio = audio.set_channels(1) + + # Convert to numpy array + samples = np.array(audio.get_array_of_samples()) + + # Normalize + samples = samples / np.max(np.abs(samples)) + + # Get a signature using peaks in frequency domain + freqs, times, spectrogram = scipy.signal.spectrogram( + samples, + audio.frame_rate, + nperseg=1024, + noverlap=512 + ) + + # Get the strongest frequencies + peaks = np.mean(spectrogram, axis=1) + # Normalize the peaks + peaks = peaks / np.max(peaks) + + return peaks + except Exception as e: + print(f"Error generating audio fingerprint for {file_path}: {e}") + return None + + +def compare_audio_similarity(file1_fingerprint, file2_fingerprint): + """Compare two audio fingerprints and return similarity score.""" + if file1_fingerprint is None or file2_fingerprint is None: + return 0 + + # Ensure same length for comparison + min_len = min(len(file1_fingerprint), len(file2_fingerprint)) + f1 = file1_fingerprint[:min_len] + f2 = file2_fingerprint[:min_len] + + # Calculate correlation coefficient + correlation = np.corrcoef(f1, f2)[0, 1] + # Convert to percentage and handle NaN + similarity = float(max(0, correlation) * 100) + return similarity if not np.isnan(similarity) else 0 + + def find_duplicate_files(paths, args): - """Find duplicate files using a multi-stage approach with optional fuzzy matching.""" + """Find duplicate files using a multi-stage approach with audio fingerprinting.""" print("Scanning for duplicate files...") size_groups = defaultdict(list) - + + # First pass: group by size for path in paths: path = Path(path) if path.is_dir(): @@ -378,80 +431,68 @@ def find_duplicate_files(paths, args): print(f"Scanning: {file_path}") size = file_path.stat().st_size size_groups[size].append(file_path) - + hash_groups = defaultdict(list) - fuzzy_groups = [] - + similar_groups = [] + + # Second pass: check content for size, file_paths in size_groups.items(): if len(file_paths) > 1: if args.verbose: print(f"\nChecking {len(file_paths)} files of size {size} bytes...") - - # First pass: exact matches + + # First try exact matches for file_path in file_paths: try: file_hash = get_file_hash(file_path, fuzzy=False) if args.ignore_names: - # Use only the hash for grouping if ignoring names hash_groups[file_hash].append(file_path) else: - # Include name in grouping key name_key = file_path.stem.lower() hash_groups[(name_key, file_hash)].append(file_path) except Exception as e: print(f"Error hashing file {file_path}: {e}") - - # Second pass: fuzzy matching if enabled + + # Then check for similar audio content if args.use_fuzzy: - unmatched = [ - f - for f in file_paths - if not any(f in g for g in hash_groups.values() if len(g) > 1) - ] + unmatched = [f for f in file_paths + if not any(f in g for g in hash_groups.values() if len(g) > 1)] + if len(unmatched) > 1: - fuzzy_matches = defaultdict(list) - + # Generate fingerprints for all unmatched files + fingerprints = {} for file_path in unmatched: - try: - audio = AudioSegment.from_file(str(file_path)) - fuzzy_key = [] - - if "Compare file lengths" in args.fuzzy_options: - fuzzy_key.append(len(audio)) - if "Compare sample rates" in args.fuzzy_options: - fuzzy_key.append(audio.frame_rate) - if "Compare channel counts" in args.fuzzy_options: - fuzzy_key.append(audio.channels) - - fuzzy_hash = get_file_hash(file_path, fuzzy=True) - if fuzzy_hash: - fuzzy_matches[(tuple(fuzzy_key), fuzzy_hash)].append( - file_path + fingerprint = get_audio_fingerprint(file_path) + if fingerprint is not None: + fingerprints[file_path] = fingerprint + + # Compare fingerprints + processed = set() + for file1 in fingerprints: + if file1 in processed: + continue + + similar_files = [file1] + for file2 in fingerprints: + if file2 != file1 and file2 not in processed: + similarity = compare_audio_similarity( + fingerprints[file1], + fingerprints[file2] ) - except Exception as e: - print(f"Error analyzing {file_path}: {e}") - - # Compare fuzzy matches - for key, matches in fuzzy_matches.items(): - if len(matches) > 1: - base_hash = get_file_hash(matches[0], fuzzy=True) - similar_files = [matches[0]] - - for other_file in matches[1:]: - other_hash = get_file_hash(other_file, fuzzy=True) - similarity = ssdeep.compare(base_hash, other_hash) if similarity >= args.fuzzy_threshold: - similar_files.append(other_file) - - if len(similar_files) > 1: - fuzzy_groups.append(similar_files) - - # Combine results based on exact and fuzzy matches + similar_files.append(file2) + processed.add(file2) + + if len(similar_files) > 1: + similar_groups.append(similar_files) + processed.add(file1) + + # Combine results duplicates = [group for group in hash_groups.values() if len(group) > 1] if args.use_fuzzy: - duplicates.extend(fuzzy_groups) - - return duplicates, fuzzy_groups + duplicates.extend(similar_groups) + + return duplicates, similar_groups def process_duplicate_files(duplicates, fuzzy_groups, args): From 42f4cdf9c039f7f44cabdd30566e8b5180269c78 Mon Sep 17 00:00:00 2001 From: Chris Farrell Date: Mon, 18 Nov 2024 06:02:24 +0000 Subject: [PATCH 15/66] Nov 17, 2024, 10:02 PM --- sample-shrinker-python/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sample-shrinker-python/requirements.txt b/sample-shrinker-python/requirements.txt index e5a6959..a1f5fa2 100644 --- a/sample-shrinker-python/requirements.txt +++ b/sample-shrinker-python/requirements.txt @@ -4,4 +4,4 @@ numpy==2.1.3 pydub==0.25.1 questionary==2.0.1 soundfile==0.12.1 -ssdeep==3.4 +scipy>=1.11.0 \ No newline at end of file From f714be1787d6c2e9c06eb4ac9d0db4e1dfbedea6 Mon Sep 17 00:00:00 2001 From: Chris Farrell Date: Mon, 18 Nov 2024 06:03:34 +0000 Subject: [PATCH 16/66] Nov 17, 2024, 10:03 PM --- sample-shrinker-python/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sample-shrinker-python/requirements.txt b/sample-shrinker-python/requirements.txt index a1f5fa2..ee3cb89 100644 --- a/sample-shrinker-python/requirements.txt +++ b/sample-shrinker-python/requirements.txt @@ -1,6 +1,6 @@ librosa==0.10.2.post1 matplotlib==3.9.2 -numpy==2.1.3 +numpy pydub==0.25.1 questionary==2.0.1 soundfile==0.12.1 From df3cae0675587cfb11ffbcf41d6cd571261781bc Mon Sep 17 00:00:00 2001 From: Chris Farrell Date: Mon, 18 Nov 2024 06:04:23 +0000 Subject: [PATCH 17/66] Nov 17, 2024, 10:04 PM --- sample-shrinker-python/sample-shrinker.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/sample-shrinker-python/sample-shrinker.py b/sample-shrinker-python/sample-shrinker.py index 5d1fa1f..46edccc 100644 --- a/sample-shrinker-python/sample-shrinker.py +++ b/sample-shrinker-python/sample-shrinker.py @@ -4,6 +4,7 @@ import hashlib import os import shutil +import sys import time from collections import defaultdict from concurrent.futures import ThreadPoolExecutor, as_completed @@ -333,14 +334,10 @@ def run_in_parallel(file_list, args): def get_file_hash(file_path, fuzzy=False, chunk_size=1024 * 1024): - """Calculate file hash using either SHA-256 or fuzzy hashing.""" + """Calculate file hash using either SHA-256 or audio fingerprinting.""" if fuzzy: - try: - # Generate fuzzy hash for the file - return ssdeep.hash_from_file(str(file_path)) - except Exception as e: - print(f"Error generating fuzzy hash for {file_path}: {e}") - return None + # Use our audio fingerprinting instead of ssdeep + return get_audio_fingerprint(file_path) else: # Standard SHA-256 hash with quick check sha256_hash = hashlib.sha256() @@ -507,12 +504,12 @@ def process_duplicate_files(duplicates, fuzzy_groups, args): if is_fuzzy: # For fuzzy matches, show similarity percentages - base_hash = get_file_hash(group[0], fuzzy=True) + base_fingerprint = get_audio_fingerprint(group[0]) print("Similarity scores:") for file in group[1:]: - file_hash = get_file_hash(file, fuzzy=True) - similarity = ssdeep.compare(base_hash, file_hash) - print(f" {file.name}: {similarity}% similar") + file_fingerprint = get_audio_fingerprint(file) + similarity = compare_audio_similarity(base_fingerprint, file_fingerprint) + print(f" {file.name}: {similarity:.1f}% similar") # Sort files by creation time files_with_time = [(f, f.stat().st_ctime) for f in group] From 86f2c8800426ae42bedbf9986b73a7d565ff17ed Mon Sep 17 00:00:00 2001 From: Chris Farrell Date: Mon, 18 Nov 2024 06:07:23 +0000 Subject: [PATCH 18/66] Nov 17, 2024, 10:07 PM --- sample-shrinker-python/sample-shrinker.py | 25 ++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/sample-shrinker-python/sample-shrinker.py b/sample-shrinker-python/sample-shrinker.py index 46edccc..8796cd8 100644 --- a/sample-shrinker-python/sample-shrinker.py +++ b/sample-shrinker-python/sample-shrinker.py @@ -598,7 +598,7 @@ def process_duplicate_directories(duplicates, args): def get_interactive_config(): """Get configuration through interactive questionary prompts.""" - + # First, get the action type action = questionary.select( "What would you like to do?", @@ -613,16 +613,31 @@ def get_interactive_config(): return None, None # Get the directory/files to process - paths = questionary.path( - "Select directory or file to process:", only_directories=False, multiple=True - ).ask() + paths = [] + while True: + path = questionary.path( + "Select directory or file to process (press Enter with empty path when done):", + only_directories=False, + ).ask() + + if not path: # Empty input + if paths: # If we have at least one path, break + break + else: # If no paths yet, ask again + print("Please select at least one directory or file.") + continue + + paths.append(path) + + if not questionary.confirm("Add another path?", default=False).ask(): + break if not paths: return None, None # Create a namespace object to match argparse structure args = argparse.Namespace() - args.files = paths.split(",") if isinstance(paths, str) else paths + args.files = paths # Set defaults args.backup_dir = "_backup" From 94d5a2af1c3ae3049803006360f955241adff92d Mon Sep 17 00:00:00 2001 From: Chris Farrell Date: Mon, 18 Nov 2024 06:08:14 +0000 Subject: [PATCH 19/66] Nov 17, 2024, 10:08 PM --- sample-shrinker-python/sample-shrinker.py | 42 ++++++++++++++++++++--- 1 file changed, 38 insertions(+), 4 deletions(-) diff --git a/sample-shrinker-python/sample-shrinker.py b/sample-shrinker-python/sample-shrinker.py index 8796cd8..5f21d6a 100644 --- a/sample-shrinker-python/sample-shrinker.py +++ b/sample-shrinker-python/sample-shrinker.py @@ -639,11 +639,22 @@ def get_interactive_config(): args = argparse.Namespace() args.files = paths - # Set defaults + # Set ALL default values (matching parse_args defaults) args.backup_dir = "_backup" args.dry_run = False args.verbose = False args.ext = "wav,mp3" + args.bitdepth = 16 + args.min_bitdepth = None + args.channels = 2 + args.samplerate = 44100 + args.min_samplerate = None + args.auto_mono = False + args.auto_mono_threshold = -95.5 + args.skip_spectrograms = False + args.pre_normalize = False + args.list = False + args.jobs = 1 if action == "Remove duplicate directories": # For duplicate removal, get configuration options @@ -708,7 +719,9 @@ def get_interactive_config(): # For sample shrinking, get all the conversion options args.bitdepth = questionary.select( - "Select target bit depth:", choices=["8", "16", "24"], default="16" + "Select target bit depth:", + choices=["8", "16", "24"], + default="16" ).ask() args.bitdepth = int(args.bitdepth) @@ -735,6 +748,8 @@ def get_interactive_config(): "Skip generating spectrograms", "Preview changes (dry run)", "Process files in parallel", + "Set minimum sample rate", + "Set minimum bit depth" ], ).ask() @@ -745,14 +760,33 @@ def get_interactive_config(): if "Process files in parallel" in advanced_options: args.jobs = questionary.select( - "How many parallel jobs?", choices=["2", "4", "8", "16"], default="4" + "How many parallel jobs?", + choices=["2", "4", "8", "16"], + default="4" ).ask() args.jobs = int(args.jobs) + if "Set minimum sample rate" in advanced_options: + args.min_samplerate = questionary.select( + "Select minimum sample rate:", + choices=["22050", "44100", "48000"], + default="22050" + ).ask() + args.min_samplerate = int(args.min_samplerate) + + if "Set minimum bit depth" in advanced_options: + args.min_bitdepth = questionary.select( + "Select minimum bit depth:", + choices=["8", "16", "24"], + default="16" + ).ask() + args.min_bitdepth = int(args.min_bitdepth) + if args.auto_mono: args.auto_mono_threshold = float( questionary.text( - "Auto-mono threshold in dB (default: -95.5):", default="-95.5" + "Auto-mono threshold in dB (default: -95.5):", + default="-95.5" ).ask() ) From f6d781ffb6c7a0c774f70b559154662861e597e7 Mon Sep 17 00:00:00 2001 From: Chris Farrell Date: Mon, 18 Nov 2024 06:09:18 +0000 Subject: [PATCH 20/66] Nov 17, 2024, 10:09 PM --- sample-shrinker-python/sample-shrinker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sample-shrinker-python/sample-shrinker.py b/sample-shrinker-python/sample-shrinker.py index 5f21d6a..5cab372 100644 --- a/sample-shrinker-python/sample-shrinker.py +++ b/sample-shrinker-python/sample-shrinker.py @@ -870,7 +870,7 @@ def main(): args = parse_args() action = "shrink" # Default to shrink mode for command line else: - # Use interactive mode + # Use interactive mode with saved configuration action, args = get_interactive_config() if not args: From 3015985e63e9c0914b08f93c87d633ab306f9bcd Mon Sep 17 00:00:00 2001 From: Chris Farrell Date: Mon, 18 Nov 2024 06:18:36 +0000 Subject: [PATCH 21/66] Nov 17, 2024, 10:18 PM --- sample-shrinker-python/README.md | 55 ++++++++++++++++++++------------ 1 file changed, 34 insertions(+), 21 deletions(-) diff --git a/sample-shrinker-python/README.md b/sample-shrinker-python/README.md index 26aff3f..20f36af 100644 --- a/sample-shrinker-python/README.md +++ b/sample-shrinker-python/README.md @@ -13,9 +13,9 @@ A Python script to conditionally batch-convert audio samples into minimal `.wav` ### Duplicate Management - **Multi-Level Detection**: Finds duplicates at both directory and file levels -- **Intelligent Matching**: Uses file size, content hashes, and optional fuzzy matching +- **Intelligent Matching**: Uses file size, content hashes, and audio fingerprinting +- **Audio Fingerprinting**: Uses spectral analysis to detect similar audio content - **Safe Defaults**: Moves duplicates to backup instead of deleting -- **Fuzzy Audio Matching**: Can detect similar audio files using configurable criteria - **Directory Structure**: Maintains original folder structure in backup directory ## Requirements @@ -25,34 +25,36 @@ A Python script to conditionally batch-convert audio samples into minimal `.wav` ``` librosa==0.10.2.post1 matplotlib==3.9.2 - numpy==2.1.2 + numpy pydub==0.25.1 questionary==2.0.1 - ssdeep==3.4 + soundfile==0.12.1 + scipy>=1.11.0 ``` - `ffmpeg` or `libav` installed for audio processing Install system dependencies: ```bash # MacOS with Homebrew -brew install ffmpeg ssdeep +brew install ffmpeg # Ubuntu/Debian -sudo apt install ffmpeg ssdeep +sudo apt install ffmpeg ``` ## Usage -### Interactive Mode -Simply run the script without arguments for an interactive interface: +### Interactive Mode (Recommended) +Simply run the script without arguments: ```bash python sample-shrinker.py ``` -The interactive mode will guide you through: +The interactive interface will guide you through: 1. Choosing between sample conversion or duplicate removal -2. Selecting directories/files to process +2. Selecting directories/files to process (add multiple paths) 3. Configuring operation-specific options +4. Setting advanced parameters ### Command Line Mode For automation or scripting: @@ -63,7 +65,7 @@ python sample-shrinker.py [options] FILE|DIRECTORY ... ## Sample Conversion Options ### Interactive Configuration -When choosing "Shrink samples", you can configure: +When choosing "Shrink samples", configure: - Target bit depth (8, 16, or 24 bit) - Channel count (mono or stereo) - Sample rate (22050, 44100, or 48000 Hz) @@ -72,6 +74,8 @@ When choosing "Shrink samples", you can configure: - Pre-normalization - Spectrogram generation - Parallel processing + - Minimum sample rate + - Minimum bit depth - Dry run preview ### Command Line Options @@ -79,6 +83,7 @@ When choosing "Shrink samples", you can configure: - `-B MIN_BIT_DEPTH`: Set minimum bit depth - `-c CHANNELS`: Set target channels (1=mono, 2=stereo) - `-r SAMPLERATE`: Set target sample rate (default: 44100) +- `-R MIN_SAMPLERATE`: Set minimum sample rate - `-a`: Enable auto-mono conversion - `-p`: Enable pre-normalization - `-j JOBS`: Set number of parallel jobs @@ -88,8 +93,8 @@ When choosing "Shrink samples", you can configure: ## Duplicate Removal Options ### Interactive Configuration -When choosing "Remove duplicates", you can configure: -- Fuzzy matching options: +When choosing "Remove duplicates", configure: +- Audio matching options: - Similarity threshold (80-95%) - File length comparison - Sample rate comparison @@ -102,7 +107,7 @@ When choosing "Remove duplicates", you can configure: - Delete immediately - Preview only -### Process +### Detection Process 1. **Directory Level**: - Finds directories with matching names - Compares file counts and total sizes @@ -110,11 +115,18 @@ When choosing "Remove duplicates", you can configure: - Keeps oldest copy, moves others to backup 2. **File Level**: - - Groups files by size - - Performs quick hash comparison - - Optionally uses fuzzy matching for similar audio + - Groups files by size (fast initial filter) + - Performs quick hash comparison for exact matches + - Uses audio fingerprinting for similar content detection - Maintains original directory structure in backup +### Audio Fingerprinting +- Converts audio to mono for comparison +- Generates spectral fingerprints +- Compares frequency content +- Provides similarity scores as percentages +- Configurable similarity threshold + ### Safety Features - Dry run option to preview changes - Backup by default instead of deletion @@ -122,12 +134,13 @@ When choosing "Remove duplicates", you can configure: - Symlink detection - Lock checking - Detailed progress reporting +- Original folder structure preserved in backups ## Examples ### Basic Sample Conversion ```bash -# Interactive mode (recommended) +# Interactive mode with guided configuration python sample-shrinker.py # Command line with specific options @@ -136,7 +149,7 @@ python sample-shrinker.py -c 1 -b 16 -a samples/ ### Duplicate Removal ```bash -# Interactive mode with guided configuration +# Interactive mode (recommended) python sample-shrinker.py # Preview duplicate detection @@ -154,8 +167,8 @@ Moving duplicate: samples/backup/drums (created: Thu Mar 21 11:30:00 2024) Found similar files: 'snare.wav' (250KB) Similarity scores: - snare_old.wav: 92% similar - snare_copy.wav: 95% similar + snare_old.wav: 92.5% similar + snare_copy.wav: 95.8% similar Keeping oldest copy: samples/snare.wav Moving similar files to backup... ``` From 34c3cfbbfc71f41fac9859f49881341a0e65e6c8 Mon Sep 17 00:00:00 2001 From: Chris Farrell Date: Mon, 18 Nov 2024 06:18:51 +0000 Subject: [PATCH 22/66] Nov 17, 2024, 10:18 PM --- sample-shrinker-python/requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sample-shrinker-python/requirements.txt b/sample-shrinker-python/requirements.txt index ee3cb89..ac87aa6 100644 --- a/sample-shrinker-python/requirements.txt +++ b/sample-shrinker-python/requirements.txt @@ -1,7 +1,7 @@ librosa==0.10.2.post1 matplotlib==3.9.2 -numpy +numpy==2.1.3 pydub==0.25.1 questionary==2.0.1 +scipy==1.14.1 soundfile==0.12.1 -scipy>=1.11.0 \ No newline at end of file From 62d37e8c76e8305ab361d8d8771c03a3cdc6e4b1 Mon Sep 17 00:00:00 2001 From: Chris Farrell Date: Mon, 18 Nov 2024 06:24:51 +0000 Subject: [PATCH 23/66] Nov 17, 2024, 10:24 PM --- sample-shrinker-python/sample-shrinker.py | 130 ++++++++++++---------- 1 file changed, 74 insertions(+), 56 deletions(-) diff --git a/sample-shrinker-python/sample-shrinker.py b/sample-shrinker-python/sample-shrinker.py index 5cab372..7542910 100644 --- a/sample-shrinker-python/sample-shrinker.py +++ b/sample-shrinker-python/sample-shrinker.py @@ -18,6 +18,11 @@ import scipy.signal from scipy.io import wavfile from pydub import AudioSegment +from rich.console import Console +from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TaskProgressColumn +from rich.panel import Panel +from rich.text import Text +from rich import print as rprint def usage_intro(): @@ -145,10 +150,14 @@ def reencode_audio(file_path): return None -def process_audio(file_path, args, dry_run=False): +def process_audio(file_path, args, dry_run=False, task_id=None, progress=None): """Main function to process audio files based on arguments.""" try: - print(f"Processing file: {file_path}") + if progress: + progress.update(task_id, description=f"Processing: {Path(file_path).name}") + else: + console.print(f"Processing file: [cyan]{file_path}[/cyan]") + audio = AudioSegment.from_file(file_path) modified = False change_reason = [] @@ -201,7 +210,12 @@ def process_audio(file_path, args, dry_run=False): modified = True if modified: - print(f"{file_path} [CHANGED]: {', '.join(change_reason)}") + status = Text() + status.append(f"{file_path} ", style="cyan") + status.append("[CHANGED]: ", style="yellow") + status.append(", ".join(change_reason), style="green") + console.print(status) + if not dry_run: # Backup the original file if required if args.backup_dir != "-": @@ -223,10 +237,13 @@ def process_audio(file_path, args, dry_run=False): file_path, output_file, os.path.dirname(backup_path) ) else: - print(f"{file_path} [UNCHANGED]") + status = Text() + status.append(f"{file_path} ", style="cyan") + status.append("[UNCHANGED]", style="blue") + console.print(status) except Exception as e: - print(f"Error processing {file_path}: {e}") + console.print(f"[red]Error processing {file_path}: {e}[/red]") # Try re-encoding the file if ffmpeg failed reencoded_file = reencode_audio(file_path) @@ -235,8 +252,8 @@ def process_audio(file_path, args, dry_run=False): # Retry the process with the re-encoded file process_audio(reencoded_file, args, dry_run) except Exception as retry_error: - print( - f"Failed to process the re-encoded file {reencoded_file}: {retry_error}" + console.print( + f"[red]Failed to process the re-encoded file {reencoded_file}: {retry_error}[/red]" ) @@ -313,22 +330,38 @@ def collect_files(args): def run_in_parallel(file_list, args): - """Run the audio processing in parallel.""" + """Run the audio processing in parallel with progress bar.""" try: - with ThreadPoolExecutor(max_workers=args.jobs) as executor: - futures = { - executor.submit(process_audio, file, args): file for file in file_list - } - for future in concurrent.futures.as_completed(futures): - try: - result = ( - future.result() - ) # Get the result of the future (processed file) - except Exception as exc: - file = futures[future] - print(f"File {file} generated an exception: {exc}") + with Progress( + SpinnerColumn(), + TextColumn("[progress.description]{task.description}"), + BarColumn(), + TaskProgressColumn(), + console=console, + ) as progress: + task = progress.add_task("Processing files...", total=len(file_list)) + + with ThreadPoolExecutor(max_workers=args.jobs) as executor: + futures = { + executor.submit( + process_audio, + file, + args, + task_id=task, + progress=progress + ): file for file in file_list + } + + for future in concurrent.futures.as_completed(futures): + progress.advance(task) + try: + result = future.result() + except Exception as exc: + file = futures[future] + console.print(f"[red]File {file} generated an exception: {exc}[/red]") + except KeyboardInterrupt: - print("Received KeyboardInterrupt, attempting to cancel all threads...") + console.print("[yellow]Received KeyboardInterrupt, attempting to cancel all threads...[/yellow]") executor.shutdown(wait=False, cancel_futures=True) raise @@ -794,47 +827,32 @@ def get_interactive_config(): def process_duplicates(args): - """Process both directory and file level duplicates with safety checks.""" - print("\nPhase 1: Searching for duplicate directories...") - dir_duplicates = find_duplicate_directories(args.files) + """Process both directory and file level duplicates with visual feedback.""" + with console.status("[bold green]Phase 1: Searching for duplicate directories...") as status: + dir_duplicates = find_duplicate_directories(args.files) if dir_duplicates: - print( - f"\nFound {sum(len(v) - 1 for v in dir_duplicates.values())} duplicate directories" - ) - - # Safety check: Verify directory contents match exactly - verified_duplicates = {} - for key, paths in dir_duplicates.items(): - dir_name, file_count, total_size = key - - # Get file listing for each directory - dir_contents = defaultdict(list) - for path in paths: - files = sorted( - f.relative_to(path) for f in path.rglob("*") if f.is_file() - ) - content_hash = hashlib.sha256(str(files).encode()).hexdigest() - dir_contents[content_hash].append(path) - - # Only keep directories with exactly matching contents - for content_hash, matching_paths in dir_contents.items(): - if len(matching_paths) > 1: - verified_duplicates[key + (content_hash,)] = matching_paths - + count = sum(len(v) - 1 for v in dir_duplicates.values()) + console.print(Panel(f"Found [cyan]{count}[/cyan] duplicate directories", + title="Directory Scan Complete")) + if args.dry_run: - print("\nDRY RUN - No directories will be moved") + console.print("[yellow]DRY RUN - No directories will be moved[/yellow]") process_duplicate_directories(verified_duplicates, args) else: - print("No duplicate directories found.") + console.print("[blue]No duplicate directories found.[/blue]") - print("\nPhase 2: Searching for duplicate files...") - file_duplicates, fuzzy_groups = find_duplicate_files(args.files, args) + with console.status("[bold green]Phase 2: Searching for duplicate files...") as status: + file_duplicates, fuzzy_groups = find_duplicate_files(args.files, args) if file_duplicates: total_duplicates = sum(len(group) - 1 for group in file_duplicates) - print(f"\nFound {total_duplicates} duplicate files") - + console.print(Panel( + f"Found [cyan]{total_duplicates}[/cyan] duplicate files\n" + f"Including [cyan]{len(fuzzy_groups)}[/cyan] groups of similar files", + title="File Scan Complete" + )) + # Additional safety checks for file processing safe_duplicates = [] for group in file_duplicates: @@ -856,12 +874,12 @@ def process_duplicates(args): safe_duplicates.append(available_files) if args.dry_run: - print("\nDRY RUN - No files will be moved") + console.print("[yellow]DRY RUN - No files will be moved[/yellow]") process_duplicate_files(safe_duplicates, fuzzy_groups, args) else: - print("No duplicate files found.") + console.print("[blue]No duplicate files found.[/blue]") - print("\nDuplicate removal complete!") + console.print("[green]Duplicate removal complete![/green]") def main(): From 8135da880972d8673ed98d9422bf577e81f69a26 Mon Sep 17 00:00:00 2001 From: Chris Farrell Date: Mon, 18 Nov 2024 06:26:10 +0000 Subject: [PATCH 24/66] Nov 17, 2024, 10:26 PM --- sample-shrinker-python/sample-shrinker.py | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/sample-shrinker-python/sample-shrinker.py b/sample-shrinker-python/sample-shrinker.py index 7542910..7ca1812 100644 --- a/sample-shrinker-python/sample-shrinker.py +++ b/sample-shrinker-python/sample-shrinker.py @@ -733,18 +733,28 @@ def get_interactive_config(): default=["Compare file lengths", "Compare sample rates"], ).ask() - # Get backup options + # Get backup options (moved before backup_choice) + backup_dir = questionary.text( + "Backup directory path:", + default="_backup", + description="Directory where duplicates will be moved" + ).ask() + + if backup_dir.strip(): # If not empty + args.backup_dir = backup_dir.strip() + else: + args.backup_dir = "_backup" # Fallback to default + backup_choice = questionary.select( "How should duplicates be handled?", choices=[ - "Move to backup directory (safe)", + f"Move to {args.backup_dir} (safe)", "Delete immediately (dangerous)", "Preview only (no changes)", ], - default="Move to backup directory (safe)", + default=f"Move to {args.backup_dir} (safe)", ).ask() - args.backup_dir = "_backup" if "Move" in backup_choice else None args.delete_duplicates = "Delete" in backup_choice args.dry_run = "Preview" in backup_choice @@ -794,8 +804,9 @@ def get_interactive_config(): if "Process files in parallel" in advanced_options: args.jobs = questionary.select( "How many parallel jobs?", - choices=["2", "4", "8", "16"], - default="4" + choices=["2", "4", "8", "16", "24", "32", "48", "64"], + default="4", + description="Higher values may improve speed but use more memory" ).ask() args.jobs = int(args.jobs) From ae57b21d4f68daaf58566fe2d6e8c2d8210c0fa9 Mon Sep 17 00:00:00 2001 From: Chris Farrell Date: Mon, 18 Nov 2024 06:28:38 +0000 Subject: [PATCH 25/66] Nov 17, 2024, 10:28 PM --- sample-shrinker-python/sample-shrinker.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/sample-shrinker-python/sample-shrinker.py b/sample-shrinker-python/sample-shrinker.py index 7ca1812..c2de9fa 100644 --- a/sample-shrinker-python/sample-shrinker.py +++ b/sample-shrinker-python/sample-shrinker.py @@ -24,6 +24,9 @@ from rich.text import Text from rich import print as rprint +# Initialize console +console = Console() + def usage_intro(): return """ @@ -803,10 +806,9 @@ def get_interactive_config(): if "Process files in parallel" in advanced_options: args.jobs = questionary.select( - "How many parallel jobs?", + "How many parallel jobs? (higher values may improve speed but use more memory)", choices=["2", "4", "8", "16", "24", "32", "48", "64"], - default="4", - description="Higher values may improve speed but use more memory" + default="4" ).ask() args.jobs = int(args.jobs) From 2bafb23a591d0cfe5739696be7490682bfd73875 Mon Sep 17 00:00:00 2001 From: Chris Farrell Date: Mon, 18 Nov 2024 06:38:00 +0000 Subject: [PATCH 26/66] Nov 17, 2024, 10:38 PM --- sample-shrinker-python/sample-shrinker.py | 54 ++++++++++++++++------- 1 file changed, 39 insertions(+), 15 deletions(-) diff --git a/sample-shrinker-python/sample-shrinker.py b/sample-shrinker-python/sample-shrinker.py index c2de9fa..f33f5c0 100644 --- a/sample-shrinker-python/sample-shrinker.py +++ b/sample-shrinker-python/sample-shrinker.py @@ -310,30 +310,44 @@ def list_files(args, file_list): def collect_files(args): """Collect all wav and mp3 files from provided directories and files.""" file_list = [] - # Split extensions string into a list and clean up whitespace valid_extensions = [ext.strip().lower() for ext in args.ext.split(",")] - + + console.print("[cyan]Starting file collection...[/cyan]") + for path in args.files: - if os.path.isdir(path): + # Expand user and resolve path + path = os.path.expanduser(path) + path = os.path.expandvars(path) + path = Path(path).resolve() + + console.print(f"[cyan]Scanning path: {path}[/cyan]") + + if path.is_dir(): for root, dirs, files in os.walk(path): for file in files: file_lower = file.lower() - # Check if file ends with any of the valid extensions - if any( - file_lower.endswith(f".{ext}") for ext in valid_extensions - ) and not file.startswith("._"): - file_list.append(os.path.join(root, file)) - elif os.path.isfile(path): - file_lower = path.lower() - if any( - file_lower.endswith(f".{ext}") for ext in valid_extensions - ) and not os.path.basename(path).startswith("._"): - file_list.append(path) + if any(file_lower.endswith(f".{ext}") for ext in valid_extensions) and not file.startswith("._"): + full_path = os.path.join(root, file) + file_list.append(full_path) + if args.verbose: + console.print(f"[dim]Found: {full_path}[/dim]") + elif path.is_file(): + file_lower = str(path).lower() + if any(file_lower.endswith(f".{ext}") for ext in valid_extensions) and not path.name.startswith("._"): + file_list.append(str(path)) + if args.verbose: + console.print(f"[dim]Found: {path}[/dim]") + + console.print(f"[green]Found {len(file_list)} files to process[/green]") return file_list def run_in_parallel(file_list, args): """Run the audio processing in parallel with progress bar.""" + if not file_list: + console.print("[yellow]No files to process![/yellow]") + return + try: with Progress( SpinnerColumn(), @@ -342,9 +356,13 @@ def run_in_parallel(file_list, args): TaskProgressColumn(), console=console, ) as progress: - task = progress.add_task("Processing files...", total=len(file_list)) + total_files = len(file_list) + console.print(f"[cyan]Starting processing of {total_files} files with {args.jobs} parallel jobs[/cyan]") + + task = progress.add_task("Processing files...", total=total_files) with ThreadPoolExecutor(max_workers=args.jobs) as executor: + # Submit all tasks futures = { executor.submit( process_audio, @@ -355,6 +373,7 @@ def run_in_parallel(file_list, args): ): file for file in file_list } + # Process completed tasks for future in concurrent.futures.as_completed(futures): progress.advance(task) try: @@ -362,11 +381,16 @@ def run_in_parallel(file_list, args): except Exception as exc: file = futures[future] console.print(f"[red]File {file} generated an exception: {exc}[/red]") + + console.print("[green]Processing complete![/green]") except KeyboardInterrupt: console.print("[yellow]Received KeyboardInterrupt, attempting to cancel all threads...[/yellow]") executor.shutdown(wait=False, cancel_futures=True) raise + except Exception as e: + console.print(f"[red]Error in parallel processing: {e}[/red]") + raise def get_file_hash(file_path, fuzzy=False, chunk_size=1024 * 1024): From 797b1f89ea57cb6aeacef841ed169b7098f6cf93 Mon Sep 17 00:00:00 2001 From: Chris Farrell Date: Mon, 18 Nov 2024 06:41:33 +0000 Subject: [PATCH 27/66] Nov 17, 2024, 10:41 PM --- sample-shrinker-python/sample-shrinker.py | 85 +++++++++++++++-------- 1 file changed, 57 insertions(+), 28 deletions(-) diff --git a/sample-shrinker-python/sample-shrinker.py b/sample-shrinker-python/sample-shrinker.py index f33f5c0..2f49783 100644 --- a/sample-shrinker-python/sample-shrinker.py +++ b/sample-shrinker-python/sample-shrinker.py @@ -9,6 +9,7 @@ from collections import defaultdict from concurrent.futures import ThreadPoolExecutor, as_completed from pathlib import Path +import subprocess import librosa import matplotlib.pyplot as plt @@ -135,22 +136,40 @@ def delete_resource_forks(directory): def reencode_audio(file_path): """Re-encode audio file to PCM 16-bit if it has a different encoding.""" try: - with sf.SoundFile(file_path) as f: - print( - f"Audio encoding: {f.format}, subtype: {f.subtype}, channels: {f.channels}" - ) - if f.subtype != "PCM_16": - # If the file is not PCM 16, re-save it as PCM_16 - data, samplerate = sf.read(file_path) - temp_output = file_path.replace( - os.path.splitext(file_path)[1], "_reencoded.wav" - ) - sf.write(temp_output, data, samplerate, subtype="PCM_16") - print(f"File re-encoded to PCM_16: {file_path} -> {temp_output}") - return temp_output + output_path = str(Path(file_path).with_suffix('.reencoded.wav')) + # Use ffmpeg directly for more reliable conversion + cmd = [ + 'ffmpeg', '-y', + '-i', str(file_path), + '-acodec', 'pcm_s16le', + '-ar', '44100', + output_path + ] + + result = subprocess.run(cmd, capture_output=True, text=True) + if result.returncode == 0: + console.print(f"[green]Successfully re-encoded: {output_path}[/green]") + return output_path + else: + console.print(f"[red]FFmpeg error: {result.stderr}[/red]") + return None except Exception as e: - print(f"Error re-encoding {file_path}: {e}") - return None + console.print(f"[red]Error re-encoding {file_path}: {str(e)}[/red]") + return None + + +def check_ffmpeg(): + """Check if ffmpeg is available and properly installed.""" + try: + subprocess.run(['ffmpeg', '-version'], capture_output=True, check=True) + return True + except (subprocess.SubprocessError, FileNotFoundError): + console.print("[red]Error: ffmpeg is not installed or not found in PATH[/red]") + console.print("Please install ffmpeg:") + console.print(" MacOS: brew install ffmpeg") + console.print(" Ubuntu/Debian: sudo apt install ffmpeg") + console.print(" Windows: https://ffmpeg.org/download.html") + return False def process_audio(file_path, args, dry_run=False, task_id=None, progress=None): @@ -160,8 +179,22 @@ def process_audio(file_path, args, dry_run=False, task_id=None, progress=None): progress.update(task_id, description=f"Processing: {Path(file_path).name}") else: console.print(f"Processing file: [cyan]{file_path}[/cyan]") + + try: + audio = AudioSegment.from_file(file_path) + except (IndexError, OSError) as e: + console.print(f"[red]Error loading {file_path}: {str(e)}[/red]") + console.print("[yellow]Attempting to re-encode file...[/yellow]") + reencoded_file = reencode_audio(file_path) + if reencoded_file: + try: + audio = AudioSegment.from_file(reencoded_file) + except Exception as re_err: + console.print(f"[red]Failed to process re-encoded file: {str(re_err)}[/red]") + return + else: + return - audio = AudioSegment.from_file(file_path) modified = False change_reason = [] @@ -246,18 +279,10 @@ def process_audio(file_path, args, dry_run=False, task_id=None, progress=None): console.print(status) except Exception as e: - console.print(f"[red]Error processing {file_path}: {e}[/red]") - - # Try re-encoding the file if ffmpeg failed - reencoded_file = reencode_audio(file_path) - if reencoded_file: - try: - # Retry the process with the re-encoded file - process_audio(reencoded_file, args, dry_run) - except Exception as retry_error: - console.print( - f"[red]Failed to process the re-encoded file {reencoded_file}: {retry_error}[/red]" - ) + console.print(f"[red]Error processing {file_path}: {str(e)}[/red]") + console.print(f"[yellow]Stack trace:[/yellow]") + import traceback + console.print(traceback.format_exc()) def check_effectively_mono(audio, threshold_dB): @@ -920,6 +945,10 @@ def process_duplicates(args): def main(): + # Check for ffmpeg first + if not check_ffmpeg(): + return + # Check if command line arguments were provided if len(sys.argv) > 1: args = parse_args() From 8c9a2f65cd747205974f31d76d22a9fae1fa9fa8 Mon Sep 17 00:00:00 2001 From: Chris Farrell Date: Mon, 18 Nov 2024 06:43:20 +0000 Subject: [PATCH 28/66] Nov 17, 2024, 10:43 PM --- sample-shrinker-python/sample-shrinker.py | 61 ++++++++++++++++------- 1 file changed, 43 insertions(+), 18 deletions(-) diff --git a/sample-shrinker-python/sample-shrinker.py b/sample-shrinker-python/sample-shrinker.py index 2f49783..16b4bbc 100644 --- a/sample-shrinker-python/sample-shrinker.py +++ b/sample-shrinker-python/sample-shrinker.py @@ -255,23 +255,47 @@ def process_audio(file_path, args, dry_run=False, task_id=None, progress=None): if not dry_run: # Backup the original file if required if args.backup_dir != "-": - # Get the relative path from the current working directory - rel_path = os.path.relpath(file_path) - # Create the backup path maintaining the directory structure - backup_path = os.path.join(args.backup_dir, rel_path) - # Ensure the directory structure exists - os.makedirs(os.path.dirname(backup_path), exist_ok=True) - shutil.copy2(file_path, backup_path) # copy2 preserves metadata + try: + # Convert the file path to a Path object + file_path_obj = Path(file_path).resolve() + # Get the absolute path to the backup directory + backup_dir = Path(args.backup_dir).resolve() + + # Create the relative path structure + rel_path = file_path_obj.relative_to(file_path_obj.parent) + backup_path = backup_dir / rel_path.parent.name / rel_path.name + + # Ensure the backup directory exists + backup_path.parent.mkdir(parents=True, exist_ok=True) + + # Add .old extension for the backup + backup_path = backup_path.with_suffix(backup_path.suffix + '.old') + + # Copy the original file with metadata preserved + console.print(f"[cyan]Backing up to: {backup_path}[/cyan]") + shutil.copy2(file_path, backup_path) + + # Generate spectrograms if enabled + if not args.skip_spectrograms: + generate_spectrogram(file_path, file_path, backup_path.parent) + + except Exception as e: + console.print(f"[red]Error creating backup: {str(e)}[/red]") + if args.verbose: + import traceback + console.print(traceback.format_exc()) + return # Export the converted audio file - output_file = file_path.replace(os.path.splitext(file_path)[1], ".wav") - audio.export(output_file, format="wav") - - # Generate spectrogram if enabled - if not args.skip_spectrograms: - generate_spectrogram( - file_path, output_file, os.path.dirname(backup_path) - ) + try: + output_file = file_path + audio.export(output_file, format="wav") + console.print(f"[green]Converted file saved: {output_file}[/green]") + except Exception as e: + console.print(f"[red]Error saving converted file: {str(e)}[/red]") + if args.verbose: + import traceback + console.print(traceback.format_exc()) else: status = Text() status.append(f"{file_path} ", style="cyan") @@ -280,9 +304,10 @@ def process_audio(file_path, args, dry_run=False, task_id=None, progress=None): except Exception as e: console.print(f"[red]Error processing {file_path}: {str(e)}[/red]") - console.print(f"[yellow]Stack trace:[/yellow]") - import traceback - console.print(traceback.format_exc()) + if args.verbose: + console.print(f"[yellow]Stack trace:[/yellow]") + import traceback + console.print(traceback.format_exc()) def check_effectively_mono(audio, threshold_dB): From 68b8a68b7ec171b3030bb42133fc1b26c8a40491 Mon Sep 17 00:00:00 2001 From: Chris Farrell Date: Mon, 18 Nov 2024 06:44:08 +0000 Subject: [PATCH 29/66] Nov 17, 2024, 10:44 PM --- sample-shrinker-python/sample-shrinker.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/sample-shrinker-python/sample-shrinker.py b/sample-shrinker-python/sample-shrinker.py index 16b4bbc..df2314c 100644 --- a/sample-shrinker-python/sample-shrinker.py +++ b/sample-shrinker-python/sample-shrinker.py @@ -253,7 +253,7 @@ def process_audio(file_path, args, dry_run=False, task_id=None, progress=None): console.print(status) if not dry_run: - # Backup the original file if required + # Backup handling if args.backup_dir != "-": try: # Convert the file path to a Path object @@ -285,6 +285,8 @@ def process_audio(file_path, args, dry_run=False, task_id=None, progress=None): import traceback console.print(traceback.format_exc()) return + else: + console.print("[yellow]No backup created (backups disabled)[/yellow]") # Export the converted audio file try: @@ -869,7 +871,8 @@ def get_interactive_config(): "Preview changes (dry run)", "Process files in parallel", "Set minimum sample rate", - "Set minimum bit depth" + "Set minimum bit depth", + "Convert in place (no backups)", ], ).ask() @@ -877,6 +880,22 @@ def get_interactive_config(): args.pre_normalize = "Pre-normalize before conversion" in advanced_options args.skip_spectrograms = "Skip generating spectrograms" in advanced_options args.dry_run = "Preview changes (dry run)" in advanced_options + convert_in_place = "Convert in place (no backups)" in advanced_options + + # Configure backup settings if not converting in place + if not convert_in_place: + args.backup_dir = questionary.text( + "Backup directory path:", + default="_backup", + ).ask() + if args.backup_dir.strip(): # If not empty + args.skip_spectrograms = questionary.confirm( + "Generate spectrograms for backup comparison?", + default=not args.skip_spectrograms + ).ask() + else: + args.backup_dir = "-" + args.skip_spectrograms = True if "Process files in parallel" in advanced_options: args.jobs = questionary.select( From 925c6ae1d284a5187ff047ce1ff65f850a7d21f0 Mon Sep 17 00:00:00 2001 From: Chris Farrell Date: Mon, 18 Nov 2024 06:45:24 +0000 Subject: [PATCH 30/66] Nov 17, 2024, 10:45 PM --- sample-shrinker-python/requirements.txt | 1 + sample-shrinker-python/sample-shrinker.py | 281 ++++++++-------------- 2 files changed, 101 insertions(+), 181 deletions(-) diff --git a/sample-shrinker-python/requirements.txt b/sample-shrinker-python/requirements.txt index ac87aa6..fcfd3f4 100644 --- a/sample-shrinker-python/requirements.txt +++ b/sample-shrinker-python/requirements.txt @@ -3,5 +3,6 @@ matplotlib==3.9.2 numpy==2.1.3 pydub==0.25.1 questionary==2.0.1 +rich==13.9.4 scipy==1.14.1 soundfile==0.12.1 diff --git a/sample-shrinker-python/sample-shrinker.py b/sample-shrinker-python/sample-shrinker.py index df2314c..df9cf14 100644 --- a/sample-shrinker-python/sample-shrinker.py +++ b/sample-shrinker-python/sample-shrinker.py @@ -10,6 +10,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed from pathlib import Path import subprocess +import json import librosa import matplotlib.pyplot as plt @@ -708,8 +709,56 @@ def process_duplicate_directories(duplicates, args): print(f"Error moving directory {dir_path}: {e}") +def load_saved_config(): + """Load previously saved configuration.""" + config_path = Path.home() / '.sample-shrinker.json' + if config_path.exists(): + try: + with open(config_path, 'r') as f: + config = json.load(f) + console.print("[dim]Loaded saved configuration[/dim]") + return config + except Exception as e: + console.print(f"[yellow]Error loading saved config: {e}[/yellow]") + return {} + +def save_config(args, action): + """Save current configuration.""" + config_path = Path.home() / '.sample-shrinker.json' + try: + # Convert namespace to dict and handle Path objects + config = { + 'last_action': action, + 'files': [str(p) for p in args.files], + 'backup_dir': args.backup_dir, + 'bitdepth': args.bitdepth, + 'channels': args.channels, + 'samplerate': args.samplerate, + 'min_samplerate': args.min_samplerate, + 'min_bitdepth': args.min_bitdepth, + 'auto_mono': args.auto_mono, + 'auto_mono_threshold': args.auto_mono_threshold, + 'skip_spectrograms': args.skip_spectrograms, + 'pre_normalize': args.pre_normalize, + 'jobs': args.jobs, + # Duplicate removal specific settings + 'use_fuzzy': getattr(args, 'use_fuzzy', False), + 'ignore_names': getattr(args, 'ignore_names', False), + 'fuzzy_threshold': getattr(args, 'fuzzy_threshold', 90), + 'fuzzy_options': getattr(args, 'fuzzy_options', []), + 'advanced_options': getattr(args, 'advanced_options', []), + } + + with open(config_path, 'w') as f: + json.dump(config, f, indent=2) + console.print("[dim]Saved configuration for next time[/dim]") + except Exception as e: + console.print(f"[yellow]Error saving config: {e}[/yellow]") + def get_interactive_config(): """Get configuration through interactive questionary prompts.""" + # Load saved configuration + saved_config = load_saved_config() # First, get the action type action = questionary.select( @@ -719,6 +768,7 @@ def get_interactive_config(): "Remove duplicate directories", "Exit", ], + default=saved_config.get('last_action', "Shrink samples (convert audio files)") ).ask() if action == "Exit": @@ -726,50 +776,41 @@ def get_interactive_config(): # Get the directory/files to process paths = [] - while True: - path = questionary.path( - "Select directory or file to process (press Enter with empty path when done):", - only_directories=False, + last_paths = saved_config.get('files', []) + + if last_paths: + use_last = questionary.confirm( + f"Use last paths?\n" + "\n".join(last_paths), + default=True ).ask() - - if not path: # Empty input - if paths: # If we have at least one path, break - break - else: # If no paths yet, ask again - print("Please select at least one directory or file.") - continue - - paths.append(path) - - if not questionary.confirm("Add another path?", default=False).ask(): - break + if use_last: + paths = last_paths - if not paths: - return None, None + # ... rest of path collection code ... - # Create a namespace object to match argparse structure + # Create a namespace object with saved defaults args = argparse.Namespace() args.files = paths - - # Set ALL default values (matching parse_args defaults) - args.backup_dir = "_backup" - args.dry_run = False - args.verbose = False - args.ext = "wav,mp3" - args.bitdepth = 16 - args.min_bitdepth = None - args.channels = 2 - args.samplerate = 44100 - args.min_samplerate = None - args.auto_mono = False - args.auto_mono_threshold = -95.5 - args.skip_spectrograms = False - args.pre_normalize = False - args.list = False - args.jobs = 1 + args.backup_dir = saved_config.get('backup_dir', "_backup") + args.bitdepth = saved_config.get('bitdepth', 16) + args.channels = saved_config.get('channels', 2) + args.samplerate = saved_config.get('samplerate', 44100) + args.min_samplerate = saved_config.get('min_samplerate', None) + args.min_bitdepth = saved_config.get('min_bitdepth', None) + args.auto_mono = saved_config.get('auto_mono', False) + args.auto_mono_threshold = saved_config.get('auto_mono_threshold', -95.5) + args.skip_spectrograms = saved_config.get('skip_spectrograms', False) + args.pre_normalize = saved_config.get('pre_normalize', False) + args.jobs = saved_config.get('jobs', 1) if action == "Remove duplicate directories": - # For duplicate removal, get configuration options + # Use saved defaults for duplicate options + saved_duplicate_options = [] + if saved_config.get('use_fuzzy', False): + saved_duplicate_options.append("Use fuzzy matching for similar files") + if saved_config.get('ignore_names', False): + saved_duplicate_options.append("Ignore filenames (match by content only)") + duplicate_options = questionary.checkbox( "Select duplicate removal options:", choices=[ @@ -778,158 +819,36 @@ def get_interactive_config(): "Preview changes (dry run)", "Show detailed progress", ], - default=["Preview changes (dry run)"], + default=saved_duplicate_options ).ask() - args.use_fuzzy = "Use fuzzy matching for similar files" in duplicate_options - args.ignore_names = ( - "Ignore filenames (match by content only)" in duplicate_options - ) - args.dry_run = "Preview changes (dry run)" in duplicate_options - args.verbose = "Show detailed progress" in duplicate_options - - if args.use_fuzzy: - # Get fuzzy matching configuration - args.fuzzy_threshold = questionary.select( - "Select fuzzy matching threshold (higher = more strict):", - choices=[ - "95 - Nearly identical", - "90 - Very similar", - "85 - Similar", - "80 - Somewhat similar", - ], - default="90 - Very similar", - ).ask() - args.fuzzy_threshold = int(args.fuzzy_threshold.split()[0]) - - args.fuzzy_options = questionary.checkbox( - "Select fuzzy matching options:", - choices=[ - "Compare file lengths", - "Compare sample rates", - "Compare channel counts", - ], - default=["Compare file lengths", "Compare sample rates"], - ).ask() - - # Get backup options (moved before backup_choice) - backup_dir = questionary.text( - "Backup directory path:", - default="_backup", - description="Directory where duplicates will be moved" - ).ask() - - if backup_dir.strip(): # If not empty - args.backup_dir = backup_dir.strip() - else: - args.backup_dir = "_backup" # Fallback to default + # ... rest of duplicate removal configuration ... - backup_choice = questionary.select( - "How should duplicates be handled?", + else: # Sample shrinking + # Use saved defaults for advanced options + saved_advanced = saved_config.get('advanced_options', []) + advanced_options = questionary.checkbox( + "Select additional options:", choices=[ - f"Move to {args.backup_dir} (safe)", - "Delete immediately (dangerous)", - "Preview only (no changes)", + "Auto-convert stereo to mono when possible", + "Pre-normalize before conversion", + "Skip generating spectrograms", + "Preview changes (dry run)", + "Process files in parallel", + "Set minimum sample rate", + "Set minimum bit depth", + "Convert in place (no backups)", ], - default=f"Move to {args.backup_dir} (safe)", - ).ask() - - args.delete_duplicates = "Delete" in backup_choice - args.dry_run = "Preview" in backup_choice - - return "duplicates", args - - # For sample shrinking, get all the conversion options - args.bitdepth = questionary.select( - "Select target bit depth:", - choices=["8", "16", "24"], - default="16" - ).ask() - args.bitdepth = int(args.bitdepth) - - args.channels = questionary.select( - "Select target channels:", - choices=["1 (mono)", "2 (stereo)"], - default="2 (stereo)", - ).ask() - args.channels = 1 if "1" in args.channels else 2 - - args.samplerate = questionary.select( - "Select target sample rate:", - choices=["22050", "44100", "48000"], - default="44100", - ).ask() - args.samplerate = int(args.samplerate) - - # Advanced options in a checkbox group - advanced_options = questionary.checkbox( - "Select additional options:", - choices=[ - "Auto-convert stereo to mono when possible", - "Pre-normalize before conversion", - "Skip generating spectrograms", - "Preview changes (dry run)", - "Process files in parallel", - "Set minimum sample rate", - "Set minimum bit depth", - "Convert in place (no backups)", - ], - ).ask() - - args.auto_mono = "Auto-convert stereo to mono when possible" in advanced_options - args.pre_normalize = "Pre-normalize before conversion" in advanced_options - args.skip_spectrograms = "Skip generating spectrograms" in advanced_options - args.dry_run = "Preview changes (dry run)" in advanced_options - convert_in_place = "Convert in place (no backups)" in advanced_options - - # Configure backup settings if not converting in place - if not convert_in_place: - args.backup_dir = questionary.text( - "Backup directory path:", - default="_backup", + default=saved_advanced ).ask() - if args.backup_dir.strip(): # If not empty - args.skip_spectrograms = questionary.confirm( - "Generate spectrograms for backup comparison?", - default=not args.skip_spectrograms - ).ask() - else: - args.backup_dir = "-" - args.skip_spectrograms = True - - if "Process files in parallel" in advanced_options: - args.jobs = questionary.select( - "How many parallel jobs? (higher values may improve speed but use more memory)", - choices=["2", "4", "8", "16", "24", "32", "48", "64"], - default="4" - ).ask() - args.jobs = int(args.jobs) - - if "Set minimum sample rate" in advanced_options: - args.min_samplerate = questionary.select( - "Select minimum sample rate:", - choices=["22050", "44100", "48000"], - default="22050" - ).ask() - args.min_samplerate = int(args.min_samplerate) + + # Store selected options for next time + args.advanced_options = advanced_options - if "Set minimum bit depth" in advanced_options: - args.min_bitdepth = questionary.select( - "Select minimum bit depth:", - choices=["8", "16", "24"], - default="16" - ).ask() - args.min_bitdepth = int(args.min_bitdepth) - - if args.auto_mono: - args.auto_mono_threshold = float( - questionary.text( - "Auto-mono threshold in dB (default: -95.5):", - default="-95.5" - ).ask() - ) + # Save the final configuration + save_config(args, action) - return "shrink", args + return "duplicates" if "Remove" in action else "shrink", args def process_duplicates(args): From 8eb86b7f06da149f3ad524d1404f395d7c171a64 Mon Sep 17 00:00:00 2001 From: Chris Farrell Date: Mon, 18 Nov 2024 06:46:31 +0000 Subject: [PATCH 31/66] Nov 17, 2024, 10:46 PM --- sample-shrinker-python/sample-shrinker.py | 31 ++++++++++++++--------- 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/sample-shrinker-python/sample-shrinker.py b/sample-shrinker-python/sample-shrinker.py index df9cf14..75d150d 100644 --- a/sample-shrinker-python/sample-shrinker.py +++ b/sample-shrinker-python/sample-shrinker.py @@ -819,7 +819,8 @@ def get_interactive_config(): "Preview changes (dry run)", "Show detailed progress", ], - default=saved_duplicate_options + # Only use default if we have valid saved options + **({"default": saved_duplicate_options} if saved_duplicate_options else {}) ).ask() # ... rest of duplicate removal configuration ... @@ -827,19 +828,25 @@ def get_interactive_config(): else: # Sample shrinking # Use saved defaults for advanced options saved_advanced = saved_config.get('advanced_options', []) + # Validate saved options against available choices + available_choices = [ + "Auto-convert stereo to mono when possible", + "Pre-normalize before conversion", + "Skip generating spectrograms", + "Preview changes (dry run)", + "Process files in parallel", + "Set minimum sample rate", + "Set minimum bit depth", + "Convert in place (no backups)", + ] + # Only keep valid saved options + valid_saved = [opt for opt in saved_advanced if opt in available_choices] + advanced_options = questionary.checkbox( "Select additional options:", - choices=[ - "Auto-convert stereo to mono when possible", - "Pre-normalize before conversion", - "Skip generating spectrograms", - "Preview changes (dry run)", - "Process files in parallel", - "Set minimum sample rate", - "Set minimum bit depth", - "Convert in place (no backups)", - ], - default=saved_advanced + choices=available_choices, + # Only use default if we have valid saved options + **({"default": valid_saved} if valid_saved else {}) ).ask() # Store selected options for next time From 2769d3c55ed56c09db98404432f5b0a2695a654f Mon Sep 17 00:00:00 2001 From: Chris Farrell Date: Mon, 18 Nov 2024 06:47:46 +0000 Subject: [PATCH 32/66] Nov 17, 2024, 10:47 PM --- sample-shrinker-python/sample-shrinker.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/sample-shrinker-python/sample-shrinker.py b/sample-shrinker-python/sample-shrinker.py index 75d150d..377c296 100644 --- a/sample-shrinker-python/sample-shrinker.py +++ b/sample-shrinker-python/sample-shrinker.py @@ -741,6 +741,8 @@ def save_config(args, action): 'skip_spectrograms': args.skip_spectrograms, 'pre_normalize': args.pre_normalize, 'jobs': args.jobs, + 'ext': getattr(args, 'ext', "wav,mp3"), + 'verbose': getattr(args, 'verbose', False), # Duplicate removal specific settings 'use_fuzzy': getattr(args, 'use_fuzzy', False), 'ignore_names': getattr(args, 'ignore_names', False), @@ -788,7 +790,7 @@ def get_interactive_config(): # ... rest of path collection code ... - # Create a namespace object with saved defaults + # Create a namespace object with ALL default values args = argparse.Namespace() args.files = paths args.backup_dir = saved_config.get('backup_dir', "_backup") @@ -802,6 +804,9 @@ def get_interactive_config(): args.skip_spectrograms = saved_config.get('skip_spectrograms', False) args.pre_normalize = saved_config.get('pre_normalize', False) args.jobs = saved_config.get('jobs', 1) + args.ext = saved_config.get('ext', "wav,mp3") + args.verbose = saved_config.get('verbose', False) + args.list = False if action == "Remove duplicate directories": # Use saved defaults for duplicate options From 357c34d738abdd3548f0304278cb83965ab39e0b Mon Sep 17 00:00:00 2001 From: Chris Farrell Date: Mon, 18 Nov 2024 06:49:22 +0000 Subject: [PATCH 33/66] Nov 17, 2024, 10:49 PM --- sample-shrinker-python/sample-shrinker.py | 76 ++++++++++++++++++++--- 1 file changed, 69 insertions(+), 7 deletions(-) diff --git a/sample-shrinker-python/sample-shrinker.py b/sample-shrinker-python/sample-shrinker.py index 377c296..34f085d 100644 --- a/sample-shrinker-python/sample-shrinker.py +++ b/sample-shrinker-python/sample-shrinker.py @@ -831,9 +831,7 @@ def get_interactive_config(): # ... rest of duplicate removal configuration ... else: # Sample shrinking - # Use saved defaults for advanced options - saved_advanced = saved_config.get('advanced_options', []) - # Validate saved options against available choices + # Define available choices first available_choices = [ "Auto-convert stereo to mono when possible", "Pre-normalize before conversion", @@ -844,19 +842,83 @@ def get_interactive_config(): "Set minimum bit depth", "Convert in place (no backups)", ] - # Only keep valid saved options - valid_saved = [opt for opt in saved_advanced if opt in available_choices] + # Get saved options and validate them + saved_advanced = saved_config.get('advanced_options', []) + # Only use saved options that exist in available choices + valid_saved = [] + if saved_advanced: + valid_saved = [opt for opt in available_choices if opt in saved_advanced] + + # Create the checkbox without conditional default advanced_options = questionary.checkbox( "Select additional options:", choices=available_choices, - # Only use default if we have valid saved options - **({"default": valid_saved} if valid_saved else {}) + default=valid_saved ).ask() # Store selected options for next time args.advanced_options = advanced_options + # Process the selections + args.auto_mono = "Auto-convert stereo to mono when possible" in advanced_options + args.pre_normalize = "Pre-normalize before conversion" in advanced_options + args.skip_spectrograms = "Skip generating spectrograms" in advanced_options + args.dry_run = "Preview changes (dry run)" in advanced_options + convert_in_place = "Convert in place (no backups)" in advanced_options + + if "Process files in parallel" in advanced_options: + args.jobs = questionary.select( + "How many parallel jobs?", + choices=["2", "4", "8", "16", "24", "32", "48", "64"], + default=str(saved_config.get('jobs', 4)) + ).ask() + args.jobs = int(args.jobs) + else: + args.jobs = 1 + + if "Set minimum sample rate" in advanced_options: + args.min_samplerate = questionary.select( + "Select minimum sample rate:", + choices=["22050", "44100", "48000"], + default=str(saved_config.get('min_samplerate', 22050)) + ).ask() + args.min_samplerate = int(args.min_samplerate) + + if "Set minimum bit depth" in advanced_options: + args.min_bitdepth = questionary.select( + "Select minimum bit depth:", + choices=["8", "16", "24"], + default=str(saved_config.get('min_bitdepth', 16)) + ).ask() + args.min_bitdepth = int(args.min_bitdepth) + + # Configure backup settings if not converting in place + if not convert_in_place: + backup_enabled = questionary.confirm( + "Enable backups of original files?", + default=not args.backup_dir == "-" + ).ask() + + if backup_enabled: + backup_dir = questionary.text( + "Backup directory path:", + default=args.backup_dir if args.backup_dir != "-" else "_backup" + ).ask() + args.backup_dir = backup_dir.strip() if backup_dir.strip() else "_backup" + + if not args.skip_spectrograms: + args.skip_spectrograms = not questionary.confirm( + "Generate spectrograms for backup comparison?", + default=True + ).ask() + else: + args.backup_dir = "-" + args.skip_spectrograms = True + else: + args.backup_dir = "-" + args.skip_spectrograms = True + # Save the final configuration save_config(args, action) From 0ac9acbe8e82b9a1b471278358c68c90e4dac281 Mon Sep 17 00:00:00 2001 From: Chris Farrell Date: Mon, 18 Nov 2024 06:51:29 +0000 Subject: [PATCH 34/66] Nov 17, 2024, 10:51 PM --- sample-shrinker-python/sample-shrinker.py | 539 ++++++++++++---------- 1 file changed, 290 insertions(+), 249 deletions(-) diff --git a/sample-shrinker-python/sample-shrinker.py b/sample-shrinker-python/sample-shrinker.py index 34f085d..49cd402 100644 --- a/sample-shrinker-python/sample-shrinker.py +++ b/sample-shrinker-python/sample-shrinker.py @@ -4,27 +4,32 @@ import hashlib import os import shutil +import subprocess import sys import time from collections import defaultdict from concurrent.futures import ThreadPoolExecutor, as_completed from pathlib import Path -import subprocess -import json import librosa import matplotlib.pyplot as plt import numpy as np import questionary -import soundfile as sf import scipy.signal -from scipy.io import wavfile +import soundfile as sf from pydub import AudioSegment +from rich import print as rprint from rich.console import Console -from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TaskProgressColumn from rich.panel import Panel +from rich.progress import ( + BarColumn, + Progress, + SpinnerColumn, + TaskProgressColumn, + TextColumn, +) from rich.text import Text -from rich import print as rprint +from scipy.io import wavfile # Initialize console console = Console() @@ -137,16 +142,20 @@ def delete_resource_forks(directory): def reencode_audio(file_path): """Re-encode audio file to PCM 16-bit if it has a different encoding.""" try: - output_path = str(Path(file_path).with_suffix('.reencoded.wav')) + output_path = str(Path(file_path).with_suffix(".reencoded.wav")) # Use ffmpeg directly for more reliable conversion cmd = [ - 'ffmpeg', '-y', - '-i', str(file_path), - '-acodec', 'pcm_s16le', - '-ar', '44100', - output_path + "ffmpeg", + "-y", + "-i", + str(file_path), + "-acodec", + "pcm_s16le", + "-ar", + "44100", + output_path, ] - + result = subprocess.run(cmd, capture_output=True, text=True) if result.returncode == 0: console.print(f"[green]Successfully re-encoded: {output_path}[/green]") @@ -162,7 +171,7 @@ def reencode_audio(file_path): def check_ffmpeg(): """Check if ffmpeg is available and properly installed.""" try: - subprocess.run(['ffmpeg', '-version'], capture_output=True, check=True) + subprocess.run(["ffmpeg", "-version"], capture_output=True, check=True) return True except (subprocess.SubprocessError, FileNotFoundError): console.print("[red]Error: ffmpeg is not installed or not found in PATH[/red]") @@ -180,7 +189,7 @@ def process_audio(file_path, args, dry_run=False, task_id=None, progress=None): progress.update(task_id, description=f"Processing: {Path(file_path).name}") else: console.print(f"Processing file: [cyan]{file_path}[/cyan]") - + try: audio = AudioSegment.from_file(file_path) except (IndexError, OSError) as e: @@ -191,11 +200,13 @@ def process_audio(file_path, args, dry_run=False, task_id=None, progress=None): try: audio = AudioSegment.from_file(reencoded_file) except Exception as re_err: - console.print(f"[red]Failed to process re-encoded file: {str(re_err)}[/red]") + console.print( + f"[red]Failed to process re-encoded file: {str(re_err)}[/red]" + ) return else: return - + modified = False change_reason = [] @@ -252,7 +263,7 @@ def process_audio(file_path, args, dry_run=False, task_id=None, progress=None): status.append("[CHANGED]: ", style="yellow") status.append(", ".join(change_reason), style="green") console.print(status) - + if not dry_run: # Backup handling if args.backup_dir != "-": @@ -261,33 +272,40 @@ def process_audio(file_path, args, dry_run=False, task_id=None, progress=None): file_path_obj = Path(file_path).resolve() # Get the absolute path to the backup directory backup_dir = Path(args.backup_dir).resolve() - + # Create the relative path structure rel_path = file_path_obj.relative_to(file_path_obj.parent) backup_path = backup_dir / rel_path.parent.name / rel_path.name - + # Ensure the backup directory exists backup_path.parent.mkdir(parents=True, exist_ok=True) - + # Add .old extension for the backup - backup_path = backup_path.with_suffix(backup_path.suffix + '.old') - + backup_path = backup_path.with_suffix( + backup_path.suffix + ".old" + ) + # Copy the original file with metadata preserved console.print(f"[cyan]Backing up to: {backup_path}[/cyan]") shutil.copy2(file_path, backup_path) - + # Generate spectrograms if enabled if not args.skip_spectrograms: - generate_spectrogram(file_path, file_path, backup_path.parent) - + generate_spectrogram( + file_path, file_path, backup_path.parent + ) + except Exception as e: console.print(f"[red]Error creating backup: {str(e)}[/red]") if args.verbose: import traceback + console.print(traceback.format_exc()) return else: - console.print("[yellow]No backup created (backups disabled)[/yellow]") + console.print( + "[yellow]No backup created (backups disabled)[/yellow]" + ) # Export the converted audio file try: @@ -298,6 +316,7 @@ def process_audio(file_path, args, dry_run=False, task_id=None, progress=None): console.print(f"[red]Error saving converted file: {str(e)}[/red]") if args.verbose: import traceback + console.print(traceback.format_exc()) else: status = Text() @@ -310,6 +329,7 @@ def process_audio(file_path, args, dry_run=False, task_id=None, progress=None): if args.verbose: console.print(f"[yellow]Stack trace:[/yellow]") import traceback + console.print(traceback.format_exc()) @@ -364,33 +384,37 @@ def collect_files(args): """Collect all wav and mp3 files from provided directories and files.""" file_list = [] valid_extensions = [ext.strip().lower() for ext in args.ext.split(",")] - + console.print("[cyan]Starting file collection...[/cyan]") - + for path in args.files: # Expand user and resolve path path = os.path.expanduser(path) path = os.path.expandvars(path) path = Path(path).resolve() - + console.print(f"[cyan]Scanning path: {path}[/cyan]") - + if path.is_dir(): for root, dirs, files in os.walk(path): for file in files: file_lower = file.lower() - if any(file_lower.endswith(f".{ext}") for ext in valid_extensions) and not file.startswith("._"): + if any( + file_lower.endswith(f".{ext}") for ext in valid_extensions + ) and not file.startswith("._"): full_path = os.path.join(root, file) file_list.append(full_path) if args.verbose: console.print(f"[dim]Found: {full_path}[/dim]") elif path.is_file(): file_lower = str(path).lower() - if any(file_lower.endswith(f".{ext}") for ext in valid_extensions) and not path.name.startswith("._"): + if any( + file_lower.endswith(f".{ext}") for ext in valid_extensions + ) and not path.name.startswith("._"): file_list.append(str(path)) if args.verbose: console.print(f"[dim]Found: {path}[/dim]") - + console.print(f"[green]Found {len(file_list)} files to process[/green]") return file_list @@ -400,7 +424,7 @@ def run_in_parallel(file_list, args): if not file_list: console.print("[yellow]No files to process![/yellow]") return - + try: with Progress( SpinnerColumn(), @@ -410,22 +434,21 @@ def run_in_parallel(file_list, args): console=console, ) as progress: total_files = len(file_list) - console.print(f"[cyan]Starting processing of {total_files} files with {args.jobs} parallel jobs[/cyan]") - + console.print( + f"[cyan]Starting processing of {total_files} files with {args.jobs} parallel jobs[/cyan]" + ) + task = progress.add_task("Processing files...", total=total_files) - + with ThreadPoolExecutor(max_workers=args.jobs) as executor: # Submit all tasks futures = { executor.submit( - process_audio, - file, - args, - task_id=task, - progress=progress - ): file for file in file_list + process_audio, file, args, task_id=task, progress=progress + ): file + for file in file_list } - + # Process completed tasks for future in concurrent.futures.as_completed(futures): progress.advance(task) @@ -433,12 +456,16 @@ def run_in_parallel(file_list, args): result = future.result() except Exception as exc: file = futures[future] - console.print(f"[red]File {file} generated an exception: {exc}[/red]") - + console.print( + f"[red]File {file} generated an exception: {exc}[/red]" + ) + console.print("[green]Processing complete![/green]") - + except KeyboardInterrupt: - console.print("[yellow]Received KeyboardInterrupt, attempting to cancel all threads...[/yellow]") + console.print( + "[yellow]Received KeyboardInterrupt, attempting to cancel all threads...[/yellow]" + ) executor.shutdown(wait=False, cancel_futures=True) raise except Exception as e: @@ -483,26 +510,23 @@ def get_audio_fingerprint(file_path): # Convert to mono for comparison if audio.channels > 1: audio = audio.set_channels(1) - + # Convert to numpy array samples = np.array(audio.get_array_of_samples()) - + # Normalize samples = samples / np.max(np.abs(samples)) - + # Get a signature using peaks in frequency domain freqs, times, spectrogram = scipy.signal.spectrogram( - samples, - audio.frame_rate, - nperseg=1024, - noverlap=512 + samples, audio.frame_rate, nperseg=1024, noverlap=512 ) - + # Get the strongest frequencies peaks = np.mean(spectrogram, axis=1) # Normalize the peaks peaks = peaks / np.max(peaks) - + return peaks except Exception as e: print(f"Error generating audio fingerprint for {file_path}: {e}") @@ -513,12 +537,12 @@ def compare_audio_similarity(file1_fingerprint, file2_fingerprint): """Compare two audio fingerprints and return similarity score.""" if file1_fingerprint is None or file2_fingerprint is None: return 0 - + # Ensure same length for comparison min_len = min(len(file1_fingerprint), len(file2_fingerprint)) f1 = file1_fingerprint[:min_len] f2 = file2_fingerprint[:min_len] - + # Calculate correlation coefficient correlation = np.corrcoef(f1, f2)[0, 1] # Convert to percentage and handle NaN @@ -530,7 +554,7 @@ def find_duplicate_files(paths, args): """Find duplicate files using a multi-stage approach with audio fingerprinting.""" print("Scanning for duplicate files...") size_groups = defaultdict(list) - + # First pass: group by size for path in paths: path = Path(path) @@ -541,16 +565,16 @@ def find_duplicate_files(paths, args): print(f"Scanning: {file_path}") size = file_path.stat().st_size size_groups[size].append(file_path) - + hash_groups = defaultdict(list) similar_groups = [] - + # Second pass: check content for size, file_paths in size_groups.items(): if len(file_paths) > 1: if args.verbose: print(f"\nChecking {len(file_paths)} files of size {size} bytes...") - + # First try exact matches for file_path in file_paths: try: @@ -562,12 +586,15 @@ def find_duplicate_files(paths, args): hash_groups[(name_key, file_hash)].append(file_path) except Exception as e: print(f"Error hashing file {file_path}: {e}") - + # Then check for similar audio content if args.use_fuzzy: - unmatched = [f for f in file_paths - if not any(f in g for g in hash_groups.values() if len(g) > 1)] - + unmatched = [ + f + for f in file_paths + if not any(f in g for g in hash_groups.values() if len(g) > 1) + ] + if len(unmatched) > 1: # Generate fingerprints for all unmatched files fingerprints = {} @@ -575,33 +602,32 @@ def find_duplicate_files(paths, args): fingerprint = get_audio_fingerprint(file_path) if fingerprint is not None: fingerprints[file_path] = fingerprint - + # Compare fingerprints processed = set() for file1 in fingerprints: if file1 in processed: continue - + similar_files = [file1] for file2 in fingerprints: if file2 != file1 and file2 not in processed: similarity = compare_audio_similarity( - fingerprints[file1], - fingerprints[file2] + fingerprints[file1], fingerprints[file2] ) if similarity >= args.fuzzy_threshold: similar_files.append(file2) processed.add(file2) - + if len(similar_files) > 1: similar_groups.append(similar_files) processed.add(file1) - + # Combine results duplicates = [group for group in hash_groups.values() if len(group) > 1] if args.use_fuzzy: duplicates.extend(similar_groups) - + return duplicates, similar_groups @@ -621,7 +647,9 @@ def process_duplicate_files(duplicates, fuzzy_groups, args): print("Similarity scores:") for file in group[1:]: file_fingerprint = get_audio_fingerprint(file) - similarity = compare_audio_similarity(base_fingerprint, file_fingerprint) + similarity = compare_audio_similarity( + base_fingerprint, file_fingerprint + ) print(f" {file.name}: {similarity:.1f}% similar") # Sort files by creation time @@ -709,59 +737,9 @@ def process_duplicate_directories(duplicates, args): print(f"Error moving directory {dir_path}: {e}") -def load_saved_config(): - """Load previously saved configuration.""" - config_path = Path.home() / '.sample-shrinker.json' - if config_path.exists(): - try: - with open(config_path, 'r') as f: - config = json.load(f) - console.print("[dim]Loaded saved configuration[/dim]") - return config - except Exception as e: - console.print(f"[yellow]Error loading saved config: {e}[/yellow]") - return {} - -def save_config(args, action): - """Save current configuration.""" - config_path = Path.home() / '.sample-shrinker.json' - try: - # Convert namespace to dict and handle Path objects - config = { - 'last_action': action, - 'files': [str(p) for p in args.files], - 'backup_dir': args.backup_dir, - 'bitdepth': args.bitdepth, - 'channels': args.channels, - 'samplerate': args.samplerate, - 'min_samplerate': args.min_samplerate, - 'min_bitdepth': args.min_bitdepth, - 'auto_mono': args.auto_mono, - 'auto_mono_threshold': args.auto_mono_threshold, - 'skip_spectrograms': args.skip_spectrograms, - 'pre_normalize': args.pre_normalize, - 'jobs': args.jobs, - 'ext': getattr(args, 'ext', "wav,mp3"), - 'verbose': getattr(args, 'verbose', False), - # Duplicate removal specific settings - 'use_fuzzy': getattr(args, 'use_fuzzy', False), - 'ignore_names': getattr(args, 'ignore_names', False), - 'fuzzy_threshold': getattr(args, 'fuzzy_threshold', 90), - 'fuzzy_options': getattr(args, 'fuzzy_options', []), - 'advanced_options': getattr(args, 'advanced_options', []), - } - - with open(config_path, 'w') as f: - json.dump(config, f, indent=2) - console.print("[dim]Saved configuration for next time[/dim]") - except Exception as e: - console.print(f"[yellow]Error saving config: {e}[/yellow]") - def get_interactive_config(): """Get configuration through interactive questionary prompts.""" - # Load saved configuration - saved_config = load_saved_config() - + # First, get the action type action = questionary.select( "What would you like to do?", @@ -770,7 +748,6 @@ def get_interactive_config(): "Remove duplicate directories", "Exit", ], - default=saved_config.get('last_action', "Shrink samples (convert audio files)") ).ask() if action == "Exit": @@ -778,44 +755,50 @@ def get_interactive_config(): # Get the directory/files to process paths = [] - last_paths = saved_config.get('files', []) - - if last_paths: - use_last = questionary.confirm( - f"Use last paths?\n" + "\n".join(last_paths), - default=True + while True: + path = questionary.path( + "Select directory or file to process (press Enter with empty path when done):", + only_directories=False, ).ask() - if use_last: - paths = last_paths - # ... rest of path collection code ... + if not path: # Empty input + if paths: # If we have at least one path, break + break + else: # If no paths yet, ask again + print("Please select at least one directory or file.") + continue + + paths.append(path) + + if not questionary.confirm("Add another path?", default=False).ask(): + break + + if not paths: + return None, None - # Create a namespace object with ALL default values + # Create a namespace object to match argparse structure args = argparse.Namespace() args.files = paths - args.backup_dir = saved_config.get('backup_dir', "_backup") - args.bitdepth = saved_config.get('bitdepth', 16) - args.channels = saved_config.get('channels', 2) - args.samplerate = saved_config.get('samplerate', 44100) - args.min_samplerate = saved_config.get('min_samplerate', None) - args.min_bitdepth = saved_config.get('min_bitdepth', None) - args.auto_mono = saved_config.get('auto_mono', False) - args.auto_mono_threshold = saved_config.get('auto_mono_threshold', -95.5) - args.skip_spectrograms = saved_config.get('skip_spectrograms', False) - args.pre_normalize = saved_config.get('pre_normalize', False) - args.jobs = saved_config.get('jobs', 1) - args.ext = saved_config.get('ext', "wav,mp3") - args.verbose = saved_config.get('verbose', False) + + # Set ALL default values (matching parse_args defaults) + args.backup_dir = "_backup" + args.dry_run = False + args.verbose = False + args.ext = "wav,mp3" + args.bitdepth = 16 + args.min_bitdepth = None + args.channels = 2 + args.samplerate = 44100 + args.min_samplerate = None + args.auto_mono = False + args.auto_mono_threshold = -95.5 + args.skip_spectrograms = False + args.pre_normalize = False args.list = False + args.jobs = 1 if action == "Remove duplicate directories": - # Use saved defaults for duplicate options - saved_duplicate_options = [] - if saved_config.get('use_fuzzy', False): - saved_duplicate_options.append("Use fuzzy matching for similar files") - if saved_config.get('ignore_names', False): - saved_duplicate_options.append("Ignore filenames (match by content only)") - + # For duplicate removal, get configuration options duplicate_options = questionary.checkbox( "Select duplicate removal options:", choices=[ @@ -824,15 +807,91 @@ def get_interactive_config(): "Preview changes (dry run)", "Show detailed progress", ], - # Only use default if we have valid saved options - **({"default": saved_duplicate_options} if saved_duplicate_options else {}) + default=["Preview changes (dry run)"], ).ask() - # ... rest of duplicate removal configuration ... + args.use_fuzzy = "Use fuzzy matching for similar files" in duplicate_options + args.ignore_names = ( + "Ignore filenames (match by content only)" in duplicate_options + ) + args.dry_run = "Preview changes (dry run)" in duplicate_options + args.verbose = "Show detailed progress" in duplicate_options + + if args.use_fuzzy: + # Get fuzzy matching configuration + args.fuzzy_threshold = questionary.select( + "Select fuzzy matching threshold (higher = more strict):", + choices=[ + "95 - Nearly identical", + "90 - Very similar", + "85 - Similar", + "80 - Somewhat similar", + ], + default="90 - Very similar", + ).ask() + args.fuzzy_threshold = int(args.fuzzy_threshold.split()[0]) + + args.fuzzy_options = questionary.checkbox( + "Select fuzzy matching options:", + choices=[ + "Compare file lengths", + "Compare sample rates", + "Compare channel counts", + ], + default=["Compare file lengths", "Compare sample rates"], + ).ask() + + # Get backup options (moved before backup_choice) + backup_dir = questionary.text( + "Backup directory path:", + default="_backup", + description="Directory where duplicates will be moved", + ).ask() - else: # Sample shrinking - # Define available choices first - available_choices = [ + if backup_dir.strip(): # If not empty + args.backup_dir = backup_dir.strip() + else: + args.backup_dir = "_backup" # Fallback to default + + backup_choice = questionary.select( + "How should duplicates be handled?", + choices=[ + f"Move to {args.backup_dir} (safe)", + "Delete immediately (dangerous)", + "Preview only (no changes)", + ], + default=f"Move to {args.backup_dir} (safe)", + ).ask() + + args.delete_duplicates = "Delete" in backup_choice + args.dry_run = "Preview" in backup_choice + + return "duplicates", args + + # For sample shrinking, get all the conversion options + args.bitdepth = questionary.select( + "Select target bit depth:", choices=["8", "16", "24"], default="16" + ).ask() + args.bitdepth = int(args.bitdepth) + + args.channels = questionary.select( + "Select target channels:", + choices=["1 (mono)", "2 (stereo)"], + default="2 (stereo)", + ).ask() + args.channels = 1 if "1" in args.channels else 2 + + args.samplerate = questionary.select( + "Select target sample rate:", + choices=["22050", "44100", "48000"], + default="44100", + ).ask() + args.samplerate = int(args.samplerate) + + # Advanced options in a checkbox group + advanced_options = questionary.checkbox( + "Select additional options:", + choices=[ "Auto-convert stereo to mono when possible", "Pre-normalize before conversion", "Skip generating spectrograms", @@ -841,117 +900,99 @@ def get_interactive_config(): "Set minimum sample rate", "Set minimum bit depth", "Convert in place (no backups)", - ] - - # Get saved options and validate them - saved_advanced = saved_config.get('advanced_options', []) - # Only use saved options that exist in available choices - valid_saved = [] - if saved_advanced: - valid_saved = [opt for opt in available_choices if opt in saved_advanced] - - # Create the checkbox without conditional default - advanced_options = questionary.checkbox( - "Select additional options:", - choices=available_choices, - default=valid_saved + ], + ).ask() + + args.auto_mono = "Auto-convert stereo to mono when possible" in advanced_options + args.pre_normalize = "Pre-normalize before conversion" in advanced_options + args.skip_spectrograms = "Skip generating spectrograms" in advanced_options + args.dry_run = "Preview changes (dry run)" in advanced_options + convert_in_place = "Convert in place (no backups)" in advanced_options + + # Configure backup settings if not converting in place + if not convert_in_place: + args.backup_dir = questionary.text( + "Backup directory path:", + default="_backup", ).ask() - - # Store selected options for next time - args.advanced_options = advanced_options - - # Process the selections - args.auto_mono = "Auto-convert stereo to mono when possible" in advanced_options - args.pre_normalize = "Pre-normalize before conversion" in advanced_options - args.skip_spectrograms = "Skip generating spectrograms" in advanced_options - args.dry_run = "Preview changes (dry run)" in advanced_options - convert_in_place = "Convert in place (no backups)" in advanced_options - - if "Process files in parallel" in advanced_options: - args.jobs = questionary.select( - "How many parallel jobs?", - choices=["2", "4", "8", "16", "24", "32", "48", "64"], - default=str(saved_config.get('jobs', 4)) + if args.backup_dir.strip(): # If not empty + args.skip_spectrograms = questionary.confirm( + "Generate spectrograms for backup comparison?", + default=not args.skip_spectrograms, ).ask() - args.jobs = int(args.jobs) else: - args.jobs = 1 + args.backup_dir = "-" + args.skip_spectrograms = True - if "Set minimum sample rate" in advanced_options: - args.min_samplerate = questionary.select( - "Select minimum sample rate:", - choices=["22050", "44100", "48000"], - default=str(saved_config.get('min_samplerate', 22050)) - ).ask() - args.min_samplerate = int(args.min_samplerate) + if "Process files in parallel" in advanced_options: + args.jobs = questionary.select( + "How many parallel jobs? (higher values may improve speed but use more memory)", + choices=["2", "4", "8", "16", "24", "32", "48", "64"], + default="4", + ).ask() + args.jobs = int(args.jobs) - if "Set minimum bit depth" in advanced_options: - args.min_bitdepth = questionary.select( - "Select minimum bit depth:", - choices=["8", "16", "24"], - default=str(saved_config.get('min_bitdepth', 16)) - ).ask() - args.min_bitdepth = int(args.min_bitdepth) + if "Set minimum sample rate" in advanced_options: + args.min_samplerate = questionary.select( + "Select minimum sample rate:", + choices=["22050", "44100", "48000"], + default="22050", + ).ask() + args.min_samplerate = int(args.min_samplerate) - # Configure backup settings if not converting in place - if not convert_in_place: - backup_enabled = questionary.confirm( - "Enable backups of original files?", - default=not args.backup_dir == "-" - ).ask() - - if backup_enabled: - backup_dir = questionary.text( - "Backup directory path:", - default=args.backup_dir if args.backup_dir != "-" else "_backup" - ).ask() - args.backup_dir = backup_dir.strip() if backup_dir.strip() else "_backup" - - if not args.skip_spectrograms: - args.skip_spectrograms = not questionary.confirm( - "Generate spectrograms for backup comparison?", - default=True - ).ask() - else: - args.backup_dir = "-" - args.skip_spectrograms = True - else: - args.backup_dir = "-" - args.skip_spectrograms = True + if "Set minimum bit depth" in advanced_options: + args.min_bitdepth = questionary.select( + "Select minimum bit depth:", choices=["8", "16", "24"], default="16" + ).ask() + args.min_bitdepth = int(args.min_bitdepth) - # Save the final configuration - save_config(args, action) + if args.auto_mono: + args.auto_mono_threshold = float( + questionary.text( + "Auto-mono threshold in dB (default: -95.5):", default="-95.5" + ).ask() + ) - return "duplicates" if "Remove" in action else "shrink", args + return "shrink", args def process_duplicates(args): """Process both directory and file level duplicates with visual feedback.""" - with console.status("[bold green]Phase 1: Searching for duplicate directories...") as status: + with console.status( + "[bold green]Phase 1: Searching for duplicate directories..." + ) as status: dir_duplicates = find_duplicate_directories(args.files) if dir_duplicates: count = sum(len(v) - 1 for v in dir_duplicates.values()) - console.print(Panel(f"Found [cyan]{count}[/cyan] duplicate directories", - title="Directory Scan Complete")) - + console.print( + Panel( + f"Found [cyan]{count}[/cyan] duplicate directories", + title="Directory Scan Complete", + ) + ) + if args.dry_run: console.print("[yellow]DRY RUN - No directories will be moved[/yellow]") process_duplicate_directories(verified_duplicates, args) else: console.print("[blue]No duplicate directories found.[/blue]") - with console.status("[bold green]Phase 2: Searching for duplicate files...") as status: + with console.status( + "[bold green]Phase 2: Searching for duplicate files..." + ) as status: file_duplicates, fuzzy_groups = find_duplicate_files(args.files, args) if file_duplicates: total_duplicates = sum(len(group) - 1 for group in file_duplicates) - console.print(Panel( - f"Found [cyan]{total_duplicates}[/cyan] duplicate files\n" - f"Including [cyan]{len(fuzzy_groups)}[/cyan] groups of similar files", - title="File Scan Complete" - )) - + console.print( + Panel( + f"Found [cyan]{total_duplicates}[/cyan] duplicate files\n" + f"Including [cyan]{len(fuzzy_groups)}[/cyan] groups of similar files", + title="File Scan Complete", + ) + ) + # Additional safety checks for file processing safe_duplicates = [] for group in file_duplicates: From 7613deee3e007c463c593daf02c3d875645182e7 Mon Sep 17 00:00:00 2001 From: Chris Farrell Date: Mon, 18 Nov 2024 06:53:59 +0000 Subject: [PATCH 35/66] Nov 17, 2024, 10:53 PM --- sample-shrinker-python/sample-shrinker.py | 65 +++++++++++++---------- 1 file changed, 38 insertions(+), 27 deletions(-) diff --git a/sample-shrinker-python/sample-shrinker.py b/sample-shrinker-python/sample-shrinker.py index 49cd402..08e1a3a 100644 --- a/sample-shrinker-python/sample-shrinker.py +++ b/sample-shrinker-python/sample-shrinker.py @@ -12,6 +12,8 @@ from pathlib import Path import librosa +import matplotlib +matplotlib.use('Agg') # Use non-interactive backend import matplotlib.pyplot as plt import numpy as np import questionary @@ -345,33 +347,42 @@ def check_effectively_mono(audio, threshold_dB): def generate_spectrogram(original_file, new_file, backup_dir): """Generate and save spectrograms for the original and new files.""" - y_old, sr_old = librosa.load(original_file, sr=None) - y_new, sr_new = librosa.load(new_file, sr=None) - - # Spectrogram for original file - plt.figure(figsize=(10, 4)) - D_old = librosa.amplitude_to_db(np.abs(librosa.stft(y_old)), ref=np.max) - librosa.display.specshow(D_old, sr=sr_old, x_axis="time", y_axis="log") - plt.colorbar(format="%+2.0f dB") - plt.title(f"Spectrogram of {os.path.basename(original_file)}") - old_spectrogram_path = os.path.join( - backup_dir, os.path.basename(original_file) + ".old.png" - ) - os.makedirs(backup_dir, exist_ok=True) # Ensure the directory exists - plt.savefig(old_spectrogram_path) - plt.close() - - # Spectrogram for new file - plt.figure(figsize=(10, 4)) - D_new = librosa.amplitude_to_db(np.abs(librosa.stft(y_new)), ref=np.max) - librosa.display.specshow(D_new, sr=sr_new, x_axis="time", y_axis="log") - plt.colorbar(format="%+2.0f dB") - plt.title(f"Spectrogram of {os.path.basename(new_file)}") - new_spectrogram_path = os.path.join( - backup_dir, os.path.basename(new_file) + ".new.png" - ) - plt.savefig(new_spectrogram_path) - plt.close() + try: + y_old, sr_old = librosa.load(original_file, sr=None) + y_new, sr_new = librosa.load(new_file, sr=None) + + # Ensure the backup directory exists + os.makedirs(backup_dir, exist_ok=True) + + # Generate spectrogram for original file + plt.figure(figsize=(10, 4)) + D_old = librosa.amplitude_to_db(np.abs(librosa.stft(y_old)), ref=np.max) + librosa.display.specshow(D_old, sr=sr_old, x_axis="time", y_axis="log") + plt.colorbar(format="%+2.0f dB") + plt.title(f"Spectrogram of {os.path.basename(original_file)}") + old_spectrogram_path = os.path.join( + backup_dir, os.path.basename(original_file) + ".old.png" + ) + plt.savefig(old_spectrogram_path) + plt.close('all') # Explicitly close all figures + + # Generate spectrogram for new file + plt.figure(figsize=(10, 4)) + D_new = librosa.amplitude_to_db(np.abs(librosa.stft(y_new)), ref=np.max) + librosa.display.specshow(D_new, sr=sr_new, x_axis="time", y_axis="log") + plt.colorbar(format="%+2.0f dB") + plt.title(f"Spectrogram of {os.path.basename(new_file)}") + new_spectrogram_path = os.path.join( + backup_dir, os.path.basename(new_file) + ".new.png" + ) + plt.savefig(new_spectrogram_path) + plt.close('all') # Explicitly close all figures + + except Exception as e: + console.print(f"[red]Error generating spectrograms: {str(e)}[/red]") + if args.verbose: + import traceback + console.print(traceback.format_exc()) def list_files(args, file_list): From f9809720b1a9614a98582a61ea2f932cc80ade94 Mon Sep 17 00:00:00 2001 From: Chris Farrell Date: Mon, 18 Nov 2024 06:55:47 +0000 Subject: [PATCH 36/66] Nov 17, 2024, 10:55 PM --- sample-shrinker-python/sample-shrinker.py | 24 ++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/sample-shrinker-python/sample-shrinker.py b/sample-shrinker-python/sample-shrinker.py index 08e1a3a..bf524fc 100644 --- a/sample-shrinker-python/sample-shrinker.py +++ b/sample-shrinker-python/sample-shrinker.py @@ -293,21 +293,27 @@ def process_audio(file_path, args, dry_run=False, task_id=None, progress=None): # Generate spectrograms if enabled if not args.skip_spectrograms: - generate_spectrogram( - file_path, file_path, backup_path.parent - ) + try: + generate_spectrogram( + file_path, + file_path, + backup_path.parent, + verbose=args.verbose + ) + except Exception as spec_err: + console.print(f"[yellow]Warning: Could not generate spectrograms: {spec_err}[/yellow]") + if args.verbose: + import traceback + console.print(traceback.format_exc()) except Exception as e: console.print(f"[red]Error creating backup: {str(e)}[/red]") if args.verbose: import traceback - console.print(traceback.format_exc()) return else: - console.print( - "[yellow]No backup created (backups disabled)[/yellow]" - ) + console.print("[yellow]No backup created (backups disabled)[/yellow]") # Export the converted audio file try: @@ -345,7 +351,7 @@ def check_effectively_mono(audio, threshold_dB): return peak_diff_db < threshold_dB -def generate_spectrogram(original_file, new_file, backup_dir): +def generate_spectrogram(original_file, new_file, backup_dir, verbose=False): """Generate and save spectrograms for the original and new files.""" try: y_old, sr_old = librosa.load(original_file, sr=None) @@ -380,7 +386,7 @@ def generate_spectrogram(original_file, new_file, backup_dir): except Exception as e: console.print(f"[red]Error generating spectrograms: {str(e)}[/red]") - if args.verbose: + if verbose: import traceback console.print(traceback.format_exc()) From f7d765398bb4f1e485328af96d132f8933bb3115 Mon Sep 17 00:00:00 2001 From: Chris Farrell Date: Mon, 18 Nov 2024 07:01:18 +0000 Subject: [PATCH 37/66] Nov 17, 2024, 11:01 PM --- sample-shrinker-python/sample-shrinker.py | 67 ++++++++++++++--------- 1 file changed, 41 insertions(+), 26 deletions(-) diff --git a/sample-shrinker-python/sample-shrinker.py b/sample-shrinker-python/sample-shrinker.py index bf524fc..f2f051b 100644 --- a/sample-shrinker-python/sample-shrinker.py +++ b/sample-shrinker-python/sample-shrinker.py @@ -360,29 +360,41 @@ def generate_spectrogram(original_file, new_file, backup_dir, verbose=False): # Ensure the backup directory exists os.makedirs(backup_dir, exist_ok=True) + # Set a reasonable n_fft based on signal length + n_fft = min(2048, len(y_old)) + if n_fft % 2 != 0: # Ensure n_fft is even + n_fft -= 1 + # Generate spectrogram for original file - plt.figure(figsize=(10, 4)) - D_old = librosa.amplitude_to_db(np.abs(librosa.stft(y_old)), ref=np.max) - librosa.display.specshow(D_old, sr=sr_old, x_axis="time", y_axis="log") - plt.colorbar(format="%+2.0f dB") - plt.title(f"Spectrogram of {os.path.basename(original_file)}") - old_spectrogram_path = os.path.join( - backup_dir, os.path.basename(original_file) + ".old.png" - ) - plt.savefig(old_spectrogram_path) - plt.close('all') # Explicitly close all figures - - # Generate spectrogram for new file - plt.figure(figsize=(10, 4)) - D_new = librosa.amplitude_to_db(np.abs(librosa.stft(y_new)), ref=np.max) - librosa.display.specshow(D_new, sr=sr_new, x_axis="time", y_axis="log") - plt.colorbar(format="%+2.0f dB") - plt.title(f"Spectrogram of {os.path.basename(new_file)}") - new_spectrogram_path = os.path.join( - backup_dir, os.path.basename(new_file) + ".new.png" - ) - plt.savefig(new_spectrogram_path) - plt.close('all') # Explicitly close all figures + with plt.ioff(): # Turn off interactive mode + fig = plt.figure(figsize=(10, 4)) + D_old = librosa.amplitude_to_db( + np.abs(librosa.stft(y_old, n_fft=n_fft)), + ref=np.max + ) + librosa.display.specshow(D_old, sr=sr_old, x_axis="time", y_axis="log") + plt.colorbar(format="%+2.0f dB") + plt.title(f"Spectrogram of {os.path.basename(original_file)}") + old_spectrogram_path = os.path.join( + backup_dir, os.path.basename(original_file) + ".old.png" + ) + plt.savefig(old_spectrogram_path) + plt.close(fig) + + # Generate spectrogram for new file + fig = plt.figure(figsize=(10, 4)) + D_new = librosa.amplitude_to_db( + np.abs(librosa.stft(y_new, n_fft=n_fft)), + ref=np.max + ) + librosa.display.specshow(D_new, sr=sr_new, x_axis="time", y_axis="log") + plt.colorbar(format="%+2.0f dB") + plt.title(f"Spectrogram of {os.path.basename(new_file)}") + new_spectrogram_path = os.path.join( + backup_dir, os.path.basename(new_file) + ".new.png" + ) + plt.savefig(new_spectrogram_path) + plt.close(fig) except Exception as e: console.print(f"[red]Error generating spectrograms: {str(e)}[/red]") @@ -933,10 +945,13 @@ def get_interactive_config(): default="_backup", ).ask() if args.backup_dir.strip(): # If not empty - args.skip_spectrograms = questionary.confirm( - "Generate spectrograms for backup comparison?", - default=not args.skip_spectrograms, - ).ask() + args.backup_dir = args.backup_dir.strip() + # Only ask about spectrograms if they weren't explicitly skipped in advanced options + if not args.skip_spectrograms: + args.skip_spectrograms = not questionary.confirm( + "Generate spectrograms for backup comparison?", + default=False + ).ask() else: args.backup_dir = "-" args.skip_spectrograms = True From ee2b6aecf103658e4a7f38c60b8e78bbd1cd33b0 Mon Sep 17 00:00:00 2001 From: Chris Farrell Date: Mon, 18 Nov 2024 07:09:01 +0000 Subject: [PATCH 38/66] Nov 17, 2024, 11:09 PM --- sample-shrinker-python/sample-shrinker.py | 28 +++++++++-------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/sample-shrinker-python/sample-shrinker.py b/sample-shrinker-python/sample-shrinker.py index f2f051b..90a2282 100644 --- a/sample-shrinker-python/sample-shrinker.py +++ b/sample-shrinker-python/sample-shrinker.py @@ -270,27 +270,23 @@ def process_audio(file_path, args, dry_run=False, task_id=None, progress=None): # Backup handling if args.backup_dir != "-": try: - # Convert the file path to a Path object + # Convert paths to Path objects file_path_obj = Path(file_path).resolve() - # Get the absolute path to the backup directory - backup_dir = Path(args.backup_dir).resolve() - - # Create the relative path structure - rel_path = file_path_obj.relative_to(file_path_obj.parent) - backup_path = backup_dir / rel_path.parent.name / rel_path.name - + backup_base = Path(args.backup_dir).resolve() + + # Get the relative path from the current working directory + rel_path = file_path_obj.relative_to(Path.cwd()) + + # Create the full backup path maintaining directory structure + backup_path = backup_base / rel_path + # Ensure the backup directory exists backup_path.parent.mkdir(parents=True, exist_ok=True) - - # Add .old extension for the backup - backup_path = backup_path.with_suffix( - backup_path.suffix + ".old" - ) - + # Copy the original file with metadata preserved console.print(f"[cyan]Backing up to: {backup_path}[/cyan]") shutil.copy2(file_path, backup_path) - + # Generate spectrograms if enabled if not args.skip_spectrograms: try: @@ -323,8 +319,6 @@ def process_audio(file_path, args, dry_run=False, task_id=None, progress=None): except Exception as e: console.print(f"[red]Error saving converted file: {str(e)}[/red]") if args.verbose: - import traceback - console.print(traceback.format_exc()) else: status = Text() From cdf9948315f5dc56d32888535f7670b028e56db0 Mon Sep 17 00:00:00 2001 From: Chris Farrell Date: Mon, 18 Nov 2024 07:10:20 +0000 Subject: [PATCH 39/66] Nov 17, 2024, 11:10 PM --- sample-shrinker-python/sample-shrinker.py | 40 +++++++++++++++-------- 1 file changed, 27 insertions(+), 13 deletions(-) diff --git a/sample-shrinker-python/sample-shrinker.py b/sample-shrinker-python/sample-shrinker.py index 90a2282..1a82dea 100644 --- a/sample-shrinker-python/sample-shrinker.py +++ b/sample-shrinker-python/sample-shrinker.py @@ -145,26 +145,40 @@ def reencode_audio(file_path): """Re-encode audio file to PCM 16-bit if it has a different encoding.""" try: output_path = str(Path(file_path).with_suffix(".reencoded.wav")) - # Use ffmpeg directly for more reliable conversion + # Use ffmpeg with explicit decoding and encoding parameters cmd = [ - "ffmpeg", - "-y", - "-i", - str(file_path), - "-acodec", - "pcm_s16le", - "-ar", - "44100", - output_path, + "ffmpeg", "-y", + "-i", str(file_path), + "-acodec", "pcm_s16le", # Force 16-bit PCM encoding + "-ar", "44100", # Maintain sample rate + "-ac", "2", # Maintain stereo if present + "-f", "wav", # Force WAV format + output_path ] - + result = subprocess.run(cmd, capture_output=True, text=True) if result.returncode == 0: console.print(f"[green]Successfully re-encoded: {output_path}[/green]") return output_path else: - console.print(f"[red]FFmpeg error: {result.stderr}[/red]") - return None + # If first attempt fails, try with different decoder + cmd = [ + "ffmpeg", "-y", + "-c:a", "adpcm_ms", # Explicitly specify ADPCM decoder + "-i", str(file_path), + "-acodec", "pcm_s16le", + "-ar", "44100", + "-ac", "2", + "-f", "wav", + output_path + ] + result = subprocess.run(cmd, capture_output=True, text=True) + if result.returncode == 0: + console.print(f"[green]Successfully re-encoded with ADPCM decoder: {output_path}[/green]") + return output_path + else: + console.print(f"[red]FFmpeg error: {result.stderr}[/red]") + return None except Exception as e: console.print(f"[red]Error re-encoding {file_path}: {str(e)}[/red]") return None From 93f6b91300b022354dcd4ca41243f0c83aac4494 Mon Sep 17 00:00:00 2001 From: Chris Farrell Date: Mon, 18 Nov 2024 07:22:33 +0000 Subject: [PATCH 40/66] Nov 17, 2024, 11:22 PM --- sample-shrinker-python/sample-shrinker.py | 32 ++++++++++------------- 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/sample-shrinker-python/sample-shrinker.py b/sample-shrinker-python/sample-shrinker.py index 1a82dea..f69e02f 100644 --- a/sample-shrinker-python/sample-shrinker.py +++ b/sample-shrinker-python/sample-shrinker.py @@ -206,22 +206,19 @@ def process_audio(file_path, args, dry_run=False, task_id=None, progress=None): else: console.print(f"Processing file: [cyan]{file_path}[/cyan]") + # First check if file needs processing try: audio = AudioSegment.from_file(file_path) - except (IndexError, OSError) as e: - console.print(f"[red]Error loading {file_path}: {str(e)}[/red]") - console.print("[yellow]Attempting to re-encode file...[/yellow]") - reencoded_file = reencode_audio(file_path) - if reencoded_file: - try: - audio = AudioSegment.from_file(reencoded_file) - except Exception as re_err: - console.print( - f"[red]Failed to process re-encoded file: {str(re_err)}[/red]" - ) - return - else: + # Skip if file already meets our requirements + if (audio.sample_width * 8 <= args.bitdepth and + audio.channels <= args.channels and + audio.frame_rate <= args.samplerate and + (not args.min_samplerate or audio.frame_rate >= args.min_samplerate)): + console.print(f"[blue]Skipping {file_path} (already meets requirements)[/blue]") return + except Exception as e: + console.print(f"[yellow]Error checking file {file_path}: {str(e)}[/yellow]") + # Continue with processing if we can't check the file modified = False change_reason = [] @@ -288,11 +285,10 @@ def process_audio(file_path, args, dry_run=False, task_id=None, progress=None): file_path_obj = Path(file_path).resolve() backup_base = Path(args.backup_dir).resolve() - # Get the relative path from the current working directory - rel_path = file_path_obj.relative_to(Path.cwd()) - - # Create the full backup path maintaining directory structure - backup_path = backup_base / rel_path + # Get the relative structure from the file path + # Use the last few components of the path to maintain structure + path_parts = file_path_obj.parts[-3:] # Adjust number as needed + backup_path = backup_base.joinpath(*path_parts) # Ensure the backup directory exists backup_path.parent.mkdir(parents=True, exist_ok=True) From 86e31c1102fad3059a2c91879bb47426a67de467 Mon Sep 17 00:00:00 2001 From: Chris Farrell Date: Mon, 18 Nov 2024 07:24:03 +0000 Subject: [PATCH 41/66] Nov 17, 2024, 11:24 PM --- sample-shrinker-python/sample-shrinker.py | 157 +++++++++++++++------- 1 file changed, 108 insertions(+), 49 deletions(-) diff --git a/sample-shrinker-python/sample-shrinker.py b/sample-shrinker-python/sample-shrinker.py index f69e02f..4b9cfae 100644 --- a/sample-shrinker-python/sample-shrinker.py +++ b/sample-shrinker-python/sample-shrinker.py @@ -13,7 +13,8 @@ import librosa import matplotlib -matplotlib.use('Agg') # Use non-interactive backend + +matplotlib.use("Agg") # Use non-interactive backend import matplotlib.pyplot as plt import numpy as np import questionary @@ -147,15 +148,21 @@ def reencode_audio(file_path): output_path = str(Path(file_path).with_suffix(".reencoded.wav")) # Use ffmpeg with explicit decoding and encoding parameters cmd = [ - "ffmpeg", "-y", - "-i", str(file_path), - "-acodec", "pcm_s16le", # Force 16-bit PCM encoding - "-ar", "44100", # Maintain sample rate - "-ac", "2", # Maintain stereo if present - "-f", "wav", # Force WAV format - output_path + "ffmpeg", + "-y", + "-i", + str(file_path), + "-acodec", + "pcm_s16le", # Force 16-bit PCM encoding + "-ar", + "44100", # Maintain sample rate + "-ac", + "2", # Maintain stereo if present + "-f", + "wav", # Force WAV format + output_path, ] - + result = subprocess.run(cmd, capture_output=True, text=True) if result.returncode == 0: console.print(f"[green]Successfully re-encoded: {output_path}[/green]") @@ -163,18 +170,27 @@ def reencode_audio(file_path): else: # If first attempt fails, try with different decoder cmd = [ - "ffmpeg", "-y", - "-c:a", "adpcm_ms", # Explicitly specify ADPCM decoder - "-i", str(file_path), - "-acodec", "pcm_s16le", - "-ar", "44100", - "-ac", "2", - "-f", "wav", - output_path + "ffmpeg", + "-y", + "-c:a", + "adpcm_ms", # Explicitly specify ADPCM decoder + "-i", + str(file_path), + "-acodec", + "pcm_s16le", + "-ar", + "44100", + "-ac", + "2", + "-f", + "wav", + output_path, ] result = subprocess.run(cmd, capture_output=True, text=True) if result.returncode == 0: - console.print(f"[green]Successfully re-encoded with ADPCM decoder: {output_path}[/green]") + console.print( + f"[green]Successfully re-encoded with ADPCM decoder: {output_path}[/green]" + ) return output_path else: console.print(f"[red]FFmpeg error: {result.stderr}[/red]") @@ -198,6 +214,51 @@ def check_ffmpeg(): return False +def get_audio_properties(file_path): + """Get audio file properties using pydub.""" + try: + audio = AudioSegment.from_file(file_path) + return { + "bit_depth": audio.sample_width * 8, + "channels": audio.channels, + "sample_rate": audio.frame_rate, + "duration": len(audio), + } + except Exception as e: + console.print( + f"[yellow]Error reading audio properties from {file_path}: {str(e)}[/yellow]" + ) + return None + + +def needs_conversion(file_path, args): + """Check if file needs conversion based on its properties.""" + props = get_audio_properties(file_path) + if not props: + return True # If we can't read properties, attempt conversion + + needs_conversion = False + reasons = [] + + if props["bit_depth"] > args.bitdepth: + needs_conversion = True + reasons.append(f"bit depth {props['bit_depth']} -> {args.bitdepth}") + + if props["channels"] > args.channels: + needs_conversion = True + reasons.append(f"channels {props['channels']} -> {args.channels}") + + if props["sample_rate"] > args.samplerate: + needs_conversion = True + reasons.append(f"sample rate {props['sample_rate']} -> {args.samplerate}") + + if args.min_samplerate and props["sample_rate"] < args.min_samplerate: + needs_conversion = True + reasons.append(f"sample rate {props['sample_rate']} -> {args.min_samplerate}") + + return needs_conversion, reasons + + def process_audio(file_path, args, dry_run=False, task_id=None, progress=None): """Main function to process audio files based on arguments.""" try: @@ -206,19 +267,13 @@ def process_audio(file_path, args, dry_run=False, task_id=None, progress=None): else: console.print(f"Processing file: [cyan]{file_path}[/cyan]") - # First check if file needs processing - try: - audio = AudioSegment.from_file(file_path) - # Skip if file already meets our requirements - if (audio.sample_width * 8 <= args.bitdepth and - audio.channels <= args.channels and - audio.frame_rate <= args.samplerate and - (not args.min_samplerate or audio.frame_rate >= args.min_samplerate)): - console.print(f"[blue]Skipping {file_path} (already meets requirements)[/blue]") - return - except Exception as e: - console.print(f"[yellow]Error checking file {file_path}: {str(e)}[/yellow]") - # Continue with processing if we can't check the file + # Check if file needs processing + needs_conv, reasons = needs_conversion(file_path, args) + if not needs_conv: + console.print( + f"[blue]Skipping {file_path} (already meets requirements)[/blue]" + ) + return modified = False change_reason = [] @@ -284,42 +339,48 @@ def process_audio(file_path, args, dry_run=False, task_id=None, progress=None): # Convert paths to Path objects file_path_obj = Path(file_path).resolve() backup_base = Path(args.backup_dir).resolve() - + # Get the relative structure from the file path # Use the last few components of the path to maintain structure path_parts = file_path_obj.parts[-3:] # Adjust number as needed backup_path = backup_base.joinpath(*path_parts) - + # Ensure the backup directory exists backup_path.parent.mkdir(parents=True, exist_ok=True) - + # Copy the original file with metadata preserved console.print(f"[cyan]Backing up to: {backup_path}[/cyan]") shutil.copy2(file_path, backup_path) - + # Generate spectrograms if enabled if not args.skip_spectrograms: try: generate_spectrogram( - file_path, - file_path, + file_path, + file_path, backup_path.parent, - verbose=args.verbose + verbose=args.verbose, ) except Exception as spec_err: - console.print(f"[yellow]Warning: Could not generate spectrograms: {spec_err}[/yellow]") + console.print( + f"[yellow]Warning: Could not generate spectrograms: {spec_err}[/yellow]" + ) if args.verbose: import traceback + console.print(traceback.format_exc()) except Exception as e: console.print(f"[red]Error creating backup: {str(e)}[/red]") if args.verbose: import traceback + console.print(traceback.format_exc()) return else: - console.print("[yellow]No backup created (backups disabled)[/yellow]") + console.print( + "[yellow]No backup created (backups disabled)[/yellow]" + ) # Export the converted audio file try: @@ -373,8 +434,7 @@ def generate_spectrogram(original_file, new_file, backup_dir, verbose=False): with plt.ioff(): # Turn off interactive mode fig = plt.figure(figsize=(10, 4)) D_old = librosa.amplitude_to_db( - np.abs(librosa.stft(y_old, n_fft=n_fft)), - ref=np.max + np.abs(librosa.stft(y_old, n_fft=n_fft)), ref=np.max ) librosa.display.specshow(D_old, sr=sr_old, x_axis="time", y_axis="log") plt.colorbar(format="%+2.0f dB") @@ -388,8 +448,7 @@ def generate_spectrogram(original_file, new_file, backup_dir, verbose=False): # Generate spectrogram for new file fig = plt.figure(figsize=(10, 4)) D_new = librosa.amplitude_to_db( - np.abs(librosa.stft(y_new, n_fft=n_fft)), - ref=np.max + np.abs(librosa.stft(y_new, n_fft=n_fft)), ref=np.max ) librosa.display.specshow(D_new, sr=sr_new, x_axis="time", y_axis="log") plt.colorbar(format="%+2.0f dB") @@ -404,6 +463,7 @@ def generate_spectrogram(original_file, new_file, backup_dir, verbose=False): console.print(f"[red]Error generating spectrograms: {str(e)}[/red]") if verbose: import traceback + console.print(traceback.format_exc()) @@ -840,7 +900,7 @@ def get_interactive_config(): "Preview changes (dry run)", "Show detailed progress", ], - default=["Preview changes (dry run)"], + default=("Preview changes (dry run)",), ).ask() args.use_fuzzy = "Use fuzzy matching for similar files" in duplicate_options @@ -871,7 +931,7 @@ def get_interactive_config(): "Compare sample rates", "Compare channel counts", ], - default=["Compare file lengths", "Compare sample rates"], + default=("Compare file lengths", "Compare sample rates"), ).ask() # Get backup options (moved before backup_choice) @@ -953,8 +1013,7 @@ def get_interactive_config(): # Only ask about spectrograms if they weren't explicitly skipped in advanced options if not args.skip_spectrograms: args.skip_spectrograms = not questionary.confirm( - "Generate spectrograms for backup comparison?", - default=False + "Generate spectrograms for backup comparison?", default=False ).ask() else: args.backup_dir = "-" @@ -1010,7 +1069,7 @@ def process_duplicates(args): if args.dry_run: console.print("[yellow]DRY RUN - No directories will be moved[/yellow]") - process_duplicate_directories(verified_duplicates, args) + process_duplicate_directories(dir_duplicates, args) else: console.print("[blue]No duplicate directories found.[/blue]") From d798be81ad7c0c1483601f72d8ec20c76a290a74 Mon Sep 17 00:00:00 2001 From: Chris Farrell Date: Mon, 18 Nov 2024 07:25:58 +0000 Subject: [PATCH 42/66] Nov 17, 2024, 11:25 PM --- sample-shrinker-python/sample-shrinker.py | 34 +++++++++++++++++------ 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/sample-shrinker-python/sample-shrinker.py b/sample-shrinker-python/sample-shrinker.py index 4b9cfae..e4bc25a 100644 --- a/sample-shrinker-python/sample-shrinker.py +++ b/sample-shrinker-python/sample-shrinker.py @@ -235,28 +235,28 @@ def needs_conversion(file_path, args): """Check if file needs conversion based on its properties.""" props = get_audio_properties(file_path) if not props: - return True # If we can't read properties, attempt conversion + return (True, ["unable to read properties"]) # Return tuple with reason - needs_conversion = False + needs_conv = False reasons = [] if props["bit_depth"] > args.bitdepth: - needs_conversion = True + needs_conv = True reasons.append(f"bit depth {props['bit_depth']} -> {args.bitdepth}") if props["channels"] > args.channels: - needs_conversion = True + needs_conv = True reasons.append(f"channels {props['channels']} -> {args.channels}") if props["sample_rate"] > args.samplerate: - needs_conversion = True + needs_conv = True reasons.append(f"sample rate {props['sample_rate']} -> {args.samplerate}") if args.min_samplerate and props["sample_rate"] < args.min_samplerate: - needs_conversion = True + needs_conv = True reasons.append(f"sample rate {props['sample_rate']} -> {args.min_samplerate}") - return needs_conversion, reasons + return (needs_conv, reasons) # Always return a tuple def process_audio(file_path, args, dry_run=False, task_id=None, progress=None): @@ -267,6 +267,24 @@ def process_audio(file_path, args, dry_run=False, task_id=None, progress=None): else: console.print(f"Processing file: [cyan]{file_path}[/cyan]") + # Load the audio file first + try: + audio = AudioSegment.from_file(file_path) + except Exception as e: + console.print(f"[yellow]Error loading {file_path}: {str(e)}[/yellow]") + console.print("[yellow]Attempting to re-encode file...[/yellow]") + reencoded_file = reencode_audio(file_path) + if reencoded_file: + try: + audio = AudioSegment.from_file(reencoded_file) + except Exception as re_err: + console.print( + f"[red]Failed to process re-encoded file: {str(re_err)}[/red]" + ) + return + else: + return + # Check if file needs processing needs_conv, reasons = needs_conversion(file_path, args) if not needs_conv: @@ -276,7 +294,7 @@ def process_audio(file_path, args, dry_run=False, task_id=None, progress=None): return modified = False - change_reason = [] + change_reason = reasons.copy() # Use the reasons from needs_conversion # Check if we need to convert the channels if audio.channels > args.channels: From c49b22399fb759e2cc6eafbc0aaf69de536aa863 Mon Sep 17 00:00:00 2001 From: Chris Farrell Date: Mon, 18 Nov 2024 07:28:25 +0000 Subject: [PATCH 43/66] Nov 17, 2024, 11:28 PM --- sample-shrinker-python/sample-shrinker.py | 107 +++++++++++++++------- 1 file changed, 74 insertions(+), 33 deletions(-) diff --git a/sample-shrinker-python/sample-shrinker.py b/sample-shrinker-python/sample-shrinker.py index e4bc25a..d95e5ce 100644 --- a/sample-shrinker-python/sample-shrinker.py +++ b/sample-shrinker-python/sample-shrinker.py @@ -146,20 +146,47 @@ def reencode_audio(file_path): """Re-encode audio file to PCM 16-bit if it has a different encoding.""" try: output_path = str(Path(file_path).with_suffix(".reencoded.wav")) - # Use ffmpeg with explicit decoding and encoding parameters + + # First try with ADPCM decoder explicitly cmd = [ "ffmpeg", "-y", + "-c:a", + "adpcm_ms", # Try ADPCM first "-i", str(file_path), "-acodec", - "pcm_s16le", # Force 16-bit PCM encoding + "pcm_s16le", "-ar", - "44100", # Maintain sample rate + "44100", "-ac", - "2", # Maintain stereo if present + "2", "-f", - "wav", # Force WAV format + "wav", + output_path, + ] + + result = subprocess.run(cmd, capture_output=True, text=True) + if result.returncode == 0: + console.print( + f"[green]Successfully re-encoded with ADPCM decoder: {output_path}[/green]" + ) + return output_path + + # If ADPCM fails, try with default decoder + cmd = [ + "ffmpeg", + "-y", + "-i", + str(file_path), + "-acodec", + "pcm_s16le", + "-ar", + "44100", + "-ac", + "2", + "-f", + "wav", output_path, ] @@ -167,34 +194,38 @@ def reencode_audio(file_path): if result.returncode == 0: console.print(f"[green]Successfully re-encoded: {output_path}[/green]") return output_path + + # If both attempts fail, try with more aggressive options + cmd = [ + "ffmpeg", + "-y", + "-i", + str(file_path), + "-acodec", + "pcm_s16le", + "-ar", + "44100", + "-ac", + "2", + "-af", + "aresample=resampler=soxr", # Use high quality resampler + "-strict", + "experimental", + "-f", + "wav", + output_path, + ] + + result = subprocess.run(cmd, capture_output=True, text=True) + if result.returncode == 0: + console.print( + f"[green]Successfully re-encoded with resampling: {output_path}[/green]" + ) + return output_path else: - # If first attempt fails, try with different decoder - cmd = [ - "ffmpeg", - "-y", - "-c:a", - "adpcm_ms", # Explicitly specify ADPCM decoder - "-i", - str(file_path), - "-acodec", - "pcm_s16le", - "-ar", - "44100", - "-ac", - "2", - "-f", - "wav", - output_path, - ] - result = subprocess.run(cmd, capture_output=True, text=True) - if result.returncode == 0: - console.print( - f"[green]Successfully re-encoded with ADPCM decoder: {output_path}[/green]" - ) - return output_path - else: - console.print(f"[red]FFmpeg error: {result.stderr}[/red]") - return None + console.print(f"[red]FFmpeg error: {result.stderr}[/red]") + return None + except Exception as e: console.print(f"[red]Error re-encoding {file_path}: {str(e)}[/red]") return None @@ -217,7 +248,17 @@ def check_ffmpeg(): def get_audio_properties(file_path): """Get audio file properties using pydub.""" try: - audio = AudioSegment.from_file(file_path) + # First try direct loading + try: + audio = AudioSegment.from_file(file_path) + except Exception as e: + # If direct loading fails, try re-encoding first + reencoded = reencode_audio(file_path) + if reencoded: + audio = AudioSegment.from_file(reencoded) + else: + raise e + return { "bit_depth": audio.sample_width * 8, "channels": audio.channels, From a6cda0e0e6363af83debd0302eefa124a9b5d0d0 Mon Sep 17 00:00:00 2001 From: Chris Farrell Date: Mon, 18 Nov 2024 07:30:28 +0000 Subject: [PATCH 44/66] Nov 17, 2024, 11:30 PM --- sample-shrinker-python/sample-shrinker.py | 86 ++++++++++++++--------- 1 file changed, 54 insertions(+), 32 deletions(-) diff --git a/sample-shrinker-python/sample-shrinker.py b/sample-shrinker-python/sample-shrinker.py index d95e5ce..9d55818 100644 --- a/sample-shrinker-python/sample-shrinker.py +++ b/sample-shrinker-python/sample-shrinker.py @@ -251,20 +251,37 @@ def get_audio_properties(file_path): # First try direct loading try: audio = AudioSegment.from_file(file_path) + # Fix for incorrect bit depth detection + actual_bit_depth = audio.sample_width * 8 + # Some 24-bit files might be reported as 32-bit + if actual_bit_depth == 32: + # Check if it's actually 24-bit + max_value = max( + abs(min(audio.get_array_of_samples())), + abs(max(audio.get_array_of_samples())), + ) + if max_value <= 0x7FFFFF: # Max value for 24-bit + actual_bit_depth = 24 + + return { + "bit_depth": actual_bit_depth, + "channels": audio.channels, + "sample_rate": audio.frame_rate, + "duration": len(audio), + } except Exception as e: # If direct loading fails, try re-encoding first reencoded = reencode_audio(file_path) if reencoded: audio = AudioSegment.from_file(reencoded) + return { + "bit_depth": audio.sample_width * 8, + "channels": audio.channels, + "sample_rate": audio.frame_rate, + "duration": len(audio), + } else: raise e - - return { - "bit_depth": audio.sample_width * 8, - "channels": audio.channels, - "sample_rate": audio.frame_rate, - "duration": len(audio), - } except Exception as e: console.print( f"[yellow]Error reading audio properties from {file_path}: {str(e)}[/yellow]" @@ -400,34 +417,39 @@ def process_audio(file_path, args, dry_run=False, task_id=None, progress=None): backup_base = Path(args.backup_dir).resolve() # Get the relative structure from the file path - # Use the last few components of the path to maintain structure path_parts = file_path_obj.parts[-3:] # Adjust number as needed backup_path = backup_base.joinpath(*path_parts) - # Ensure the backup directory exists - backup_path.parent.mkdir(parents=True, exist_ok=True) - - # Copy the original file with metadata preserved - console.print(f"[cyan]Backing up to: {backup_path}[/cyan]") - shutil.copy2(file_path, backup_path) - - # Generate spectrograms if enabled - if not args.skip_spectrograms: - try: - generate_spectrogram( - file_path, - file_path, - backup_path.parent, - verbose=args.verbose, - ) - except Exception as spec_err: - console.print( - f"[yellow]Warning: Could not generate spectrograms: {spec_err}[/yellow]" - ) - if args.verbose: - import traceback - - console.print(traceback.format_exc()) + # Check if backup already exists + if backup_path.exists(): + console.print( + f"[blue]Backup already exists: {backup_path}[/blue]" + ) + else: + # Ensure the backup directory exists + backup_path.parent.mkdir(parents=True, exist_ok=True) + + # Copy the original file with metadata preserved + console.print(f"[cyan]Backing up to: {backup_path}[/cyan]") + shutil.copy2(file_path, backup_path) + + # Generate spectrograms if enabled + if not args.skip_spectrograms: + try: + generate_spectrogram( + file_path, + file_path, + backup_path.parent, + verbose=args.verbose, + ) + except Exception as spec_err: + console.print( + f"[yellow]Warning: Could not generate spectrograms: {spec_err}[/yellow]" + ) + if args.verbose: + import traceback + + console.print(traceback.format_exc()) except Exception as e: console.print(f"[red]Error creating backup: {str(e)}[/red]") From 9e941a5a32f5efaf5114dd1b980aac9da34de5e8 Mon Sep 17 00:00:00 2001 From: Chris Farrell Date: Mon, 18 Nov 2024 07:30:56 +0000 Subject: [PATCH 45/66] Nov 17, 2024, 11:30 PM --- sample-shrinker-python/sample-shrinker.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sample-shrinker-python/sample-shrinker.py b/sample-shrinker-python/sample-shrinker.py index 9d55818..85ac7f6 100644 --- a/sample-shrinker-python/sample-shrinker.py +++ b/sample-shrinker-python/sample-shrinker.py @@ -981,7 +981,7 @@ def get_interactive_config(): "Preview changes (dry run)", "Show detailed progress", ], - default=("Preview changes (dry run)",), + default=["Preview changes (dry run)"], ).ask() args.use_fuzzy = "Use fuzzy matching for similar files" in duplicate_options @@ -1012,7 +1012,7 @@ def get_interactive_config(): "Compare sample rates", "Compare channel counts", ], - default=("Compare file lengths", "Compare sample rates"), + default=["Compare file lengths", "Compare sample rates"], ).ask() # Get backup options (moved before backup_choice) From d341d2a845d0b9c9b636045ce0eff1b9ec55840a Mon Sep 17 00:00:00 2001 From: Chris Farrell Date: Mon, 18 Nov 2024 07:31:45 +0000 Subject: [PATCH 46/66] Nov 17, 2024, 11:31 PM --- sample-shrinker-python/sample-shrinker.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sample-shrinker-python/sample-shrinker.py b/sample-shrinker-python/sample-shrinker.py index 85ac7f6..d4a221d 100644 --- a/sample-shrinker-python/sample-shrinker.py +++ b/sample-shrinker-python/sample-shrinker.py @@ -981,7 +981,7 @@ def get_interactive_config(): "Preview changes (dry run)", "Show detailed progress", ], - default=["Preview changes (dry run)"], + default=[2], # Index of "Preview changes (dry run)" ).ask() args.use_fuzzy = "Use fuzzy matching for similar files" in duplicate_options @@ -1012,7 +1012,7 @@ def get_interactive_config(): "Compare sample rates", "Compare channel counts", ], - default=["Compare file lengths", "Compare sample rates"], + default=[0, 1], # Indices of the first two choices ).ask() # Get backup options (moved before backup_choice) From 1d46e7e542be31622bdf53aee4b4a96427a6cd9e Mon Sep 17 00:00:00 2001 From: Chris Farrell Date: Mon, 18 Nov 2024 07:33:10 +0000 Subject: [PATCH 47/66] Nov 17, 2024, 11:33 PM --- sample-shrinker-python/sample-shrinker.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/sample-shrinker-python/sample-shrinker.py b/sample-shrinker-python/sample-shrinker.py index d4a221d..d7138e9 100644 --- a/sample-shrinker-python/sample-shrinker.py +++ b/sample-shrinker-python/sample-shrinker.py @@ -981,7 +981,7 @@ def get_interactive_config(): "Preview changes (dry run)", "Show detailed progress", ], - default=[2], # Index of "Preview changes (dry run)" + default=["Preview changes (dry run)"], # Use the actual choice string ).ask() args.use_fuzzy = "Use fuzzy matching for similar files" in duplicate_options @@ -1012,7 +1012,10 @@ def get_interactive_config(): "Compare sample rates", "Compare channel counts", ], - default=[0, 1], # Indices of the first two choices + default=[ + "Compare file lengths", + "Compare sample rates", + ], # Use actual choice strings ).ask() # Get backup options (moved before backup_choice) From 9891ac330c0b3c60fa6deb55271757bc3b097951 Mon Sep 17 00:00:00 2001 From: Chris Farrell Date: Mon, 18 Nov 2024 07:34:27 +0000 Subject: [PATCH 48/66] Nov 17, 2024, 11:34 PM --- sample-shrinker-python/sample-shrinker.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/sample-shrinker-python/sample-shrinker.py b/sample-shrinker-python/sample-shrinker.py index d7138e9..2ee77c0 100644 --- a/sample-shrinker-python/sample-shrinker.py +++ b/sample-shrinker-python/sample-shrinker.py @@ -981,7 +981,6 @@ def get_interactive_config(): "Preview changes (dry run)", "Show detailed progress", ], - default=["Preview changes (dry run)"], # Use the actual choice string ).ask() args.use_fuzzy = "Use fuzzy matching for similar files" in duplicate_options @@ -1012,10 +1011,6 @@ def get_interactive_config(): "Compare sample rates", "Compare channel counts", ], - default=[ - "Compare file lengths", - "Compare sample rates", - ], # Use actual choice strings ).ask() # Get backup options (moved before backup_choice) From 1f6b1890d4799de9f786d15c262f9882cb4be6ae Mon Sep 17 00:00:00 2001 From: Chris Farrell Date: Mon, 18 Nov 2024 07:34:41 +0000 Subject: [PATCH 49/66] Nov 17, 2024, 11:34 PM --- sample-shrinker-python/sample-shrinker.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/sample-shrinker-python/sample-shrinker.py b/sample-shrinker-python/sample-shrinker.py index 2ee77c0..03be39f 100644 --- a/sample-shrinker-python/sample-shrinker.py +++ b/sample-shrinker-python/sample-shrinker.py @@ -1013,11 +1013,10 @@ def get_interactive_config(): ], ).ask() - # Get backup options (moved before backup_choice) + # Get backup options (modified text prompt) backup_dir = questionary.text( - "Backup directory path:", + "Backup directory path (where duplicates will be moved):", default="_backup", - description="Directory where duplicates will be moved", ).ask() if backup_dir.strip(): # If not empty From e48843521221f5ad243169ab1cac885829b48cbb Mon Sep 17 00:00:00 2001 From: Chris Farrell Date: Mon, 18 Nov 2024 07:37:31 +0000 Subject: [PATCH 50/66] Nov 17, 2024, 11:37 PM --- sample-shrinker-python/sample-shrinker.py | 223 +++++++++++++++------- 1 file changed, 155 insertions(+), 68 deletions(-) diff --git a/sample-shrinker-python/sample-shrinker.py b/sample-shrinker-python/sample-shrinker.py index 03be39f..ff9b3e5 100644 --- a/sample-shrinker-python/sample-shrinker.py +++ b/sample-shrinker-python/sample-shrinker.py @@ -980,6 +980,7 @@ def get_interactive_config(): "Ignore filenames (match by content only)", "Preview changes (dry run)", "Show detailed progress", + "Process files in parallel", ], ).ask() @@ -990,28 +991,16 @@ def get_interactive_config(): args.dry_run = "Preview changes (dry run)" in duplicate_options args.verbose = "Show detailed progress" in duplicate_options - if args.use_fuzzy: - # Get fuzzy matching configuration - args.fuzzy_threshold = questionary.select( - "Select fuzzy matching threshold (higher = more strict):", - choices=[ - "95 - Nearly identical", - "90 - Very similar", - "85 - Similar", - "80 - Somewhat similar", - ], - default="90 - Very similar", - ).ask() - args.fuzzy_threshold = int(args.fuzzy_threshold.split()[0]) - - args.fuzzy_options = questionary.checkbox( - "Select fuzzy matching options:", - choices=[ - "Compare file lengths", - "Compare sample rates", - "Compare channel counts", - ], + # Add parallel processing configuration + if "Process files in parallel" in duplicate_options: + args.jobs = questionary.select( + "How many parallel jobs?", + choices=["2", "4", "8", "16", "24", "32"], + default="4", ).ask() + args.jobs = int(args.jobs) + else: + args.jobs = 1 # Get backup options (modified text prompt) backup_dir = questionary.text( @@ -1131,68 +1120,166 @@ def get_interactive_config(): def process_duplicates(args): """Process both directory and file level duplicates with visual feedback.""" - with console.status( - "[bold green]Phase 1: Searching for duplicate directories..." - ) as status: + with Progress( + SpinnerColumn(), + TextColumn("[progress.description]{task.description}"), + BarColumn(), + TaskProgressColumn(), + console=console, + ) as progress: + # Phase 1: Directory scan + scan_task = progress.add_task( + "[cyan]Scanning for duplicate directories...", total=None + ) dir_duplicates = find_duplicate_directories(args.files) + progress.update(scan_task, completed=True) - if dir_duplicates: - count = sum(len(v) - 1 for v in dir_duplicates.values()) - console.print( - Panel( - f"Found [cyan]{count}[/cyan] duplicate directories", - title="Directory Scan Complete", + if dir_duplicates: + count = sum(len(v) - 1 for v in dir_duplicates.values()) + console.print( + Panel( + f"Found [cyan]{count}[/cyan] duplicate directories", + title="Directory Scan Complete", + ) ) - ) - if args.dry_run: - console.print("[yellow]DRY RUN - No directories will be moved[/yellow]") - process_duplicate_directories(dir_duplicates, args) - else: - console.print("[blue]No duplicate directories found.[/blue]") + if args.dry_run: + console.print("[yellow]DRY RUN - No directories will be moved[/yellow]") + + # Process directories with progress bar + dir_task = progress.add_task( + "[green]Processing directories...", total=len(dir_duplicates) + ) + + with ThreadPoolExecutor(max_workers=args.jobs) as executor: + futures = [] + for (dir_name, file_count, total_size), paths in dir_duplicates.items(): + future = executor.submit( + process_directory_group, + dir_name, + file_count, + total_size, + paths, + args, + progress, + ) + futures.append(future) - with console.status( - "[bold green]Phase 2: Searching for duplicate files..." - ) as status: + for future in as_completed(futures): + try: + future.result() + progress.advance(dir_task) + except Exception as e: + console.print(f"[red]Error processing directory: {e}[/red]") + else: + console.print("[blue]No duplicate directories found.[/blue]") + + # Phase 2: File scan + file_task = progress.add_task( + "[cyan]Scanning for duplicate files...", total=None + ) file_duplicates, fuzzy_groups = find_duplicate_files(args.files, args) + progress.update(file_task, completed=True) - if file_duplicates: - total_duplicates = sum(len(group) - 1 for group in file_duplicates) - console.print( - Panel( - f"Found [cyan]{total_duplicates}[/cyan] duplicate files\n" - f"Including [cyan]{len(fuzzy_groups)}[/cyan] groups of similar files", - title="File Scan Complete", + if file_duplicates: + total_duplicates = sum(len(group) - 1 for group in file_duplicates) + console.print( + Panel( + f"Found [cyan]{total_duplicates}[/cyan] duplicate files\n" + f"Including [cyan]{len(fuzzy_groups)}[/cyan] groups of similar files", + title="File Scan Complete", + ) ) + + if args.dry_run: + console.print("[yellow]DRY RUN - No files will be moved[/yellow]") + + # Process files with progress bar + file_process_task = progress.add_task( + "[green]Processing files...", total=len(file_duplicates) + ) + + with ThreadPoolExecutor(max_workers=args.jobs) as executor: + futures = [] + for group in file_duplicates: + future = executor.submit( + process_file_group, + group, + fuzzy_groups, + args, + progress, + ) + futures.append(future) + + for future in as_completed(futures): + try: + future.result() + progress.advance(file_process_task) + except Exception as e: + console.print(f"[red]Error processing file group: {e}[/red]") + + console.print("[green]Duplicate removal complete![/green]") + + +def process_directory_group(dir_name, file_count, total_size, paths, args, progress): + """Process a group of duplicate directories.""" + try: + console.print( + f"\nFound duplicate directories named '[cyan]{dir_name}[/cyan]' " + f"with {file_count} files ({total_size} bytes):" ) - # Additional safety checks for file processing - safe_duplicates = [] - for group in file_duplicates: - # Verify files are not symbolic links - real_files = [f for f in group if not f.is_symlink()] + # Sort paths by creation time + valid_paths = [] + for path in paths: + try: + stat = path.stat() + valid_paths.append((path, stat.st_ctime)) + except FileNotFoundError: + console.print(f"[yellow]Warning: Directory not found: {path}[/yellow]") + continue + + if not valid_paths: + console.print("[red]No valid paths found in group[/red]") + return + + valid_paths.sort(key=lambda x: x[1]) + + # Keep the oldest directory + original_dir = valid_paths[0][0] + console.print( + f"Keeping oldest copy: [green]{original_dir}[/green] " + f"(created: {time.ctime(valid_paths[0][1])})" + ) - # Check if files are in use (on Windows) or locked - available_files = [] - for file in real_files: + # Process newer copies + for dir_path, ctime in valid_paths[1:]: + console.print( + f"Moving duplicate: [yellow]{dir_path}[/yellow] " + f"(created: {time.ctime(ctime)})" + ) + if not args.dry_run: try: - with open(file, "rb") as f: - # Try to get a shared lock - pass - available_files.append(file) - except (IOError, OSError): - print(f"Warning: File {file} appears to be in use, skipping") + # Create backup path + rel_path = dir_path.relative_to(dir_path.parent.parent) + backup_path = Path(args.backup_dir) / rel_path - if len(available_files) > 1: - safe_duplicates.append(available_files) + # Ensure backup directory exists + backup_path.parent.mkdir(parents=True, exist_ok=True) - if args.dry_run: - console.print("[yellow]DRY RUN - No files will be moved[/yellow]") - process_duplicate_files(safe_duplicates, fuzzy_groups, args) - else: - console.print("[blue]No duplicate files found.[/blue]") + shutil.move(str(dir_path), str(backup_path)) + except Exception as e: + console.print(f"[red]Error moving directory {dir_path}: {e}[/red]") - console.print("[green]Duplicate removal complete![/green]") + except Exception as e: + console.print(f"[red]Error processing directory group {dir_name}: {e}[/red]") + raise + + +def process_file_group(group, fuzzy_groups, args, progress): + """Process a group of duplicate files.""" + # Similar structure to process_duplicate_files but adapted for parallel processing + # ... implement the file processing logic here ... def main(): From 3affe52bf0b9e7edd9268c1fe105093ad14f1316 Mon Sep 17 00:00:00 2001 From: Chris Farrell Date: Mon, 18 Nov 2024 07:38:50 +0000 Subject: [PATCH 51/66] Nov 17, 2024, 11:38 PM --- sample-shrinker-python/sample-shrinker.py | 75 ++++++++++++++++++++++- 1 file changed, 73 insertions(+), 2 deletions(-) diff --git a/sample-shrinker-python/sample-shrinker.py b/sample-shrinker-python/sample-shrinker.py index ff9b3e5..c5559ad 100644 --- a/sample-shrinker-python/sample-shrinker.py +++ b/sample-shrinker-python/sample-shrinker.py @@ -1278,8 +1278,79 @@ def process_directory_group(dir_name, file_count, total_size, paths, args, progr def process_file_group(group, fuzzy_groups, args, progress): """Process a group of duplicate files.""" - # Similar structure to process_duplicate_files but adapted for parallel processing - # ... implement the file processing logic here ... + try: + # Get file size for reporting + file_size = group[0].stat().st_size + console.print( + f"\nProcessing duplicate group for '[cyan]{group[0].name}[/cyan]' ({file_size} bytes)" + ) + + # For fuzzy matches, show similarity percentages + if group in fuzzy_groups: + base_fingerprint = get_audio_fingerprint(group[0]) + console.print("[cyan]Similarity scores:[/cyan]") + for file in group[1:]: + file_fingerprint = get_audio_fingerprint(file) + similarity = compare_audio_similarity( + base_fingerprint, file_fingerprint + ) + console.print( + f" {file.name}: [yellow]{similarity:.1f}%[/yellow] similar" + ) + + # Sort files by creation time + files_with_time = [] + for file_path in group: + try: + stat = file_path.stat() + files_with_time.append((file_path, stat.st_ctime)) + except FileNotFoundError: + console.print(f"[yellow]Warning: File not found: {file_path}[/yellow]") + continue + + if not files_with_time: + console.print("[red]No valid files found in group[/red]") + return + + files_with_time.sort(key=lambda x: x[1]) + + # Keep the oldest file + original_file = files_with_time[0][0] + console.print( + f"Keeping oldest copy: [green]{original_file}[/green] " + f"(created: {time.ctime(files_with_time[0][1])})" + ) + + # Process newer copies + for file_path, ctime in files_with_time[1:]: + console.print( + f"Processing duplicate: [yellow]{file_path}[/yellow] " + f"(created: {time.ctime(ctime)})" + ) + + if not args.dry_run: + try: + if args.delete_duplicates: + console.print(f"[red]Deleting: {file_path}[/red]") + file_path.unlink() + else: + # Create backup path maintaining directory structure + rel_path = file_path.relative_to(file_path.parent.parent) + backup_path = Path(args.backup_dir) / rel_path + + # Ensure backup directory exists + backup_path.parent.mkdir(parents=True, exist_ok=True) + + # Move the file + console.print(f"Moving to: [blue]{backup_path}[/blue]") + shutil.move(str(file_path), str(backup_path)) + + except Exception as e: + console.print(f"[red]Error processing file {file_path}: {e}[/red]") + + except Exception as e: + console.print(f"[red]Error processing file group: {e}[/red]") + raise def main(): From 74c8d5fd767806c2ffcb09be320c2105b18db487 Mon Sep 17 00:00:00 2001 From: Chris Farrell Date: Mon, 18 Nov 2024 07:41:32 +0000 Subject: [PATCH 52/66] Nov 17, 2024, 11:41 PM --- sample-shrinker-python/sample-shrinker.py | 73 +++++++++++++++++------ 1 file changed, 54 insertions(+), 19 deletions(-) diff --git a/sample-shrinker-python/sample-shrinker.py b/sample-shrinker-python/sample-shrinker.py index c5559ad..135be02 100644 --- a/sample-shrinker-python/sample-shrinker.py +++ b/sample-shrinker-python/sample-shrinker.py @@ -995,7 +995,7 @@ def get_interactive_config(): if "Process files in parallel" in duplicate_options: args.jobs = questionary.select( "How many parallel jobs?", - choices=["2", "4", "8", "16", "24", "32"], + choices=["2", "4", "8", "16", "24", "32", "48", "64"], default="4", ).ask() args.jobs = int(args.jobs) @@ -1129,7 +1129,7 @@ def process_duplicates(args): ) as progress: # Phase 1: Directory scan scan_task = progress.add_task( - "[cyan]Scanning for duplicate directories...", total=None + "[magenta]Scanning for duplicate directories...[/magenta]", total=None ) dir_duplicates = find_duplicate_directories(args.files) progress.update(scan_task, completed=True) @@ -1176,7 +1176,7 @@ def process_duplicates(args): # Phase 2: File scan file_task = progress.add_task( - "[cyan]Scanning for duplicate files...", total=None + "[magenta]Scanning for duplicate files...[/magenta]", total=None ) file_duplicates, fuzzy_groups = find_duplicate_files(args.files, args) progress.update(file_task, completed=True) @@ -1233,10 +1233,17 @@ def process_directory_group(dir_name, file_count, total_size, paths, args, progr valid_paths = [] for path in paths: try: + if not path.exists(): + console.print( + f"[yellow]Warning: Directory not found: {path}[/yellow]" + ) + continue stat = path.stat() valid_paths.append((path, stat.st_ctime)) - except FileNotFoundError: - console.print(f"[yellow]Warning: Directory not found: {path}[/yellow]") + except (FileNotFoundError, OSError) as e: + console.print( + f"[yellow]Warning: Cannot access directory {path}: {e}[/yellow]" + ) continue if not valid_paths: @@ -1254,22 +1261,50 @@ def process_directory_group(dir_name, file_count, total_size, paths, args, progr # Process newer copies for dir_path, ctime in valid_paths[1:]: - console.print( - f"Moving duplicate: [yellow]{dir_path}[/yellow] " - f"(created: {time.ctime(ctime)})" - ) - if not args.dry_run: - try: - # Create backup path - rel_path = dir_path.relative_to(dir_path.parent.parent) - backup_path = Path(args.backup_dir) / rel_path + try: + if not dir_path.exists(): + console.print( + f"[yellow]Warning: Directory disappeared: {dir_path}[/yellow]" + ) + continue - # Ensure backup directory exists - backup_path.parent.mkdir(parents=True, exist_ok=True) + console.print( + f"Moving duplicate: [yellow]{dir_path}[/yellow] " + f"(created: {time.ctime(ctime)})" + ) - shutil.move(str(dir_path), str(backup_path)) - except Exception as e: - console.print(f"[red]Error moving directory {dir_path}: {e}[/red]") + if not args.dry_run: + try: + # Create backup path + rel_path = dir_path.relative_to(dir_path.parent.parent) + backup_path = Path(args.backup_dir) / rel_path + + # Ensure backup directory exists + backup_path.parent.mkdir(parents=True, exist_ok=True) + + if backup_path.exists(): + console.print( + f"[yellow]Warning: Backup path already exists: {backup_path}[/yellow]" + ) + # Create a unique name by appending a number + counter = 1 + while backup_path.exists(): + new_name = f"{backup_path.name}_{counter}" + backup_path = backup_path.parent / new_name + counter += 1 + console.print( + f"[blue]Using alternate path: {backup_path}[/blue]" + ) + + shutil.move(str(dir_path), str(backup_path)) + except Exception as e: + console.print( + f"[red]Error moving directory {dir_path}: {e}[/red]" + ) + + except Exception as e: + console.print(f"[red]Error processing directory {dir_path}: {e}[/red]") + continue except Exception as e: console.print(f"[red]Error processing directory group {dir_name}: {e}[/red]") From 47b08d2ed446191978e19813aa1ebfeb21635d98 Mon Sep 17 00:00:00 2001 From: Chris Farrell Date: Mon, 18 Nov 2024 07:43:59 +0000 Subject: [PATCH 53/66] Nov 17, 2024, 11:43 PM --- sample-shrinker-python/sample-shrinker.py | 95 +++++++---------------- 1 file changed, 29 insertions(+), 66 deletions(-) diff --git a/sample-shrinker-python/sample-shrinker.py b/sample-shrinker-python/sample-shrinker.py index 135be02..d2964a0 100644 --- a/sample-shrinker-python/sample-shrinker.py +++ b/sample-shrinker-python/sample-shrinker.py @@ -970,6 +970,7 @@ def get_interactive_config(): args.pre_normalize = False args.list = False args.jobs = 1 + args.fuzzy_threshold = 90 # Add default fuzzy threshold if action == "Remove duplicate directories": # For duplicate removal, get configuration options @@ -1026,6 +1027,20 @@ def get_interactive_config(): args.delete_duplicates = "Delete" in backup_choice args.dry_run = "Preview" in backup_choice + if args.use_fuzzy: + # Get fuzzy matching configuration + threshold_choice = questionary.select( + "Select fuzzy matching threshold (higher = more strict):", + choices=[ + "95 - Nearly identical", + "90 - Very similar", + "85 - Similar", + "80 - Somewhat similar", + ], + default="90 - Very similar", + ).ask() + args.fuzzy_threshold = int(threshold_choice.split()[0]) + return "duplicates", args # For sample shrinking, get all the conversion options @@ -1127,9 +1142,11 @@ def process_duplicates(args): TaskProgressColumn(), console=console, ) as progress: - # Phase 1: Directory scan + # Phase 1: Directory scan - Compare directory contents + console.print("\n[cyan]Phase 1: Directory Structure Analysis[/cyan]") scan_task = progress.add_task( - "[magenta]Scanning for duplicate directories...[/magenta]", total=None + "[magenta]Scanning for duplicate directory structures...[/magenta]", + total=None, ) dir_duplicates = find_duplicate_directories(args.files) progress.update(scan_task, completed=True) @@ -1138,45 +1155,17 @@ def process_duplicates(args): count = sum(len(v) - 1 for v in dir_duplicates.values()) console.print( Panel( - f"Found [cyan]{count}[/cyan] duplicate directories", - title="Directory Scan Complete", + f"Found [cyan]{count}[/cyan] directories with identical contents", + title="Directory Structure Analysis Complete", ) ) + # ... rest of directory processing ... - if args.dry_run: - console.print("[yellow]DRY RUN - No directories will be moved[/yellow]") - - # Process directories with progress bar - dir_task = progress.add_task( - "[green]Processing directories...", total=len(dir_duplicates) - ) - - with ThreadPoolExecutor(max_workers=args.jobs) as executor: - futures = [] - for (dir_name, file_count, total_size), paths in dir_duplicates.items(): - future = executor.submit( - process_directory_group, - dir_name, - file_count, - total_size, - paths, - args, - progress, - ) - futures.append(future) - - for future in as_completed(futures): - try: - future.result() - progress.advance(dir_task) - except Exception as e: - console.print(f"[red]Error processing directory: {e}[/red]") - else: - console.print("[blue]No duplicate directories found.[/blue]") - - # Phase 2: File scan + # Phase 2: File scan - Compare individual files + console.print("\n[cyan]Phase 2: Individual File Analysis[/cyan]") file_task = progress.add_task( - "[magenta]Scanning for duplicate files...[/magenta]", total=None + "[magenta]Scanning for duplicate files across all directories...[/magenta]", + total=None, ) file_duplicates, fuzzy_groups = find_duplicate_files(args.files, args) progress.update(file_task, completed=True) @@ -1187,38 +1176,12 @@ def process_duplicates(args): Panel( f"Found [cyan]{total_duplicates}[/cyan] duplicate files\n" f"Including [cyan]{len(fuzzy_groups)}[/cyan] groups of similar files", - title="File Scan Complete", + title="File Analysis Complete", ) ) + # ... rest of file processing ... - if args.dry_run: - console.print("[yellow]DRY RUN - No files will be moved[/yellow]") - - # Process files with progress bar - file_process_task = progress.add_task( - "[green]Processing files...", total=len(file_duplicates) - ) - - with ThreadPoolExecutor(max_workers=args.jobs) as executor: - futures = [] - for group in file_duplicates: - future = executor.submit( - process_file_group, - group, - fuzzy_groups, - args, - progress, - ) - futures.append(future) - - for future in as_completed(futures): - try: - future.result() - progress.advance(file_process_task) - except Exception as e: - console.print(f"[red]Error processing file group: {e}[/red]") - - console.print("[green]Duplicate removal complete![/green]") + console.print("[green]Duplicate analysis and removal complete![/green]") def process_directory_group(dir_name, file_count, total_size, paths, args, progress): From d9f46342dbf5aa5c39ef21b805207e6a2398102a Mon Sep 17 00:00:00 2001 From: Chris Farrell Date: Mon, 18 Nov 2024 07:46:11 +0000 Subject: [PATCH 54/66] Nov 17, 2024, 11:46 PM --- sample-shrinker-python/sample-shrinker.py | 102 ++++++++++++++++++---- 1 file changed, 87 insertions(+), 15 deletions(-) diff --git a/sample-shrinker-python/sample-shrinker.py b/sample-shrinker-python/sample-shrinker.py index d2964a0..9ae9c00 100644 --- a/sample-shrinker-python/sample-shrinker.py +++ b/sample-shrinker-python/sample-shrinker.py @@ -692,8 +692,14 @@ def get_audio_fingerprint(file_path): samples = samples / np.max(np.abs(samples)) # Get a signature using peaks in frequency domain + # Adjust nperseg and noverlap based on sample length + nperseg = min(1024, len(samples)) + if nperseg % 2 != 0: # Make sure nperseg is even + nperseg -= 1 + noverlap = nperseg // 2 # Set noverlap to half of nperseg + freqs, times, spectrogram = scipy.signal.spectrogram( - samples, audio.frame_rate, nperseg=1024, noverlap=512 + samples, audio.frame_rate, nperseg=nperseg, noverlap=noverlap ) # Get the strongest frequencies @@ -703,7 +709,9 @@ def get_audio_fingerprint(file_path): return peaks except Exception as e: - print(f"Error generating audio fingerprint for {file_path}: {e}") + console.print( + f"[yellow]Error generating audio fingerprint for {file_path}: {e}[/yellow]" + ) return None @@ -1135,6 +1143,8 @@ def get_interactive_config(): def process_duplicates(args): """Process both directory and file level duplicates with visual feedback.""" + # Phase 1: Directory scan - Compare directory contents + console.print("\n[cyan]Phase 1: Directory Structure Analysis[/cyan]") with Progress( SpinnerColumn(), TextColumn("[progress.description]{task.description}"), @@ -1142,8 +1152,6 @@ def process_duplicates(args): TaskProgressColumn(), console=console, ) as progress: - # Phase 1: Directory scan - Compare directory contents - console.print("\n[cyan]Phase 1: Directory Structure Analysis[/cyan]") scan_task = progress.add_task( "[magenta]Scanning for duplicate directory structures...[/magenta]", total=None, @@ -1151,18 +1159,59 @@ def process_duplicates(args): dir_duplicates = find_duplicate_directories(args.files) progress.update(scan_task, completed=True) - if dir_duplicates: - count = sum(len(v) - 1 for v in dir_duplicates.values()) - console.print( - Panel( - f"Found [cyan]{count}[/cyan] directories with identical contents", - title="Directory Structure Analysis Complete", - ) + if dir_duplicates: + count = sum(len(v) - 1 for v in dir_duplicates.values()) + console.print( + Panel( + f"Found [cyan]{count}[/cyan] directories with identical contents", + title="Directory Structure Analysis Complete", ) - # ... rest of directory processing ... + ) + if not args.dry_run: + with Progress( + SpinnerColumn(), + TextColumn("[progress.description]{task.description}"), + BarColumn(), + TaskProgressColumn(), + console=console, + ) as progress: + dir_task = progress.add_task( + "[green]Processing directories...", total=len(dir_duplicates) + ) + with ThreadPoolExecutor(max_workers=args.jobs) as executor: + futures = [] + for ( + dir_name, + file_count, + total_size, + ), paths in dir_duplicates.items(): + future = executor.submit( + process_directory_group, + dir_name, + file_count, + total_size, + paths, + args, + progress, + ) + futures.append(future) + + for future in as_completed(futures): + try: + future.result() + progress.advance(dir_task) + except Exception as e: + console.print(f"[red]Error processing directory: {e}[/red]") - # Phase 2: File scan - Compare individual files - console.print("\n[cyan]Phase 2: Individual File Analysis[/cyan]") + # Phase 2: File scan - Compare individual files + console.print("\n[cyan]Phase 2: Individual File Analysis[/cyan]") + with Progress( + SpinnerColumn(), + TextColumn("[progress.description]{task.description}"), + BarColumn(), + TaskProgressColumn(), + console=console, + ) as progress: file_task = progress.add_task( "[magenta]Scanning for duplicate files across all directories...[/magenta]", total=None, @@ -1179,7 +1228,30 @@ def process_duplicates(args): title="File Analysis Complete", ) ) - # ... rest of file processing ... + if not args.dry_run: + file_process_task = progress.add_task( + "[green]Processing files...", total=len(file_duplicates) + ) + with ThreadPoolExecutor(max_workers=args.jobs) as executor: + futures = [] + for group in file_duplicates: + future = executor.submit( + process_file_group, + group, + fuzzy_groups, + args, + progress, + ) + futures.append(future) + + for future in as_completed(futures): + try: + future.result() + progress.advance(file_process_task) + except Exception as e: + console.print( + f"[red]Error processing file group: {e}[/red]" + ) console.print("[green]Duplicate analysis and removal complete![/green]") From c752c64b879d5cbffab4c25e8456e0ea9dee54ba Mon Sep 17 00:00:00 2001 From: Chris Farrell Date: Mon, 18 Nov 2024 07:47:46 +0000 Subject: [PATCH 55/66] Nov 17, 2024, 11:47 PM --- sample-shrinker-python/sample-shrinker.py | 100 +++++++++++++++------- 1 file changed, 67 insertions(+), 33 deletions(-) diff --git a/sample-shrinker-python/sample-shrinker.py b/sample-shrinker-python/sample-shrinker.py index 9ae9c00..55e90c7 100644 --- a/sample-shrinker-python/sample-shrinker.py +++ b/sample-shrinker-python/sample-shrinker.py @@ -732,10 +732,10 @@ def compare_audio_similarity(file1_fingerprint, file2_fingerprint): return similarity if not np.isnan(similarity) else 0 -def find_duplicate_files(paths, args): +def find_duplicate_files(paths, args, progress, task_id): """Find duplicate files using a multi-stage approach with audio fingerprinting.""" - print("Scanning for duplicate files...") size_groups = defaultdict(list) + scanned = 0 # First pass: group by size for path in paths: @@ -743,8 +743,12 @@ def find_duplicate_files(paths, args): if path.is_dir(): for file_path in path.rglob("*"): if file_path.is_file() and is_audio_file(str(file_path)): + # Update progress + scanned += 1 + progress.update(task_id, completed=scanned) + if args.verbose: - print(f"Scanning: {file_path}") + console.print(f"Scanning: {file_path}") size = file_path.stat().st_size size_groups[size].append(file_path) @@ -864,15 +868,20 @@ def process_duplicate_files(duplicates, fuzzy_groups, args): print(f"Error moving file {file_path}: {e}") -def find_duplicate_directories(paths): +def find_duplicate_directories(paths, progress, task_id): """Find directories with matching names and file counts.""" dir_map = defaultdict(list) + scanned = 0 for path in paths: path = Path(path) if path.is_dir(): for dir_path in path.rglob("*"): if dir_path.is_dir(): + # Update progress + scanned += 1 + progress.update(task_id, completed=scanned) + # Get directory name, file count, and total size dir_name = dir_path.name.lower() # Case-insensitive comparison files = list(dir_path.glob("*")) @@ -1150,14 +1159,21 @@ def process_duplicates(args): TextColumn("[progress.description]{task.description}"), BarColumn(), TaskProgressColumn(), + TextColumn("{task.completed}/{task.total} directories"), console=console, ) as progress: + # First count total directories for progress + total_dirs = sum( + 1 for path in args.files for _ in path.rglob("*") if path.is_dir() + ) scan_task = progress.add_task( "[magenta]Scanning for duplicate directory structures...[/magenta]", - total=None, + total=total_dirs, ) - dir_duplicates = find_duplicate_directories(args.files) - progress.update(scan_task, completed=True) + + # Modify find_duplicate_directories to update progress + dir_duplicates = find_duplicate_directories(args.files, progress, scan_task) + progress.update(scan_task, completed=total_dirs) if dir_duplicates: count = sum(len(v) - 1 for v in dir_duplicates.values()) @@ -1173,10 +1189,11 @@ def process_duplicates(args): TextColumn("[progress.description]{task.description}"), BarColumn(), TaskProgressColumn(), + TextColumn("{task.completed}/{task.total} duplicates"), console=console, ) as progress: dir_task = progress.add_task( - "[green]Processing directories...", total=len(dir_duplicates) + "[green]Processing directories...", total=count ) with ThreadPoolExecutor(max_workers=args.jobs) as executor: futures = [] @@ -1210,14 +1227,23 @@ def process_duplicates(args): TextColumn("[progress.description]{task.description}"), BarColumn(), TaskProgressColumn(), + TextColumn("{task.completed}/{task.total} files"), console=console, ) as progress: + # First count total files for progress + total_files = sum( + 1 for path in args.files for _ in path.rglob("*") if path.is_file() + ) file_task = progress.add_task( "[magenta]Scanning for duplicate files across all directories...[/magenta]", - total=None, + total=total_files, + ) + + # Modify find_duplicate_files to update progress + file_duplicates, fuzzy_groups = find_duplicate_files( + args.files, args, progress, file_task ) - file_duplicates, fuzzy_groups = find_duplicate_files(args.files, args) - progress.update(file_task, completed=True) + progress.update(file_task, completed=total_files) if file_duplicates: total_duplicates = sum(len(group) - 1 for group in file_duplicates) @@ -1229,29 +1255,37 @@ def process_duplicates(args): ) ) if not args.dry_run: - file_process_task = progress.add_task( - "[green]Processing files...", total=len(file_duplicates) - ) - with ThreadPoolExecutor(max_workers=args.jobs) as executor: - futures = [] - for group in file_duplicates: - future = executor.submit( - process_file_group, - group, - fuzzy_groups, - args, - progress, - ) - futures.append(future) - - for future in as_completed(futures): - try: - future.result() - progress.advance(file_process_task) - except Exception as e: - console.print( - f"[red]Error processing file group: {e}[/red]" + with Progress( + SpinnerColumn(), + TextColumn("[progress.description]{task.description}"), + BarColumn(), + TaskProgressColumn(), + TextColumn("{task.completed}/{task.total} duplicates"), + console=console, + ) as progress: + file_process_task = progress.add_task( + "[green]Processing files...", total=total_duplicates + ) + with ThreadPoolExecutor(max_workers=args.jobs) as executor: + futures = [] + for group in file_duplicates: + future = executor.submit( + process_file_group, + group, + fuzzy_groups, + args, + progress, ) + futures.append(future) + + for future in as_completed(futures): + try: + future.result() + progress.advance(file_process_task) + except Exception as e: + console.print( + f"[red]Error processing file group: {e}[/red]" + ) console.print("[green]Duplicate analysis and removal complete![/green]") From 8ea5496e23e1fe7ca0e49f03a383d4fb691a2ba0 Mon Sep 17 00:00:00 2001 From: Chris Farrell Date: Mon, 18 Nov 2024 07:48:17 +0000 Subject: [PATCH 56/66] Nov 17, 2024, 11:48 PM --- sample-shrinker-python/sample-shrinker.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/sample-shrinker-python/sample-shrinker.py b/sample-shrinker-python/sample-shrinker.py index 55e90c7..5daed49 100644 --- a/sample-shrinker-python/sample-shrinker.py +++ b/sample-shrinker-python/sample-shrinker.py @@ -873,8 +873,8 @@ def find_duplicate_directories(paths, progress, task_id): dir_map = defaultdict(list) scanned = 0 - for path in paths: - path = Path(path) + for path_str in paths: + path = Path(path_str) # Convert string to Path if path.is_dir(): for dir_path in path.rglob("*"): if dir_path.is_dir(): @@ -1164,7 +1164,10 @@ def process_duplicates(args): ) as progress: # First count total directories for progress total_dirs = sum( - 1 for path in args.files for _ in path.rglob("*") if path.is_dir() + 1 + for path_str in args.files + for _ in Path(path_str).rglob("*") + if Path(path_str).is_dir() ) scan_task = progress.add_task( "[magenta]Scanning for duplicate directory structures...[/magenta]", From 9e4e14db937ca0626a17edeca707f6bdda5d4133 Mon Sep 17 00:00:00 2001 From: Chris Farrell Date: Mon, 18 Nov 2024 07:50:12 +0000 Subject: [PATCH 57/66] Nov 17, 2024, 11:50 PM --- sample-shrinker-python/sample-shrinker.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sample-shrinker-python/sample-shrinker.py b/sample-shrinker-python/sample-shrinker.py index 5daed49..24494cc 100644 --- a/sample-shrinker-python/sample-shrinker.py +++ b/sample-shrinker-python/sample-shrinker.py @@ -1235,7 +1235,10 @@ def process_duplicates(args): ) as progress: # First count total files for progress total_files = sum( - 1 for path in args.files for _ in path.rglob("*") if path.is_file() + 1 + for path_str in args.files + for _ in Path(path_str).rglob("*") + if Path(_).is_file() # Check if the found item is a file ) file_task = progress.add_task( "[magenta]Scanning for duplicate files across all directories...[/magenta]", From 781723bc0ab77331cbc8e1c1155a93d094f03b37 Mon Sep 17 00:00:00 2001 From: Chris Farrell Date: Mon, 18 Nov 2024 07:57:18 +0000 Subject: [PATCH 58/66] Nov 17, 2024, 11:57 PM --- sample-shrinker-python/sample-shrinker.py | 187 ++++++++++++++++------ 1 file changed, 141 insertions(+), 46 deletions(-) diff --git a/sample-shrinker-python/sample-shrinker.py b/sample-shrinker-python/sample-shrinker.py index 24494cc..2f1f42e 100644 --- a/sample-shrinker-python/sample-shrinker.py +++ b/sample-shrinker-python/sample-shrinker.py @@ -1225,6 +1225,8 @@ def process_duplicates(args): # Phase 2: File scan - Compare individual files console.print("\n[cyan]Phase 2: Individual File Analysis[/cyan]") + + # Step 1: Initial file scanning with Progress( SpinnerColumn(), TextColumn("[progress.description]{task.description}"), @@ -1233,65 +1235,158 @@ def process_duplicates(args): TextColumn("{task.completed}/{task.total} files"), console=console, ) as progress: - # First count total files for progress total_files = sum( 1 for path_str in args.files for _ in Path(path_str).rglob("*") - if Path(_).is_file() # Check if the found item is a file + if Path(_).is_file() ) - file_task = progress.add_task( - "[magenta]Scanning for duplicate files across all directories...[/magenta]", + scan_task = progress.add_task( + "[magenta]Scanning filesystem for files...[/magenta]", total=total_files, ) - # Modify find_duplicate_files to update progress - file_duplicates, fuzzy_groups = find_duplicate_files( - args.files, args, progress, file_task + # First pass: collect files and group by size + size_groups = defaultdict(list) + scanned = 0 + for path_str in args.files: + path = Path(path_str) + if path.is_dir(): + for file_path in path.rglob("*"): + if file_path.is_file() and is_audio_file(str(file_path)): + scanned += 1 + progress.update(scan_task, completed=scanned) + size_groups[file_path.stat().st_size].append(file_path) + + # Step 2: Similarity analysis + potential_duplicates = { + size: files for size, files in size_groups.items() if len(files) > 1 + } + total_to_check = sum(len(files) for files in potential_duplicates.values()) + + file_duplicates = [] + fuzzy_groups = [] + + with Progress( + SpinnerColumn(), + TextColumn("[progress.description]{task.description}"), + BarColumn(), + TaskProgressColumn(), + TextColumn("{task.completed}/{task.total} files"), + console=console, + ) as progress: + check_task = progress.add_task( + "[magenta]Analyzing files for duplicates...[/magenta]", + total=total_to_check, ) - progress.update(file_task, completed=total_files) - if file_duplicates: - total_duplicates = sum(len(group) - 1 for group in file_duplicates) - console.print( - Panel( - f"Found [cyan]{total_duplicates}[/cyan] duplicate files\n" - f"Including [cyan]{len(fuzzy_groups)}[/cyan] groups of similar files", - title="File Analysis Complete", + checked = 0 + for size, file_paths in potential_duplicates.items(): + if args.verbose: + console.print( + f"\nChecking {len(file_paths)} files of size {size} bytes..." ) + + # Group files by hash first + hash_groups = defaultdict(list) + for file_path in file_paths: + try: + file_hash = get_file_hash(file_path, fuzzy=False) + if args.ignore_names: + hash_groups[file_hash].append(file_path) + else: + name_key = file_path.stem.lower() + hash_groups[(name_key, file_hash)].append(file_path) + checked += 1 + progress.update(check_task, completed=checked) + except Exception as e: + console.print(f"[red]Error hashing file {file_path}: {e}[/red]") + + # Add exact matches to results + for group in hash_groups.values(): + if len(group) > 1: + file_duplicates.append(group) + + # Check for similar audio content if enabled + if args.use_fuzzy: + # Get unmatched files (not in any exact match group) + unmatched = [ + f for f in file_paths if not any(f in g for g in file_duplicates) + ] + + if len(unmatched) > 1: + fingerprints = {} + for file_path in unmatched: + fingerprint = get_audio_fingerprint(file_path) + if fingerprint is not None: + fingerprints[file_path] = fingerprint + + # Compare fingerprints + processed = set() + for file1 in fingerprints: + if file1 in processed: + continue + + similar_files = [file1] + for file2 in fingerprints: + if file2 != file1 and file2 not in processed: + similarity = compare_audio_similarity( + fingerprints[file1], fingerprints[file2] + ) + if similarity >= args.fuzzy_threshold: + similar_files.append(file2) + processed.add(file2) + + if len(similar_files) > 1: + fuzzy_groups.append(similar_files) + file_duplicates.append(similar_files) + processed.add(file1) + + # Report results and process duplicates + if file_duplicates: + total_duplicates = sum(len(group) - 1 for group in file_duplicates) + console.print( + Panel( + f"Found [cyan]{total_duplicates}[/cyan] duplicate files\n" + f"Including [cyan]{len(fuzzy_groups)}[/cyan] groups of similar files", + title="File Analysis Complete", ) - if not args.dry_run: - with Progress( - SpinnerColumn(), - TextColumn("[progress.description]{task.description}"), - BarColumn(), - TaskProgressColumn(), - TextColumn("{task.completed}/{task.total} duplicates"), - console=console, - ) as progress: - file_process_task = progress.add_task( - "[green]Processing files...", total=total_duplicates - ) - with ThreadPoolExecutor(max_workers=args.jobs) as executor: - futures = [] - for group in file_duplicates: - future = executor.submit( - process_file_group, - group, - fuzzy_groups, - args, - progress, + ) + + # Step 3: Process duplicates if not in dry run mode + if not args.dry_run: + with Progress( + SpinnerColumn(), + TextColumn("[progress.description]{task.description}"), + BarColumn(), + TaskProgressColumn(), + TextColumn("{task.completed}/{task.total} duplicates"), + console=console, + ) as progress: + process_task = progress.add_task( + "[green]Processing duplicate files...", total=total_duplicates + ) + + with ThreadPoolExecutor(max_workers=args.jobs) as executor: + futures = [] + for group in file_duplicates: + future = executor.submit( + process_file_group, + group, + fuzzy_groups, + args, + progress, + ) + futures.append(future) + + for future in as_completed(futures): + try: + future.result() + progress.advance(process_task) + except Exception as e: + console.print( + f"[red]Error processing file group: {e}[/red]" ) - futures.append(future) - - for future in as_completed(futures): - try: - future.result() - progress.advance(file_process_task) - except Exception as e: - console.print( - f"[red]Error processing file group: {e}[/red]" - ) console.print("[green]Duplicate analysis and removal complete![/green]") From a2966c78a1f77fca29dbc8b5f9281761a418f937 Mon Sep 17 00:00:00 2001 From: Chris Farrell Date: Mon, 18 Nov 2024 08:15:01 +0000 Subject: [PATCH 59/66] Nov 18, 2024, 12:15 AM --- sample-shrinker-python/sample-shrinker.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/sample-shrinker-python/sample-shrinker.py b/sample-shrinker-python/sample-shrinker.py index 2f1f42e..ee04b5f 100644 --- a/sample-shrinker-python/sample-shrinker.py +++ b/sample-shrinker-python/sample-shrinker.py @@ -1403,13 +1403,21 @@ def process_directory_group(dir_name, file_count, total_size, paths, args, progr valid_paths = [] for path in paths: try: - if not path.exists(): + # Wait briefly for cloud storage to download if needed + retries = 3 + while retries > 0: + if path.exists(): + stat = path.stat() + valid_paths.append((path, stat.st_ctime)) + break + retries -= 1 + if retries > 0: + time.sleep(1) # Wait a second before retry + + if retries == 0: console.print( - f"[yellow]Warning: Directory not found: {path}[/yellow]" + f"[yellow]Warning: Directory not available after retries: {path}[/yellow]" ) - continue - stat = path.stat() - valid_paths.append((path, stat.st_ctime)) except (FileNotFoundError, OSError) as e: console.print( f"[yellow]Warning: Cannot access directory {path}: {e}[/yellow]" @@ -1432,9 +1440,10 @@ def process_directory_group(dir_name, file_count, total_size, paths, args, progr # Process newer copies for dir_path, ctime in valid_paths[1:]: try: + # Check again before processing as cloud storage might have changed if not dir_path.exists(): console.print( - f"[yellow]Warning: Directory disappeared: {dir_path}[/yellow]" + f"[yellow]Skipping unavailable directory: {dir_path}[/yellow]" ) continue From 7703ac3318468e2a40ad17b78be4372052ef3a3e Mon Sep 17 00:00:00 2001 From: Chris Farrell Date: Mon, 18 Nov 2024 08:16:59 +0000 Subject: [PATCH 60/66] Nov 18, 2024, 12:16 AM --- sample-shrinker-python/sample-shrinker.py | 32 ++++++++++++++++++++--- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/sample-shrinker-python/sample-shrinker.py b/sample-shrinker-python/sample-shrinker.py index ee04b5f..5d59135 100644 --- a/sample-shrinker-python/sample-shrinker.py +++ b/sample-shrinker-python/sample-shrinker.py @@ -1440,7 +1440,7 @@ def process_directory_group(dir_name, file_count, total_size, paths, args, progr # Process newer copies for dir_path, ctime in valid_paths[1:]: try: - # Check again before processing as cloud storage might have changed + # First verify source exists if not dir_path.exists(): console.print( f"[yellow]Skipping unavailable directory: {dir_path}[/yellow]" @@ -1458,9 +1458,17 @@ def process_directory_group(dir_name, file_count, total_size, paths, args, progr rel_path = dir_path.relative_to(dir_path.parent.parent) backup_path = Path(args.backup_dir) / rel_path - # Ensure backup directory exists + # IMPORTANT: Create ALL parent directories first backup_path.parent.mkdir(parents=True, exist_ok=True) + # Verify the backup path is valid before attempting move + if not backup_path.parent.exists(): + console.print( + f"[red]Error: Backup directory could not be created: {backup_path.parent}[/red]" + ) + continue + + # Check if destination already exists if backup_path.exists(): console.print( f"[yellow]Warning: Backup path already exists: {backup_path}[/yellow]" @@ -1475,10 +1483,26 @@ def process_directory_group(dir_name, file_count, total_size, paths, args, progr f"[blue]Using alternate path: {backup_path}[/blue]" ) - shutil.move(str(dir_path), str(backup_path)) + # Do the move + try: + shutil.move(str(dir_path), str(backup_path)) + except Exception as move_error: + console.print( + f"[red]Error moving {dir_path} to {backup_path}: {move_error}[/red]" + ) + # Try to provide more context about the error + if not dir_path.exists(): + console.print( + "[red]Source directory no longer exists[/red]" + ) + if not backup_path.parent.exists(): + console.print( + "[red]Destination directory does not exist[/red]" + ) + except Exception as e: console.print( - f"[red]Error moving directory {dir_path}: {e}[/red]" + f"[red]Error setting up backup path for {dir_path}: {e}[/red]" ) except Exception as e: From 7feb4a25598f0a24c908dc0b49e9493a72d529e6 Mon Sep 17 00:00:00 2001 From: Chris Farrell Date: Mon, 18 Nov 2024 08:18:59 +0000 Subject: [PATCH 61/66] Nov 18, 2024, 12:18 AM --- sample-shrinker-python/sample-shrinker.py | 222 +++++++++++++++------- 1 file changed, 149 insertions(+), 73 deletions(-) diff --git a/sample-shrinker-python/sample-shrinker.py b/sample-shrinker-python/sample-shrinker.py index 5d59135..12fc519 100644 --- a/sample-shrinker-python/sample-shrinker.py +++ b/sample-shrinker-python/sample-shrinker.py @@ -930,13 +930,13 @@ def process_duplicate_directories(duplicates, args): def get_interactive_config(): """Get configuration through interactive questionary prompts.""" - # First, get the action type action = questionary.select( "What would you like to do?", choices=[ "Shrink samples (convert audio files)", "Remove duplicate directories", + "Restore from backup", "Exit", ], ).ask() @@ -971,46 +971,38 @@ def get_interactive_config(): args = argparse.Namespace() args.files = paths - # Set ALL default values (matching parse_args defaults) - args.backup_dir = "_backup" - args.dry_run = False - args.verbose = False - args.ext = "wav,mp3" - args.bitdepth = 16 - args.min_bitdepth = None - args.channels = 2 - args.samplerate = 44100 - args.min_samplerate = None - args.auto_mono = False - args.auto_mono_threshold = -95.5 - args.skip_spectrograms = False - args.pre_normalize = False - args.list = False - args.jobs = 1 - args.fuzzy_threshold = 90 # Add default fuzzy threshold - - if action == "Remove duplicate directories": - # For duplicate removal, get configuration options - duplicate_options = questionary.checkbox( - "Select duplicate removal options:", + if action == "Restore from backup": + # Get backup directory + args.backup_dir = questionary.path( + "Select backup directory to restore from:", + only_directories=True, + default="_backup", + ).ask() + + # Get file extensions to restore + args.restore_ext = questionary.text( + "Enter file extensions to restore (comma-separated, e.g., wav,mp3):", + default="wav,mp3", + ).ask() + + # Get restore options + restore_options = questionary.checkbox( + "Select restore options:", choices=[ - "Use fuzzy matching for similar files", - "Ignore filenames (match by content only)", "Preview changes (dry run)", "Show detailed progress", "Process files in parallel", + "Skip existing files", + "Overwrite existing files", ], ).ask() - args.use_fuzzy = "Use fuzzy matching for similar files" in duplicate_options - args.ignore_names = ( - "Ignore filenames (match by content only)" in duplicate_options - ) - args.dry_run = "Preview changes (dry run)" in duplicate_options - args.verbose = "Show detailed progress" in duplicate_options + args.dry_run = "Preview changes (dry run)" in restore_options + args.verbose = "Show detailed progress" in restore_options + args.skip_existing = "Skip existing files" in restore_options + args.overwrite = "Overwrite existing files" in restore_options - # Add parallel processing configuration - if "Process files in parallel" in duplicate_options: + if "Process files in parallel" in restore_options: args.jobs = questionary.select( "How many parallel jobs?", choices=["2", "4", "8", "16", "24", "32", "48", "64"], @@ -1020,45 +1012,7 @@ def get_interactive_config(): else: args.jobs = 1 - # Get backup options (modified text prompt) - backup_dir = questionary.text( - "Backup directory path (where duplicates will be moved):", - default="_backup", - ).ask() - - if backup_dir.strip(): # If not empty - args.backup_dir = backup_dir.strip() - else: - args.backup_dir = "_backup" # Fallback to default - - backup_choice = questionary.select( - "How should duplicates be handled?", - choices=[ - f"Move to {args.backup_dir} (safe)", - "Delete immediately (dangerous)", - "Preview only (no changes)", - ], - default=f"Move to {args.backup_dir} (safe)", - ).ask() - - args.delete_duplicates = "Delete" in backup_choice - args.dry_run = "Preview" in backup_choice - - if args.use_fuzzy: - # Get fuzzy matching configuration - threshold_choice = questionary.select( - "Select fuzzy matching threshold (higher = more strict):", - choices=[ - "95 - Nearly identical", - "90 - Very similar", - "85 - Similar", - "80 - Somewhat similar", - ], - default="90 - Very similar", - ).ask() - args.fuzzy_threshold = int(threshold_choice.split()[0]) - - return "duplicates", args + return "restore", args # For sample shrinking, get all the conversion options args.bitdepth = questionary.select( @@ -1591,6 +1545,126 @@ def process_file_group(group, fuzzy_groups, args, progress): raise +def restore_from_backup(args): + """Restore files from backup to their original locations.""" + console.print("\n[cyan]Starting Backup Restore Process[/cyan]") + + backup_path = Path(args.backup_dir) + if not backup_path.exists(): + console.print(f"[red]Error: Backup directory {backup_path} not found[/red]") + return + + # Get list of extensions to restore + extensions = [ext.strip().lower() for ext in args.restore_ext.split(",")] + + # Step 1: Scan backup directory + with Progress( + SpinnerColumn(), + TextColumn("[progress.description]{task.description}"), + BarColumn(), + TaskProgressColumn(), + TextColumn("{task.completed}/{task.total} files"), + console=console, + ) as progress: + scan_task = progress.add_task( + "[magenta]Scanning backup directory...[/magenta]", total=None + ) + + # Collect all files to restore + restore_files = [] + for ext in extensions: + for file_path in backup_path.rglob(f"*.{ext}"): + try: + # Calculate original path + rel_path = file_path.relative_to(backup_path) + target_path = Path(args.files[0]) / rel_path + restore_files.append((file_path, target_path)) + except Exception as e: + console.print(f"[yellow]Error processing {file_path}: {e}[/yellow]") + + progress.update( + scan_task, total=len(restore_files), completed=len(restore_files) + ) + + # Report findings + console.print( + Panel( + f"Found [cyan]{len(restore_files)}[/cyan] files to restore", + title="Backup Scan Complete", + ) + ) + + if not restore_files: + return + + # Step 2: Restore files + with Progress( + SpinnerColumn(), + TextColumn("[progress.description]{task.description}"), + BarColumn(), + TaskProgressColumn(), + TextColumn("{task.completed}/{task.total} files"), + console=console, + ) as progress: + restore_task = progress.add_task( + "[green]Restoring files...[/green]", total=len(restore_files) + ) + + with ThreadPoolExecutor(max_workers=args.jobs) as executor: + futures = [] + for backup_file, target_path in restore_files: + future = executor.submit( + restore_single_file, + backup_file, + target_path, + args, + ) + futures.append(future) + + for future in as_completed(futures): + try: + future.result() + progress.advance(restore_task) + except Exception as e: + console.print(f"[red]Error during restore: {e}[/red]") + + console.print("[green]Restore process complete![/green]") + + +def restore_single_file(backup_file, target_path, args): + """Restore a single file from backup to its original location.""" + try: + if args.verbose: + console.print(f"Processing: {backup_file} -> {target_path}") + + if target_path.exists(): + if args.skip_existing: + if args.verbose: + console.print( + f"[yellow]Skipping existing file: {target_path}[/yellow]" + ) + return + elif not args.overwrite: + console.print( + f"[yellow]Target exists (skipping): {target_path}[/yellow]" + ) + return + + if not args.dry_run: + # Create target directory if it doesn't exist + target_path.parent.mkdir(parents=True, exist_ok=True) + + # Copy the file with metadata preserved + shutil.copy2(backup_file, target_path) + + if args.verbose: + console.print(f"[green]Restored: {target_path}[/green]") + + except Exception as e: + console.print(f"[red]Error restoring {backup_file}: {e}[/red]") + raise + + def main(): # Check for ffmpeg first if not check_ffmpeg(): @@ -1607,7 +1681,9 @@ def main(): if not args: return - if action == "duplicates": + if action == "restore": + restore_from_backup(args) + elif action == "duplicates": process_duplicates(args) else: # Shrink samples # Delete all '._' files before processing anything From 72472a8079116eb6999bb1b55126262d92dcb301 Mon Sep 17 00:00:00 2001 From: Chris Farrell Date: Mon, 18 Nov 2024 08:22:24 +0000 Subject: [PATCH 62/66] Nov 18, 2024, 12:22 AM --- sample-shrinker-python/sample-shrinker.py | 226 ++++++++++++++-------- 1 file changed, 143 insertions(+), 83 deletions(-) diff --git a/sample-shrinker-python/sample-shrinker.py b/sample-shrinker-python/sample-shrinker.py index 12fc519..52980d8 100644 --- a/sample-shrinker-python/sample-shrinker.py +++ b/sample-shrinker-python/sample-shrinker.py @@ -935,7 +935,7 @@ def get_interactive_config(): "What would you like to do?", choices=[ "Shrink samples (convert audio files)", - "Remove duplicate directories", + "Remove duplicate files and directories", "Restore from backup", "Exit", ], @@ -971,7 +971,66 @@ def get_interactive_config(): args = argparse.Namespace() args.files = paths - if action == "Restore from backup": + # Set default values that all modes need + args.dry_run = False + args.verbose = False + args.jobs = 1 + + if action == "Remove duplicate files and directories": + # For duplicate removal, get configuration options + duplicate_options = questionary.checkbox( + "Select duplicate removal options:", + choices=[ + "Use fuzzy matching for similar files", + "Ignore filenames (match by content only)", + "Preview changes (dry run)", + "Show detailed progress", + "Process files in parallel", + ], + ).ask() + + args.use_fuzzy = "Use fuzzy matching for similar files" in duplicate_options + args.ignore_names = ( + "Ignore filenames (match by content only)" in duplicate_options + ) + args.dry_run = "Preview changes (dry run)" in duplicate_options + args.verbose = "Show detailed progress" in duplicate_options + + if "Process files in parallel" in duplicate_options: + args.jobs = questionary.select( + "How many parallel jobs?", + choices=["2", "4", "8", "16", "24", "32", "48", "64"], + default="4", + ).ask() + args.jobs = int(args.jobs) + + # Get backup options + args.backup_dir = questionary.text( + "Backup directory path (where duplicates will be moved):", + default="_backup", + ).ask() + + if args.backup_dir.strip(): # If not empty + args.backup_dir = args.backup_dir.strip() + else: + args.backup_dir = "_backup" # Fallback to default + + if args.use_fuzzy: + threshold_choice = questionary.select( + "Select fuzzy matching threshold (higher = more strict):", + choices=[ + "95 - Nearly identical", + "90 - Very similar", + "85 - Similar", + "80 - Somewhat similar", + ], + default="90 - Very similar", + ).ask() + args.fuzzy_threshold = int(threshold_choice.split()[0]) + + return "duplicates", args + + elif action == "Restore from backup": # Get backup directory args.backup_dir = questionary.path( "Select backup directory to restore from:", @@ -1009,99 +1068,100 @@ def get_interactive_config(): default="4", ).ask() args.jobs = int(args.jobs) - else: - args.jobs = 1 return "restore", args - # For sample shrinking, get all the conversion options - args.bitdepth = questionary.select( - "Select target bit depth:", choices=["8", "16", "24"], default="16" - ).ask() - args.bitdepth = int(args.bitdepth) - - args.channels = questionary.select( - "Select target channels:", - choices=["1 (mono)", "2 (stereo)"], - default="2 (stereo)", - ).ask() - args.channels = 1 if "1" in args.channels else 2 - - args.samplerate = questionary.select( - "Select target sample rate:", - choices=["22050", "44100", "48000"], - default="44100", - ).ask() - args.samplerate = int(args.samplerate) - - # Advanced options in a checkbox group - advanced_options = questionary.checkbox( - "Select additional options:", - choices=[ - "Auto-convert stereo to mono when possible", - "Pre-normalize before conversion", - "Skip generating spectrograms", - "Preview changes (dry run)", - "Process files in parallel", - "Set minimum sample rate", - "Set minimum bit depth", - "Convert in place (no backups)", - ], - ).ask() - - args.auto_mono = "Auto-convert stereo to mono when possible" in advanced_options - args.pre_normalize = "Pre-normalize before conversion" in advanced_options - args.skip_spectrograms = "Skip generating spectrograms" in advanced_options - args.dry_run = "Preview changes (dry run)" in advanced_options - convert_in_place = "Convert in place (no backups)" in advanced_options - - # Configure backup settings if not converting in place - if not convert_in_place: - args.backup_dir = questionary.text( - "Backup directory path:", - default="_backup", + elif action == "Shrink samples (convert audio files)": + # For sample shrinking, get all the conversion options + args.bitdepth = questionary.select( + "Select target bit depth:", choices=["8", "16", "24"], default="16" ).ask() - if args.backup_dir.strip(): # If not empty - args.backup_dir = args.backup_dir.strip() - # Only ask about spectrograms if they weren't explicitly skipped in advanced options - if not args.skip_spectrograms: - args.skip_spectrograms = not questionary.confirm( - "Generate spectrograms for backup comparison?", default=False - ).ask() - else: - args.backup_dir = "-" - args.skip_spectrograms = True - - if "Process files in parallel" in advanced_options: - args.jobs = questionary.select( - "How many parallel jobs? (higher values may improve speed but use more memory)", - choices=["2", "4", "8", "16", "24", "32", "48", "64"], - default="4", + args.bitdepth = int(args.bitdepth) + + args.channels = questionary.select( + "Select target channels:", + choices=["1 (mono)", "2 (stereo)"], + default="2 (stereo)", ).ask() - args.jobs = int(args.jobs) + args.channels = 1 if "1" in args.channels else 2 - if "Set minimum sample rate" in advanced_options: - args.min_samplerate = questionary.select( - "Select minimum sample rate:", + args.samplerate = questionary.select( + "Select target sample rate:", choices=["22050", "44100", "48000"], - default="22050", + default="44100", ).ask() - args.min_samplerate = int(args.min_samplerate) + args.samplerate = int(args.samplerate) - if "Set minimum bit depth" in advanced_options: - args.min_bitdepth = questionary.select( - "Select minimum bit depth:", choices=["8", "16", "24"], default="16" + # Advanced options in a checkbox group + advanced_options = questionary.checkbox( + "Select additional options:", + choices=[ + "Auto-convert stereo to mono when possible", + "Pre-normalize before conversion", + "Skip generating spectrograms", + "Preview changes (dry run)", + "Process files in parallel", + "Set minimum sample rate", + "Set minimum bit depth", + "Convert in place (no backups)", + ], ).ask() - args.min_bitdepth = int(args.min_bitdepth) - if args.auto_mono: - args.auto_mono_threshold = float( - questionary.text( - "Auto-mono threshold in dB (default: -95.5):", default="-95.5" + args.auto_mono = "Auto-convert stereo to mono when possible" in advanced_options + args.pre_normalize = "Pre-normalize before conversion" in advanced_options + args.skip_spectrograms = "Skip generating spectrograms" in advanced_options + args.dry_run = "Preview changes (dry run)" in advanced_options + convert_in_place = "Convert in place (no backups)" in advanced_options + + # Configure backup settings if not converting in place + if not convert_in_place: + args.backup_dir = questionary.text( + "Backup directory path:", + default="_backup", ).ask() - ) + if args.backup_dir.strip(): # If not empty + args.backup_dir = args.backup_dir.strip() + # Only ask about spectrograms if they weren't explicitly skipped in advanced options + if not args.skip_spectrograms: + args.skip_spectrograms = not questionary.confirm( + "Generate spectrograms for backup comparison?", default=False + ).ask() + else: + args.backup_dir = "-" + args.skip_spectrograms = True + + if "Process files in parallel" in advanced_options: + args.jobs = questionary.select( + "How many parallel jobs? (higher values may improve speed but use more memory)", + choices=["2", "4", "8", "16", "24", "32", "48", "64"], + default="4", + ).ask() + args.jobs = int(args.jobs) + + if "Set minimum sample rate" in advanced_options: + args.min_samplerate = questionary.select( + "Select minimum sample rate:", + choices=["22050", "44100", "48000"], + default="22050", + ).ask() + args.min_samplerate = int(args.min_samplerate) + + if "Set minimum bit depth" in advanced_options: + args.min_bitdepth = questionary.select( + "Select minimum bit depth:", choices=["8", "16", "24"], default="16" + ).ask() + args.min_bitdepth = int(args.min_bitdepth) + + if args.auto_mono: + args.auto_mono_threshold = float( + questionary.text( + "Auto-mono threshold in dB (default: -95.5):", default="-95.5" + ).ask() + ) + + return "shrink", args - return "shrink", args + return action.split()[0].lower(), args # 'shrink', 'duplicates', or 'restore' def process_duplicates(args): @@ -1685,7 +1745,7 @@ def main(): restore_from_backup(args) elif action == "duplicates": process_duplicates(args) - else: # Shrink samples + elif action == "shrink": # Delete all '._' files before processing anything for path in args.files: if os.path.isdir(path): From 16de3cc25007eee732408ad6ff6d5dac4f1cce8d Mon Sep 17 00:00:00 2001 From: Chris Farrell Date: Mon, 18 Nov 2024 08:27:52 +0000 Subject: [PATCH 63/66] Nov 18, 2024, 12:27 AM --- sample-shrinker-python/sample-shrinker.py | 82 ++++++++++++++++++++--- 1 file changed, 73 insertions(+), 9 deletions(-) diff --git a/sample-shrinker-python/sample-shrinker.py b/sample-shrinker-python/sample-shrinker.py index 52980d8..31ce259 100644 --- a/sample-shrinker-python/sample-shrinker.py +++ b/sample-shrinker-python/sample-shrinker.py @@ -873,8 +873,38 @@ def find_duplicate_directories(paths, progress, task_id): dir_map = defaultdict(list) scanned = 0 + def get_directory_signature(dir_path): + """Generate a signature for a directory based on its contents.""" + try: + # Get all files and subdirectories recursively + all_items = list(dir_path.rglob("*")) + + # Count files and directories + files = [f for f in all_items if f.is_file()] + subdirs = [d for d in all_items if d.is_dir()] + + # Calculate total size of all files + total_size = sum(f.stat().st_size for f in files) + + # Get relative paths of all items for structure comparison + rel_paths = sorted(str(item.relative_to(dir_path)) for item in all_items) + + # Get file sizes in a deterministic order + file_sizes = sorted(f.stat().st_size for f in files) + + return { + "file_count": len(files), + "subdir_count": len(subdirs), + "total_size": total_size, + "structure": rel_paths, + "file_sizes": file_sizes, + } + except Exception as e: + console.print(f"[yellow]Error analyzing directory {dir_path}: {e}[/yellow]") + return None + for path_str in paths: - path = Path(path_str) # Convert string to Path + path = Path(path_str) if path.is_dir(): for dir_path in path.rglob("*"): if dir_path.is_dir(): @@ -882,16 +912,50 @@ def find_duplicate_directories(paths, progress, task_id): scanned += 1 progress.update(task_id, completed=scanned) - # Get directory name, file count, and total size - dir_name = dir_path.name.lower() # Case-insensitive comparison - files = list(dir_path.glob("*")) - file_count = len([f for f in files if f.is_file()]) - total_size = sum(f.stat().st_size for f in files if f.is_file()) - - dir_map[(dir_name, file_count, total_size)].append(dir_path) + # Get directory signature + signature = get_directory_signature(dir_path) + if signature: + # Create a unique key combining name and content signature + dir_name = dir_path.name.lower() # Case-insensitive comparison + key = ( + dir_name, + signature["file_count"], + signature["subdir_count"], + signature["total_size"], + tuple(signature["file_sizes"]), # Make hashable + tuple(signature["structure"]), # Make hashable + ) + dir_map[key].append(dir_path) # Return only directories that have duplicates - return {k: v for k, v in dir_map.items() if len(v) > 1} + duplicates = {k: v for k, v in dir_map.items() if len(v) > 1} + + if duplicates: + # Log detailed information about matches + for ( + name, + file_count, + subdir_count, + total_size, + sizes, + structure, + ), paths in duplicates.items(): + console.print( + f"\n[cyan]Found potential duplicates:[/cyan]\n" + f"Directory name: [yellow]{name}[/yellow]\n" + f"File count: {file_count}\n" + f"Subdirectory count: {subdir_count}\n" + f"Total size: {total_size} bytes\n" + f"Structure match: {len(structure)} items" + ) + if args.verbose: + console.print("Directory structure:") + for item in structure[:10]: # Show first 10 items + console.print(f" {item}") + if len(structure) > 10: + console.print(" ...") + + return duplicates def process_duplicate_directories(duplicates, args): From ab2eb22324181272a00c3d01c7e561f0ecf1cfd1 Mon Sep 17 00:00:00 2001 From: Chris Farrell Date: Mon, 18 Nov 2024 08:28:45 +0000 Subject: [PATCH 64/66] Nov 18, 2024, 12:28 AM --- sample-shrinker-python/sample-shrinker.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/sample-shrinker-python/sample-shrinker.py b/sample-shrinker-python/sample-shrinker.py index 31ce259..8484d56 100644 --- a/sample-shrinker-python/sample-shrinker.py +++ b/sample-shrinker-python/sample-shrinker.py @@ -868,7 +868,7 @@ def process_duplicate_files(duplicates, fuzzy_groups, args): print(f"Error moving file {file_path}: {e}") -def find_duplicate_directories(paths, progress, task_id): +def find_duplicate_directories(paths, progress, task_id, args): """Find directories with matching names and file counts.""" dir_map = defaultdict(list) scanned = 0 @@ -1252,8 +1252,10 @@ def process_duplicates(args): total=total_dirs, ) - # Modify find_duplicate_directories to update progress - dir_duplicates = find_duplicate_directories(args.files, progress, scan_task) + # Pass args to find_duplicate_directories + dir_duplicates = find_duplicate_directories( + args.files, progress, scan_task, args + ) progress.update(scan_task, completed=total_dirs) if dir_duplicates: From e8a218cc752153869d51a157a0a9e336addfded1 Mon Sep 17 00:00:00 2001 From: Chris Farrell Date: Mon, 18 Nov 2024 08:30:09 +0000 Subject: [PATCH 65/66] Nov 18, 2024, 12:30 AM --- sample-shrinker-python/sample-shrinker.py | 93 ++++++++++++++++++----- 1 file changed, 73 insertions(+), 20 deletions(-) diff --git a/sample-shrinker-python/sample-shrinker.py b/sample-shrinker-python/sample-shrinker.py index 8484d56..85cc9e2 100644 --- a/sample-shrinker-python/sample-shrinker.py +++ b/sample-shrinker-python/sample-shrinker.py @@ -960,36 +960,89 @@ def get_directory_signature(dir_path): def process_duplicate_directories(duplicates, args): """Process duplicate directories, keeping the oldest copy.""" - for (dir_name, file_count, total_size), paths in duplicates.items(): - print( - f"\nFound duplicate directories named '{dir_name}' with {file_count} files ({total_size} bytes):" + for ( + dir_name, + file_count, + subdir_count, + total_size, + sizes, + structure, + ), paths in duplicates.items(): + console.print( + f"\nFound duplicate directories named '[cyan]{dir_name}[/cyan]' " + f"with {file_count} files, {subdir_count} subdirectories " + f"({total_size} bytes):" ) # Sort paths by creation time - paths_with_time = [(p, p.stat().st_ctime) for p in paths] - paths_with_time.sort(key=lambda x: x[1]) + valid_paths = [] + for path in paths: + try: + stat = path.stat() + valid_paths.append((path, stat.st_ctime)) + except FileNotFoundError: + console.print(f"[yellow]Warning: Directory not found: {path}[/yellow]") + continue + + if not valid_paths: + console.print("[red]No valid paths found in group[/red]") + return + + valid_paths.sort(key=lambda x: x[1]) # Keep the oldest directory - original_dir = paths_with_time[0][0] - print( - f"Keeping oldest copy: {original_dir} (created: {time.ctime(paths_with_time[0][1])})" + original_dir = valid_paths[0][0] + console.print( + f"Keeping oldest copy: [green]{original_dir}[/green] " + f"(created: {time.ctime(valid_paths[0][1])})" ) # Process newer copies - for dir_path, ctime in paths_with_time[1:]: - print(f"Moving duplicate: {dir_path} (created: {time.ctime(ctime)})") - if not args.dry_run: - # Create backup path - rel_path = dir_path.relative_to(dir_path.parent.parent) - backup_path = Path(args.backup_dir) / rel_path + for dir_path, ctime in valid_paths[1:]: + try: + if not dir_path.exists(): + console.print( + f"[yellow]Warning: Directory disappeared: {dir_path}[/yellow]" + ) + continue - # Ensure backup directory exists - backup_path.parent.mkdir(parents=True, exist_ok=True) + console.print( + f"Moving duplicate: [yellow]{dir_path}[/yellow] " + f"(created: {time.ctime(ctime)})" + ) - try: - shutil.move(str(dir_path), str(backup_path)) - except Exception as e: - print(f"Error moving directory {dir_path}: {e}") + if not args.dry_run: + try: + # Create backup path + rel_path = dir_path.relative_to(dir_path.parent.parent) + backup_path = Path(args.backup_dir) / rel_path + + # Ensure backup directory exists + backup_path.parent.mkdir(parents=True, exist_ok=True) + + if backup_path.exists(): + console.print( + f"[yellow]Warning: Backup path already exists: {backup_path}[/yellow]" + ) + # Create a unique name by appending a number + counter = 1 + while backup_path.exists(): + new_name = f"{backup_path.name}_{counter}" + backup_path = backup_path.parent / new_name + counter += 1 + console.print( + f"[blue]Using alternate path: {backup_path}[/blue]" + ) + + shutil.move(str(dir_path), str(backup_path)) + except Exception as e: + console.print( + f"[red]Error moving directory {dir_path}: {e}[/red]" + ) + + except Exception as e: + console.print(f"[red]Error processing directory {dir_path}: {e}[/red]") + continue def get_interactive_config(): From f9566177625a03248f56306e4e6270e866153dab Mon Sep 17 00:00:00 2001 From: Chris Farrell Date: Mon, 18 Nov 2024 08:34:36 +0000 Subject: [PATCH 66/66] Nov 18, 2024, 12:34 AM --- sample-shrinker-python/sample-shrinker.py | 26 ++--------------------- 1 file changed, 2 insertions(+), 24 deletions(-) diff --git a/sample-shrinker-python/sample-shrinker.py b/sample-shrinker-python/sample-shrinker.py index 85cc9e2..d39f7a8 100644 --- a/sample-shrinker-python/sample-shrinker.py +++ b/sample-shrinker-python/sample-shrinker.py @@ -1331,30 +1331,8 @@ def process_duplicates(args): dir_task = progress.add_task( "[green]Processing directories...", total=count ) - with ThreadPoolExecutor(max_workers=args.jobs) as executor: - futures = [] - for ( - dir_name, - file_count, - total_size, - ), paths in dir_duplicates.items(): - future = executor.submit( - process_directory_group, - dir_name, - file_count, - total_size, - paths, - args, - progress, - ) - futures.append(future) - - for future in as_completed(futures): - try: - future.result() - progress.advance(dir_task) - except Exception as e: - console.print(f"[red]Error processing directory: {e}[/red]") + process_duplicate_directories(dir_duplicates, args) + progress.update(dir_task, completed=count) # Phase 2: File scan - Compare individual files console.print("\n[cyan]Phase 2: Individual File Analysis[/cyan]")