diff --git a/.gitignore b/.gitignore index 71c7399..e4a401a 100644 --- a/.gitignore +++ b/.gitignore @@ -173,3 +173,4 @@ nvidia_dependencies_linux.zip 1.29.zip config.ini build_portable_example.py +.idea/ diff --git a/DemoMedia/demo1.png b/DemoMedia/demo1.png new file mode 100644 index 0000000..1c74316 Binary files /dev/null and b/DemoMedia/demo1.png differ diff --git a/README.md b/README.md index 8f98c91..3037bd9 100644 --- a/README.md +++ b/README.md @@ -1,21 +1,22 @@ # System Captioner -Generates and shows real-time captions by listening to your Windows PC's audio. Makes digital content more accessible for those who are deaf or hard of hearing, aids language learning, and more. +This application generates and displays real-time captions/subtitles by capturing audio output from a Windows PC. It enhances accessibility for individuals who are deaf or hard of hearing, facilitates language learning, and offers other potential applications. For example, with translation turned on, it enables near real-time communication during foreign language voice calls and provides on-the-fly subtitling for foreign language media, such as anime, without the need for pre-processing video files, and is not limited to any one software boundary. https://github.com/user-attachments/assets/7315ab7c-fe30-4c37-91aa-60bb32979338 +![ Main User Interface](DemoMedia/demo1.png) ## How it works 1. Captures system audio in real-time through Windows audio loopback using PyAudioWPatch -3. Locally transcribes the recordings using faster-whisper -4. Displays the transcriptions as captions in a overlay window that remains always on top +2. Locally transcribes or translates the recordings using faster-whisper +3. Displays the transcriptions/translations as captions in an overlay window that remains always on top Language auto-detection, user-friendly GUI, draggable captions box, and intelligent mode that shows captions only when speech is detected. -By default, the app runs on and requires **nVidia CUDA** (dependencies included). The app should work with RTX 2000, 3000 and 4000 series cards. Turning off GPU mode will make the app run on CPU; start with the smallest model and settle with the model that's stable. +By default, the app runs on and requires **NVIDIA CUDA** (dependencies included). The app should work with RTX 2000, 3000 and 4000 series cards. Turning off GPU mode will make the app run on CPU; start with the smallest model and settle with the model that's stable. ## Installation (Windows) @@ -23,10 +24,12 @@ By default, the app runs on and requires **nVidia CUDA** (dependencies included) 2. Run SystemCaptioner.exe and follow the instructions. -Alternatively build the standalone executable yourself using build_portable.py. You will need the nvidia_dependencies folder from the standalone .zip (/SystemCaptioner/Controller/_internal/nvidia_dependencies) and install all the dependencies using requirements.txt inside a venv first. +Alternatively build the standalone executable yourself using `build_portable.py`. You will need the nvidia_dependencies folder from the standalone .zip (/SystemCaptioner/Controller/_internal/nvidia_dependencies) and install all the dependencies using requirements.txt inside a venv first. After building, extra nvidia_dependencies folder is generated inside dist/SystemCaptioner/. Delete it. ## Limitations/Troubleshooting -‼️ Occasionally, the app can take a long time to start up/load a model. If there are no clear errors in console, wait for at least a few mins or try stopping and starting model again. +‼️ Occasionally, the app can take a long time to start load a model. If there are no clear errors in console, wait for at least a few minutes or try stopping and starting model again. -If you experienced any issues with System Captioner, let me know in the 'Issues' page of this repo! Include the Console window log if possible. +⚠️ If you are getting `Library cublas64_12.dll is not found or cannot be loaded` error on console with no translation, copy `cublasLt64_11.dll` and rename to `cublasLt64_12.dll` in `\Controller\_internal\nvidia_dependencies` folder. + +If you experienced any issues with System Captioner, let us know in the 'Issues' page of this repo! Include the Console window log if possible. diff --git a/build_portable.py b/build_portable.py index 1b1a0f8..97983f3 100644 --- a/build_portable.py +++ b/build_portable.py @@ -6,22 +6,23 @@ def build_portable(): # Get the current directory current_dir = os.path.dirname(os.path.abspath(__file__)) - + # Define paths dist_path = os.path.join(current_dir, 'dist') build_path = os.path.join(current_dir, 'build') nvidia_deps_path = os.path.join(current_dir, 'nvidia_dependencies') icon_path = os.path.join(current_dir, 'icon.ico') - + hallucinations_file = os.path.join(current_dir, 'filter_hallucinations.txt') + # Get faster_whisper assets path faster_whisper_path = os.path.dirname(faster_whisper.__file__) assets_path = os.path.join(faster_whisper_path, 'assets') - + # Clean previous builds for path in [dist_path, build_path]: if os.path.exists(path): shutil.rmtree(path) - + # PyInstaller configuration for main.py PyInstaller.__main__.run([ 'main.py', @@ -60,7 +61,7 @@ def build_portable(): '--collect-all=faster_whisper', '--collect-all=customtkinter', ]) - + # PyInstaller configuration for controller.py PyInstaller.__main__.run([ 'controller.py', @@ -92,7 +93,7 @@ def build_portable(): '--collect-all=torch', '--collect-all=faster_whisper', ]) - + # Copy NVIDIA dependencies if they exist if os.path.exists(nvidia_deps_path): target_nvidia_path = os.path.join(dist_path, 'SystemCaptioner', 'nvidia_dependencies') @@ -100,16 +101,15 @@ def build_portable(): shutil.rmtree(target_nvidia_path) shutil.copytree(nvidia_deps_path, target_nvidia_path) print("NVIDIA dependencies copied successfully") - - + print("Build completed successfully!") - + # Post-build steps try: dist_system_captioner = os.path.join(dist_path, 'SystemCaptioner') dist_controller = os.path.join(dist_path, 'Controller') controller_internal = os.path.join(dist_system_captioner, 'Controller', '_internal') - + # Move Controller folder inside SystemCaptioner if os.path.exists(dist_controller): target_controller = os.path.join(dist_system_captioner, 'Controller') @@ -117,7 +117,7 @@ def build_portable(): shutil.rmtree(target_controller) shutil.move(dist_controller, target_controller) print("Controller folder moved successfully") - + # Copy NVIDIA dependencies to Controller/_internal nvidia_src = os.path.join(dist_system_captioner, 'nvidia_dependencies') if os.path.exists(nvidia_src): @@ -126,16 +126,22 @@ def build_portable(): shutil.rmtree(nvidia_dest) shutil.copytree(nvidia_src, nvidia_dest) print("NVIDIA dependencies copied to Controller/_internal successfully") - + # Copy icon.ico from _internal to root icon_src = os.path.join(dist_system_captioner, '_internal', 'icon.ico') icon_dest = os.path.join(dist_system_captioner, 'icon.ico') if os.path.exists(icon_src): shutil.copy2(icon_src, icon_dest) print("icon.ico copied to root successfully") - + + # Copy filter_hallucinations.txt to root folder of System Captioner + if os.path.exists(hallucinations_file): + hallucinations_dest = os.path.join(dist_system_captioner, 'filter_hallucinations.txt') + shutil.copy2(hallucinations_file, hallucinations_dest) + print("filter_hallucinations.txt copied to root successfully") + print("Post-build steps completed successfully!") - + except Exception as e: print(f"Error during post-build steps: {e}") diff --git a/console.py b/console.py index a8a5f6a..b0933ea 100644 --- a/console.py +++ b/console.py @@ -1,15 +1,14 @@ import customtkinter as ctk from tkinter import scrolledtext -import threading import queue -import sys + class ConsoleWindow(ctk.CTkToplevel): def __init__(self, console_queue, master=None, icon_path=None): super().__init__(master) self.title("Console Output") - self.geometry("600x400") - + self.geometry("900x450") + # Set the icon for the console window if icon_path: self.iconbitmap(icon_path) @@ -49,8 +48,10 @@ def display_message(self, message): self.text_area.configure(state='disabled') self.text_area.yview(ctk.END) + class QueueWriter: """A writer object that redirects writes to a queue.""" + def __init__(self, log_queue): self.log_queue = log_queue diff --git a/controller.py b/controller.py index 8caf11d..fe999f1 100644 --- a/controller.py +++ b/controller.py @@ -2,43 +2,42 @@ import sys import ctypes import threading -import recorder -import transcriber -from gui import SubtitleGUI -import queue import time import argparse import configparser -# Update the import statement for the GUI -from gui import SubtitleGUI # No change needed +# Import necessary modules from the project +import recorder +import transcriber +from gui import SubtitleGUI -# Change the hardcoded path to a relative path +# Setup CUDA DLL path cuda_dll_path = os.path.join(os.path.dirname(__file__), "nvidia_dependencies") os.environ['PATH'] = f"{cuda_dll_path}{os.pathsep}{os.environ['PATH']}" sys.path.append(cuda_dll_path) -# Explicitly add the DLL to the DLL search path +# Add the DLL to the DLL search path os.add_dll_directory(cuda_dll_path) +# Attempt to load the CUDA DLL try: ctypes.CDLL(os.path.join(cuda_dll_path, "cudnn_ops_infer64_8.dll")) print("Successfully loaded cudnn_ops_infer64_8.dll", flush=True) except Exception as e: print(f"Error loading cudnn_ops_infer64_8.dll: {e}", flush=True) -def start_recording(): +def start_recording(device_index=None): """Start the audio recording process.""" - device_index = args.device_index if hasattr(args, 'device_index') else None recorder.record_audio(device_index) -def start_transcription(device): +def start_transcription(device, args): """Start the audio transcription process.""" transcriber.monitor_audio_file( transcriber.AUDIO_INPUT_DIR, transcriber.TRANSCRIPTION_OUTPUT, check_interval=0.2, - device=device + device=device, + args=args ) def start_gui(update_queue, intelligent_mode): @@ -50,9 +49,17 @@ def start_gui(update_queue, intelligent_mode): parser = argparse.ArgumentParser(description="TranscriberX Application") parser.add_argument('--intelligent', action='store_true', help='Enable intelligent mode') parser.add_argument('--cuda', action='store_true', help='Enable CUDA for transcription') - parser.add_argument('--model', type=str, choices=['tiny', 'base', 'small', 'medium', 'large'], + parser.add_argument('--model', type=str, choices=['tiny.en', 'tiny', 'base.en', 'base', 'small.en', 'small', 'medium.en', 'medium', + 'large-v1', 'large-v2', 'large-v3', 'large', 'distil-large-v2', 'distil-medium.en', + 'distil-small.en', 'distil-large-v3'], help='Select the model size for transcription') parser.add_argument('--device-index', type=int, help='Audio device index for recording') + parser.add_argument('--transcription-timeout', type=int, default=5, help='Transcription timeout in seconds') + parser.add_argument('--workers', type=int, default=4, help='Number of worker threads') + parser.add_argument('--translation-enabled', action='store_true', help='Enable translation') + parser.add_argument('--source-language', type=str, default='en', help='Source language for transcription') + parser.add_argument('--filter-hallucinations', action='store_true', help='Filter hallucinations using filter_hallucinations.txt') + parser.add_argument('--store-output', action='store_true', help='Store transcription output in transcriptions.txt') args = parser.parse_args() # Update config with the selected model @@ -70,8 +77,8 @@ def start_gui(update_queue, intelligent_mode): device = "cuda" if args.cuda else "cpu" # Create threads for recording, transcription, and GUI - recording_thread = threading.Thread(target=start_recording, daemon=True) - transcription_thread = threading.Thread(target=start_transcription, args=(device,), daemon=True) + recording_thread = threading.Thread(target=start_recording, args=(args.device_index,), daemon=True) + transcription_thread = threading.Thread(target=start_transcription, args=(device, args), daemon=True) gui_thread = threading.Thread(target=start_gui, args=(transcription_queue, args.intelligent), daemon=True) # Start the threads @@ -85,3 +92,4 @@ def start_gui(update_queue, intelligent_mode): time.sleep(1) except KeyboardInterrupt: print("Exiting program.", flush=True) + diff --git a/filter_hallucinations.txt b/filter_hallucinations.txt new file mode 100644 index 0000000..d8212d3 --- /dev/null +++ b/filter_hallucinations.txt @@ -0,0 +1,158 @@ +! +, . +. , +.. +:) +:D +?? +??? +[applause] +[music] +[typing] +… +♪ +♪♪♪ +❤️ par SousTitreur.com +amara.org +bye +bye, ladies and gentlemen +Click like and subscribe +Click the bell icon +Click the like button +Cliquez-vous sur les sous-titres et abonnez-vous à la chaîne d'Amara.org +Copyright WDR 2019 +Copyright WDR 2020 +Copyright WDR 2021 +Don't forget to like +Don't forget to like and subscribe +don't forget to like, commentary, and subscribe to the channel +don't forget to subscribe +E aí +e se inscreva no canal +e se inscreva no canal e ative o sininho +Fast forward +for your viewing +he was gonna catch it +hello, +I am also looking forward to the next video +I hope you enjoy it. +I hope you enjoy the video +I hope you enjoy this video +I hope you enjoyed the video +I hope you enjoyed this video +I hope you have a good time +I will see you again in the next video +I would like to say thank you very much +I'll be back in 10 minutes +I'll be back in a few minutes +I'll be back in the next video +I'll be back soon +I'll be back soon, or I'll be back soon +I'll be right back +I'll be waiting for you in the evening +I'll be waiting for you in the next video +I'll see you again in the next video +i'll see you next time +i'll see you on the next video +I'm sorry +I'm sorry, but I am sorry +improvável probably a good show +Is it time? +it's no good to me +Legendas pela comunidade Amara.org +Legendas pela comunidade da Amara.org +Legendas pela comunidade das Amara.org +legendas pela comunidade de amara.org +legendas pela comunidade des amara.org +Legendas pela comunidade do Amara.org +legendas pela comunidade dos amara.org +like and subscribe +Like my video +Liking my video +Napisy robione przez społeczność Amara.org +Napisy stworzone przez społeczność Amara.org +Napisy stworzone przez społeczności Amara.org +Napisy wykonane przez społeczność Amara.org +Obrigado. +Oi! +Ondertiteld door de Amara.org gemeenschap +Ondertiteling door de Amara.org gemeenschap +Ondertitels ingediend door de Amara.org gemeenschap +Please give a thumbs up and subscribe to the channel +Please like and subscribe +Please subscribe +Please subscribe and like +Please subscribe to my channel +Please subscribe to my channel! +Please subscribe to the channel +See you again in the next video +See you again in tomorrow's video +see you in the next video +see you next time +see you next video +sorry, don't ask me if i asked about this question right, and i mean the lightening +Sous-titrage ST' 501 +Sous-titrage ST'501 +Sous-titres fait par la communauté d'Amara.org +Sous-titres fait par Sous-titres par Amara.org +Sous-Titres faits par la communauté d'Amara.org +Sous-titres par Amara.org +Sous-titres par la communauté d'Amara.org +Sous-titres par l'Amara.org +Sous-titres réalisés par la communauté d'Amara.org +Sous-titres réalisés par la communauté de l'Amara.org +Sous-titres réalisés par les SousTitres d'Amara.org +Sous-titres réalisés para la communauté d'Amara.org +Sous-titres réalisés pour la communauté d'Amara.org +Subscribe to my channel +Subtitles made by the community of Amara.org +SWR 2020 +SWR 2021 +Tchau tchau! +Tchau! +Tchau, galera! +thank you +thank you for watching +Thank you for your attention +Thank you for your hard work +Thank you for your liking +Thank you for your subscribing +Thank you for your support +Thank you for your time +thank you for your watching +thank you for your watching! +thank you very much +thank you very much for watching +thank you very much for watching! +thank you very much. thank you very much +Thank you. +thank you. bye bye +thanks for watching +Thanks for watching! +thanks mate +That's all for this video +That's all for today +That's all for today's video +That's it for today +Tłumaczenie i napisy stworzone przez społeczność Amara.org +Tłumaczenie stworzone przez społeczność Amara.org +Untertitel der Amara.org-Community +Untertitel im Auftrag des ZDF für funk, 2017 +Untertitel im Auftrag des ZDF, 2017 +Untertitel im Auftrag des ZDF, 2018 +Untertitel im Auftrag des ZDF, 2020 +Untertitel im Auftrag des ZDF, 2021 +Untertitel von Stephanie Geiges +Untertitelung aufgrund der Amara.org-Community +Untertitelung im Auftrag des ZDF, 2021 +Valeu. +Welcome back to my channel +Welcome back to my video +Welcome to a new video +Welcome to the channel +www.mooji.org +www.multi-moto.eu +Zdjęcia i napisy stworzone przez społeczność Amara.org +Редактор субтитров А.Синецкая Корректор А.Егорова +字幕由Amara.org社区提供 +小編字幕由Amara.org社區提供 \ No newline at end of file diff --git a/gui.py b/gui.py index 84bc60b..93a14fd 100644 --- a/gui.py +++ b/gui.py @@ -1,9 +1,9 @@ import tkinter as tk from tkinter import scrolledtext -import threading import queue import time + class SubtitleGUI: def __init__(self, update_queue, intelligent_mode=False): self.update_queue = update_queue @@ -12,10 +12,10 @@ def __init__(self, update_queue, intelligent_mode=False): self.should_show = False self.root = tk.Tk() - + # Remove window decorations (frameless window) self.root.overrideredirect(True) - + # Set window size and position window_width = 800 window_height = 120 @@ -24,16 +24,16 @@ def __init__(self, update_queue, intelligent_mode=False): x_position = (screen_width // 2) - (window_width // 2) y_position = screen_height - window_height - 50 # 50 pixels above the bottom self.root.geometry(f"{window_width}x{window_height}+{x_position}+{y_position}") - + # Make window semi-transparent - self.root.attributes("-alpha", 0.9) # Range: 0.0 (fully transparent) to 1.0 (fully opaque) - + self.root.attributes("-alpha", 0.8) # Range: 0.0 (fully transparent) to 1.0 (fully opaque) + # Set window to be always on top self.root.attributes("-topmost", True) - + # Set background color to dark grey self.root.configure(bg='#2e2e2e') # Dark grey color - + # ScrolledText widget for displaying subtitles self.text_area = scrolledtext.ScrolledText( self.root, @@ -46,12 +46,12 @@ def __init__(self, update_queue, intelligent_mode=False): ) self.text_area.pack(expand=True, fill='both') self.text_area.configure(state='disabled') - + # Bind mouse events for dragging the window self.text_area.bind("", self.start_move) self.text_area.bind("", self.stop_move) self.text_area.bind("", self.do_move) - + # Variables to keep track of dragging self.offset_x = 0 self.offset_y = 0 @@ -102,7 +102,8 @@ def update_subtitles(self): def display_transcription(self, transcription): """Insert the transcription into the text area.""" self.text_area.configure(state='normal') - self.text_area.insert(tk.END, transcription + "\n") + self.text_area.tag_configure("center", justify='center') + self.text_area.insert(tk.END, transcription + "\n", "center") self.text_area.configure(state='disabled') self.text_area.yview(tk.END) diff --git a/icon.ico b/icon.ico index 5331b18..346c442 100644 Binary files a/icon.ico and b/icon.ico differ diff --git a/main.py b/main.py index b4e04c4..9f1ab9f 100644 --- a/main.py +++ b/main.py @@ -3,32 +3,43 @@ import sys import os import threading -import queue # New import for queue -import time # New import for sleep -import configparser # New import for config handling -import webbrowser # Add this import at the top - -from console import ConsoleWindow, QueueWriter # Importing ConsoleWindow and QueueWriter from console.py -from setupGUI import run_setup # Add this import at the top +import queue +import time +import configparser +import webbrowser +from console import ConsoleWindow, QueueWriter +from setupGUI import run_setup ctk.set_appearance_mode("dark") ctk.set_default_color_theme("dark-blue") # Constants CONFIG_FILE = "config.ini" +TOOLTIP_WRAP_LENGTH = 150 +TOOLTIP_BG_COLOR = "#2e2e2e" +TOOLTIP_TEXT_COLOR = "white" +DEFAULT_SOURCE_LANGUAGE = "" +DEFAULT_TRANSCRIPTION_TIMEOUT = "5" +DEFAULT_WORKERS = "4" +FEEDBACK_LINK = "https://github.com/evermoving/SystemCaptioner/issues" +LANGUAGE_CODES_LINK = "https://en.wikipedia.org/wiki/List_of_ISO_639_language_codes" +CONTROLLER_DIR = "Controller" if getattr(sys, 'frozen', False) else "." +CONTROLLER_EXECUTABLE = 'Controller.exe' if getattr(sys, 'frozen', False) else 'controller.py' + class ToolTip: """ - It creates a tooltip for a given widget as the mouse goes on it. + Creates a tooltip for a given widget as the mouse goes on it. """ + def __init__(self, widget, text): self.widget = widget self.text = text self.tooltip_window = None - self.widget.bind("", self.show_tooltip) - self.widget.bind("", self.hide_tooltip) + self.widget.bind("", self._show_tooltip) + self.widget.bind("", self._hide_tooltip) - def show_tooltip(self, event=None): + def _show_tooltip(self, event=None): if self.tooltip_window or not self.text: return x = self.widget.winfo_rootx() + 20 @@ -36,156 +47,227 @@ def show_tooltip(self, event=None): self.tooltip_window = tw = ctk.CTkToplevel(self.widget) tw.wm_overrideredirect(True) tw.wm_geometry(f"+{x}+{y}") - label = ctk.CTkLabel(tw, text=self.text, wraplength=150, bg_color="#2e2e2e", text_color="white") + label = ctk.CTkLabel( + tw, + text=self.text, + wraplength=TOOLTIP_WRAP_LENGTH, + bg_color=TOOLTIP_BG_COLOR, + text_color=TOOLTIP_TEXT_COLOR, + ) label.pack() - def hide_tooltip(self, event=None): + def _hide_tooltip(self, event=None): tw = self.tooltip_window self.tooltip_window = None if tw: tw.destroy() + def get_base_path(): - """Get the base path for the application in both dev and standalone environments""" + """Get the base path for the application in both dev and standalone environments.""" if getattr(sys, 'frozen', False): - # Running in a bundle (standalone) return os.path.dirname(sys.executable) - else: - # Running in normal Python environment - return os.path.dirname(os.path.abspath(__file__)) + return os.path.dirname(os.path.abspath(__file__)) + class App(ctk.CTk): def __init__(self): super().__init__() - self.title("System Captioner") - self.geometry("400x285") - self.resizable(False, False) + self.title("System Captioner v1.39 - Translation Update") + self.geometry("650x585") + self.resizable(True, True) - # Add icon to the main window + # Load the application icon icon_path = os.path.join(get_base_path(), "icon.ico") self.iconbitmap(icon_path) + # Initialize variables for app state and settings self.intelligent_mode = ctk.BooleanVar() self.gpu_enabled = ctk.BooleanVar() self.model_selection = ctk.StringVar() self.app_running = False self.process = None - - # Redirect stdout and stderr to the console queue + self.translation_enabled = ctk.BooleanVar() + self.source_language = ctk.StringVar(value=DEFAULT_SOURCE_LANGUAGE) + self.transcription_timeout = ctk.StringVar(value=DEFAULT_TRANSCRIPTION_TIMEOUT) + self.workers = ctk.StringVar(value=DEFAULT_WORKERS) + self.filter_hallucinations = ctk.BooleanVar() + self.store_output = ctk.BooleanVar() self.console_queue = queue.Queue() - sys.stdout = QueueWriter(self.console_queue) - sys.stderr = QueueWriter(self.console_queue) - - # Initialize the console window self.console_window = ConsoleWindow(self.console_queue, self) self.console_window.withdraw() # Start hidden - self.config = configparser.ConfigParser() - self.load_config() + self._load_config() - # Initialize variables with config values + # Set initial values from config self.intelligent_mode.set(self.config.getboolean('Settings', 'mode')) self.gpu_enabled.set(self.config.getboolean('Settings', 'cuda')) self.model_selection.set(self.config.get('Settings', 'model')) - self.start_button = ctk.CTkButton(self, text="Start", command=self.toggle_app, fg_color="green", hover_color="dark green") + # Redirect stdout and stderr + sys.stdout = QueueWriter(self.console_queue) + sys.stderr = QueueWriter(self.console_queue) + + # Initialize main UI elements + self._init_ui() + + # Setup timeout monitoring + self.TRANSCRIPTION_TIMEOUT = 5 # seconds + self.last_transcription_start = 0 + self.current_transcription_file = None + self.timeout_thread = None + self.stop_timeout = threading.Event() + + # Add this line after super().__init__() + self.protocol("WM_DELETE_WINDOW", self.on_closing) + + def _init_ui(self): + """Initializes the main UI elements.""" + # Start/Stop button + self.start_button = ctk.CTkButton( + self, text="Start", command=self.toggle_app, fg_color="green", hover_color="dark green" + ) self.start_button.pack(pady=(25, 10)) - self.console_button = ctk.CTkButton(self, text="Console", command=self.open_console, fg_color="blue", hover_color="dark blue") + # Console button + self.console_button = ctk.CTkButton( + self, text="Console", command=self.open_console, fg_color="blue", hover_color="dark blue" + ) self.console_button.pack(pady=(0, 25)) + # Checkbox frame self.checkbox_frame = ctk.CTkFrame(self) self.checkbox_frame.pack(pady=(0, 10)) + # Inner checkbox frame for organization self.inner_checkbox_frame = ctk.CTkFrame(self.checkbox_frame) self.inner_checkbox_frame.pack() + # Intelligent mode checkbox self.intelligent_checkbox = ctk.CTkCheckBox( - self.inner_checkbox_frame, - text="Intelligent mode", + self.inner_checkbox_frame, + text="Intelligent Mode", variable=self.intelligent_mode, - command=self.save_config + command=self._save_config, ) self.intelligent_checkbox.grid(row=0, column=0, sticky="w", padx=(0, 10)) - self.intelligent_tooltip_button = ctk.CTkButton( + # Intelligent mode tooltip button + self._create_tooltip_button( self.inner_checkbox_frame, - text="?", - width=25, - height=25, - fg_color="transparent", - hover_color="grey", - command=None - ) - self.intelligent_tooltip_button.grid(row=0, column=1) - ToolTip( - self.intelligent_tooltip_button, - "In intelligent mode, subtitle window is shown only when speech is detected." + row=0, + column=1, + tooltip_text="In intelligent mode, subtitle window is shown only when speech is detected." ) + # GPU checkbox self.gpu_checkbox = ctk.CTkCheckBox( self.inner_checkbox_frame, text="Run on GPU", variable=self.gpu_enabled, - command=self.save_config + command=self._save_config, ) self.gpu_checkbox.grid(row=1, column=0, sticky="w", padx=(0, 10), pady=(5, 0)) - self.gpu_tooltip_button = ctk.CTkButton( + # GPU tooltip button + self._create_tooltip_button( self.inner_checkbox_frame, - text="?", - width=25, - height=25, - fg_color="transparent", - hover_color="grey", - command=None + row=1, + column=1, + tooltip_text="Disabling this will run the app on CPU and result in much slower transcription." + ) + + # Translation checkbox + self.translation_checkbox = ctk.CTkCheckBox( + self.inner_checkbox_frame, + text="Enable Translation", + variable=self.translation_enabled, + command=self._save_config, + ) + self.translation_checkbox.grid(row=2, column=0, sticky="w", padx=(0, 10), pady=(5, 0)) + + # Translation tooltip button + self._create_tooltip_button( + self.inner_checkbox_frame, + row=2, + column=1, + tooltip_text="Enable this to translate the transcription to English." + ) + + # Filter hallucinations checkbox + self.filter_hallucinations_checkbox = ctk.CTkCheckBox( + self.inner_checkbox_frame, + text="Filter Hallucinations", + variable=self.filter_hallucinations, + command=self._save_config, + ) + self.filter_hallucinations_checkbox.grid(row=3, column=0, sticky="w", padx=(0, 10), pady=(5, 0)) + + # Filter hallucinations tooltip button + self._create_tooltip_button( + self.inner_checkbox_frame, + row=3, + column=1, + tooltip_text="Enable this to filter hallucinations using filter_hallucinations.txt file.", + command=lambda: self._open_file("filter_hallucinations.txt"), ) - self.gpu_tooltip_button.grid(row=1, column=1, pady=(5, 0)) - ToolTip( - self.gpu_tooltip_button, - "Disabling this will run the app on CPU and result in much slower transcription." + + # Store output checkbox + self.store_output_checkbox = ctk.CTkCheckBox( + self.inner_checkbox_frame, + text="Store Output", + variable=self.store_output, + command=self._save_config, ) + self.store_output_checkbox.grid(row=4, column=0, sticky="w", padx=(0, 10), pady=(5, 0)) + # Store output tooltip button + self._create_tooltip_button( + self.inner_checkbox_frame, + row=4, + column=1, + tooltip_text="Enable this to store the transcription output in transcriptions.txt." + ) + + # Model selection frame self.model_frame = ctk.CTkFrame(self) self.model_frame.pack(pady=(0, 10)) + # Model label self.model_label = ctk.CTkLabel(self.model_frame, text="Model:") self.model_label.pack(side="left", padx=(0, 5)) + # Model dropdown self.model_dropdown = ctk.CTkOptionMenu( self.model_frame, - values=["tiny", "base", "small", "medium", "large"], + values=[ + 'tiny.en', 'tiny', 'base.en', 'base', 'small.en', 'small', 'medium.en', 'medium', + 'large-v1', 'large-v2', 'large-v3', 'large', 'distil-large-v2', 'distil-medium.en', + 'distil-small.en', 'distil-large-v3' + ], variable=self.model_selection, - command=self.save_config # Save config on change + command=self._save_config, ) self.model_dropdown.pack(side="left") - self.model_tooltip_button = ctk.CTkButton( + # Model tooltip button + self._create_tooltip_button( self.model_frame, - text="?", - width=25, - height=25, - fg_color="transparent", - hover_color="grey", - command=None - ) - self.model_tooltip_button.pack(side="left") - ToolTip( - self.model_tooltip_button, - "Select the model to use for transcription. Larger models are more accurate but require more VRAM." + tooltip_text="Select the model to use for transcription. Larger models are more accurate but require more VRAM. .en are English only models" ) - # Add audio device selection frame + # Audio device frame self.device_frame = ctk.CTkFrame(self) self.device_frame.pack(pady=(0, 10)) + # Audio device label self.device_label = ctk.CTkLabel(self.device_frame, text="Audio Device:") self.device_label.pack(side="left", padx=(0, 5)) - self.devices = self.get_audio_devices() + # Audio devices dropdown + self.devices = self._get_audio_devices() self.device_names = [device['name'] for device in self.devices] self.device_selection = ctk.StringVar() - # Load saved device from config saved_device = self.config.get('Settings', 'audio_device', fallback='') if saved_device in self.device_names: @@ -197,71 +279,141 @@ def __init__(self): self.device_frame, values=self.device_names, variable=self.device_selection, - command=self.on_device_change # Call this method when device changes + command=self._on_device_change, ) self.device_dropdown.pack(side="left") - # Add this line after super().__init__() - self.protocol("WM_DELETE_WINDOW", self.on_closing) - - # Add these as class attributes - self.TRANSCRIPTION_TIMEOUT = 5 # seconds - self.last_transcription_start = 0 - self.current_transcription_file = None - self.timeout_thread = None - self.stop_timeout = threading.Event() - - # Add these after all other UI elements in __init__ + # Feedback label self.feedback_label = ctk.CTkLabel( self, - text="If the app didn't work or you had any issues, let me know!", + text="If the app didn't work or you had any issues, let us know!", text_color="light blue", cursor="hand2", - font=("", -13, "underline") # Added underline to the font + font=("", -13, "underline"), ) self.feedback_label.pack(side="bottom", pady=(0, 10)) - self.feedback_label.bind("", lambda e: self.open_feedback_link()) + self.feedback_label.bind("", lambda e: self._open_url(FEEDBACK_LINK)) - def load_config(self): - """Load the configuration from config.ini or create default if not exists.""" - if not os.path.exists(CONFIG_FILE): - # Run setup GUI to get initial audio device selection - run_setup() - self.config.read(CONFIG_FILE) + # Timeout frame + self.timeout_frame = ctk.CTkFrame(self) + self.timeout_frame.pack(pady=(0, 10)) + + # Timeout label + self.timeout_label = ctk.CTkLabel(self.timeout_frame, text="Transcription Timeout (seconds):") + self.timeout_label.pack(side="left", padx=(0, 5)) + + # Timeout entry + self.timeout_entry = ctk.CTkEntry(self.timeout_frame, textvariable=self.transcription_timeout) + self.timeout_entry.pack(side="left") - def save_config(self, *args): - """Save the current settings to config.ini.""" - self.config['Settings']['mode'] = str(self.intelligent_mode.get()) - self.config['Settings']['cuda'] = str(self.gpu_enabled.get()) - self.config['Settings']['model'] = self.model_selection.get() - self.config['Settings']['audio_device'] = self.device_selection.get() + # Workers frame + self.workers_frame = ctk.CTkFrame(self) + self.workers_frame.pack(pady=(0, 10)) + + # Workers label + self.workers_label = ctk.CTkLabel(self.workers_frame, text="Workers:") + self.workers_label.pack(side="left", padx=(0, 5)) + + # Workers entry + self.workers_entry = ctk.CTkEntry(self.workers_frame, textvariable=self.workers) + self.workers_entry.pack(side="left") + + # Workers tooltip + self._create_tooltip_button( + self.workers_frame, + tooltip_text="Number of worker threads for parallel processing." + ) + + # Language frame + self.language_frame = ctk.CTkFrame(self) + self.language_frame.pack(pady=(0, 10)) + + # Language label + self.language_label = ctk.CTkLabel(self.language_frame, text="[Optional] Source Language:") + self.language_label.pack(side="left", padx=(0, 5)) + + # Language entry + self.language_entry = ctk.CTkEntry(self.language_frame, textvariable=self.source_language) + self.language_entry.pack(side="left") + + # Language tooltip + self._create_tooltip_button( + self.language_frame, + tooltip_text="Specify the language used by the source audio using ISO-639-1 format (e.g., 'en' for English, 'zh' for Chinese). Click on the tooltip for full list.", + command=lambda: self._open_url(LANGUAGE_CODES_LINK) + ) + + def _create_tooltip_button(self, parent, row=None, column=None, tooltip_text="", command=None): + """Creates a tooltip button with the specified tooltip text.""" + button = ctk.CTkButton( + parent, + text="?", + width=25, + height=25, + fg_color="transparent", + hover_color="grey", + command=command, + ) + if row is not None and column is not None: + button.grid(row=row, column=column, pady=(5, 0)) + else: + button.pack(side="left") + ToolTip(button, tooltip_text) + def _load_config(self): + """Loads configuration from config.ini or creates defaults if missing.""" + if not os.path.exists(CONFIG_FILE): + run_setup() # Run setup for initial device selection + self.config.read(CONFIG_FILE) + self.translation_enabled.set(self.config.getboolean('Settings', 'translation_enabled', fallback=False)) + self.source_language.set(self.config.get('Settings', 'source_language', fallback=DEFAULT_SOURCE_LANGUAGE)) + self.transcription_timeout.set( + self.config.get('Settings', 'transcription_timeout', fallback=DEFAULT_TRANSCRIPTION_TIMEOUT)) + self.workers.set(self.config.get('Settings', 'workers', fallback=DEFAULT_WORKERS)) + self.filter_hallucinations.set(self.config.getboolean('Settings', 'filter_hallucinations', fallback=True)) + self.store_output.set(self.config.getboolean('Settings', 'store_output', fallback=True)) + + def _save_config(self, *args): + """Saves current settings to config.ini.""" + settings = self.config['Settings'] + settings['mode'] = str(self.intelligent_mode.get()) + settings['cuda'] = str(self.gpu_enabled.get()) + settings['model'] = self.model_selection.get() + settings['audio_device'] = self.device_selection.get() + settings['transcription_timeout'] = self.transcription_timeout.get() + settings['workers'] = self.workers.get() + settings['translation_enabled'] = str(self.translation_enabled.get()) + settings['source_language'] = self.source_language.get() + settings['filter_hallucinations'] = str(self.filter_hallucinations.get()) + settings['store_output'] = str(self.store_output.get()) # Save the sample rate of the selected device selected_device = self.device_selection.get() device_info = next((device for device in self.devices if device['name'] == selected_device), None) if device_info: - self.config['Settings']['sample_rate'] = str(device_info['defaultSampleRate']) - + settings['sample_rate'] = str(device_info['defaultSampleRate']) with open(CONFIG_FILE, 'w') as configfile: self.config.write(configfile) def toggle_app(self): + """Toggles the application's start/stop state.""" if not self.app_running: - self.start_app() + self._start_app() self.start_button.configure(text="Stop", fg_color="red", hover_color="dark red") else: - self.stop_app() + self._stop_app() self.start_button.configure(text="Start", fg_color="green", hover_color="dark green") - def start_app(self): - # Reset the timeout tracking variables + def _start_app(self): + """Starts the captioning application.""" + # Reset timeout tracking variables self.last_transcription_start = 0 self.current_transcription_file = None - + base_dir = get_base_path() recordings_path = os.path.join(base_dir, "recordings") transcriptions_path = os.path.join(base_dir, "transcriptions.txt") + # Ensure recordings directory exists and clear existing files if os.path.exists(recordings_path): try: for filename in os.listdir(recordings_path): @@ -269,75 +421,107 @@ def start_app(self): if os.path.isfile(file_path): os.remove(file_path) print("Existing recordings have been deleted.", flush=True) - self.enqueue_console_message("Existing recordings have been deleted.") except Exception as e: print(f"Error deleting recordings: {e}", flush=True) - self.enqueue_console_message(f"Error deleting recordings: {e}") else: print("Recordings directory does not exist. Creating one.", flush=True) - self.enqueue_console_message("Recordings directory does not exist. Creating one.") os.makedirs(recordings_path) + # Clear the transcriptions file if it's not already empty try: - with open(transcriptions_path, 'w') as f: - pass # Truncate the file to empty it - print("transcriptions.txt has been emptied.", flush=True) - self.enqueue_console_message("transcriptions.txt has been emptied.") + if os.path.exists(transcriptions_path) and os.path.getsize(transcriptions_path) > 0: + with open(transcriptions_path, 'w') as f: + pass # Truncate the file + print("transcriptions.txt has been emptied.", flush=True) + elif os.path.exists(transcriptions_path) and os.path.getsize(transcriptions_path) == 0: + print("transcriptions.txt is already empty.", flush=True) + else: + print("transcriptions.txt does not exist.", flush=True) + except Exception as e: - print(f"Error emptying transcriptions.txt: {e}", flush=True) - self.enqueue_console_message(f"Error emptying transcriptions.txt: {e}") + print(f"Error handling transcriptions.txt: {e}", flush=True) + # Update the start button self.start_button.configure(text="Stop", fg_color="red", hover_color="dark red") - intelligent = self.intelligent_mode.get() - cuda = self.gpu_enabled.get() - model = self.model_selection.get() - - # Determine the path to the Controller executable - if getattr(sys, 'frozen', False): - controller_executable = os.path.join(base_dir, 'Controller', 'Controller.exe') - else: - controller_executable = os.path.join(base_dir, 'controller.py') - + + # Construct the controller executable path + controller_executable = os.path.join(base_dir, CONTROLLER_DIR, CONTROLLER_EXECUTABLE) + + # Build the command arguments args = [controller_executable] - if intelligent: + if self.intelligent_mode.get(): args.append("--intelligent") - if cuda: + if self.gpu_enabled.get(): args.append("--cuda") - args.extend(["--model", model]) - - # Get the selected device index + args.extend(["--model", self.model_selection.get()]) + + # Get selected audio device index selected_device = self.device_selection.get() device_index = next((device['index'] for device in self.devices if device['name'] == selected_device), None) if device_index is not None: args.extend(["--device-index", str(device_index)]) - - # If running in a frozen state, ensure subprocess handles executable correctly + + # Add translation and filter settings + if self.translation_enabled.get(): + args.append("--translation-enabled") + + if self.filter_hallucinations.get(): + args.append("--filter-hallucinations") + + if self.store_output.get(): + args.append("--store-output") + + args.extend(["--source-language", self.source_language.get()]) + args.extend(["--transcription-timeout", self.transcription_timeout.get()]) + args.extend(["--workers", self.workers.get()]) + + # Launch the subprocess self.process = subprocess.Popen( args, stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, # Merge stderr into stdout + stderr=subprocess.STDOUT, text=True, bufsize=1, universal_newlines=True ) self.app_running = True + # Start monitoring threads self.stop_timeout.clear() - self.timeout_thread = threading.Thread(target=self.monitor_timeout, daemon=True) + self.timeout_thread = threading.Thread(target=self._monitor_timeout, daemon=True) self.timeout_thread.start() - threading.Thread(target=self.read_process_output, daemon=True).start() - threading.Thread(target=self.watch_console_queue, daemon=True).start() + threading.Thread(target=self._read_process_output, daemon=True).start() + threading.Thread(target=self._watch_console_queue, daemon=True).start() + + self.TRANSCRIPTION_TIMEOUT = int(self.transcription_timeout.get()) + workers = int(self.workers.get()) + translation_enabled = self.translation_enabled.get() + filter_hallucinations = self.filter_hallucinations.get() + store_output = self.store_output.get() + source_language = self.source_language.get() + + args.extend(["--transcription-timeout", str(self.TRANSCRIPTION_TIMEOUT)]) + args.extend(["--workers", str(workers)]) + if translation_enabled: + args.append("--translation-enabled") - def stop_app(self): + if filter_hallucinations: + args.append("--filter-hallucinations") + + if store_output: + args.append("--store-output") + + args.extend(["--source-language", source_language]) + + def _stop_app(self): + """Stops the captioning application.""" if self.process: try: self.process.terminate() - # Wait for a short time for graceful termination - self.process.wait(timeout=2) + self.process.wait(timeout=2) # Give a short time for graceful termination except subprocess.TimeoutExpired: - # Force kill if process doesn't terminate gracefully - self.process.kill() + self.process.kill() # Force kill if process doesn't terminate gracefully self.process = None self.start_button.configure(text="Start", fg_color="green", hover_color="dark green") self.app_running = False @@ -346,27 +530,27 @@ def stop_app(self): self.timeout_thread.join() self.timeout_thread = None - def monitor_timeout(self): + def _monitor_timeout(self): + """Monitors the transcription timeout and restarts the app if necessary.""" while self.app_running and not self.stop_timeout.is_set(): if self.last_transcription_start > 0: elapsed_time = time.time() - self.last_transcription_start if elapsed_time > self.TRANSCRIPTION_TIMEOUT: error_msg = f"Transcription timeout for {self.current_transcription_file} after {self.TRANSCRIPTION_TIMEOUT} seconds" - self.enqueue_console_message(f"controller.py ERROR: {error_msg}") - self.stop_app() - time.sleep(1) # Give it a moment to clean up - self.start_app() # Restart the application + self._enqueue_console_message(f"controller.py ERROR: {error_msg}") + self._stop_app() + time.sleep(1) + self._start_app() break time.sleep(1) - def read_process_output(self): - """Read and process lines from the subprocess's combined stdout and stderr.""" + def _read_process_output(self): + """Reads and processes output from the subprocess.""" if self.process.stdout: for line in iter(self.process.stdout.readline, ''): if not line: break line = line.strip() - # Check for transcription start if "Starting transcription for" in line: self.last_transcription_start = time.time() @@ -377,78 +561,80 @@ def read_process_output(self): self.last_transcription_start = 0 # Reset the timer self.current_transcription_file = None - # Determine if the line is an error message + # Send messages to the console queue if "ERROR" in line: - self.enqueue_console_message(f"controller.py ERROR: {line}") + self._enqueue_console_message(f"controller.py ERROR: {line}") else: - self.enqueue_console_message(f"controller.py: {line}") + self._enqueue_console_message(f"controller.py: {line}") - def enqueue_console_message(self, message): - """Helper method to enqueue messages to the console queue.""" + def _enqueue_console_message(self, message): + """Enqueues a message to the console queue.""" self.console_queue.put(message) def open_console(self): - """Open the console window.""" + """Opens the console window.""" if not self.console_window or not self.console_window.winfo_exists(): self.console_window = ConsoleWindow(self.console_queue, self) else: self.console_window.deiconify() self.console_window.focus() - def watch_console_queue(self): - """Continuously watch for console messages (if any additional handling is needed).""" + def _watch_console_queue(self): + """Monitors the console queue (placeholder for any additional console processing).""" while self.app_running: - time.sleep(1) # Adjust the sleep duration as needed + time.sleep(1) def run(self): - """Run the main application loop.""" + """Runs the main application loop.""" self.mainloop() - def get_audio_devices(self): - """Get list of available audio devices.""" + def _get_audio_devices(self): + """Gets a list of available audio devices.""" from recorder import get_audio_devices return get_audio_devices() - def on_device_change(self, selected_device_name): - """Handle changes in the selected audio device.""" - # Find the selected device info + def _on_device_change(self, selected_device_name): + """Handles changes in the selected audio device.""" device_info = next((device for device in self.devices if device['name'] == selected_device_name), None) if device_info: - # Update the config with the new sample rate self.config['Settings']['sample_rate'] = str(device_info['defaultSampleRate']) self.config['Settings']['audio_device'] = selected_device_name with open(CONFIG_FILE, 'w') as configfile: self.config.write(configfile) def on_closing(self): - """Handle cleanup when the window is closed.""" + """Handles cleanup when the window is closed.""" # Stop the application if it's running if self.app_running: - self.stop_app() - - # Ensure the process is terminated + self._stop_app() + + # Ensure process is terminated if self.process: try: self.process.terminate() - # Wait for a short time for graceful termination self.process.wait(timeout=2) except subprocess.TimeoutExpired: - # Force kill if process doesn't terminate gracefully self.process.kill() self.process = None - # Destroy the console window if it exists + # Destroy console window if self.console_window and self.console_window.winfo_exists(): self.console_window.destroy() - # Destroy the main window + # Destroy main window self.quit() self.destroy() - def open_feedback_link(self): - """Open the feedback link in default web browser""" - webbrowser.open("https://github.com/evermoving/SystemCaptioner/issues") + def _open_file(self, filename): + """Opens a file with the default application.""" + os.startfile(filename) + + def _open_url(self, url): + """Opens a URL in the default web browser.""" + webbrowser.open(url) + if __name__ == "__main__": app = App() app.run() + diff --git a/outdated_readme.md b/outdated_readme.md deleted file mode 100644 index 9dbf5de..0000000 --- a/outdated_readme.md +++ /dev/null @@ -1,23 +0,0 @@ - 0. Prerequisites: One of the following Python versions installed on your system: 3.{7,8,9,10,11,12}. - -1. Clone the repository (or download it as .zip from this page) and navigate into the folder: -```bash -git clone https://github.com/evermoving/SystemCaptioner -cd SystemCaptioner -``` -2. Create a virtual environment inside the cloned repo: -```bash -python -m venv venv -``` -3. Activate the virtual environment: -```bash -.\venv\Scripts\activate -``` -4. Install the dependencies: -```bash -pip install -r requirements.txt -``` -5. Download nvidia_dependencies zip from the releases section and extract it into folder where main.py is, i.e. `/SystemCaptioner/nvidia_dependencies/` -6. Launch main.py while in virtual environment: -```bash -python main.py \ No newline at end of file diff --git a/pyaudiowpatchdocu.txt b/pyaudiowpatchdocu.txt deleted file mode 100644 index d51acc5..0000000 --- a/pyaudiowpatchdocu.txt +++ /dev/null @@ -1,118 +0,0 @@ -PyAudioWPatch - -This fork will allow you to use the WASAPI device as loopback using PyAudio. -So you can use speakers to record audio ✨ - -Last Commit Wheels Downloads Py Version Latest release - - - -For whom? - -If you want to record sound from speakers in python, then this fork is for you. You can get recording from any device that supports WASAPI, for example, you can even record audio from Bluetooth headphones🎧 - - PyAudioW(indows|ASAPI)Patch come only with WMME, DirectX and WASAPI support if you need more -> create an issue - -How - -The Windows Audio Session API (WASAPI) allows you to use output devices (that support this API) in loopback mode. At the time of release, it was impossible to achieve this using the original version of PyAudio. - - Note: Now WASAPI loopback devices are duplicated at the end of the list as virtual devices. That is, to record from speakers, you need to use not just a WASAPI device, but its loopback analogue. All loopback devices are input devices. - -How to use - -Read -> Install -> Enjoy! ↣ Press ⭐ -Installation - -pip install PyAudioWPatch - - Wheels are available for Windows, Python 3.{7,8,9,10,11,12}. - All wheels support APIs: WMME, WASAPI, DirectX(DSound). - -In code - -With new features: - -import pyaudiowpatch as pyaudio - -with pyaudio.PyAudio() as p: - # Open PyA manager via context manager - with p.open(...) as stream: - # Open audio stream via context manager - # Do some stuff - ... - -Or in original PyAudio way: - -import pyaudiowpatch as pyaudio - -p = pyaudio.PyAudio() -stream = p.open(...) - -# Do some stuff -... - -stream.stop_stream() -stream.close() - -# close PyAudio -p.terminate() - -Difference with PyAudio - - The behavior of all standard methods is unchanged - Added several life-improving methods - Fixed problem with name encoding - Ability to record audio from WASAPI loopback devices (see example) - -More detailed - - new methods: - get_host_api_info_generator - Iterate over all Host APIs - get_device_info_generator - Iterate over all devices - get_device_info_generator_by_host_api - Iterate over all devices, by specific Host API(index/type) - get_loopback_device_info_generator - Iterate over all devices(with loopback mode) - print_detailed_system_info - Print some info about Host Api and devices - get_default_wasapi_loopback - Return loopback for default speakers - get_wasapi_loopback_analogue_by_index - Return loopback for device via index - get_wasapi_loopback_analogue_by_dict - Return loopback for device related to info_dict - get_default_wasapi_device - Return default (out/in)put device for WASAPI driver - - new features: - Context manager support, for PyAudio(manager) and Stream classes - Run python -m pyaudiowpatch to get list of devices(like print_detailed_system_info call) - -Examples: - - 🆕 Sequential recording from speakers - Play sine, using 'new context manager' - Record audio from default speakers - Simple recording app - Cross-platform concept (Not example) - -Sources - -The following were taken as a basis: - - PortAudio v19 [8b6d16f26ad660e68a97743842ac29b939f3c0c1] - PyAudio v0.2.12 - -How to build manually -Build PortAudio (using the instructions in the README) -Install python -run in the PyAudioWPatch directory: - -python setup.py install - - ??? - Profit. - -Also you can build wheels: - - pip install cibuildwheel - Run in Cygwin: - - ./cygwin_cibuildwheel_build.sh - - Get your wheels in the ./wheelhouse folder - diff --git a/recorder.py b/recorder.py index ce39820..e005b2e 100644 --- a/recorder.py +++ b/recorder.py @@ -14,25 +14,27 @@ config = configparser.ConfigParser() config.read("config.ini") -# Convert the sample rate to a float first, then to an integer -SAMPLE_RATE = int(float(config.get('Settings', 'sample_rate', fallback='44100'))) # Default to 44100 if not set +# Get the sample rate from the config, defaulting to 16000 +SAMPLE_RATE = int(float(config.get('Settings', 'sample_rate', fallback='16000'))) # Constants -CHUNK = 2048 # Number of frames per buffer -FORMAT = pyaudio.paInt16 # 16-bit resolution -CHANNELS = 2 # Stereo -RECORD_SECONDS = 3 # Record in 2-second intervals -OUTPUT_DIR = "recordings" # Directory to save recordings -MAX_FILES = 100 # Maximum number of files to keep - -def get_default_loopback_device(p): +CHUNK = 2048 +FORMAT = pyaudio.paInt16 +CHANNELS = 2 +RECORD_SECONDS = 3 +OUTPUT_DIR = "recordings" +MAX_FILES = 100 + + +def get_default_loopback_device(audio_interface): """Get the default loopback device.""" - return p.get_default_wasapi_loopback() + return audio_interface.get_default_wasapi_loopback() + def save_audio(frames, filename): """Save the recorded audio frames to a WAV file.""" - if not frames: # Check if frames is empty - print(f"Warning: No audio data to save for {filename}") + if not frames: + logger.warning(f"No audio data to save for {filename}") return with wave.open(filename, 'wb') as wf: wf.setnchannels(CHANNELS) @@ -40,25 +42,26 @@ def save_audio(frames, filename): wf.setframerate(SAMPLE_RATE) wf.writeframes(b''.join(frames)) + def cleanup_old_files(): """Delete old WAV files, keeping only the most recent MAX_FILES.""" files = [f for f in os.listdir(OUTPUT_DIR) if f.endswith('.wav')] files.sort(key=lambda x: os.path.getmtime(os.path.join(OUTPUT_DIR, x)), reverse=True) - + for old_file in files[MAX_FILES:]: os.remove(os.path.join(OUTPUT_DIR, old_file)) - print(f"Deleted old file: {old_file}") + logger.info(f"Deleted old file: {old_file}") + def get_audio_devices(): """Get all available WASAPI loopback devices.""" devices = [] try: - p = pyaudio.PyAudio() - wasapi_info = p.get_host_api_info_by_type(pyaudio.paWASAPI) - - for i in range(p.get_device_count()): - device_info = p.get_device_info_by_index(i) - # Check if the device is a loopback device + audio_interface = pyaudio.PyAudio() + wasapi_info = audio_interface.get_host_api_info_by_type(pyaudio.paWASAPI) + + for i in range(audio_interface.get_device_count()): + device_info = audio_interface.get_device_info_by_index(i) if device_info.get('hostApi') == wasapi_info['index'] and device_info.get('isLoopbackDevice', False): devices.append({ 'index': i, @@ -67,12 +70,13 @@ def get_audio_devices(): 'maxInputChannels': device_info.get('maxInputChannels', 2) }) logger.info(f"Found loopback audio device: {device_info.get('name')} (Index: {i})") - - p.terminate() + + audio_interface.terminate() except Exception as e: logger.error(f"Error getting audio devices: {e}") return devices + def record_audio(device_index=None): """Record audio from the specified or default speaker and save it to a file.""" if not os.path.exists(OUTPUT_DIR): @@ -80,30 +84,27 @@ def record_audio(device_index=None): logger.info(f"Created output directory: {OUTPUT_DIR}") try: - with pyaudio.PyAudio() as p: - # Get the specified or default loopback device + with pyaudio.PyAudio() as audio_interface: if device_index is not None: - device_info = p.get_device_info_by_index(device_index) + device_info = audio_interface.get_device_info_by_index(device_index) logger.info(f"Using selected device: {device_info.get('name')} (Index: {device_index})") else: - device_info = get_default_loopback_device(p) + device_info = get_default_loopback_device(audio_interface) device_index = device_info['index'] logger.info(f"Using default loopback device: {device_info.get('name')} (Index: {device_index})") - # Log device properties logger.info(f"Device properties: {device_info}") - + try: - # Open the stream with the detected sample rate - stream = p.open(format=FORMAT, - channels=CHANNELS, - rate=SAMPLE_RATE, - input=True, - frames_per_buffer=CHUNK, - input_device_index=device_index) - + stream = audio_interface.open(format=FORMAT, + channels=CHANNELS, + rate=SAMPLE_RATE, + input=True, + frames_per_buffer=CHUNK, + input_device_index=device_index) + logger.info("Audio stream opened successfully") - + while True: frames = [] for _ in range(0, int(SAMPLE_RATE / CHUNK * RECORD_SECONDS)): @@ -129,5 +130,7 @@ def record_audio(device_index=None): logger.error(f"Critical error in record_audio: {e}") raise + if __name__ == "__main__": record_audio() + diff --git a/requirements.txt b/requirements.txt index e7443dd..8f0846c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ -pyaudiowpatch -faster-whisper -customtkinter -soundfile -pyaudio +pyaudiowpatch==0.2.12.6 +faster-whisper==1.1.0 +customtkinter==5.2.2 +soundfile==0.12.1 +pyaudio==0.2.14 diff --git a/transcriber.py b/transcriber.py index 7f96e4c..c0dc415 100644 --- a/transcriber.py +++ b/transcriber.py @@ -1,9 +1,9 @@ import time import os +import re import configparser from faster_whisper import WhisperModel import queue # New import -from gui import SubtitleGUI # New import import soundfile as sf import concurrent.futures @@ -31,43 +31,74 @@ def initialize_model(device): """ print(f"Loading model: {MODEL_SIZE} on {device}", flush=True) model = WhisperModel(MODEL_SIZE, device=device) - print("Model loaded.", flush=True) + print(f"Model {MODEL_SIZE} loaded.", flush=True) return model -def transcribe_audio(model, audio_path): +def transcribe_audio(model, audio_path, translation_enabled, source_language): """ Transcribe the given audio file using the preloaded Faster Whisper model. """ - print(f"Starting transcription for {audio_path}...", flush=True) + print(f"Transcribing {audio_path} with source language {source_language}...", flush=True) + try: with sf.SoundFile(audio_path) as sound_file: if sound_file.frames == 0: - print(f"Warning: Empty audio file: {audio_path}") + print(f"Warning: Empty audio file: {audio_path}", flush=True) return "" except Exception as e: - print(f"Error reading audio file {audio_path}: {e}") + print(f"Error reading audio file {audio_path}: {e}", flush=True) return "" - segments, _ = model.transcribe(audio_path, beam_size=1, vad_filter=True, word_timestamps=True) - transcription = " ".join(segment.text for segment in segments) - print("Transcription completed.", flush=True) - return transcription.strip() + try: + # print(f"Translation mode: {'enabled' if translation_enabled else 'disabled'}", flush=True) + # print(f"Source language: {source_language}", flush=True) + + # Build the arguments dictionary + transcribe_args = { + "task": "translate" if translation_enabled else "transcribe", + "beam_size": 1, + "vad_filter": True, + "word_timestamps": True, + # "initial_prompt": "", + # "suppress_tokens": [-1, 50363, 50364] + } + + # Add the language argument only if source_language is not an empty string + if source_language: + transcribe_args["language"] = source_language -def save_transcription(transcription, output_path): + # Pass audio_path as a positional argument and unpack the other arguments + segments, _ = model.transcribe(audio_path, **transcribe_args) + + transcription = " ".join(segment.text for segment in segments) + print("Whisper processing completed.", flush=True) + return transcription.strip() + except Exception as e: + print(f"Error during processing: {e}", flush=True) + return "" + +def save_transcription(transcription, output_path, filter_hallucinations, store_output): """ Save the transcription text to a file and send it to the GUI. Args: transcription (str): The transcribed text. output_path (str): Path to the output transcription file. + filter_hallucinations (bool): Whether to filter hallucinations. + store_output (bool): Whether to store the output in a file. """ - with open(output_path, "a") as f: - f.write(transcription + "\n") - print(f"Transcription saved to {output_path}", flush=True) + if filter_hallucinations: + transcription = filter_hallucination_content(transcription) + + if store_output: + with open(output_path, "a", encoding='utf-8') as f: + f.write(transcription + "\n") + print(f"Output saved to {output_path}", flush=True) + # Send transcription to GUI queue transcription_queue.put(transcription) -def monitor_audio_file(input_dir, output_path, check_interval=0.5, device="cuda"): +def monitor_audio_file(input_dir, output_path, check_interval=0.5, device="cuda", args=None): """ Continuously monitor the directory for new audio files and transcribe them. @@ -76,26 +107,87 @@ def monitor_audio_file(input_dir, output_path, check_interval=0.5, device="cuda" output_path (str): Path to save the transcriptions. check_interval (int): Time in seconds between checks. device (str): Device to use for transcription ('cuda' or 'cpu'). + args (argparse.Namespace): Parsed command-line arguments for translation toggle and source language. """ processed_files = set() model = initialize_model(device) - executor = concurrent.futures.ThreadPoolExecutor(max_workers=4) # Allows parallel processing + print(f"Using {args.workers} workers thread...", flush=True) + executor = concurrent.futures.ThreadPoolExecutor(max_workers=args.workers) # Allows parallel processing + + print(f"Starting transcribe_and_save with translation_enabled: {args.translation_enabled} | source_language: \"{args.source_language}\" | filter_hallucinations: {args.filter_hallucinations} | store_output: {args.store_output}...", flush=True) + while True: for filename in os.listdir(input_dir): file_path = os.path.join(input_dir, filename) if file_path not in processed_files: - executor.submit(transcribe_and_save, model, file_path, output_path) + executor.submit(transcribe_and_save, model, file_path, output_path, args.translation_enabled, args.source_language, args.filter_hallucinations, args.store_output) processed_files.add(file_path) time.sleep(check_interval) -def transcribe_and_save(model, file_path, output_path): + +def filter_hallucination_content(input_string): + """ + Filters out blacklisted words or sentences from an input string based on a blacklist file. + + Args: + input_string (str): The input string to be filtered. + + Returns: + str: The filtered string with blacklisted words/sentences removed and cleaned. + """ + try: + # Read the blacklist from the file + with open('filter_hallucinations.txt', 'r', encoding='utf-8') as file: + blacklisted_lines = sorted( + [line.strip().lower() for line in file if line.strip()], + key=len, + reverse=True # Sort by length in descending order + ) + + filtered_string = input_string + + # Check for and remove blacklisted words/sentences recursively + while True: + initial_string = filtered_string + for blacklisted in blacklisted_lines: + pattern = re.compile(re.escape(blacklisted), re.IGNORECASE) + matches = pattern.findall(filtered_string) + if matches: + for match in matches: + print(f"Detected blacklisted text: '{match}'", flush=True) + filtered_string = pattern.sub('', filtered_string) + + # If no changes were made, exit the loop + if initial_string == filtered_string: + break + + # Remove awkward spaces (e.g., extra spaces between words) + filtered_string = re.sub(r'\s+', ' ', filtered_string).strip() + + # Remove unnecessary new lines + filtered_string = re.sub(r'\s*\n\s*', ' ', filtered_string).strip() + + if filtered_string == ".": + return "" + + # print(f"String '{input_string}' filtered as hallucination text detected", flush=True) + + return filtered_string + + except Exception as e: + print(f"Returning unfiltered string. Error encountered: {e}", flush=True) + return input_string + + +def transcribe_and_save(model, file_path, output_path, translation_enabled, source_language, filter_hallucinations, store_output): try: - print(f"Transcribing {file_path}...", flush=True) - transcription = transcribe_audio(model, file_path) + print(f"Transcribing/translating {file_path}...", flush=True) + transcription = transcribe_audio(model, file_path, translation_enabled, source_language) if transcription: - save_transcription(transcription, output_path) + save_transcription(transcription, output_path, filter_hallucinations, store_output) except Exception as e: - print(f"Can't transcribe audio chunk {file_path}: {e}", flush=True) + print(f"Can't transcribe/translate audio chunk {file_path}: {e}", flush=True) if __name__ == "__main__": monitor_audio_file(AUDIO_INPUT_DIR, TRANSCRIPTION_OUTPUT) +