#!/usr/bin/env python3 """ Voice Recording and Transcription Script Records audio from microphone and converts it to text using OpenAI Whisper """ import sounddevice as sd import numpy as np import whisper import tempfile import wave import os import re import pyperclip import sys import tkinter as tk from tkinter import ttk, scrolledtext, messagebox import threading import json import warnings from datetime import datetime from pathlib import Path from pynput import keyboard import pystray from PIL import Image, ImageDraw # Suppress common PyTorch/Whisper warnings warnings.filterwarnings("ignore", message="FP16 is not supported on CPU") warnings.filterwarnings("ignore", message=".*FP16.*") warnings.filterwarnings("ignore", category=UserWarning, module="whisper") # Set environment variable to reduce PyTorch verbosity os.environ["TOKENIZERS_PARALLELISM"] = "false" class PromptProcessor: """Processes transcribed text to create better Claude Code prompts""" def __init__(self): self.patterns = [ # Agent references (r'\buse agent ([\w-]+)\b', r'@agent \1'), (r'\blaunch agent ([\w-]+(?:\s+[\w-]+)*)\b', lambda m: f"@agent {m.group(1).replace(' ', '-')}"), (r'\bcall agent ([\w-]+(?:\s+[\w-]+)*)\b', lambda m: f"@agent {m.group(1).replace(' ', '-')}"), # Tool references (r'\brun tool (\w+)\b', r'@tool \1'), (r'\bcall the (\w+) tool\b', r'@tool \1'), (r'\buse the (\w+) tool\b', r'@tool \1'), # Directory references (r'\bdirectory ([\w/\\.-]+)\b', r'@dir \1/'), (r'\bfolder ([\w/\\.-]+)\b', r'@dir \1/'), (r'\bthe ([\w.-]+) directory\b', r'@dir \1/'), # File references (r'\bfile ([\w/\\.-]+\.[\w]+)\b', r'@file \1'), (r'\bthe ([\w.-]+\.[\w]+) file\b', r'@file \1'), (r'\breadme file\b', '@file README.md'), (r'\bpackage json\b', '@file package.json'), # Code elements (r'\bfunction ([\w_]+)\b', r'`\1()` function'), (r'\bclass ([\w_]+)\b', r'`\1` class'), (r'\bvariable ([\w_]+)\b', r'`\1` variable'), (r'\bmethod ([\w_]+)\b', r'`\1()` method'), # Task management (r'\badd to todo\b', 'add to todo:'), (r'\bnew task\b', 'new todo:'), (r'\bmark complete\b', 'mark todo complete'), (r'\bmark done\b', 'mark todo complete'), # Commands (r'\brun tests\b', 'run tests'), (r'\bcommit changes\b', 'commit changes'), (r'\bcreate pull request\b', 'create PR'), (r'\binstall dependencies\b', 'install dependencies'), ] def process(self, text): """Process raw transcription into a Claude Code prompt""" processed = text.strip() # Apply pattern replacements for pattern, replacement in self.patterns: if callable(replacement): processed = re.sub(pattern, replacement, processed, flags=re.IGNORECASE) else: processed = re.sub(pattern, replacement, processed, flags=re.IGNORECASE) # Capitalize first letter and ensure proper punctuation if processed: processed = processed[0].upper() + processed[1:] if len(processed) > 1 else processed.upper() if not processed.endswith(('.', '!', '?', ':')): processed += '.' return processed class SettingsManager: """Manages application settings with JSON persistence""" def __init__(self): self.settings_file = Path('voice_to_text_settings.json') self.default_settings = { 'hotkey': 'f1', 'always_on_top': False, 'minimize_to_tray': True, 'whisper_model': 'base', 'window_geometry': '600x500', 'auto_copy_clipboard': True } self.settings = self.load_settings() def load_settings(self): """Load settings from JSON file or create defaults""" try: if self.settings_file.exists(): with open(self.settings_file, 'r') as f: settings = json.load(f) # Merge with defaults to handle new settings merged = self.default_settings.copy() merged.update(settings) return merged except Exception as e: print(f"Error loading settings: {e}") return self.default_settings.copy() def save_settings(self): """Save current settings to JSON file""" try: with open(self.settings_file, 'w') as f: json.dump(self.settings, f, indent=2) except Exception as e: print(f"Error saving settings: {e}") def get(self, key, default=None): """Get a setting value""" return self.settings.get(key, default) def set(self, key, value): """Set a setting value and save""" self.settings[key] = value self.save_settings() class VoiceRecorder: def __init__(self, sample_rate=16000, channels=1, settings_manager=None): self.sample_rate = sample_rate self.channels = channels self.recording = False self.audio_data = [] self.processor = PromptProcessor() self.settings = settings_manager or SettingsManager() # Ensure transcripts directory exists self.transcripts_dir = Path('transcripts') self.transcripts_dir.mkdir(exist_ok=True) def record_audio(self, duration=None): """ Record audio from microphone Args: duration: Recording duration in seconds. If None, records until Enter is pressed """ print("Loading Whisper model...") model = whisper.load_model("base") if duration: print(f"Recording for {duration} seconds...") audio = sd.rec(int(duration * self.sample_rate), samplerate=self.sample_rate, channels=self.channels, dtype=np.float32) sd.wait() print("Recording complete!") else: print("Recording... Press Enter to stop.") self.recording = True self.audio_data = [] def callback(indata, frames, time, status): if self.recording: self.audio_data.append(indata.copy()) with sd.InputStream(callback=callback, samplerate=self.sample_rate, channels=self.channels, dtype=np.float32): input() # Wait for Enter key self.recording = False if self.audio_data: audio = np.concatenate(self.audio_data, axis=0) else: print("No audio recorded.") return # Save to temporary file temp_file = self._save_to_temp_file(audio) try: # Transcribe with Whisper print("Transcribing...") result = model.transcribe(temp_file) # Process the transcription raw_text = result["text"] processed_text = self.processor.process(raw_text) # Display results print("\n" + "="*50) print("RAW TRANSCRIPTION:") print("="*50) print(raw_text) print("\n" + "="*50) print("PROCESSED PROMPT:") print("="*50) print(processed_text) print("="*50) # Copy processed text to clipboard try: pyperclip.copy(processed_text) print("\nāœ“ Processed prompt copied to clipboard!") print("You can now paste it directly into Claude Code.") except Exception as e: print(f"\n⚠ Could not copy to clipboard: {e}") print("Please copy the processed text manually.") # Save to file in transcripts directory timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") output_file = self.transcripts_dir / f"transcription_{timestamp}.txt" with open(output_file, 'w', encoding='utf-8') as f: f.write(f"Transcription - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n") f.write("="*60 + "\n") f.write("RAW TRANSCRIPTION:\n") f.write("="*60 + "\n") f.write(raw_text + "\n\n") f.write("="*60 + "\n") f.write("PROCESSED PROMPT:\n") f.write("="*60 + "\n") f.write(processed_text) print(f"\nTranscription saved to: {output_file}") finally: # Clean up temporary file os.unlink(temp_file) def _save_to_temp_file(self, audio_data): """Save audio data to temporary WAV file""" temp_file = tempfile.mktemp(suffix=".wav") with wave.open(temp_file, 'wb') as wf: wf.setnchannels(self.channels) wf.setsampwidth(2) # 16-bit wf.setframerate(self.sample_rate) # Convert float32 to int16 audio_int16 = (audio_data * 32767).astype(np.int16) wf.writeframes(audio_int16.tobytes()) return temp_file class SettingsDialog: """Settings dialog for configuring the voice recorder""" def __init__(self, parent, settings_manager, apply_callback): self.settings = settings_manager self.apply_callback = apply_callback # Create dialog window self.dialog = tk.Toplevel(parent) self.dialog.title("Settings") self.dialog.geometry("400x300") self.dialog.resizable(False, False) self.dialog.transient(parent) self.dialog.grab_set() # Center the dialog self.dialog.update_idletasks() x = (self.dialog.winfo_screenwidth() // 2) - (400 // 2) y = (self.dialog.winfo_screenheight() // 2) - (300 // 2) self.dialog.geometry(f"400x300+{x}+{y}") self.create_widgets() def create_widgets(self): """Create the settings widgets""" main_frame = ttk.Frame(self.dialog, padding="20") main_frame.pack(fill=tk.BOTH, expand=True) # Hotkey setting ttk.Label(main_frame, text="Global Hotkey:", font=('Arial', 10, 'bold')).pack(anchor=tk.W, pady=(0, 5)) hotkey_frame = ttk.Frame(main_frame) hotkey_frame.pack(fill=tk.X, pady=(0, 15)) self.hotkey_var = tk.StringVar(value=self.settings.get('hotkey', 'f1')) hotkey_combo = ttk.Combobox(hotkey_frame, textvariable=self.hotkey_var, values=['f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12'], state="readonly", width=10) hotkey_combo.pack(side=tk.LEFT) ttk.Label(hotkey_frame, text="(Press this key anywhere to start/stop recording)").pack(side=tk.LEFT, padx=(10, 0)) # Whisper model setting ttk.Label(main_frame, text="Whisper Model:", font=('Arial', 10, 'bold')).pack(anchor=tk.W, pady=(0, 5)) model_frame = ttk.Frame(main_frame) model_frame.pack(fill=tk.X, pady=(0, 15)) self.model_var = tk.StringVar(value=self.settings.get('whisper_model', 'base')) model_combo = ttk.Combobox(model_frame, textvariable=self.model_var, values=['tiny', 'base', 'small', 'medium', 'large', 'turbo'], state="readonly", width=10) model_combo.pack(side=tk.LEFT) ttk.Label(model_frame, text="(tiny=fastest, large=most accurate)").pack(side=tk.LEFT, padx=(10, 0)) # Boolean settings ttk.Label(main_frame, text="Options:", font=('Arial', 10, 'bold')).pack(anchor=tk.W, pady=(15, 5)) self.always_on_top_var = tk.BooleanVar(value=self.settings.get('always_on_top', False)) ttk.Checkbutton(main_frame, text="Keep window always on top", variable=self.always_on_top_var).pack(anchor=tk.W, pady=2) self.minimize_to_tray_var = tk.BooleanVar(value=self.settings.get('minimize_to_tray', True)) ttk.Checkbutton(main_frame, text="Minimize to system tray when closed", variable=self.minimize_to_tray_var).pack(anchor=tk.W, pady=2) self.auto_copy_var = tk.BooleanVar(value=self.settings.get('auto_copy_clipboard', True)) ttk.Checkbutton(main_frame, text="Automatically copy processed text to clipboard", variable=self.auto_copy_var).pack(anchor=tk.W, pady=2) # Buttons button_frame = ttk.Frame(main_frame) button_frame.pack(side=tk.BOTTOM, fill=tk.X, pady=(20, 0)) ttk.Button(button_frame, text="Cancel", command=self.cancel).pack(side=tk.RIGHT, padx=(5, 0)) ttk.Button(button_frame, text="Apply", command=self.apply).pack(side=tk.RIGHT) def apply(self): """Apply the settings""" # Update settings self.settings.set('hotkey', self.hotkey_var.get()) self.settings.set('whisper_model', self.model_var.get()) self.settings.set('always_on_top', self.always_on_top_var.get()) self.settings.set('minimize_to_tray', self.minimize_to_tray_var.get()) self.settings.set('auto_copy_clipboard', self.auto_copy_var.get()) # Call the apply callback if self.apply_callback: self.apply_callback() self.dialog.destroy() def cancel(self): """Cancel the dialog""" self.dialog.destroy() class VoiceRecorderGUI: """GUI version of the voice recorder with hotkey support""" def __init__(self): self.settings = SettingsManager() self.recorder = VoiceRecorder(settings_manager=self.settings) self.is_recording = False self.hotkey_listener = None self.tray_icon = None self.is_closing = False # Create main window self.root = tk.Tk() self.root.title("Voice to Text Converter") geometry = self.settings.get('window_geometry', '600x500') self.root.geometry(geometry) self.root.resizable(True, True) # Set always on top if enabled if self.settings.get('always_on_top', False): self.root.wm_attributes('-topmost', True) # Set up tray first (before UI setup) if self.settings.get('minimize_to_tray', True): self.setup_tray() self.setup_ui() self.setup_hotkey() def setup_ui(self): """Set up the GUI elements""" # Main frame main_frame = ttk.Frame(self.root, padding="10") main_frame.grid(row=0, column=0, sticky=(tk.W, tk.E, tk.N, tk.S)) # Configure grid weights self.root.columnconfigure(0, weight=1) self.root.rowconfigure(0, weight=1) main_frame.columnconfigure(1, weight=1) main_frame.rowconfigure(2, weight=1) # Title title_label = ttk.Label(main_frame, text="Voice to Text Converter", font=('Arial', 16, 'bold')) title_label.grid(row=0, column=0, columnspan=2, pady=(0, 20)) # Record button self.record_button = ttk.Button(main_frame, text="šŸŽ¤ Record", command=self.toggle_recording, style="Record.TButton") self.record_button.grid(row=1, column=0, columnspan=2, pady=10, sticky="ew") # Status label self.status_label = ttk.Label(main_frame, text="Ready to record (Press F1 or click Record)", font=('Arial', 10)) self.status_label.grid(row=2, column=0, columnspan=2, pady=(0, 10)) # Results frame results_frame = ttk.LabelFrame(main_frame, text="Transcription Results", padding="10") results_frame.grid(row=3, column=0, columnspan=2, sticky=(tk.W, tk.E, tk.N, tk.S), pady=10) results_frame.columnconfigure(0, weight=1) results_frame.rowconfigure(1, weight=1) # Raw transcription ttk.Label(results_frame, text="Raw Transcription:", font=('Arial', 10, 'bold')).grid(row=0, column=0, sticky=tk.W) self.raw_text = scrolledtext.ScrolledText(results_frame, height=6, width=70) self.raw_text.grid(row=1, column=0, sticky=(tk.W, tk.E, tk.N, tk.S), pady=(5, 10)) # Processed transcription ttk.Label(results_frame, text="Processed Prompt (Copied to Clipboard):", font=('Arial', 10, 'bold')).grid(row=2, column=0, sticky=tk.W) self.processed_text = scrolledtext.ScrolledText(results_frame, height=6, width=70) self.processed_text.grid(row=3, column=0, sticky=(tk.W, tk.E, tk.N, tk.S), pady=(5, 0)) # Control buttons frame controls_frame = ttk.Frame(main_frame) controls_frame.grid(row=4, column=0, columnspan=2, pady=10, sticky="ew") controls_frame.columnconfigure(0, weight=1) controls_frame.columnconfigure(1, weight=1) controls_frame.columnconfigure(2, weight=1) # Always on top toggle self.always_on_top_var = tk.BooleanVar(value=self.settings.get('always_on_top', False)) always_on_top_cb = ttk.Checkbutton(controls_frame, text="Always on Top", variable=self.always_on_top_var, command=self.toggle_always_on_top) always_on_top_cb.grid(row=0, column=0, sticky="w") # Settings button settings_btn = ttk.Button(controls_frame, text="āš™ļø Settings", command=self.open_settings) settings_btn.grid(row=0, column=1, padx=5) # Minimize to tray button (if tray enabled and available) if self.settings.get('minimize_to_tray', True) and self.tray_icon: tray_btn = ttk.Button(controls_frame, text="šŸ“Œ Minimize to Tray", command=self.minimize_to_tray) tray_btn.grid(row=0, column=2, sticky="e") # Hotkey info hotkey = self.settings.get('hotkey', 'f1').upper() info_label = ttk.Label(main_frame, text=f"šŸ’” Tip: Press {hotkey} anywhere to start/stop recording", font=('Arial', 9), foreground="gray") info_label.grid(row=5, column=0, columnspan=2, pady=10) # Configure button style style = ttk.Style() style.configure("Record.TButton", font=('Arial', 12, 'bold')) def setup_hotkey(self): """Set up global hotkey listener""" def on_hotkey(): # Schedule the toggle in the main thread self.root.after(0, self.toggle_recording) # Get hotkey from settings hotkey = self.settings.get('hotkey', 'f1') # Start hotkey listener in background thread self.hotkey_listener = keyboard.GlobalHotKeys({ f'<{hotkey}>': on_hotkey }) self.hotkey_listener.start() def setup_tray(self): """Set up system tray icon""" try: # Create a simple icon (avoid emoji text which can cause issues) image = Image.new('RGB', (64, 64), color='blue') draw = ImageDraw.Draw(image) draw.ellipse([16, 16, 48, 48], fill='white') draw.ellipse([24, 24, 40, 40], fill='blue') # Simple microphone representation # Create tray menu menu = pystray.Menu( pystray.MenuItem('Show', self.show_window), pystray.MenuItem('Record', self.toggle_recording), pystray.MenuItem('Settings', self.open_settings), pystray.MenuItem('Quit', self.quit_app) ) self.tray_icon = pystray.Icon('VoiceToText', image, 'Voice to Text', menu) except Exception as e: print(f"Warning: Could not set up system tray: {e}") print("System tray features will be disabled.") self.tray_icon = None # Disable tray setting if it fails self.settings.set('minimize_to_tray', False) def toggle_always_on_top(self): """Toggle always on top setting""" always_on_top = self.always_on_top_var.get() self.root.wm_attributes('-topmost', always_on_top) self.settings.set('always_on_top', always_on_top) def minimize_to_tray(self): """Minimize window to system tray""" if self.tray_icon: self.root.withdraw() # Start tray icon in background thread threading.Thread(target=self.tray_icon.run, daemon=True).start() else: # If tray is not available, just minimize normally self.root.iconify() messagebox.showinfo("Minimized", "Window minimized to taskbar (system tray not available)") def show_window(self, icon=None, item=None): """Show window from tray""" self.root.deiconify() self.root.lift() if self.tray_icon: self.tray_icon.stop() def open_settings(self, icon=None, item=None): """Open settings dialog""" SettingsDialog(self.root, self.settings, self.apply_settings) def apply_settings(self): """Apply new settings to the application""" # Update hotkey if self.hotkey_listener: self.hotkey_listener.stop() self.setup_hotkey() # Update always on top always_on_top = self.settings.get('always_on_top', False) self.always_on_top_var.set(always_on_top) self.root.wm_attributes('-topmost', always_on_top) # Update hotkey info label hotkey = self.settings.get('hotkey', 'f1').upper() # Find and update the info label (this is a bit hacky but works) for widget in self.root.winfo_children(): for child in widget.winfo_children(): if isinstance(child, ttk.Label) and 'šŸ’” Tip:' in child.cget('text'): child.config(text=f"šŸ’” Tip: Press {hotkey} anywhere to start/stop recording") def quit_app(self, icon=None, item=None): """Quit the application completely""" self.is_closing = True if self.tray_icon: self.tray_icon.stop() if self.hotkey_listener: self.hotkey_listener.stop() self.root.quit() self.root.destroy() def toggle_recording(self): """Toggle recording state""" if not self.is_recording: self.start_recording() else: self.stop_recording() def start_recording(self): """Start recording in background thread""" if self.is_recording: return self.is_recording = True self.record_button.config(text="šŸ›‘ Stop Recording", style="Stop.TButton") self.status_label.config(text="šŸ”“ Recording... Click Stop or press F1 to finish") # Configure stop button style style = ttk.Style() style.configure("Stop.TButton", font=('Arial', 12, 'bold'), foreground="red") # Clear previous results self.raw_text.delete(1.0, tk.END) self.processed_text.delete(1.0, tk.END) # Start recording in background thread threading.Thread(target=self._record_audio, daemon=True).start() def _record_audio(self): """Background recording method""" try: # Start recording self.recorder.recording = True self.recorder.audio_data = [] def callback(indata, frames, time, status): if self.recorder.recording: self.recorder.audio_data.append(indata.copy()) # Update status in main thread self.root.after(0, lambda: self.status_label.config(text="šŸ”“ Recording... Speak now!")) with sd.InputStream(callback=callback, samplerate=self.recorder.sample_rate, channels=self.recorder.channels, dtype=np.float32): # Wait until recording is stopped while self.is_recording: threading.Event().wait(0.1) except Exception as e: self.root.after(0, lambda: self._handle_recording_error(str(e))) def stop_recording(self): """Stop recording and process audio""" if not self.is_recording: return self.is_recording = False self.recorder.recording = False self.record_button.config(text="šŸŽ¤ Record", style="Record.TButton") self.status_label.config(text="ā³ Processing transcription...") # Process audio in background thread threading.Thread(target=self._process_audio, daemon=True).start() def _process_audio(self): """Process recorded audio and update GUI""" try: if not self.recorder.audio_data: self.root.after(0, lambda: self._handle_recording_error("No audio recorded")) return # Combine audio data audio = np.concatenate(self.recorder.audio_data, axis=0) # Update status self.root.after(0, lambda: self.status_label.config(text="šŸ¤– Loading Whisper model...")) # Load model and transcribe model_name = self.settings.get('whisper_model', 'base') if hasattr(self, 'settings') else 'base' model = whisper.load_model(model_name) temp_file = self.recorder._save_to_temp_file(audio) self.root.after(0, lambda: self.status_label.config(text="šŸ¤– Transcribing audio...")) try: result = model.transcribe(temp_file) raw_text = result["text"] processed_text = self.recorder.processor.process(raw_text) # Update GUI in main thread self.root.after(0, lambda: self._update_results(raw_text, processed_text)) # Save to file self._save_transcription(raw_text, processed_text) finally: os.unlink(temp_file) except Exception as e: self.root.after(0, lambda: self._handle_recording_error(str(e))) def _update_results(self, raw_text, processed_text): """Update GUI with transcription results""" # Update text widgets self.raw_text.delete(1.0, tk.END) self.raw_text.insert(1.0, raw_text) self.processed_text.delete(1.0, tk.END) self.processed_text.insert(1.0, processed_text) # Copy to clipboard if enabled if self.settings.get('auto_copy_clipboard', True): try: pyperclip.copy(processed_text) self.status_label.config(text="āœ… Transcription complete! Processed prompt copied to clipboard.") except Exception as e: self.status_label.config(text="āœ… Transcription complete! (Clipboard copy failed)") else: self.status_label.config(text="āœ… Transcription complete!") def _save_transcription(self, raw_text, processed_text): """Save transcription to file""" try: timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") output_file = self.recorder.transcripts_dir / f"transcription_{timestamp}.txt" with open(output_file, 'w', encoding='utf-8') as f: f.write(f"Transcription - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n") f.write("="*60 + "\n") f.write("RAW TRANSCRIPTION:\n") f.write("="*60 + "\n") f.write(raw_text + "\n\n") f.write("="*60 + "\n") f.write("PROCESSED PROMPT:\n") f.write("="*60 + "\n") f.write(processed_text) except Exception as e: print(f"Error saving file: {e}") def _handle_recording_error(self, error_msg): """Handle recording errors""" self.is_recording = False self.recorder.recording = False self.record_button.config(text="šŸŽ¤ Record", style="Record.TButton") self.status_label.config(text=f"āŒ Error: {error_msg}") messagebox.showerror("Recording Error", error_msg) def run(self): """Start the GUI application""" try: self.root.protocol("WM_DELETE_WINDOW", self.on_closing) self.root.bind('', lambda e: self.save_window_geometry() if e.widget == self.root else None) self.root.mainloop() finally: self.save_window_geometry() if self.hotkey_listener: self.hotkey_listener.stop() if self.tray_icon: self.tray_icon.stop() def on_closing(self): """Handle window closing""" if self.settings.get('minimize_to_tray', True) and not self.is_closing and self.tray_icon: # Minimize to tray instead of closing (only if tray is available) self.minimize_to_tray() else: # Actually close self.quit_app() def save_window_geometry(self): """Save current window geometry""" try: geometry = self.root.geometry() self.settings.set('window_geometry', geometry) except Exception: pass def main_terminal(): """Terminal version of the voice recorder""" recorder = VoiceRecorder() print("Voice to Text Converter (Terminal Mode)") print("=======================================") while True: print("\n1. Record (Enter to stop)") print("2. Quit") choice = input("\nSelect option (1-2): ").strip() if choice == "1": recorder.record_audio() elif choice == "2": print("Goodbye!") break else: print("Invalid choice. Please select 1 or 2.") def main(): """Main entry point - check for UI argument""" if len(sys.argv) > 1 and sys.argv[1].lower() == 'ui': # Launch GUI version print("Starting Voice to Text Converter (GUI Mode)...") app = VoiceRecorderGUI() app.run() else: # Launch terminal version main_terminal() if __name__ == "__main__": main()