whisper/examples/streaming_client.py

#!/usr/bin/env python3
"""
Example client for Whisper WebSocket streaming server.

This script demonstrates how to connect to the WebSocket server and stream audio
for real-time transcription.
"""

import asyncio
import json
import base64
import numpy as np
import time
import logging
from typing import Optional
import argparse

try:
    import websockets
    WEBSOCKETS_AVAILABLE = True
except ImportError:
    WEBSOCKETS_AVAILABLE = False

try:
    import pyaudio
    PYAUDIO_AVAILABLE = True
except ImportError:
    PYAUDIO_AVAILABLE = False


class WhisperStreamingClient:
    """Client for connecting to Whisper WebSocket streaming server."""

    def __init__(self, server_url: str = "ws://localhost:8765"):
        """Initialize the streaming client."""
        if not WEBSOCKETS_AVAILABLE:
            raise ImportError("websockets library is required: pip install websockets")

        self.server_url = server_url
        self.websocket = None
        self.is_connected = False
        self.is_streaming = False

        # Audio settings
        self.sample_rate = 16000
        self.channels = 1
        self.chunk_size = 1024
        self.audio_format = pyaudio.paInt16 if PYAUDIO_AVAILABLE else None

        # Setup logging
        self.logger = logging.getLogger(__name__)

    async def connect(self) -> bool:
        """Connect to the WebSocket server."""
        try:
            self.websocket = await websockets.connect(
                self.server_url,
                ping_interval=20,
                ping_timeout=10
            )
            self.is_connected = True
            self.logger.info(f"Connected to {self.server_url}")
            return True

        except Exception as e:
            self.logger.error(f"Failed to connect: {e}")
            return False

    async def disconnect(self) -> None:
        """Disconnect from the WebSocket server."""
        if self.websocket:
            await self.websocket.close()
            self.is_connected = False
            self.logger.info("Disconnected from server")

    async def configure_stream(self, config: dict) -> bool:
        """Configure the streaming parameters."""
        try:
            message = {
                "type": "configure",
                "config": config
            }
            await self.websocket.send(json.dumps(message))

            # Wait for response
            response = await self.websocket.recv()
            response_data = json.loads(response)

            if response_data.get("type") == "configuration_updated":
                self.logger.info("Stream configured successfully")
                return True
            else:
                self.logger.error(f"Configuration failed: {response_data}")
                return False

        except Exception as e:
            self.logger.error(f"Error configuring stream: {e}")
            return False

    async def start_stream(self) -> bool:
        """Start the transcription stream."""
        try:
            message = {"type": "start_stream"}
            await self.websocket.send(json.dumps(message))

            # Wait for response
            response = await self.websocket.recv()
            response_data = json.loads(response)

            if response_data.get("type") == "stream_started":
                self.is_streaming = True
                self.logger.info("Stream started successfully")
                return True
            else:
                self.logger.error(f"Failed to start stream: {response_data}")
                return False

        except Exception as e:
            self.logger.error(f"Error starting stream: {e}")
            return False

    async def stop_stream(self) -> bool:
        """Stop the transcription stream."""
        try:
            message = {"type": "stop_stream"}
            await self.websocket.send(json.dumps(message))

            # Wait for response
            response = await self.websocket.recv()
            response_data = json.loads(response)

            if response_data.get("type") == "stream_stopped":
                self.is_streaming = False
                self.logger.info("Stream stopped successfully")
                return True
            else:
                self.logger.error(f"Failed to stop stream: {response_data}")
                return False

        except Exception as e:
            self.logger.error(f"Error stopping stream: {e}")
            return False

    async def send_audio_data(self, audio_data: np.ndarray) -> None:
        """Send audio data to the server."""
        try:
            if not self.is_streaming:
                return

            # Convert audio to bytes
            if audio_data.dtype != np.int16:
                audio_data = (audio_data * 32767).astype(np.int16)

            audio_bytes = audio_data.tobytes()
            audio_b64 = base64.b64encode(audio_bytes).decode('utf-8')

            message = {
                "type": "audio_data",
                "format": "pcm16",
                "audio": audio_b64
            }

            await self.websocket.send(json.dumps(message))

        except Exception as e:
            self.logger.error(f"Error sending audio data: {e}")

    async def listen_for_results(self) -> None:
        """Listen for transcription results from the server."""
        try:
            while self.is_connected:
                response = await self.websocket.recv()
                response_data = json.loads(response)

                message_type = response_data.get("type")

                if message_type == "transcription_result":
                    self._handle_transcription_result(response_data)
                elif message_type == "error":
                    self._handle_error(response_data)
                elif message_type == "connection_established":
                    self._handle_connection_established(response_data)
                else:
                    self.logger.info(f"Received: {response_data}")

        except websockets.exceptions.ConnectionClosed:
            self.logger.info("Connection closed by server")
        except Exception as e:
            self.logger.error(f"Error listening for results: {e}")

    def _handle_transcription_result(self, data: dict) -> None:
        """Handle transcription result from server."""
        result = data.get("result", {})
        text = result.get("text", "")
        confidence = result.get("confidence", 0.0)
        is_final = result.get("is_final", True)

        status = "FINAL" if is_final else "PARTIAL"
        print(f"[{status}] ({confidence:.2f}): {text}")

    def _handle_error(self, data: dict) -> None:
        """Handle error message from server."""
        error_type = data.get("error_type", "Unknown")
        message = data.get("message", "")
        print(f"ERROR [{error_type}]: {message}")

    def _handle_connection_established(self, data: dict) -> None:
        """Handle connection established message."""
        server_info = data.get("server_info", {})
        print(f"Connected to server version {server_info.get('version', 'unknown')}")
        print(f"Supported formats: {server_info.get('supported_formats', [])}")

    async def get_status(self) -> dict:
        """Get status from the server."""
        try:
            message = {"type": "get_status"}
            await self.websocket.send(json.dumps(message))

            response = await self.websocket.recv()
            response_data = json.loads(response)

            if response_data.get("type") == "status":
                return response_data
            else:
                self.logger.error(f"Unexpected status response: {response_data}")
                return {}

        except Exception as e:
            self.logger.error(f"Error getting status: {e}")
            return {}


class MicrophoneStreamer:
    """Stream audio from microphone to Whisper server."""

    def __init__(self, client: WhisperStreamingClient):
        """Initialize microphone streamer."""
        if not PYAUDIO_AVAILABLE:
            raise ImportError("pyaudio library is required: pip install pyaudio")

        self.client = client
        self.audio = None
        self.stream = None
        self.is_recording = False

    def start_recording(self) -> bool:
        """Start recording from microphone."""
        try:
            self.audio = pyaudio.PyAudio()

            self.stream = self.audio.open(
                format=self.client.audio_format,
                channels=self.client.channels,
                rate=self.client.sample_rate,
                input=True,
                frames_per_buffer=self.client.chunk_size
            )

            self.is_recording = True
            print(f"Started recording from microphone (SR: {self.client.sample_rate}Hz)")
            return True

        except Exception as e:
            print(f"Error starting microphone: {e}")
            return False

    def stop_recording(self) -> None:
        """Stop recording from microphone."""
        self.is_recording = False

        if self.stream:
            self.stream.stop_stream()
            self.stream.close()

        if self.audio:
            self.audio.terminate()

        print("Stopped recording")

    async def stream_audio(self) -> None:
        """Stream audio from microphone to server."""
        print("Streaming audio... (Press Ctrl+C to stop)")

        try:
            while self.is_recording:
                # Read audio data
                data = self.stream.read(self.client.chunk_size, exception_on_overflow=False)
                audio_data = np.frombuffer(data, dtype=np.int16)

                # Send to server
                await self.client.send_audio_data(audio_data)

                # Small delay to avoid overwhelming the server
                await asyncio.sleep(0.01)

        except KeyboardInterrupt:
            print("\\nStopping audio stream...")
        except Exception as e:
            print(f"Error streaming audio: {e}")


async def run_demo_client(server_url: str, model: str = "base", use_microphone: bool = False):
    """Run a demo of the streaming client."""
    client = WhisperStreamingClient(server_url)

    try:
        # Connect to server
        if not await client.connect():
            return

        # Start listening for results in background
        listen_task = asyncio.create_task(client.listen_for_results())

        # Wait a bit for connection to be established
        await asyncio.sleep(1)

        # Configure stream
        config = {
            "model_name": model,
            "sample_rate": 16000,
            "language": None,  # Auto-detect
            "temperature": 0.0,
            "return_timestamps": True
        }

        if not await client.configure_stream(config):
            return

        # Start stream
        if not await client.start_stream():
            return

        # Stream audio
        if use_microphone and PYAUDIO_AVAILABLE:
            # Use microphone
            mic_streamer = MicrophoneStreamer(client)
            if mic_streamer.start_recording():
                try:
                    await mic_streamer.stream_audio()
                finally:
                    mic_streamer.stop_recording()
        else:
            # Use synthetic audio for demo
            print("Streaming synthetic audio... (Press Ctrl+C to stop)")
            try:
                duration = 0
                while duration < 30:  # Stream for 30 seconds
                    # Generate 1 second of synthetic speech-like audio
                    t = np.linspace(0, 1, 16000)
                    frequency = 440 + 50 * np.sin(2 * np.pi * 0.5 * duration)  # Varying frequency
                    audio = 0.3 * np.sin(2 * np.pi * frequency * t)
                    audio_data = (audio * 32767).astype(np.int16)

                    await client.send_audio_data(audio_data)
                    await asyncio.sleep(1)
                    duration += 1

            except KeyboardInterrupt:
                print("\\nStopping synthetic audio stream...")

        # Stop stream
        await client.stop_stream()

        # Get final status
        status = await client.get_status()
        if status:
            print(f"\\nFinal Status:")
            print(f"  Processor state: {status.get('processor', {}).get('state', 'unknown')}")
            print(f"  Segments processed: {status.get('processor', {}).get('segments_completed', 0)}")

    except Exception as e:
        print(f"Demo error: {e}")

    finally:
        await client.disconnect()


def main():
    """Main function."""
    parser = argparse.ArgumentParser(description="Whisper Streaming Client Demo")
    parser.add_argument("--server", default="ws://localhost:8765", help="WebSocket server URL")
    parser.add_argument("--model", default="base", help="Whisper model to use")
    parser.add_argument("--microphone", action="store_true", help="Use microphone input")
    parser.add_argument("--verbose", action="store_true", help="Enable verbose logging")

    args = parser.parse_args()

    # Setup logging
    logging.basicConfig(
        level=logging.INFO if args.verbose else logging.WARNING,
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
    )

    # Check dependencies
    if not WEBSOCKETS_AVAILABLE:
        print("Error: websockets library is required")
        print("Install with: pip install websockets")
        return

    if args.microphone and not PYAUDIO_AVAILABLE:
        print("Error: pyaudio library is required for microphone input")
        print("Install with: pip install pyaudio")
        return

    print(f"Whisper Streaming Client Demo")
    print(f"Server: {args.server}")
    print(f"Model: {args.model}")
    print(f"Input: {'Microphone' if args.microphone else 'Synthetic audio'}")
    print()

    # Run the demo
    try:
        asyncio.run(run_demo_client(args.server, args.model, args.microphone))
    except KeyboardInterrupt:
        print("\\nDemo interrupted by user")
    except Exception as e:
        print(f"Demo failed: {e}")


if __name__ == "__main__":
    main()