Intermediate

Step 1: Speech Recognition

Build the ears of your voice assistant. You will integrate OpenAI Whisper for speech-to-text, handle streaming audio from the browser, implement noise filtering and silence detection, and create a robust transcription pipeline that works in real-world conditions.

How Whisper Works

OpenAI Whisper is a transformer-based ASR model trained on 680,000 hours of multilingual audio. The API version (whisper-1) accepts audio files up to 25 MB and returns text with timestamps. Key features:

Multi-language: Supports 100+ languages with automatic language detection.
Noise robust: Trained on diverse audio including noisy environments, accents, and background music.
Formats: Accepts WAV, MP3, M4A, FLAC, OGG, and WebM audio files.
Cost: $0.006 per minute of audio — a 30-second utterance costs $0.003.

Audio Utilities Module

First, create helper functions for audio format conversion. The browser sends audio in various formats, and Whisper expects specific input.

# app/asr/audio_utils.py
"""Audio format conversion and processing utilities."""
import io
import struct
import numpy as np
from typing import Optional


def pcm_to_wav(pcm_data: bytes, sample_rate: int = 16000,
               channels: int = 1, sample_width: int = 2) -> bytes:
    """Convert raw PCM audio bytes to WAV format.

    Args:
        pcm_data: Raw PCM audio bytes (16-bit signed integers)
        sample_rate: Sample rate in Hz (default 16000)
        channels: Number of audio channels (default 1 = mono)
        sample_width: Bytes per sample (default 2 = 16-bit)

    Returns:
        WAV file as bytes
    """
    buffer = io.BytesIO()

    # WAV header
    data_size = len(pcm_data)
    file_size = data_size + 36  # 44 - 8 bytes for RIFF header

    # RIFF header
    buffer.write(b"RIFF")
    buffer.write(struct.pack("<I", file_size))
    buffer.write(b"WAVE")

    # fmt chunk
    buffer.write(b"fmt ")
    buffer.write(struct.pack("<I", 16))                    # chunk size
    buffer.write(struct.pack("<H", 1))                     # PCM format
    buffer.write(struct.pack("<H", channels))              # channels
    buffer.write(struct.pack("<I", sample_rate))           # sample rate
    buffer.write(struct.pack("<I",
        sample_rate * channels * sample_width))              # byte rate
    buffer.write(struct.pack("<H", channels * sample_width))  # block align
    buffer.write(struct.pack("<H", sample_width * 8))     # bits per sample

    # data chunk
    buffer.write(b"data")
    buffer.write(struct.pack("<I", data_size))
    buffer.write(pcm_data)

    return buffer.getvalue()


def compute_rms(audio_bytes: bytes, sample_width: int = 2) -> float:
    """Compute the Root Mean Square (RMS) energy of audio.

    Used for silence detection - if RMS is below a threshold,
    the audio segment is likely silence.

    Args:
        audio_bytes: Raw PCM audio bytes
        sample_width: Bytes per sample (2 = 16-bit)

    Returns:
        RMS energy as a float (0.0 = silence, 1.0 = max)
    """
    if not audio_bytes:
        return 0.0

    if sample_width == 2:
        dtype = np.int16
        max_val = 32768.0
    else:
        dtype = np.int32
        max_val = 2147483648.0

    samples = np.frombuffer(audio_bytes, dtype=dtype).astype(np.float64)
    if len(samples) == 0:
        return 0.0

    rms = np.sqrt(np.mean(samples ** 2)) / max_val
    return float(rms)


def is_silence(audio_bytes: bytes, threshold: float = 0.01,
               sample_width: int = 2) -> bool:
    """Check if an audio segment is silence.

    Args:
        audio_bytes: Raw PCM audio bytes
        threshold: RMS threshold below which audio is silence
        sample_width: Bytes per sample

    Returns:
        True if the segment is silence
    """
    return compute_rms(audio_bytes, sample_width) < threshold


def resample_audio(audio_bytes: bytes, from_rate: int,
                   to_rate: int, sample_width: int = 2) -> bytes:
    """Resample audio from one sample rate to another.

    Args:
        audio_bytes: Raw PCM audio bytes
        from_rate: Source sample rate
        to_rate: Target sample rate
        sample_width: Bytes per sample

    Returns:
        Resampled PCM audio bytes
    """
    if from_rate == to_rate:
        return audio_bytes

    dtype = np.int16 if sample_width == 2 else np.int32
    samples = np.frombuffer(audio_bytes, dtype=dtype)

    # Calculate new length
    duration = len(samples) / from_rate
    new_length = int(duration * to_rate)

    # Linear interpolation resampling
    indices = np.linspace(0, len(samples) - 1, new_length)
    resampled = np.interp(indices, np.arange(len(samples)),
                          samples.astype(np.float64))

    return resampled.astype(dtype).tobytes()

Whisper Client

Now build the main Whisper client that handles transcription with buffering and silence detection:

# app/asr/whisper_client.py
"""OpenAI Whisper ASR client with streaming support."""
import io
import logging
import time
from typing import Optional, AsyncGenerator
from openai import AsyncOpenAI

from app.config import get_settings
from app.asr.audio_utils import pcm_to_wav, is_silence, compute_rms

logger = logging.getLogger(__name__)
settings = get_settings()


class WhisperClient:
    """Whisper speech-to-text client with audio buffering.

    Accumulates audio chunks from the WebSocket, detects when
    the user stops speaking (silence detection), and sends
    the complete utterance to Whisper for transcription.
    """

    def __init__(self):
        self.client = AsyncOpenAI(api_key=settings.openai_api_key)
        self.model = settings.whisper_model
        self.language = settings.whisper_language

        # Audio buffer settings
        self.sample_rate = 16000
        self.channels = 1
        self.sample_width = 2  # 16-bit

        # Silence detection settings
        self.silence_threshold = 0.01
        self.silence_duration = 0.8  # seconds of silence to trigger end
        self.min_audio_duration = 0.5  # minimum seconds of audio to transcribe

        # Buffer state
        self._buffer = bytearray()
        self._silence_start: Optional[float] = None
        self._has_speech = False

    def reset(self):
        """Reset the audio buffer for a new utterance."""
        self._buffer = bytearray()
        self._silence_start = None
        self._has_speech = False

    def add_audio(self, chunk: bytes) -> Optional[str]:
        """Add an audio chunk to the buffer.

        Returns a signal string:
        - None: keep buffering
        - "silence_detected": user stopped speaking, ready to transcribe

        Args:
            chunk: Raw PCM audio bytes (16-bit, 16kHz, mono)

        Returns:
            Signal string or None
        """
        self._buffer.extend(chunk)

        # Check if this chunk contains speech
        rms = compute_rms(chunk, self.sample_width)

        if rms > self.silence_threshold:
            # Speech detected
            self._has_speech = True
            self._silence_start = None
        else:
            # Silence detected
            if self._has_speech and self._silence_start is None:
                self._silence_start = time.time()
            elif self._silence_start is not None:
                silence_elapsed = time.time() - self._silence_start
                if silence_elapsed >= self.silence_duration:
                    # Check minimum audio duration
                    audio_duration = (len(self._buffer)
                        / (self.sample_rate * self.channels * self.sample_width))
                    if audio_duration >= self.min_audio_duration:
                        return "silence_detected"

        return None

    async def transcribe(self, audio_bytes: Optional[bytes] = None) -> str:
        """Transcribe audio using Whisper API.

        Args:
            audio_bytes: Raw PCM audio bytes. If None, uses the
                        internal buffer.

        Returns:
            Transcribed text string
        """
        if audio_bytes is None:
            audio_bytes = bytes(self._buffer)

        if not audio_bytes:
            return ""

        # Convert PCM to WAV for the API
        wav_data = pcm_to_wav(
            audio_bytes,
            sample_rate=self.sample_rate,
            channels=self.channels,
            sample_width=self.sample_width
        )

        # Create a file-like object
        audio_file = io.BytesIO(wav_data)
        audio_file.name = "audio.wav"

        start_time = time.time()

        try:
            response = await self.client.audio.transcriptions.create(
                model=self.model,
                file=audio_file,
                language=self.language,
                response_format="text"
            )

            elapsed = time.time() - start_time
            transcript = response.strip()

            logger.info(
                f"Whisper transcription: '{transcript}' "
                f"({elapsed:.2f}s, "
                f"{len(audio_bytes) / (self.sample_rate * 2):.1f}s audio)"
            )

            return transcript

        except Exception as e:
            logger.error(f"Whisper transcription failed: {e}")
            raise

    async def transcribe_and_reset(self) -> str:
        """Transcribe the buffered audio and reset for next utterance.

        Returns:
            Transcribed text string
        """
        transcript = await self.transcribe()
        self.reset()
        return transcript

💡

Silence Detection Strategy: We use a simple RMS energy threshold. When the audio energy drops below the threshold for 0.8 seconds after speech was detected, we consider the utterance complete. This avoids sending incomplete sentences to the LLM. You can tune silence_threshold and silence_duration for your environment.

Handling Streaming Audio from the Browser

The browser captures audio using the Web Audio API and sends it over the WebSocket in chunks. Here is how the server-side handler processes incoming audio:

# app/asr/stream_handler.py
"""Handle streaming audio from WebSocket for ASR."""
import logging
from typing import AsyncGenerator, Callable, Awaitable

from app.asr.whisper_client import WhisperClient

logger = logging.getLogger(__name__)


class ASRStreamHandler:
    """Processes streaming audio chunks and produces transcripts.

    Usage:
        handler = ASRStreamHandler()
        async for event in handler.process_stream(audio_chunks):
            if event["type"] == "transcript":
                print(event["text"])
    """

    def __init__(self):
        self.whisper = WhisperClient()
        self._is_listening = False

    async def process_chunk(self, audio_chunk: bytes) -> dict | None:
        """Process a single audio chunk.

        Args:
            audio_chunk: Raw PCM audio bytes from WebSocket

        Returns:
            Event dict or None if still buffering
        """
        if not self._is_listening:
            self._is_listening = True
            self.whisper.reset()
            logger.debug("ASR: Started listening")

        signal = self.whisper.add_audio(audio_chunk)

        if signal == "silence_detected":
            self._is_listening = False

            # Transcribe the buffered audio
            transcript = await self.whisper.transcribe_and_reset()

            if transcript:
                logger.info(f"ASR transcript: '{transcript}'")
                return {
                    "type": "transcript",
                    "text": transcript
                }
            else:
                logger.debug("ASR: Empty transcript, ignoring")
                return {
                    "type": "silence",
                    "text": ""
                }

        return None  # Still buffering

    def stop(self):
        """Force-stop listening and discard buffer."""
        self._is_listening = False
        self.whisper.reset()
        logger.debug("ASR: Stopped listening")

Noise Handling Best Practices

Real-world audio is noisy. Here are practical techniques to improve transcription quality:

# app/asr/noise_filter.py
"""Audio preprocessing for noise reduction."""
import numpy as np
from typing import Optional


def apply_highpass_filter(audio_bytes: bytes, sample_rate: int = 16000,
                          cutoff_hz: int = 80) -> bytes:
    """Remove low-frequency rumble (AC hum, wind noise).

    A simple first-order high-pass filter that removes frequencies
    below the cutoff. Human speech starts around 85 Hz, so 80 Hz
    is a safe default.

    Args:
        audio_bytes: Raw PCM audio (16-bit signed)
        sample_rate: Audio sample rate
        cutoff_hz: Frequencies below this are removed

    Returns:
        Filtered audio bytes
    """
    samples = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float64)

    # First-order high-pass filter coefficient
    rc = 1.0 / (2.0 * np.pi * cutoff_hz)
    dt = 1.0 / sample_rate
    alpha = rc / (rc + dt)

    # Apply filter
    filtered = np.zeros_like(samples)
    filtered[0] = samples[0]
    for i in range(1, len(samples)):
        filtered[i] = alpha * (filtered[i - 1] + samples[i] - samples[i - 1])

    return filtered.astype(np.int16).tobytes()


def normalize_volume(audio_bytes: bytes,
                     target_rms: float = 0.1) -> bytes:
    """Normalize audio volume to a target RMS level.

    Prevents Whisper from struggling with very quiet or very loud audio.

    Args:
        audio_bytes: Raw PCM audio (16-bit signed)
        target_rms: Target RMS level (0.0 to 1.0)

    Returns:
        Volume-normalized audio bytes
    """
    samples = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float64)

    if len(samples) == 0:
        return audio_bytes

    current_rms = np.sqrt(np.mean(samples ** 2)) / 32768.0

    if current_rms < 0.001:
        return audio_bytes  # Too quiet, probably silence

    gain = target_rms / current_rms
    gain = min(gain, 10.0)  # Cap gain to avoid amplifying noise

    normalized = samples * gain
    normalized = np.clip(normalized, -32768, 32767)

    return normalized.astype(np.int16).tobytes()


def preprocess_audio(audio_bytes: bytes,
                     sample_rate: int = 16000) -> bytes:
    """Full audio preprocessing pipeline.

    Applies high-pass filter and volume normalization.

    Args:
        audio_bytes: Raw PCM audio
        sample_rate: Audio sample rate

    Returns:
        Preprocessed audio bytes
    """
    # Step 1: Remove low-frequency noise
    audio_bytes = apply_highpass_filter(audio_bytes, sample_rate)

    # Step 2: Normalize volume
    audio_bytes = normalize_volume(audio_bytes)

    return audio_bytes

📝

When to Preprocess: Apply preprocess_audio() to the full audio buffer just before sending it to Whisper. Do not apply it to individual chunks during streaming, as this would distort the silence detection. The preprocessing pipeline runs in under 10ms for typical utterances.

Putting It All Together

Here is how the ASR module integrates with the WebSocket handler. We will flesh out the full WebSocket handler in the Web Interface lesson, but here is the ASR portion:

# Updated app/ws/handler.py (ASR portion)
from app.asr.stream_handler import ASRStreamHandler

async def handle_voice_session(websocket):
    """Handle a voice WebSocket session - ASR portion."""
    asr = ASRStreamHandler()

    try:
        while True:
            # Receive audio chunk from browser
            audio_data = await websocket.receive_bytes()

            # Process through ASR
            event = await asr.process_chunk(audio_data)

            if event and event["type"] == "transcript":
                transcript = event["text"]

                # Send transcript to client for display
                await websocket.send_json({
                    "type": "transcript",
                    "text": transcript
                })

                # TODO: Send transcript to LLM (next lesson)
                # response = await llm.generate(transcript)

    except Exception:
        asr.stop()

Testing the ASR Module

# tests/test_asr.py
"""Test the ASR module with a real audio file."""
import asyncio
from app.asr.whisper_client import WhisperClient
from app.asr.audio_utils import compute_rms, is_silence, pcm_to_wav
from app.asr.noise_filter import preprocess_audio
import numpy as np


def test_pcm_to_wav():
    """Test PCM to WAV conversion."""
    # Generate 1 second of 440Hz sine wave
    sample_rate = 16000
    t = np.linspace(0, 1.0, sample_rate, dtype=np.float32)
    samples = (np.sin(2 * np.pi * 440 * t) * 32767).astype(np.int16)
    pcm = samples.tobytes()

    wav = pcm_to_wav(pcm, sample_rate)
    assert wav[:4] == b"RIFF"
    assert wav[8:12] == b"WAVE"
    print(f"WAV conversion OK - {len(wav)} bytes")


def test_silence_detection():
    """Test silence detection with known audio."""
    # Silence
    silence = np.zeros(16000, dtype=np.int16).tobytes()
    assert is_silence(silence) == True

    # Loud tone
    t = np.linspace(0, 1.0, 16000, dtype=np.float32)
    tone = (np.sin(2 * np.pi * 440 * t) * 16000).astype(np.int16).tobytes()
    assert is_silence(tone) == False
    print("Silence detection OK")


def test_noise_filter():
    """Test audio preprocessing."""
    sample_rate = 16000
    t = np.linspace(0, 1.0, sample_rate, dtype=np.float32)

    # Mix speech-frequency tone with low-frequency hum
    speech = (np.sin(2 * np.pi * 300 * t) * 10000).astype(np.int16)
    hum = (np.sin(2 * np.pi * 50 * t) * 5000).astype(np.int16)
    mixed = (speech.astype(np.int32) + hum.astype(np.int32))
    mixed = np.clip(mixed, -32768, 32767).astype(np.int16).tobytes()

    filtered = preprocess_audio(mixed, sample_rate)
    assert len(filtered) == len(mixed)
    print("Noise filter OK")


async def test_whisper_transcription():
    """Test Whisper transcription with synthetic audio."""
    client = WhisperClient()

    # Generate 2 seconds of silence (should return empty)
    silence = np.zeros(32000, dtype=np.int16).tobytes()
    transcript = await client.transcribe(silence)
    print(f"Whisper silence test: '{transcript}'")


if __name__ == "__main__":
    test_pcm_to_wav()
    test_silence_detection()
    test_noise_filter()
    asyncio.run(test_whisper_transcription())
    print("\nAll ASR tests passed!")

Key Takeaways

Whisper accepts complete audio files, not streaming chunks. We buffer audio and use silence detection to know when to send it.
The WhisperClient accumulates PCM chunks, monitors RMS energy, and triggers transcription after 0.8 seconds of silence.
Audio preprocessing (high-pass filter + volume normalization) significantly improves transcription quality in noisy environments.
The ASRStreamHandler wraps the client with a clean async interface for the WebSocket handler.

What Is Next

In the next lesson, you will build the LLM conversation engine — the brain that processes transcripts, maintains conversation context, supports tool calling, and generates streaming responses.

← Previous Project Setup Next → LLM Conversation Engine