Step 1: Speech Recognition
Build the ears of your voice assistant. You will integrate OpenAI Whisper for speech-to-text, handle streaming audio from the browser, implement noise filtering and silence detection, and create a robust transcription pipeline that works in real-world conditions.
How Whisper Works
OpenAI Whisper is a transformer-based ASR model trained on 680,000 hours of multilingual audio. The API version (whisper-1) accepts audio files up to 25 MB and returns text with timestamps. Key features:
- Multi-language: Supports 100+ languages with automatic language detection.
- Noise robust: Trained on diverse audio including noisy environments, accents, and background music.
- Formats: Accepts WAV, MP3, M4A, FLAC, OGG, and WebM audio files.
- Cost: $0.006 per minute of audio — a 30-second utterance costs $0.003.
Audio Utilities Module
First, create helper functions for audio format conversion. The browser sends audio in various formats, and Whisper expects specific input.
# app/asr/audio_utils.py
"""Audio format conversion and processing utilities."""
import io
import struct
import numpy as np
from typing import Optional
def pcm_to_wav(pcm_data: bytes, sample_rate: int = 16000,
channels: int = 1, sample_width: int = 2) -> bytes:
"""Convert raw PCM audio bytes to WAV format.
Args:
pcm_data: Raw PCM audio bytes (16-bit signed integers)
sample_rate: Sample rate in Hz (default 16000)
channels: Number of audio channels (default 1 = mono)
sample_width: Bytes per sample (default 2 = 16-bit)
Returns:
WAV file as bytes
"""
buffer = io.BytesIO()
# WAV header
data_size = len(pcm_data)
file_size = data_size + 36 # 44 - 8 bytes for RIFF header
# RIFF header
buffer.write(b"RIFF")
buffer.write(struct.pack("<I", file_size))
buffer.write(b"WAVE")
# fmt chunk
buffer.write(b"fmt ")
buffer.write(struct.pack("<I", 16)) # chunk size
buffer.write(struct.pack("<H", 1)) # PCM format
buffer.write(struct.pack("<H", channels)) # channels
buffer.write(struct.pack("<I", sample_rate)) # sample rate
buffer.write(struct.pack("<I",
sample_rate * channels * sample_width)) # byte rate
buffer.write(struct.pack("<H", channels * sample_width)) # block align
buffer.write(struct.pack("<H", sample_width * 8)) # bits per sample
# data chunk
buffer.write(b"data")
buffer.write(struct.pack("<I", data_size))
buffer.write(pcm_data)
return buffer.getvalue()
def compute_rms(audio_bytes: bytes, sample_width: int = 2) -> float:
"""Compute the Root Mean Square (RMS) energy of audio.
Used for silence detection - if RMS is below a threshold,
the audio segment is likely silence.
Args:
audio_bytes: Raw PCM audio bytes
sample_width: Bytes per sample (2 = 16-bit)
Returns:
RMS energy as a float (0.0 = silence, 1.0 = max)
"""
if not audio_bytes:
return 0.0
if sample_width == 2:
dtype = np.int16
max_val = 32768.0
else:
dtype = np.int32
max_val = 2147483648.0
samples = np.frombuffer(audio_bytes, dtype=dtype).astype(np.float64)
if len(samples) == 0:
return 0.0
rms = np.sqrt(np.mean(samples ** 2)) / max_val
return float(rms)
def is_silence(audio_bytes: bytes, threshold: float = 0.01,
sample_width: int = 2) -> bool:
"""Check if an audio segment is silence.
Args:
audio_bytes: Raw PCM audio bytes
threshold: RMS threshold below which audio is silence
sample_width: Bytes per sample
Returns:
True if the segment is silence
"""
return compute_rms(audio_bytes, sample_width) < threshold
def resample_audio(audio_bytes: bytes, from_rate: int,
to_rate: int, sample_width: int = 2) -> bytes:
"""Resample audio from one sample rate to another.
Args:
audio_bytes: Raw PCM audio bytes
from_rate: Source sample rate
to_rate: Target sample rate
sample_width: Bytes per sample
Returns:
Resampled PCM audio bytes
"""
if from_rate == to_rate:
return audio_bytes
dtype = np.int16 if sample_width == 2 else np.int32
samples = np.frombuffer(audio_bytes, dtype=dtype)
# Calculate new length
duration = len(samples) / from_rate
new_length = int(duration * to_rate)
# Linear interpolation resampling
indices = np.linspace(0, len(samples) - 1, new_length)
resampled = np.interp(indices, np.arange(len(samples)),
samples.astype(np.float64))
return resampled.astype(dtype).tobytes()
Whisper Client
Now build the main Whisper client that handles transcription with buffering and silence detection:
# app/asr/whisper_client.py
"""OpenAI Whisper ASR client with streaming support."""
import io
import logging
import time
from typing import Optional, AsyncGenerator
from openai import AsyncOpenAI
from app.config import get_settings
from app.asr.audio_utils import pcm_to_wav, is_silence, compute_rms
logger = logging.getLogger(__name__)
settings = get_settings()
class WhisperClient:
"""Whisper speech-to-text client with audio buffering.
Accumulates audio chunks from the WebSocket, detects when
the user stops speaking (silence detection), and sends
the complete utterance to Whisper for transcription.
"""
def __init__(self):
self.client = AsyncOpenAI(api_key=settings.openai_api_key)
self.model = settings.whisper_model
self.language = settings.whisper_language
# Audio buffer settings
self.sample_rate = 16000
self.channels = 1
self.sample_width = 2 # 16-bit
# Silence detection settings
self.silence_threshold = 0.01
self.silence_duration = 0.8 # seconds of silence to trigger end
self.min_audio_duration = 0.5 # minimum seconds of audio to transcribe
# Buffer state
self._buffer = bytearray()
self._silence_start: Optional[float] = None
self._has_speech = False
def reset(self):
"""Reset the audio buffer for a new utterance."""
self._buffer = bytearray()
self._silence_start = None
self._has_speech = False
def add_audio(self, chunk: bytes) -> Optional[str]:
"""Add an audio chunk to the buffer.
Returns a signal string:
- None: keep buffering
- "silence_detected": user stopped speaking, ready to transcribe
Args:
chunk: Raw PCM audio bytes (16-bit, 16kHz, mono)
Returns:
Signal string or None
"""
self._buffer.extend(chunk)
# Check if this chunk contains speech
rms = compute_rms(chunk, self.sample_width)
if rms > self.silence_threshold:
# Speech detected
self._has_speech = True
self._silence_start = None
else:
# Silence detected
if self._has_speech and self._silence_start is None:
self._silence_start = time.time()
elif self._silence_start is not None:
silence_elapsed = time.time() - self._silence_start
if silence_elapsed >= self.silence_duration:
# Check minimum audio duration
audio_duration = (len(self._buffer)
/ (self.sample_rate * self.channels * self.sample_width))
if audio_duration >= self.min_audio_duration:
return "silence_detected"
return None
async def transcribe(self, audio_bytes: Optional[bytes] = None) -> str:
"""Transcribe audio using Whisper API.
Args:
audio_bytes: Raw PCM audio bytes. If None, uses the
internal buffer.
Returns:
Transcribed text string
"""
if audio_bytes is None:
audio_bytes = bytes(self._buffer)
if not audio_bytes:
return ""
# Convert PCM to WAV for the API
wav_data = pcm_to_wav(
audio_bytes,
sample_rate=self.sample_rate,
channels=self.channels,
sample_width=self.sample_width
)
# Create a file-like object
audio_file = io.BytesIO(wav_data)
audio_file.name = "audio.wav"
start_time = time.time()
try:
response = await self.client.audio.transcriptions.create(
model=self.model,
file=audio_file,
language=self.language,
response_format="text"
)
elapsed = time.time() - start_time
transcript = response.strip()
logger.info(
f"Whisper transcription: '{transcript}' "
f"({elapsed:.2f}s, "
f"{len(audio_bytes) / (self.sample_rate * 2):.1f}s audio)"
)
return transcript
except Exception as e:
logger.error(f"Whisper transcription failed: {e}")
raise
async def transcribe_and_reset(self) -> str:
"""Transcribe the buffered audio and reset for next utterance.
Returns:
Transcribed text string
"""
transcript = await self.transcribe()
self.reset()
return transcript
silence_threshold and silence_duration for your environment.Handling Streaming Audio from the Browser
The browser captures audio using the Web Audio API and sends it over the WebSocket in chunks. Here is how the server-side handler processes incoming audio:
# app/asr/stream_handler.py
"""Handle streaming audio from WebSocket for ASR."""
import logging
from typing import AsyncGenerator, Callable, Awaitable
from app.asr.whisper_client import WhisperClient
logger = logging.getLogger(__name__)
class ASRStreamHandler:
"""Processes streaming audio chunks and produces transcripts.
Usage:
handler = ASRStreamHandler()
async for event in handler.process_stream(audio_chunks):
if event["type"] == "transcript":
print(event["text"])
"""
def __init__(self):
self.whisper = WhisperClient()
self._is_listening = False
async def process_chunk(self, audio_chunk: bytes) -> dict | None:
"""Process a single audio chunk.
Args:
audio_chunk: Raw PCM audio bytes from WebSocket
Returns:
Event dict or None if still buffering
"""
if not self._is_listening:
self._is_listening = True
self.whisper.reset()
logger.debug("ASR: Started listening")
signal = self.whisper.add_audio(audio_chunk)
if signal == "silence_detected":
self._is_listening = False
# Transcribe the buffered audio
transcript = await self.whisper.transcribe_and_reset()
if transcript:
logger.info(f"ASR transcript: '{transcript}'")
return {
"type": "transcript",
"text": transcript
}
else:
logger.debug("ASR: Empty transcript, ignoring")
return {
"type": "silence",
"text": ""
}
return None # Still buffering
def stop(self):
"""Force-stop listening and discard buffer."""
self._is_listening = False
self.whisper.reset()
logger.debug("ASR: Stopped listening")
Noise Handling Best Practices
Real-world audio is noisy. Here are practical techniques to improve transcription quality:
# app/asr/noise_filter.py
"""Audio preprocessing for noise reduction."""
import numpy as np
from typing import Optional
def apply_highpass_filter(audio_bytes: bytes, sample_rate: int = 16000,
cutoff_hz: int = 80) -> bytes:
"""Remove low-frequency rumble (AC hum, wind noise).
A simple first-order high-pass filter that removes frequencies
below the cutoff. Human speech starts around 85 Hz, so 80 Hz
is a safe default.
Args:
audio_bytes: Raw PCM audio (16-bit signed)
sample_rate: Audio sample rate
cutoff_hz: Frequencies below this are removed
Returns:
Filtered audio bytes
"""
samples = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float64)
# First-order high-pass filter coefficient
rc = 1.0 / (2.0 * np.pi * cutoff_hz)
dt = 1.0 / sample_rate
alpha = rc / (rc + dt)
# Apply filter
filtered = np.zeros_like(samples)
filtered[0] = samples[0]
for i in range(1, len(samples)):
filtered[i] = alpha * (filtered[i - 1] + samples[i] - samples[i - 1])
return filtered.astype(np.int16).tobytes()
def normalize_volume(audio_bytes: bytes,
target_rms: float = 0.1) -> bytes:
"""Normalize audio volume to a target RMS level.
Prevents Whisper from struggling with very quiet or very loud audio.
Args:
audio_bytes: Raw PCM audio (16-bit signed)
target_rms: Target RMS level (0.0 to 1.0)
Returns:
Volume-normalized audio bytes
"""
samples = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float64)
if len(samples) == 0:
return audio_bytes
current_rms = np.sqrt(np.mean(samples ** 2)) / 32768.0
if current_rms < 0.001:
return audio_bytes # Too quiet, probably silence
gain = target_rms / current_rms
gain = min(gain, 10.0) # Cap gain to avoid amplifying noise
normalized = samples * gain
normalized = np.clip(normalized, -32768, 32767)
return normalized.astype(np.int16).tobytes()
def preprocess_audio(audio_bytes: bytes,
sample_rate: int = 16000) -> bytes:
"""Full audio preprocessing pipeline.
Applies high-pass filter and volume normalization.
Args:
audio_bytes: Raw PCM audio
sample_rate: Audio sample rate
Returns:
Preprocessed audio bytes
"""
# Step 1: Remove low-frequency noise
audio_bytes = apply_highpass_filter(audio_bytes, sample_rate)
# Step 2: Normalize volume
audio_bytes = normalize_volume(audio_bytes)
return audio_bytes
preprocess_audio() to the full audio buffer just before sending it to Whisper. Do not apply it to individual chunks during streaming, as this would distort the silence detection. The preprocessing pipeline runs in under 10ms for typical utterances.Putting It All Together
Here is how the ASR module integrates with the WebSocket handler. We will flesh out the full WebSocket handler in the Web Interface lesson, but here is the ASR portion:
# Updated app/ws/handler.py (ASR portion)
from app.asr.stream_handler import ASRStreamHandler
async def handle_voice_session(websocket):
"""Handle a voice WebSocket session - ASR portion."""
asr = ASRStreamHandler()
try:
while True:
# Receive audio chunk from browser
audio_data = await websocket.receive_bytes()
# Process through ASR
event = await asr.process_chunk(audio_data)
if event and event["type"] == "transcript":
transcript = event["text"]
# Send transcript to client for display
await websocket.send_json({
"type": "transcript",
"text": transcript
})
# TODO: Send transcript to LLM (next lesson)
# response = await llm.generate(transcript)
except Exception:
asr.stop()
Testing the ASR Module
# tests/test_asr.py
"""Test the ASR module with a real audio file."""
import asyncio
from app.asr.whisper_client import WhisperClient
from app.asr.audio_utils import compute_rms, is_silence, pcm_to_wav
from app.asr.noise_filter import preprocess_audio
import numpy as np
def test_pcm_to_wav():
"""Test PCM to WAV conversion."""
# Generate 1 second of 440Hz sine wave
sample_rate = 16000
t = np.linspace(0, 1.0, sample_rate, dtype=np.float32)
samples = (np.sin(2 * np.pi * 440 * t) * 32767).astype(np.int16)
pcm = samples.tobytes()
wav = pcm_to_wav(pcm, sample_rate)
assert wav[:4] == b"RIFF"
assert wav[8:12] == b"WAVE"
print(f"WAV conversion OK - {len(wav)} bytes")
def test_silence_detection():
"""Test silence detection with known audio."""
# Silence
silence = np.zeros(16000, dtype=np.int16).tobytes()
assert is_silence(silence) == True
# Loud tone
t = np.linspace(0, 1.0, 16000, dtype=np.float32)
tone = (np.sin(2 * np.pi * 440 * t) * 16000).astype(np.int16).tobytes()
assert is_silence(tone) == False
print("Silence detection OK")
def test_noise_filter():
"""Test audio preprocessing."""
sample_rate = 16000
t = np.linspace(0, 1.0, sample_rate, dtype=np.float32)
# Mix speech-frequency tone with low-frequency hum
speech = (np.sin(2 * np.pi * 300 * t) * 10000).astype(np.int16)
hum = (np.sin(2 * np.pi * 50 * t) * 5000).astype(np.int16)
mixed = (speech.astype(np.int32) + hum.astype(np.int32))
mixed = np.clip(mixed, -32768, 32767).astype(np.int16).tobytes()
filtered = preprocess_audio(mixed, sample_rate)
assert len(filtered) == len(mixed)
print("Noise filter OK")
async def test_whisper_transcription():
"""Test Whisper transcription with synthetic audio."""
client = WhisperClient()
# Generate 2 seconds of silence (should return empty)
silence = np.zeros(32000, dtype=np.int16).tobytes()
transcript = await client.transcribe(silence)
print(f"Whisper silence test: '{transcript}'")
if __name__ == "__main__":
test_pcm_to_wav()
test_silence_detection()
test_noise_filter()
asyncio.run(test_whisper_transcription())
print("\nAll ASR tests passed!")
Key Takeaways
- Whisper accepts complete audio files, not streaming chunks. We buffer audio and use silence detection to know when to send it.
- The
WhisperClientaccumulates PCM chunks, monitors RMS energy, and triggers transcription after 0.8 seconds of silence. - Audio preprocessing (high-pass filter + volume normalization) significantly improves transcription quality in noisy environments.
- The
ASRStreamHandlerwraps the client with a clean async interface for the WebSocket handler.
What Is Next
In the next lesson, you will build the LLM conversation engine — the brain that processes transcripts, maintains conversation context, supports tool calling, and generates streaming responses.