Enhancements & Next Steps
Take your voice assistant to the next level. This lesson covers wake word detection for hands-free activation, multi-language support, telephony integration for phone-based access, and answers the most common questions voice assistant builders ask.
Enhancement 1: Wake Word Detection
Wake word detection lets users activate the assistant by saying a keyword (like "Hey Assistant") instead of pressing a button. This enables true hands-free operation.
# app/wakeword/detector.py
"""Wake word detection using Porcupine or a simple energy-based approach."""
import logging
import numpy as np
from typing import Optional
logger = logging.getLogger(__name__)
class SimpleWakeWordDetector:
"""Energy-based wake word detection.
This is a simplified approach that detects when someone
starts speaking after a period of silence. For production
use, integrate Picovoice Porcupine for keyword detection.
For Porcupine integration:
pip install pvporcupine
detector = pvporcupine.create(
access_key="YOUR_KEY",
keywords=["hey assistant"]
)
"""
def __init__(self, energy_threshold: float = 0.02,
activation_frames: int = 3):
self.energy_threshold = energy_threshold
self.activation_frames = activation_frames
self._consecutive_active = 0
self._is_activated = False
def process_frame(self, audio_frame: bytes) -> bool:
"""Process an audio frame and check for wake word.
Args:
audio_frame: Raw PCM audio (16-bit, 16kHz)
Returns:
True if wake word detected (start listening)
"""
samples = np.frombuffer(audio_frame, dtype=np.int16)
energy = np.sqrt(np.mean(samples.astype(np.float64) ** 2)) / 32768.0
if energy > self.energy_threshold:
self._consecutive_active += 1
else:
self._consecutive_active = 0
if (self._consecutive_active >= self.activation_frames
and not self._is_activated):
self._is_activated = True
logger.info("Wake word detected - starting to listen")
return True
return False
def reset(self):
"""Reset after an utterance is processed."""
self._is_activated = False
self._consecutive_active = 0
class PorcupineWakeWordDetector:
"""Production wake word detection using Picovoice Porcupine.
Porcupine runs entirely on-device with no cloud dependency.
It detects custom keywords with high accuracy and low
false-positive rates.
Setup:
pip install pvporcupine
# Get a free access key at picovoice.ai/console
"""
def __init__(self, access_key: str,
keywords: list[str] = None):
try:
import pvporcupine
self.porcupine = pvporcupine.create(
access_key=access_key,
keywords=keywords or ["computer"]
)
self.frame_length = self.porcupine.frame_length
self.sample_rate = self.porcupine.sample_rate
logger.info(
f"Porcupine initialized: keywords={keywords}, "
f"frame_length={self.frame_length}"
)
except ImportError:
raise ImportError(
"Install pvporcupine: pip install pvporcupine"
)
def process_frame(self, audio_frame: bytes) -> bool:
"""Process a frame through Porcupine.
Args:
audio_frame: PCM audio, must be exactly frame_length samples
Returns:
True if wake word detected
"""
samples = np.frombuffer(audio_frame, dtype=np.int16)
keyword_index = self.porcupine.process(samples)
if keyword_index >= 0:
logger.info(f"Porcupine: keyword {keyword_index} detected")
return True
return False
def cleanup(self):
"""Release Porcupine resources."""
if hasattr(self, 'porcupine'):
self.porcupine.delete()
Enhancement 2: Multi-Language Support
Adding multi-language support requires changes to all three pipeline stages:
# app/i18n/language_manager.py
"""Multi-language support for the voice assistant."""
import logging
from dataclasses import dataclass
logger = logging.getLogger(__name__)
@dataclass
class LanguageConfig:
"""Configuration for a supported language."""
code: str # ISO 639-1 code
name: str # Display name
whisper_language: str # Whisper language code
tts_voice_elevenlabs: str # ElevenLabs voice ID
tts_voice_openai: str # OpenAI TTS voice
system_prompt_addition: str # Additional system prompt
# Supported languages
LANGUAGES = {
"en": LanguageConfig(
code="en",
name="English",
whisper_language="en",
tts_voice_elevenlabs="21m00Tcm4TlvDq8ikWAM", # Rachel
tts_voice_openai="nova",
system_prompt_addition=""
),
"es": LanguageConfig(
code="es",
name="Spanish",
whisper_language="es",
tts_voice_elevenlabs="ThT5KcBeYPX3keUQqHPh", # Spanish voice
tts_voice_openai="nova",
system_prompt_addition="Respond in Spanish. Use natural, conversational Spanish."
),
"fr": LanguageConfig(
code="fr",
name="French",
whisper_language="fr",
tts_voice_elevenlabs="XrExE9yKIg1WjnnlVkGX", # French voice
tts_voice_openai="nova",
system_prompt_addition="Respond in French. Use natural, conversational French."
),
"de": LanguageConfig(
code="de",
name="German",
whisper_language="de",
tts_voice_elevenlabs="pNInz6obpgDQGcFmaJgB", # German voice
tts_voice_openai="onyx",
system_prompt_addition="Respond in German. Use natural, conversational German."
),
"ja": LanguageConfig(
code="ja",
name="Japanese",
whisper_language="ja",
tts_voice_elevenlabs="Xb7hH8MSUJpSbSDYk0k2", # Japanese voice
tts_voice_openai="shimmer",
system_prompt_addition="Respond in Japanese. Use polite, conversational Japanese."
),
}
class LanguageManager:
"""Manage language selection and configuration.
Supports automatic language detection from speech
or manual language selection by the user.
"""
def __init__(self, default_language: str = "en"):
self.current_language = default_language
def get_config(self) -> LanguageConfig:
"""Get the current language configuration."""
return LANGUAGES.get(self.current_language, LANGUAGES["en"])
def set_language(self, code: str) -> bool:
"""Set the active language.
Args:
code: ISO 639-1 language code
Returns:
True if language is supported
"""
if code in LANGUAGES:
self.current_language = code
logger.info(f"Language set to: {LANGUAGES[code].name}")
return True
return False
def detect_language_from_text(self, text: str) -> str:
"""Simple language detection heuristic.
For production, use langdetect or the language
returned by Whisper's auto-detection.
Args:
text: Input text
Returns:
Detected language code
"""
# Whisper can auto-detect language when whisper_language
# is not set. Use that detection result instead of this.
return self.current_language
def get_supported_languages(self) -> list[dict]:
"""List all supported languages."""
return [
{"code": lang.code, "name": lang.name}
for lang in LANGUAGES.values()
]
Enhancement 3: Telephony Integration
Connect your voice assistant to phone calls using Twilio. Users can call a phone number and talk to your assistant.
# app/telephony/twilio_handler.py
"""Twilio telephony integration for phone-based voice assistant."""
import logging
import base64
from fastapi import APIRouter, Request
from fastapi.responses import Response
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/telephony", tags=["telephony"])
# Twilio TwiML for answering calls
TWIML_CONNECT = """<?xml version="1.0" encoding="UTF-8"?>
<Response>
<Say>Hello! I am your AI voice assistant. How can I help you today?</Say>
<Connect>
<Stream url="wss://{host}/ws/twilio" />
</Connect>
</Response>"""
@router.post("/incoming")
async def handle_incoming_call(request: Request):
"""Handle incoming Twilio phone call.
Twilio sends a POST request when someone calls your number.
We respond with TwiML that streams audio to our WebSocket.
Setup:
1. pip install twilio
2. Get a Twilio phone number
3. Set the Voice webhook URL to https://your-domain.com/telephony/incoming
"""
host = request.headers.get("host", "your-domain.com")
twiml = TWIML_CONNECT.format(host=host)
return Response(
content=twiml,
media_type="application/xml"
)
# Twilio Media Stream WebSocket handler
# This receives audio from the phone call and sends it
# through the same ASR -> LLM -> TTS pipeline
async def handle_twilio_stream(websocket):
"""Handle a Twilio Media Stream WebSocket connection.
Twilio sends audio as base64-encoded mulaw at 8kHz.
We need to:
1. Decode from base64
2. Convert from mulaw to PCM
3. Upsample from 8kHz to 16kHz
4. Send through ASR -> LLM -> TTS
5. Convert TTS output back to mulaw
6. Send back to Twilio
This is the WebSocket handler for /ws/twilio
"""
import json
await websocket.accept()
stream_sid = None
logger.info("Twilio media stream connected")
try:
while True:
message = await websocket.receive_text()
data = json.loads(message)
event = data.get("event")
if event == "start":
stream_sid = data["start"]["streamSid"]
logger.info(f"Twilio stream started: {stream_sid}")
elif event == "media":
# Audio from the phone call
payload = data["media"]["payload"]
audio_bytes = base64.b64decode(payload)
# TODO: Convert mulaw 8kHz -> PCM 16kHz
# TODO: Process through ASR -> LLM -> TTS
# TODO: Convert response back to mulaw
# TODO: Send back to Twilio via WebSocket
# Send audio back to Twilio:
# await websocket.send_json({
# "event": "media",
# "streamSid": stream_sid,
# "media": {
# "payload": base64.b64encode(response_audio).decode()
# }
# })
pass
elif event == "stop":
logger.info(f"Twilio stream ended: {stream_sid}")
break
except Exception as e:
logger.error(f"Twilio stream error: {e}")
# Audio conversion helpers for telephony
def mulaw_to_pcm(mulaw_data: bytes) -> bytes:
"""Convert mu-law encoded audio to 16-bit PCM.
Twilio sends audio as 8-bit mu-law at 8kHz.
We need 16-bit PCM at 16kHz for Whisper.
"""
import audioop
# Decode mu-law to PCM (16-bit)
pcm_8khz = audioop.ulaw2lin(mulaw_data, 2)
# Upsample from 8kHz to 16kHz
pcm_16khz = audioop.ratecv(pcm_8khz, 2, 1, 8000, 16000, None)[0]
return pcm_16khz
def pcm_to_mulaw(pcm_data: bytes) -> bytes:
"""Convert 16-bit PCM to mu-law for Twilio.
TTS output is typically 16-bit PCM at 16kHz or higher.
Twilio expects 8-bit mu-law at 8kHz.
"""
import audioop
# Downsample to 8kHz
pcm_8khz = audioop.ratecv(pcm_data, 2, 1, 16000, 8000, None)[0]
# Encode as mu-law
mulaw = audioop.lin2ulaw(pcm_8khz, 2)
return mulaw
ngrok http 8000). Set your Twilio number's Voice webhook URL to https://your-ngrok-url/telephony/incoming.Enhancement 4: Interruption Handling
Let users interrupt the assistant while it is speaking, just like a natural conversation:
# app/ws/interruption.py
"""Handle user interruptions during assistant speech."""
import logging
import asyncio
from typing import Optional
logger = logging.getLogger(__name__)
class InterruptionHandler:
"""Detect and handle user interruptions.
When the user starts speaking while the assistant is
still talking, we should:
1. Stop TTS playback immediately
2. Cancel any pending TTS requests
3. Transcribe the user's new input
4. Generate a new response
This creates a more natural conversational experience.
"""
def __init__(self):
self._is_speaking = False
self._cancel_event = asyncio.Event()
@property
def is_speaking(self) -> bool:
return self._is_speaking
def start_speaking(self):
"""Mark that the assistant started speaking."""
self._is_speaking = True
self._cancel_event.clear()
def stop_speaking(self):
"""Mark that the assistant stopped speaking."""
self._is_speaking = False
def interrupt(self):
"""Signal that the user interrupted."""
if self._is_speaking:
logger.info("User interrupted - cancelling speech")
self._cancel_event.set()
self._is_speaking = False
@property
def should_cancel(self) -> bool:
"""Check if current speech should be cancelled."""
return self._cancel_event.is_set()
async def speak_with_interruption(
self,
audio_generator,
websocket
):
"""Stream audio with interruption support.
Args:
audio_generator: Async generator of audio chunks
websocket: WebSocket to send audio through
Stops sending audio immediately if interrupted.
"""
self.start_speaking()
try:
async for chunk in audio_generator:
if self.should_cancel:
logger.info("Speech cancelled by interruption")
# Tell client to stop playback
await websocket.send_json({
"type": "stop_playback"
})
break
await websocket.send_bytes(chunk)
finally:
self.stop_speaking()
await websocket.send_json({
"type": "audio_done"
})
Enhancement 5: Conversation Analytics
# app/analytics/tracker.py
"""Track conversation analytics for insights."""
from dataclasses import dataclass, field
from datetime import datetime
from typing import list
@dataclass
class TurnMetrics:
"""Metrics for a single conversation turn."""
timestamp: datetime
user_text: str
assistant_text: str
asr_latency_ms: float
llm_latency_ms: float
tts_latency_ms: float
total_latency_ms: float
tools_used: list[str] = field(default_factory=list)
@dataclass
class ConversationAnalytics:
"""Analytics for a full conversation session."""
session_id: str
started_at: datetime = field(default_factory=datetime.utcnow)
turns: list[TurnMetrics] = field(default_factory=list)
@property
def turn_count(self) -> int:
return len(self.turns)
@property
def avg_latency_ms(self) -> float:
if not self.turns:
return 0.0
return sum(t.total_latency_ms for t in self.turns) / len(self.turns)
@property
def duration_seconds(self) -> float:
if not self.turns:
return 0.0
return (self.turns[-1].timestamp - self.started_at).total_seconds()
def get_summary(self) -> dict:
return {
"session_id": self.session_id,
"turns": self.turn_count,
"duration_seconds": round(self.duration_seconds, 1),
"avg_latency_ms": round(self.avg_latency_ms, 0),
"tools_used": list(set(
tool for t in self.turns for tool in t.tools_used
))
}
Frequently Asked Questions
Can I run Whisper locally instead of using the API?
Yes. Install openai-whisper or use faster-whisper (CTranslate2-based, 4x faster). Local inference eliminates the API call latency (~200ms) and cost, but requires a GPU for real-time performance. A T4 GPU can transcribe audio faster than real-time with faster-whisper.
# Local Whisper with faster-whisper
# pip install faster-whisper
from faster_whisper import WhisperModel
model = WhisperModel("base", device="cuda", compute_type="float16")
segments, info = model.transcribe("audio.wav", language="en")
text = " ".join(segment.text for segment in segments)
How do I reduce end-to-end latency below 1 second?
The biggest wins come from:
- Local Whisper: Eliminates 200-500ms of API latency.
- GPT-4o-mini instead of GPT-4o: 50% faster first-token latency, 90% cheaper.
- ElevenLabs Turbo v2.5: ~200ms time-to-first-audio.
- Sentence-level streaming: Overlap LLM generation with TTS synthesis.
- Connection pooling: Reuse HTTPS connections to eliminate TLS handshakes.
- Shorter silence threshold: Reduce from 800ms to 500ms (but increases false triggers).
How do I handle background noise in production?
Layer these techniques:
- Enable browser-level noise suppression:
noiseSuppression: trueingetUserMedia. - Apply the high-pass filter from our audio_utils module to remove low-frequency rumble.
- Increase the silence detection threshold in noisy environments.
- Use WebRTC VAD (Voice Activity Detection) for more reliable speech/silence detection than simple RMS energy.
- For very noisy environments, consider a noise suppression model like RNNoise or DTLN.
Can I use this with mobile apps?
Yes. The WebSocket protocol works from native mobile apps. For iOS, use URLSessionWebSocketTask. For Android, use OkHttp's WebSocket client. For React Native, use the react-native-websocket package. The audio capture code will differ (use AVAudioEngine on iOS, AudioRecord on Android) but the server-side pipeline is identical.
How much does it cost to run?
Typical costs per conversation turn (one user question + one response):
- Whisper ASR: $0.003 (5 seconds of audio at $0.006/min)
- GPT-4o: $0.005 (500 input tokens + 100 output tokens)
- ElevenLabs TTS: $0.00 (free tier: 10,000 chars/month)
- Total per turn: ~$0.008 or about $0.05 per 6-turn conversation
At 1,000 conversations per month, expect about $50 in API costs. Use GPT-4o-mini to cut LLM costs by 90%.
How do I add custom tools?
Add a new tool in three steps:
- Define the JSON schema in
TOOL_SCHEMASinapp/llm/tools.py. - Implement the function in the same file.
- Add a case to the
execute_tool()dispatcher.
The LLM will automatically discover and use the new tool based on user intent.
Can I fine-tune the voice?
With ElevenLabs, you can clone a voice by uploading 1-3 minutes of sample audio. This creates a custom voice that sounds like a specific person. With OpenAI TTS, you are limited to the 6 built-in voices but can adjust speed. For ultimate control, consider running a local TTS model like XTTS or Bark.
Complete Architecture Diagram
+---------------------------+
| Browser / Mobile App |
| |
| [Mic] -> [AudioWorklet] |
| [Speaker] <- [AudioQueue] |
| [Waveform Canvas] |
| [Conversation UI] |
+-----------|---|-------------+
| |
WebSocket (wss://)
text + binary frames
| |
+-----------|---|-------------+
| Nginx (SSL) |
+-----------|---|-------------+
| |
+-----------|---|-------------+
| FastAPI Server |
| |
| [Session Manager] |
| [Interruption Handler] |
| |
| Audio In -> [ASR Module] |
| Whisper API |
| | |
| transcript |
| | |
| [LLM Engine] |
| GPT-4o + Tools |
| Memory Manager |
| | |
| response text |
| | |
| [TTS Module] |
| ElevenLabs / OpenAI TTS |
| | |
| Audio Out -> |
+-----------------------------+
What You Have Built
Congratulations! You have built a complete, production-ready voice assistant from scratch. Here is a recap of what each lesson delivered:
- Lesson 1 (Project Setup): ASR-LLM-TTS architecture, project structure, API configuration, and connectivity verification.
- Lesson 2 (Speech Recognition): Whisper integration with streaming audio, silence detection, noise filtering, and a clean async ASR interface.
- Lesson 3 (LLM Brain): Conversation engine with memory management, tool calling (time, weather, reminders), streaming generation, and voice-optimized text cleaning.
- Lesson 4 (TTS): Dual-provider TTS with ElevenLabs and OpenAI, streaming audio output, automatic fallback, and sentence-level streaming for low latency.
- Lesson 5 (Web Interface): Browser-based voice UI with WebSocket audio streaming, push-to-talk, waveform visualization, and TTS audio playback.
- Lesson 6 (Deployment): Docker containerization, nginx WebSocket proxy, latency optimization, concurrent session management, and cost monitoring.
- Lesson 7 (Enhancements): Wake word detection, multi-language support, telephony integration, interruption handling, and analytics.
The full codebase is modular, well-tested, and ready for customization. Pick the enhancements that matter most for your use case and build on the foundation you have created.