Step 3: Text-to-Speech
Build the voice of your assistant. You will integrate two TTS providers (ElevenLabs and OpenAI TTS), implement streaming audio output for low-latency playback, create a voice manager for voice selection and switching, and build a sentence-level streaming pipeline that starts speaking before the LLM finishes generating.
TTS Provider Comparison
We support two TTS providers. Each has different strengths:
| Feature | ElevenLabs | OpenAI TTS |
|---|---|---|
| Voice Quality | Premium, ultra-realistic | Good, natural-sounding |
| Streaming | Yes (WebSocket + HTTP) | Yes (HTTP streaming) |
| Latency | ~300ms (Turbo v2.5) | ~500ms |
| Cost | Free tier: 10k chars/mo | $15/1M characters |
| Voice Cloning | Yes | No |
| Best For | Production, premium UX | Development, cost-sensitive |
ElevenLabs Streaming TTS Client
The ElevenLabs client uses their streaming API to start receiving audio chunks within 300ms. We send text and receive MP3 audio chunks in real time.
# app/tts/elevenlabs_client.py
"""ElevenLabs streaming text-to-speech client."""
import logging
from typing import AsyncGenerator
import httpx
from app.config import get_settings
logger = logging.getLogger(__name__)
settings = get_settings()
class ElevenLabsClient:
"""ElevenLabs TTS client with streaming support.
Sends text to the ElevenLabs API and yields audio chunks
as they arrive. The streaming API starts returning audio
within ~300ms of the request.
"""
BASE_URL = "https://api.elevenlabs.io/v1"
def __init__(self):
self.api_key = settings.elevenlabs_api_key
self.voice_id = settings.elevenlabs_voice_id
self.model_id = settings.elevenlabs_model
async def synthesize_stream(
self,
text: str,
voice_id: str | None = None,
output_format: str = "mp3_44100_128"
) -> AsyncGenerator[bytes, None]:
"""Stream TTS audio from ElevenLabs.
Args:
text: Text to synthesize
voice_id: Override default voice ID
output_format: Audio format (mp3_44100_128, pcm_16000, etc.)
Yields:
Audio chunks as bytes (MP3 or PCM)
"""
vid = voice_id or self.voice_id
url = f"{self.BASE_URL}/text-to-speech/{vid}/stream"
headers = {
"xi-api-key": self.api_key,
"Content-Type": "application/json",
"Accept": "audio/mpeg"
}
payload = {
"text": text,
"model_id": self.model_id,
"voice_settings": {
"stability": 0.5,
"similarity_boost": 0.75,
"style": 0.0,
"use_speaker_boost": True
},
"output_format": output_format
}
logger.debug(
f"ElevenLabs TTS: '{text[:50]}...' "
f"(voice={vid}, model={self.model_id})"
)
async with httpx.AsyncClient(timeout=30.0) as client:
async with client.stream(
"POST", url, headers=headers, json=payload
) as response:
if response.status_code != 200:
error = await response.aread()
logger.error(
f"ElevenLabs error {response.status_code}: {error}"
)
raise Exception(
f"ElevenLabs API error: {response.status_code}"
)
chunk_count = 0
total_bytes = 0
async for chunk in response.aiter_bytes(chunk_size=4096):
chunk_count += 1
total_bytes += len(chunk)
yield chunk
logger.info(
f"ElevenLabs TTS complete: "
f"{chunk_count} chunks, {total_bytes} bytes"
)
async def synthesize_full(
self, text: str, voice_id: str | None = None
) -> bytes:
"""Synthesize text and return complete audio.
Args:
text: Text to synthesize
voice_id: Override default voice ID
Returns:
Complete MP3 audio as bytes
"""
chunks = []
async for chunk in self.synthesize_stream(text, voice_id):
chunks.append(chunk)
return b"".join(chunks)
async def list_voices(self) -> list[dict]:
"""List all available voices.
Returns:
List of voice dicts with id, name, and labels
"""
async with httpx.AsyncClient() as client:
response = await client.get(
f"{self.BASE_URL}/voices",
headers={"xi-api-key": self.api_key}
)
data = response.json()
voices = []
for v in data.get("voices", []):
voices.append({
"id": v["voice_id"],
"name": v["name"],
"category": v.get("category", ""),
"labels": v.get("labels", {})
})
return voices
OpenAI TTS Client (Fallback)
The OpenAI TTS client is simpler and cheaper. Use it for development or when ElevenLabs quota is exhausted.
# app/tts/openai_tts.py
"""OpenAI TTS client with streaming support."""
import logging
from typing import AsyncGenerator
from openai import AsyncOpenAI
from app.config import get_settings
logger = logging.getLogger(__name__)
settings = get_settings()
class OpenAITTSClient:
"""OpenAI Text-to-Speech client.
Uses the OpenAI TTS API which supports 6 built-in voices:
alloy, echo, fable, onyx, nova, shimmer.
Two models available:
- tts-1: Faster, lower quality ($15/1M chars)
- tts-1-hd: Slower, higher quality ($30/1M chars)
"""
# Available voices with descriptions
VOICES = {
"alloy": "Neutral, balanced",
"echo": "Warm, deep",
"fable": "British, expressive",
"onyx": "Deep, authoritative",
"nova": "Friendly, conversational",
"shimmer": "Clear, energetic"
}
def __init__(self):
self.client = AsyncOpenAI(api_key=settings.openai_api_key)
self.model = settings.openai_tts_model
self.voice = settings.openai_tts_voice
async def synthesize_stream(
self,
text: str,
voice: str | None = None,
response_format: str = "mp3",
speed: float = 1.0
) -> AsyncGenerator[bytes, None]:
"""Stream TTS audio from OpenAI.
Args:
text: Text to synthesize
voice: Override default voice
response_format: mp3, opus, aac, flac, wav, or pcm
speed: Playback speed (0.25 to 4.0)
Yields:
Audio chunks as bytes
"""
v = voice or self.voice
logger.debug(
f"OpenAI TTS: '{text[:50]}...' "
f"(voice={v}, model={self.model})"
)
async with self.client.audio.speech.with_streaming_response.create(
model=self.model,
voice=v,
input=text,
response_format=response_format,
speed=speed
) as response:
chunk_count = 0
total_bytes = 0
async for chunk in response.iter_bytes(chunk_size=4096):
chunk_count += 1
total_bytes += len(chunk)
yield chunk
logger.info(
f"OpenAI TTS complete: "
f"{chunk_count} chunks, {total_bytes} bytes"
)
async def synthesize_full(
self, text: str, voice: str | None = None
) -> bytes:
"""Synthesize text and return complete audio.
Args:
text: Text to synthesize
voice: Override default voice
Returns:
Complete audio as bytes
"""
chunks = []
async for chunk in self.synthesize_stream(text, voice):
chunks.append(chunk)
return b"".join(chunks)
Voice Manager
The voice manager abstracts the TTS provider selection and provides a unified interface:
# app/tts/voice_manager.py
"""Voice manager - unified TTS interface with provider switching."""
import logging
from typing import AsyncGenerator
from app.config import get_settings
from app.tts.elevenlabs_client import ElevenLabsClient
from app.tts.openai_tts import OpenAITTSClient
from app.llm.voice_optimizations import clean_for_speech, split_into_sentences
logger = logging.getLogger(__name__)
settings = get_settings()
class VoiceManager:
"""Unified TTS interface with automatic fallback.
Uses ElevenLabs as primary and OpenAI TTS as fallback.
Handles text cleaning, sentence splitting for natural
prosody, and audio streaming.
Usage:
manager = VoiceManager()
async for audio_chunk in manager.speak("Hello world!"):
send_audio(audio_chunk)
"""
def __init__(self):
self.provider = settings.tts_provider
self.elevenlabs = ElevenLabsClient()
self.openai_tts = OpenAITTSClient()
async def speak(
self,
text: str,
provider: str | None = None
) -> AsyncGenerator[bytes, None]:
"""Convert text to streaming audio.
Cleans the text for speech, selects the provider,
and yields audio chunks.
Args:
text: Text to speak
provider: Override default provider ("elevenlabs" or "openai")
Yields:
Audio chunks (MP3 bytes)
"""
# Clean text for natural speech
cleaned = clean_for_speech(text)
if not cleaned:
return
p = provider or self.provider
try:
if p == "elevenlabs":
async for chunk in self.elevenlabs.synthesize_stream(cleaned):
yield chunk
else:
async for chunk in self.openai_tts.synthesize_stream(cleaned):
yield chunk
except Exception as e:
logger.error(f"TTS error with {p}: {e}")
# Fallback to the other provider
fallback = "openai" if p == "elevenlabs" else "elevenlabs"
logger.info(f"Falling back to {fallback}")
try:
if fallback == "elevenlabs":
async for chunk in self.elevenlabs.synthesize_stream(cleaned):
yield chunk
else:
async for chunk in self.openai_tts.synthesize_stream(cleaned):
yield chunk
except Exception as e2:
logger.error(f"TTS fallback also failed: {e2}")
raise
async def speak_sentences(
self,
text: str,
provider: str | None = None
) -> AsyncGenerator[tuple[str, bytes], None]:
"""Speak text sentence by sentence for incremental playback.
Splits text into sentences and synthesizes each one
independently. This allows the frontend to start playing
the first sentence while later sentences are still being
synthesized.
Args:
text: Full text to speak
provider: Override default provider
Yields:
Tuples of (sentence_text, audio_bytes)
"""
cleaned = clean_for_speech(text)
sentences = split_into_sentences(cleaned)
for sentence in sentences:
if not sentence:
continue
p = provider or self.provider
try:
if p == "elevenlabs":
audio = await self.elevenlabs.synthesize_full(sentence)
else:
audio = await self.openai_tts.synthesize_full(sentence)
yield (sentence, audio)
except Exception as e:
logger.error(f"TTS error for sentence: {e}")
continue
async def speak_streaming_llm(
self,
text_generator: AsyncGenerator[str, None]
) -> AsyncGenerator[bytes, None]:
"""Speak text as it streams from the LLM.
Buffers LLM tokens until a complete sentence is formed,
then immediately sends it to TTS. This minimizes the
time between the LLM generating text and the user
hearing it.
Args:
text_generator: Async generator yielding text chunks from LLM
Yields:
Audio chunks (MP3 bytes)
"""
buffer = ""
sentence_enders = {'.', '!', '?'}
async for token in text_generator:
buffer += token
# Check if we have a complete sentence
if buffer and buffer[-1] in sentence_enders:
sentence = buffer.strip()
buffer = ""
if sentence:
logger.debug(f"TTS sentence: '{sentence}'")
async for audio_chunk in self.speak(sentence):
yield audio_chunk
# Handle remaining text
if buffer.strip():
async for audio_chunk in self.speak(buffer.strip()):
yield audio_chunk
speak_streaming_llm() method is the key to low latency. As the LLM streams tokens, we buffer them until a sentence is complete (ending with ., !, or ?). The moment a sentence is ready, we send it to TTS and start streaming audio to the client. The user hears the first sentence while the LLM is still generating the second.Testing TTS
# tests/test_tts.py
"""Test TTS providers."""
import asyncio
from app.tts.elevenlabs_client import ElevenLabsClient
from app.tts.openai_tts import OpenAITTSClient
from app.tts.voice_manager import VoiceManager
async def test_openai_tts():
"""Test OpenAI TTS."""
client = OpenAITTSClient()
print("Testing OpenAI TTS...")
audio = await client.synthesize_full(
"Hello! I am your voice assistant."
)
print(f"OpenAI TTS OK - {len(audio)} bytes of audio")
# Save to file for manual listening
with open("test_openai.mp3", "wb") as f:
f.write(audio)
print("Saved to test_openai.mp3")
async def test_elevenlabs_tts():
"""Test ElevenLabs TTS."""
client = ElevenLabsClient()
print("\nTesting ElevenLabs TTS...")
try:
audio = await client.synthesize_full(
"Hello! I am your voice assistant."
)
print(f"ElevenLabs TTS OK - {len(audio)} bytes of audio")
with open("test_elevenlabs.mp3", "wb") as f:
f.write(audio)
print("Saved to test_elevenlabs.mp3")
except Exception as e:
print(f"ElevenLabs SKIPPED: {e}")
async def test_streaming():
"""Test streaming TTS."""
manager = VoiceManager()
print("\nTesting streaming TTS...")
text = "Hello! How are you doing today? I can help with many things."
chunks = []
async for chunk in manager.speak(text):
chunks.append(chunk)
total = sum(len(c) for c in chunks)
print(f"Streaming OK - {len(chunks)} chunks, {total} bytes total")
async def test_voice_list():
"""List available ElevenLabs voices."""
client = ElevenLabsClient()
try:
voices = await client.list_voices()
print(f"\nAvailable voices ({len(voices)}):")
for v in voices[:5]:
print(f" - {v['name']} ({v['id'][:8]}...)")
except Exception as e:
print(f"Voice list SKIPPED: {e}")
if __name__ == "__main__":
asyncio.run(test_openai_tts())
asyncio.run(test_elevenlabs_tts())
asyncio.run(test_streaming())
asyncio.run(test_voice_list())
print("\nAll TTS tests passed!")
VoiceManager provides a unified interface with automatic fallback. The sentence-level streaming pipeline ensures low latency by speaking as soon as a complete sentence is available. Play the generated MP3 files to verify audio quality.Key Takeaways
- ElevenLabs offers premium voice quality with ~300ms latency; OpenAI TTS is cheaper and simpler for development.
- Streaming TTS starts returning audio chunks immediately — you do not need to wait for the full audio to be generated.
- The
VoiceManagerprovides automatic fallback: if ElevenLabs fails, it falls back to OpenAI TTS seamlessly. - Sentence-level streaming (buffering LLM tokens until a sentence is complete) is the key to minimizing perceived latency.
What Is Next
In the next lesson, you will build the web interface — a browser-based voice UI with WebSocket audio streaming, push-to-talk controls, waveform visualization, and real-time conversation display.