Step 4: Web Interface
Build the face of your voice assistant. You will create a browser-based UI with WebSocket audio streaming, a push-to-talk button, live waveform visualization, real-time transcript and response display, and audio playback of TTS output. Everything runs in vanilla HTML/CSS/JS with no framework.
The Complete WebSocket Handler (Server Side)
First, let us wire up the full server-side WebSocket handler that connects ASR, LLM, and TTS:
# app/ws/handler.py
"""WebSocket handler - orchestrates the full voice pipeline."""
import logging
import json
from fastapi import WebSocket, WebSocketDisconnect
from app.asr.stream_handler import ASRStreamHandler
from app.llm.engine import ConversationEngine
from app.tts.voice_manager import VoiceManager
from app.llm.voice_optimizations import clean_for_speech
logger = logging.getLogger(__name__)
class VoiceSessionHandler:
"""Handles a single voice WebSocket session.
Orchestrates the full pipeline:
1. Receive audio from client (binary WebSocket frames)
2. Transcribe with Whisper (ASR)
3. Generate response with GPT-4o (LLM)
4. Synthesize speech (TTS)
5. Send audio back to client (binary WebSocket frames)
Text messages are sent as JSON for UI updates:
- {"type": "transcript", "text": "..."} - user transcript
- {"type": "response_text", "text": "..."} - LLM response chunk
- {"type": "response_done", "text": "..."} - full response
- {"type": "status", "message": "..."} - status updates
- {"type": "error", "message": "..."} - error messages
"""
def __init__(self):
self.asr = ASRStreamHandler()
self.llm = ConversationEngine()
self.tts = VoiceManager()
async def handle(self, websocket: WebSocket):
"""Main WebSocket handler loop."""
await websocket.accept()
logger.info("Voice session started")
await websocket.send_json({
"type": "status",
"message": "Connected! Hold the mic button to speak."
})
try:
while True:
# Receive message from client
message = await websocket.receive()
if "bytes" in message:
# Binary frame = audio data
await self._handle_audio(
websocket, message["bytes"]
)
elif "text" in message:
# Text frame = control message
data = json.loads(message["text"])
await self._handle_control(websocket, data)
except WebSocketDisconnect:
logger.info("Voice session ended")
except Exception as e:
logger.error(f"Voice session error: {e}")
try:
await websocket.send_json({
"type": "error",
"message": str(e)
})
except Exception:
pass
async def _handle_audio(self, websocket: WebSocket,
audio_data: bytes):
"""Process incoming audio through ASR -> LLM -> TTS."""
# Step 1: ASR - process audio chunk
event = await self.asr.process_chunk(audio_data)
if event and event["type"] == "transcript":
transcript = event["text"]
# Send transcript to client for display
await websocket.send_json({
"type": "transcript",
"text": transcript
})
# Notify client we are thinking
await websocket.send_json({
"type": "status",
"message": "Thinking..."
})
# Step 2: LLM - generate response
full_response = ""
async for chunk in self.llm.generate_response(transcript):
full_response += chunk
# Stream response text to client
await websocket.send_json({
"type": "response_text",
"text": chunk
})
# Send complete response
await websocket.send_json({
"type": "response_done",
"text": full_response
})
# Step 3: TTS - synthesize and stream audio
await websocket.send_json({
"type": "status",
"message": "Speaking..."
})
cleaned_text = clean_for_speech(full_response)
async for audio_chunk in self.tts.speak(cleaned_text):
# Send audio as binary WebSocket frame
await websocket.send_bytes(audio_chunk)
# Signal audio stream complete
await websocket.send_json({
"type": "audio_done"
})
async def _handle_control(self, websocket: WebSocket,
data: dict):
"""Handle control messages from the client."""
action = data.get("action")
if action == "reset":
self.llm.reset()
self.asr.stop()
await websocket.send_json({
"type": "status",
"message": "Conversation reset."
})
elif action == "stop_listening":
self.asr.stop()
elif action == "start_listening":
self.asr = ASRStreamHandler()
# Update app/main.py to use the handler:
# from app.ws.handler import VoiceSessionHandler
#
# @app.websocket("/ws/voice")
# async def voice_websocket(websocket: WebSocket):
# handler = VoiceSessionHandler()
# await handler.handle(websocket)
The Frontend: HTML Structure
Create the voice assistant UI. This is a single-page app with a microphone button, waveform canvas, and conversation log:
<!-- frontend/index.html -->
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Voice Assistant</title>
<link rel="stylesheet" href="style.css">
</head>
<body>
<div class="app">
<header class="app-header">
<h1>Voice Assistant</h1>
<div class="status" id="status">Connecting...</div>
</header>
<!-- Waveform visualization -->
<div class="waveform-container">
<canvas id="waveform" width="600" height="100"></canvas>
</div>
<!-- Conversation log -->
<div class="conversation" id="conversation">
<div class="message system">
Hold the microphone button and speak. Release to send.
</div>
</div>
<!-- Controls -->
<div class="controls">
<button class="mic-btn" id="micBtn"
onmousedown="startRecording()"
onmouseup="stopRecording()"
ontouchstart="startRecording()"
ontouchend="stopRecording()">
<svg viewBox="0 0 24 24" width="32" height="32">
<path fill="currentColor"
d="M12 14c1.66 0 3-1.34 3-3V5c0-1.66-1.34-3-3-3S9
3.34 9 5v6c0 1.66 1.34 3 3 3zm-1-9c0-.55.45-1
1-1s1 .45 1 1v6c0 .55-.45 1-1 1s-1-.45-1-1V5z"
/>
<path fill="currentColor"
d="M17 11c0 2.76-2.24 5-5 5s-5-2.24-5-5H5c0
3.53 2.61 6.43 6 6.92V21h2v-3.08c3.39-.49 6-3.39
6-6.92h-2z"
/>
</svg>
</button>
<button class="reset-btn" id="resetBtn" onclick="resetConversation()">
New Conversation
</button>
</div>
</div>
<script src="app.js"></script>
</body>
</html>
The Frontend: CSS
/* frontend/style.css */
* {
margin: 0;
padding: 0;
box-sizing: border-box;
}
body {
font-family: 'Inter', -apple-system, sans-serif;
background: #0f0f23;
color: #e2e8f0;
min-height: 100vh;
display: flex;
justify-content: center;
}
.app {
width: 100%;
max-width: 640px;
display: flex;
flex-direction: column;
height: 100vh;
padding: 1rem;
}
.app-header {
text-align: center;
padding: 1rem 0;
}
.app-header h1 {
font-size: 1.5rem;
background: linear-gradient(135deg, #6366f1, #a78bfa);
-webkit-background-clip: text;
-webkit-text-fill-color: transparent;
}
.status {
font-size: 0.85rem;
color: #94a3b8;
margin-top: 0.5rem;
}
.status.active { color: #4ade80; }
.status.error { color: #f87171; }
/* Waveform */
.waveform-container {
background: #1a1a3e;
border-radius: 12px;
padding: 0.5rem;
margin: 1rem 0;
}
#waveform {
width: 100%;
height: 80px;
display: block;
}
/* Conversation */
.conversation {
flex: 1;
overflow-y: auto;
padding: 1rem 0;
display: flex;
flex-direction: column;
gap: 0.75rem;
}
.message {
padding: 0.75rem 1rem;
border-radius: 12px;
max-width: 85%;
line-height: 1.5;
font-size: 0.95rem;
}
.message.user {
background: #6366f1;
color: white;
align-self: flex-end;
border-bottom-right-radius: 4px;
}
.message.assistant {
background: #1e293b;
color: #e2e8f0;
align-self: flex-start;
border-bottom-left-radius: 4px;
}
.message.system {
background: transparent;
color: #64748b;
text-align: center;
font-size: 0.85rem;
align-self: center;
}
/* Controls */
.controls {
display: flex;
align-items: center;
justify-content: center;
gap: 1rem;
padding: 1.5rem 0;
}
.mic-btn {
width: 72px;
height: 72px;
border-radius: 50%;
border: none;
background: linear-gradient(135deg, #6366f1, #8b5cf6);
color: white;
cursor: pointer;
display: flex;
align-items: center;
justify-content: center;
transition: all 0.2s;
box-shadow: 0 4px 20px rgba(99, 102, 241, 0.3);
}
.mic-btn:hover {
transform: scale(1.05);
box-shadow: 0 6px 30px rgba(99, 102, 241, 0.5);
}
.mic-btn.recording {
background: linear-gradient(135deg, #ef4444, #f87171);
box-shadow: 0 4px 20px rgba(239, 68, 68, 0.5);
animation: pulse 1.5s ease-in-out infinite;
}
@keyframes pulse {
0%, 100% { transform: scale(1); }
50% { transform: scale(1.08); }
}
.reset-btn {
padding: 0.5rem 1rem;
border-radius: 8px;
border: 1px solid #334155;
background: transparent;
color: #94a3b8;
cursor: pointer;
font-size: 0.85rem;
transition: all 0.2s;
}
.reset-btn:hover {
border-color: #6366f1;
color: #e2e8f0;
}
The Frontend: JavaScript
The client-side JavaScript handles microphone capture, WebSocket communication, waveform visualization, and audio playback:
// frontend/app.js
// Voice Assistant - Client-Side Logic
// ============================================================
// WebSocket Connection
// ============================================================
const WS_URL = `ws://${window.location.host}/ws/voice`;
let ws = null;
let isConnected = false;
function connectWebSocket() {
ws = new WebSocket(WS_URL);
ws.binaryType = "arraybuffer";
ws.onopen = () => {
isConnected = true;
setStatus("Ready. Hold the mic button to speak.", "active");
};
ws.onmessage = (event) => {
if (event.data instanceof ArrayBuffer) {
// Binary = audio from TTS
handleAudioResponse(event.data);
} else {
// Text = JSON control message
const data = JSON.parse(event.data);
handleServerMessage(data);
}
};
ws.onclose = () => {
isConnected = false;
setStatus("Disconnected. Reconnecting...", "error");
setTimeout(connectWebSocket, 2000);
};
ws.onerror = (err) => {
console.error("WebSocket error:", err);
setStatus("Connection error", "error");
};
}
function handleServerMessage(data) {
switch (data.type) {
case "transcript":
addMessage("user", data.text);
break;
case "response_text":
appendToLastAssistant(data.text);
break;
case "response_done":
finalizeAssistantMessage(data.text);
break;
case "audio_done":
setStatus("Ready. Hold the mic button to speak.", "active");
break;
case "status":
setStatus(data.message, "active");
break;
case "error":
setStatus("Error: " + data.message, "error");
break;
}
}
// ============================================================
// Audio Recording (Microphone Capture)
// ============================================================
let audioContext = null;
let mediaStream = null;
let scriptProcessor = null;
let isRecording = false;
async function startRecording() {
if (isRecording || !isConnected) return;
try {
// Request microphone access
mediaStream = await navigator.mediaDevices.getUserMedia({
audio: {
sampleRate: 16000,
channelCount: 1,
echoCancellation: true,
noiseSuppression: true,
autoGainControl: true
}
});
audioContext = new (window.AudioContext || window.webkitAudioContext)({
sampleRate: 16000
});
const source = audioContext.createMediaStreamSource(mediaStream);
// ScriptProcessorNode for capturing raw PCM data
// Buffer size of 4096 = ~256ms at 16kHz
scriptProcessor = audioContext.createScriptProcessor(4096, 1, 1);
scriptProcessor.onaudioprocess = (event) => {
if (!isRecording) return;
const inputData = event.inputBuffer.getChannelData(0);
// Convert Float32 to Int16 PCM
const pcmData = new Int16Array(inputData.length);
for (let i = 0; i < inputData.length; i++) {
const s = Math.max(-1, Math.min(1, inputData[i]));
pcmData[i] = s < 0 ? s * 0x8000 : s * 0x7FFF;
}
// Send PCM data over WebSocket
if (ws && ws.readyState === WebSocket.OPEN) {
ws.send(pcmData.buffer);
}
// Update waveform visualization
drawWaveform(inputData);
};
source.connect(scriptProcessor);
scriptProcessor.connect(audioContext.destination);
isRecording = true;
document.getElementById("micBtn").classList.add("recording");
setStatus("Listening...", "active");
// Tell server we started recording
ws.send(JSON.stringify({ action: "start_listening" }));
} catch (err) {
console.error("Microphone error:", err);
setStatus("Microphone access denied", "error");
}
}
function stopRecording() {
if (!isRecording) return;
isRecording = false;
document.getElementById("micBtn").classList.remove("recording");
// Clean up audio resources
if (scriptProcessor) {
scriptProcessor.disconnect();
scriptProcessor = null;
}
if (mediaStream) {
mediaStream.getTracks().forEach(track => track.stop());
mediaStream = null;
}
if (audioContext) {
audioContext.close();
audioContext = null;
}
// Tell server we stopped recording
if (ws && ws.readyState === WebSocket.OPEN) {
ws.send(JSON.stringify({ action: "stop_listening" }));
}
setStatus("Processing...", "active");
clearWaveform();
}
// ============================================================
// Audio Playback (TTS Output)
// ============================================================
let audioQueue = [];
let isPlaying = false;
let playbackContext = null;
function handleAudioResponse(audioData) {
audioQueue.push(audioData);
if (!isPlaying) {
playNextChunk();
}
}
async function playNextChunk() {
if (audioQueue.length === 0) {
isPlaying = false;
return;
}
isPlaying = true;
if (!playbackContext) {
playbackContext = new (window.AudioContext || window.webkitAudioContext)();
}
const chunk = audioQueue.shift();
try {
// Decode MP3 audio data
const audioBuffer = await playbackContext.decodeAudioData(chunk.slice(0));
const source = playbackContext.createBufferSource();
source.buffer = audioBuffer;
source.connect(playbackContext.destination);
source.onended = () => {
playNextChunk();
};
source.start();
} catch (err) {
console.error("Audio playback error:", err);
playNextChunk(); // Skip failed chunk
}
}
// ============================================================
// Waveform Visualization
// ============================================================
const canvas = document.getElementById("waveform");
const canvasCtx = canvas.getContext("2d");
function drawWaveform(audioData) {
const width = canvas.width;
const height = canvas.height;
const bufferLength = audioData.length;
const sliceWidth = width / bufferLength;
canvasCtx.fillStyle = "#1a1a3e";
canvasCtx.fillRect(0, 0, width, height);
canvasCtx.lineWidth = 2;
canvasCtx.strokeStyle = "#6366f1";
canvasCtx.beginPath();
let x = 0;
for (let i = 0; i < bufferLength; i++) {
const v = audioData[i];
const y = (v * 0.5 + 0.5) * height;
if (i === 0) {
canvasCtx.moveTo(x, y);
} else {
canvasCtx.lineTo(x, y);
}
x += sliceWidth;
}
canvasCtx.lineTo(width, height / 2);
canvasCtx.stroke();
}
function clearWaveform() {
canvasCtx.fillStyle = "#1a1a3e";
canvasCtx.fillRect(0, 0, canvas.width, canvas.height);
// Draw flat line
canvasCtx.strokeStyle = "#334155";
canvasCtx.lineWidth = 1;
canvasCtx.beginPath();
canvasCtx.moveTo(0, canvas.height / 2);
canvasCtx.lineTo(canvas.width, canvas.height / 2);
canvasCtx.stroke();
}
// ============================================================
// UI Helpers
// ============================================================
const conversationEl = document.getElementById("conversation");
let currentAssistantEl = null;
function addMessage(role, text) {
const msgEl = document.createElement("div");
msgEl.className = `message ${role}`;
msgEl.textContent = text;
conversationEl.appendChild(msgEl);
conversationEl.scrollTop = conversationEl.scrollHeight;
if (role === "assistant") {
currentAssistantEl = msgEl;
}
return msgEl;
}
function appendToLastAssistant(text) {
if (!currentAssistantEl) {
currentAssistantEl = addMessage("assistant", "");
}
currentAssistantEl.textContent += text;
conversationEl.scrollTop = conversationEl.scrollHeight;
}
function finalizeAssistantMessage(fullText) {
if (currentAssistantEl) {
currentAssistantEl.textContent = fullText;
}
currentAssistantEl = null;
}
function setStatus(message, className) {
const statusEl = document.getElementById("status");
statusEl.textContent = message;
statusEl.className = "status " + (className || "");
}
function resetConversation() {
// Clear UI
conversationEl.innerHTML = `
<div class="message system">
Conversation reset. Hold the mic button to speak.
</div>
`;
currentAssistantEl = null;
// Tell server to reset
if (ws && ws.readyState === WebSocket.OPEN) {
ws.send(JSON.stringify({ action: "reset" }));
}
}
// ============================================================
// Initialize
// ============================================================
clearWaveform();
connectWebSocket();
Updating the FastAPI Entry Point
Update app/main.py to use the new WebSocket handler:
# app/main.py (updated)
import logging
from fastapi import FastAPI, WebSocket
from fastapi.middleware.cors import CORSMiddleware
from fastapi.staticfiles import StaticFiles
from fastapi.responses import FileResponse
from app.config import get_settings
from app.ws.handler import VoiceSessionHandler
settings = get_settings()
logging.basicConfig(
level=getattr(logging, settings.log_level),
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)
app = FastAPI(
title="Voice Assistant API",
description="End-to-end voice assistant with ASR, LLM, and TTS",
version="1.0.0"
)
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
app.mount("/static", StaticFiles(directory="frontend"), name="static")
@app.get("/")
async def root():
return FileResponse("frontend/index.html")
@app.get("/health")
async def health_check():
return {
"status": "healthy",
"asr_model": settings.whisper_model,
"llm_model": settings.llm_model,
"tts_provider": settings.tts_provider,
}
@app.websocket("/ws/voice")
async def voice_websocket(websocket: WebSocket):
"""WebSocket endpoint for real-time voice communication."""
handler = VoiceSessionHandler()
await handler.handle(websocket)
Testing the Full Pipeline
# Start the server
uvicorn app.main:app --reload --port 8000
# Open http://localhost:8000 in your browser
# 1. Click and hold the microphone button
# 2. Say something like "What time is it?"
# 3. Release the button
# 4. Watch the transcript appear and hear the response
Key Takeaways
- The WebSocket handler orchestrates the full ASR → LLM → TTS pipeline in a single connection, enabling bidirectional real-time communication.
- The browser captures 16-bit PCM audio at 16kHz using
ScriptProcessorNodeand sends it as binary WebSocket frames. - Push-to-talk gives users explicit control over recording, avoiding VAD false triggers in noisy environments.
- Canvas-based waveform visualization provides real-time visual feedback during recording.
- Audio playback uses a queue system to handle streaming TTS chunks and play them sequentially.
What Is Next
In the next lesson, you will deploy the voice assistant — containerize everything with Docker, optimize latency, handle concurrent sessions, and set up monitoring.
Lilly Tech Systems