Intermediate

Step 4: Generation & Streaming

Now that we can retrieve relevant context, we need to generate answers. In this step, you will build the prompt engineering layer, stream responses token-by-token using FastAPI Server-Sent Events, and add a grounding check to prevent hallucinations.

Prompt Engineering for RAG

The system prompt is the most important part of the generation layer. It tells the model exactly how to use the retrieved context and when to say "I don't know."

# app/generation/prompts.py
"""Prompt templates for RAG generation."""

SYSTEM_PROMPT = """You are a helpful assistant that answers questions based on
the provided context documents. Follow these rules strictly:

1. ONLY use information from the provided context to answer the question.
2. If the context does not contain enough information to answer, say:
   "I don't have enough information in the available documents to answer
   this question."
3. When referencing information, cite the source using [Source N] markers
   that match the context labels.
4. Be concise and direct. Do not repeat the question.
5. If the question is ambiguous, ask for clarification.
6. Format your response using markdown when it improves readability
   (bullet points, bold, code blocks).

Context documents:
{context}"""

NO_CONTEXT_PROMPT = """You are a helpful assistant. The user asked a question
but no relevant documents were found in the knowledge base.

Respond by:
1. Acknowledging that you could not find relevant information
2. Suggesting the user rephrase their question or upload more documents
3. Do NOT make up an answer"""


def build_messages(
    question: str,
    context: str,
    chat_history: list[dict] | None = None,
) -> list[dict]:
    """Build the message list for the LLM.

    Args:
        question: The user's current question.
        context: Retrieved context with source markers.
        chat_history: Optional list of previous messages.

    Returns:
        List of message dicts for the OpenAI API.
    """
    messages = []

    # System prompt with context
    if context:
        messages.append({
            "role": "system",
            "content": SYSTEM_PROMPT.format(context=context)
        })
    else:
        messages.append({
            "role": "system",
            "content": NO_CONTEXT_PROMPT
        })

    # Add chat history (last 10 messages to stay within context window)
    if chat_history:
        for msg in chat_history[-10:]:
            messages.append({
                "role": msg["role"],
                "content": msg["content"]
            })

    # Add the current question
    messages.append({
        "role": "user",
        "content": question
    })

    return messages

Streaming Generator

The generator calls OpenAI with streaming enabled and yields tokens as they arrive. This lets the frontend display text progressively, just like ChatGPT.

# app/generation/generator.py
"""LLM generation with streaming support."""
import json
import logging
from typing import AsyncGenerator
from openai import OpenAI

from app.config import get_settings
from app.generation.prompts import build_messages

logger = logging.getLogger(__name__)
settings = get_settings()
_client = OpenAI(api_key=settings.openai_api_key)


def generate_response(
    question: str,
    context: str,
    chat_history: list[dict] | None = None,
) -> str:
    """Generate a complete (non-streaming) response.

    Args:
        question: The user's question.
        context: Retrieved context with source markers.
        chat_history: Optional conversation history.

    Returns:
        The complete response text.
    """
    messages = build_messages(question, context, chat_history)

    response = _client.chat.completions.create(
        model=settings.openai_chat_model,
        messages=messages,
        temperature=0.3,  # Low temperature for factual accuracy
        max_tokens=1024,
    )

    answer = response.choices[0].message.content
    logger.info(
        f"Generated response: {len(answer)} chars, "
        f"{response.usage.total_tokens} tokens"
    )
    return answer


def generate_stream(
    question: str,
    context: str,
    citations: list[dict] | None = None,
    chat_history: list[dict] | None = None,
) -> callable:
    """Generate a streaming response.

    Returns a generator function that yields SSE-formatted events.

    Args:
        question: The user's question.
        context: Retrieved context with source markers.
        citations: Citation metadata to send at the end.
        chat_history: Optional conversation history.

    Returns:
        Generator function that yields SSE event strings.
    """
    messages = build_messages(question, context, chat_history)

    def event_generator():
        """Yield Server-Sent Events."""
        try:
            stream = _client.chat.completions.create(
                model=settings.openai_chat_model,
                messages=messages,
                temperature=0.3,
                max_tokens=1024,
                stream=True,
            )

            full_response = []

            for chunk in stream:
                if chunk.choices[0].delta.content:
                    token = chunk.choices[0].delta.content
                    full_response.append(token)

                    # Send token event
                    yield f"data: {json.dumps({'type': 'token', 'content': token})}\n\n"

            # Send citations at the end
            if citations:
                yield f"data: {json.dumps({'type': 'citations', 'citations': citations})}\n\n"

            # Send completion event
            complete_text = "".join(full_response)
            yield f"data: {json.dumps({'type': 'done', 'total_length': len(complete_text)})}\n\n"

            logger.info(f"Streamed {len(complete_text)} chars")

        except Exception as e:
            logger.error(f"Streaming error: {e}")
            yield f"data: {json.dumps({'type': 'error', 'message': str(e)})}\n\n"

    return event_generator
💡
Why temperature 0.3? For RAG, you want the model to be factual and grounded in the provided context. Higher temperatures (0.7-1.0) make the model more creative, which increases hallucination risk. Use 0.3 for factual Q&A and only increase it if your use case requires creativity.

Add the Streaming Chat Endpoint

Now add the main chat endpoint to app/main.py that ties retrieval and generation together with streaming:

# Add to app/main.py
from fastapi.responses import StreamingResponse
from app.generation.generator import generate_stream, generate_response


class ChatRequest(BaseModel):
    question: str
    stream: bool = True
    top_k: int = 5
    use_reranking: bool = True
    chat_history: list[dict] = []


@app.post("/api/chat")
async def chat(request: ChatRequest):
    """Chat with the RAG chatbot.

    Retrieves relevant context and generates an answer.
    Supports both streaming (SSE) and non-streaming modes.
    """
    # Step 1: Retrieve context
    retrieval_result = retrieve_with_context(
        question=request.question,
        store=vector_store,
        top_k=request.top_k,
        use_reranking=request.use_reranking,
    )

    context = retrieval_result["context"]
    citations = retrieval_result["citations"]

    if request.stream:
        # Step 2a: Stream the response
        event_gen = generate_stream(
            question=request.question,
            context=context,
            citations=citations,
            chat_history=request.chat_history,
        )
        return StreamingResponse(
            event_gen(),
            media_type="text/event-stream",
            headers={
                "Cache-Control": "no-cache",
                "Connection": "keep-alive",
                "X-Accel-Buffering": "no",
            },
        )
    else:
        # Step 2b: Return complete response
        answer = generate_response(
            question=request.question,
            context=context,
            chat_history=request.chat_history,
        )
        return {
            "answer": answer,
            "citations": citations,
            "num_sources": len(citations),
        }

Test the Chat Endpoint

# Non-streaming test
curl -X POST http://localhost:8000/api/chat \
  -H "Content-Type: application/json" \
  -d '{"question": "What is the RAG chatbot architecture?", "stream": false}'

# Expected:
# {
#   "answer": "Based on the available documents, the RAG chatbot
#              architecture consists of four main components [Source 1]:
#              an ingestion pipeline, a vector store, a retrieval engine,
#              and a generation layer...",
#   "citations": [...],
#   "num_sources": 2
# }

# Streaming test
curl -N -X POST http://localhost:8000/api/chat \
  -H "Content-Type: application/json" \
  -d '{"question": "What features does the chatbot support?", "stream": true}'

# Expected (SSE events):
# data: {"type": "token", "content": "Based"}
# data: {"type": "token", "content": " on"}
# data: {"type": "token", "content": " the"}
# ...
# data: {"type": "citations", "citations": [...]}
# data: {"type": "done", "total_length": 342}

Hallucination Prevention

The system prompt handles most hallucination prevention, but you can add a post-generation grounding check for critical applications:

# Add to app/generation/generator.py

GROUNDING_CHECK_PROMPT = """Given the following context and answer, determine
if the answer is fully supported by the context.

Context: {context}

Answer: {answer}

Respond with ONLY one of:
- "GROUNDED" if the answer is fully supported by the context
- "PARTIALLY_GROUNDED" if some claims are supported but others are not
- "NOT_GROUNDED" if the answer contains claims not found in the context"""


def check_grounding(context: str, answer: str) -> str:
    """Check if the answer is grounded in the context.

    Args:
        context: The retrieved context.
        answer: The generated answer.

    Returns:
        One of: "GROUNDED", "PARTIALLY_GROUNDED", "NOT_GROUNDED"
    """
    response = _client.chat.completions.create(
        model=settings.openai_chat_model,
        messages=[{
            "role": "user",
            "content": GROUNDING_CHECK_PROMPT.format(
                context=context, answer=answer
            )
        }],
        temperature=0,
        max_tokens=20,
    )

    result = response.choices[0].message.content.strip().upper()

    if "NOT_GROUNDED" in result:
        return "NOT_GROUNDED"
    elif "PARTIALLY" in result:
        return "PARTIALLY_GROUNDED"
    else:
        return "GROUNDED"

Key Takeaways

  • The system prompt explicitly instructs the model to only use provided context and cite sources — this is the first line of defense against hallucinations.
  • Low temperature (0.3) keeps the model factual and reduces creative fabrication.
  • Streaming via Server-Sent Events makes the chatbot feel responsive — the first token appears in ~200ms instead of waiting 2-3 seconds for the complete response.
  • The SSE format sends structured events (token, citations, done, error) so the frontend knows exactly what to display.
  • The grounding check is an optional post-generation safety net for high-stakes applications where hallucinations are unacceptable.

What Is Next

The backend is now complete. In the next lesson, you will build the chat UI — a clean HTML/JS interface with message history, typing indicators, and streaming display that connects to the /api/chat endpoint.