Intermediate

Knowledge Base

Ingest FAQ documents, help articles, and past support tickets into a Qdrant vector store. Build the retrieval layer that powers the bot's answers.

Document Ingestion Pipeline

# app/knowledge/ingester.py
import logging
from pathlib import Path
from langchain_community.document_loaders import TextLoader, CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct
import uuid

logger = logging.getLogger(__name__)


class KnowledgeIngester:
    def __init__(self, settings):
        self.embeddings = OpenAIEmbeddings(
            api_key=settings.openai_api_key,
            model="text-embedding-3-small",
        )
        self.client = QdrantClient(host=settings.qdrant_host, port=settings.qdrant_port)
        self.collection = "support_knowledge"
        self.splitter = RecursiveCharacterTextSplitter(
            chunk_size=500, chunk_overlap=50,
            separators=["\n\n", "\n", ". ", " "],
        )
        self._ensure_collection()

    def _ensure_collection(self):
        collections = [c.name for c in self.client.get_collections().collections]
        if self.collection not in collections:
            self.client.create_collection(
                self.collection,
                vectors_config=VectorParams(size=1536, distance=Distance.COSINE),
            )

    def ingest_directory(self, dir_path: str) -> int:
        path = Path(dir_path)
        total = 0
        for file in path.glob("**/*"):
            if file.suffix in {".txt", ".md"}:
                docs = TextLoader(str(file)).load()
            elif file.suffix == ".csv":
                docs = CSVLoader(str(file)).load()
            else:
                continue
            chunks = self.splitter.split_documents(docs)
            self._store_chunks(chunks, source=file.name)
            total += len(chunks)
        logger.info(f"Ingested {total} chunks from {dir_path}")
        return total

    def ingest_faq(self, faq_pairs: list[dict]) -> int:
        points = []
        for faq in faq_pairs:
            text = f"Q: {faq['question']}\nA: {faq['answer']}"
            embedding = self.embeddings.embed_query(text)
            points.append(PointStruct(
                id=str(uuid.uuid4()),
                vector=embedding,
                payload={"text": text, "source": "faq",
                         "question": faq["question"], "answer": faq["answer"]},
            ))
        self.client.upsert(self.collection, points)
        return len(points)

    def _store_chunks(self, chunks, source=""):
        points = []
        texts = [c.page_content for c in chunks]
        embeddings = self.embeddings.embed_documents(texts)
        for chunk, emb in zip(chunks, embeddings):
            points.append(PointStruct(
                id=str(uuid.uuid4()), vector=emb,
                payload={"text": chunk.page_content, "source": source,
                         "metadata": chunk.metadata},
            ))
        self.client.upsert(self.collection, points)

Retriever

# app/knowledge/retriever.py
class KnowledgeRetriever:
    def __init__(self, settings):
        self.embeddings = OpenAIEmbeddings(api_key=settings.openai_api_key, model="text-embedding-3-small")
        self.client = QdrantClient(host=settings.qdrant_host, port=settings.qdrant_port)
        self.collection = "support_knowledge"

    def search(self, query: str, top_k: int = 5) -> list[dict]:
        embedding = self.embeddings.embed_query(query)
        results = self.client.search(self.collection, query_vector=embedding, limit=top_k)
        return [{"text": r.payload["text"], "score": r.score, "source": r.payload.get("source", "")} for r in results]
💡
FAQ pairs are gold: Pre-formatted Q&A pairs have the highest retrieval accuracy because the question matches user queries directly. Always prioritize ingesting your existing FAQ before general documentation.

Key Takeaways

  • Chunk size of 500 characters balances context richness with retrieval precision.
  • FAQ pairs as Q&A format produce the best retrieval matches for common questions.
  • The ingestion pipeline handles text, markdown, and CSV files from a directory.