Intermediate

Knowledge Base

Ingest FAQ documents, help articles, and past support tickets into a Qdrant vector store. Build the retrieval layer that powers the bot's answers.

Document Ingestion Pipeline

# app/knowledge/ingester.py
import logging
from pathlib import Path
from langchain_community.document_loaders import TextLoader, CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct
import uuid

logger = logging.getLogger(__name__)


class KnowledgeIngester:
    def __init__(self, settings):
        self.embeddings = OpenAIEmbeddings(
            api_key=settings.openai_api_key,
            model="text-embedding-3-small",
        )
        self.client = QdrantClient(host=settings.qdrant_host, port=settings.qdrant_port)
        self.collection = "support_knowledge"
        self.splitter = RecursiveCharacterTextSplitter(
            chunk_size=500, chunk_overlap=50,
            separators=["\n\n", "\n", ". ", " "],
        )
        self._ensure_collection()

    def _ensure_collection(self):
        collections = [c.name for c in self.client.get_collections().collections]
        if self.collection not in collections:
            self.client.create_collection(
                self.collection,
                vectors_config=VectorParams(size=1536, distance=Distance.COSINE),
            )

    def ingest_directory(self, dir_path: str) -> int:
        path = Path(dir_path)
        total = 0
        for file in path.glob("**/*"):
            if file.suffix in {".txt", ".md"}:
                docs = TextLoader(str(file)).load()
            elif file.suffix == ".csv":
                docs = CSVLoader(str(file)).load()
            else:
                continue
            chunks = self.splitter.split_documents(docs)
            self._store_chunks(chunks, source=file.name)
            total += len(chunks)
        logger.info(f"Ingested {total} chunks from {dir_path}")
        return total

    def ingest_faq(self, faq_pairs: list[dict]) -> int:
        points = []
        for faq in faq_pairs:
            text = f"Q: {faq['question']}\nA: {faq['answer']}"
            embedding = self.embeddings.embed_query(text)
            points.append(PointStruct(
                id=str(uuid.uuid4()),
                vector=embedding,
                payload={"text": text, "source": "faq",
                         "question": faq["question"], "answer": faq["answer"]},
            ))
        self.client.upsert(self.collection, points)
        return len(points)

    def _store_chunks(self, chunks, source=""):
        points = []
        texts = [c.page_content for c in chunks]
        embeddings = self.embeddings.embed_documents(texts)
        for chunk, emb in zip(chunks, embeddings):
            points.append(PointStruct(
                id=str(uuid.uuid4()), vector=emb,
                payload={"text": chunk.page_content, "source": source,
                         "metadata": chunk.metadata},
            ))
        self.client.upsert(self.collection, points)

Retriever

# app/knowledge/retriever.py
class KnowledgeRetriever:
    def __init__(self, settings):
        self.embeddings = OpenAIEmbeddings(api_key=settings.openai_api_key, model="text-embedding-3-small")
        self.client = QdrantClient(host=settings.qdrant_host, port=settings.qdrant_port)
        self.collection = "support_knowledge"

    def search(self, query: str, top_k: int = 5) -> list[dict]:
        embedding = self.embeddings.embed_query(query)
        results = self.client.search(self.collection, query_vector=embedding, limit=top_k)
        return [{"text": r.payload["text"], "score": r.score, "source": r.payload.get("source", "")} for r in results]

💡

FAQ pairs are gold: Pre-formatted Q&A pairs have the highest retrieval accuracy because the question matches user queries directly. Always prioritize ingesting your existing FAQ before general documentation.

Key Takeaways

Chunk size of 500 characters balances context richness with retrieval precision.
FAQ pairs as Q&A format produce the best retrieval matches for common questions.
The ingestion pipeline handles text, markdown, and CSV files from a directory.

← PreviousProject Setup Next →Conversation Engine