Intermediate
Knowledge Base
Ingest FAQ documents, help articles, and past support tickets into a Qdrant vector store. Build the retrieval layer that powers the bot's answers.
Document Ingestion Pipeline
# app/knowledge/ingester.py
import logging
from pathlib import Path
from langchain_community.document_loaders import TextLoader, CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct
import uuid
logger = logging.getLogger(__name__)
class KnowledgeIngester:
def __init__(self, settings):
self.embeddings = OpenAIEmbeddings(
api_key=settings.openai_api_key,
model="text-embedding-3-small",
)
self.client = QdrantClient(host=settings.qdrant_host, port=settings.qdrant_port)
self.collection = "support_knowledge"
self.splitter = RecursiveCharacterTextSplitter(
chunk_size=500, chunk_overlap=50,
separators=["\n\n", "\n", ". ", " "],
)
self._ensure_collection()
def _ensure_collection(self):
collections = [c.name for c in self.client.get_collections().collections]
if self.collection not in collections:
self.client.create_collection(
self.collection,
vectors_config=VectorParams(size=1536, distance=Distance.COSINE),
)
def ingest_directory(self, dir_path: str) -> int:
path = Path(dir_path)
total = 0
for file in path.glob("**/*"):
if file.suffix in {".txt", ".md"}:
docs = TextLoader(str(file)).load()
elif file.suffix == ".csv":
docs = CSVLoader(str(file)).load()
else:
continue
chunks = self.splitter.split_documents(docs)
self._store_chunks(chunks, source=file.name)
total += len(chunks)
logger.info(f"Ingested {total} chunks from {dir_path}")
return total
def ingest_faq(self, faq_pairs: list[dict]) -> int:
points = []
for faq in faq_pairs:
text = f"Q: {faq['question']}\nA: {faq['answer']}"
embedding = self.embeddings.embed_query(text)
points.append(PointStruct(
id=str(uuid.uuid4()),
vector=embedding,
payload={"text": text, "source": "faq",
"question": faq["question"], "answer": faq["answer"]},
))
self.client.upsert(self.collection, points)
return len(points)
def _store_chunks(self, chunks, source=""):
points = []
texts = [c.page_content for c in chunks]
embeddings = self.embeddings.embed_documents(texts)
for chunk, emb in zip(chunks, embeddings):
points.append(PointStruct(
id=str(uuid.uuid4()), vector=emb,
payload={"text": chunk.page_content, "source": source,
"metadata": chunk.metadata},
))
self.client.upsert(self.collection, points)
Retriever
# app/knowledge/retriever.py
class KnowledgeRetriever:
def __init__(self, settings):
self.embeddings = OpenAIEmbeddings(api_key=settings.openai_api_key, model="text-embedding-3-small")
self.client = QdrantClient(host=settings.qdrant_host, port=settings.qdrant_port)
self.collection = "support_knowledge"
def search(self, query: str, top_k: int = 5) -> list[dict]:
embedding = self.embeddings.embed_query(query)
results = self.client.search(self.collection, query_vector=embedding, limit=top_k)
return [{"text": r.payload["text"], "score": r.score, "source": r.payload.get("source", "")} for r in results]
FAQ pairs are gold: Pre-formatted Q&A pairs have the highest retrieval accuracy because the question matches user queries directly. Always prioritize ingesting your existing FAQ before general documentation.
Key Takeaways
- Chunk size of 500 characters balances context richness with retrieval precision.
- FAQ pairs as Q&A format produce the best retrieval matches for common questions.
- The ingestion pipeline handles text, markdown, and CSV files from a directory.
Lilly Tech Systems