Intermediate
Structured Data Extraction
In this step, you will build the structured extraction layer that takes raw text and extracts specific fields into validated JSON using Pydantic models and OpenAI function calling.
Why Structured Extraction?
Raw text is useful for search, but downstream systems need structured data. An accounting system needs the invoice number, date, and line items as separate fields — not as a blob of text.
Step 1: Define Extraction Schemas
# app/structuring/schemas.py
from pydantic import BaseModel, Field
from typing import Optional
class Address(BaseModel):
street: str = Field(description="Street address")
city: str = Field(description="City name")
state: str = Field(default="", description="State or province")
zip_code: str = Field(default="", description="ZIP or postal code")
country: str = Field(default="", description="Country")
class LineItem(BaseModel):
description: str = Field(description="Item description")
quantity: float = Field(default=1, description="Quantity")
unit_price: float = Field(description="Price per unit")
total: float = Field(description="Line total")
class InvoiceData(BaseModel):
invoice_number: str = Field(description="Invoice number or ID")
invoice_date: Optional[str] = Field(default=None, description="Invoice date")
due_date: Optional[str] = Field(default=None, description="Due date")
vendor_name: str = Field(description="Seller/vendor name")
vendor_address: Optional[Address] = None
customer_name: str = Field(default="", description="Buyer name")
customer_address: Optional[Address] = None
line_items: list[LineItem] = Field(default_factory=list)
subtotal: Optional[float] = None
tax: Optional[float] = None
total: float = Field(description="Total amount due")
currency: str = Field(default="USD")
class ReceiptData(BaseModel):
store_name: str = Field(description="Store or merchant name")
store_address: Optional[str] = None
date: Optional[str] = None
items: list[LineItem] = Field(default_factory=list)
subtotal: Optional[float] = None
tax: Optional[float] = None
total: float = Field(description="Total amount")
payment_method: Optional[str] = None
class ContractData(BaseModel):
title: str = Field(description="Contract title")
parties: list[str] = Field(description="Contracting parties")
effective_date: Optional[str] = None
expiration_date: Optional[str] = None
key_terms: list[str] = Field(default_factory=list)
total_value: Optional[float] = None
currency: str = Field(default="USD")
Step 2: Build the Extraction Engine
# app/structuring/extractor.py
import json
import logging
from typing import Type
from openai import OpenAI
from pydantic import BaseModel
from app.config import get_settings
from app.structuring.schemas import InvoiceData, ReceiptData, ContractData
logger = logging.getLogger(__name__)
settings = get_settings()
SCHEMA_MAP = {
"invoice": InvoiceData,
"receipt": ReceiptData,
"contract": ContractData,
}
class StructuredExtractor:
"""Extract structured data using OpenAI function calling."""
def __init__(self):
self.client = OpenAI(api_key=settings.openai_api_key)
self.model = settings.openai_chat_model
def _schema_to_function(self, schema_class: Type[BaseModel]) -> dict:
schema = schema_class.model_json_schema()
return {
"type": "function",
"function": {
"name": f"extract_{schema_class.__name__.lower()}",
"description": f"Extract {schema_class.__name__} from document",
"parameters": schema,
},
}
def extract(self, text: str, document_type: str = "invoice") -> dict:
"""Extract structured data from document text."""
schema_class = SCHEMA_MAP.get(document_type)
if not schema_class:
raise ValueError(f"Unknown type: {document_type}")
tool = self._schema_to_function(schema_class)
response = self.client.chat.completions.create(
model=self.model,
messages=[
{"role": "system", "content": "You are a document extraction expert. Extract all fields precisely."},
{"role": "user", "content": f"Extract from this {document_type}:\n\n{text}"},
],
tools=[tool],
tool_choice={"type": "function", "function": {"name": tool["function"]["name"]}},
temperature=0.0,
)
extracted = json.loads(response.choices[0].message.tool_calls[0].function.arguments)
validated = schema_class(**extracted)
return {
"document_type": document_type,
"data": validated.model_dump(),
"tokens_used": response.usage.total_tokens if response.usage else 0,
}
def detect_document_type(self, text: str) -> str:
"""Auto-detect document type from text content."""
response = self.client.chat.completions.create(
model=self.model,
messages=[
{"role": "system", "content": "Classify document type. Reply: invoice, receipt, or contract."},
{"role": "user", "content": f"Type?\n\n{text[:2000]}"},
],
temperature=0.0, max_tokens=10,
)
t = response.choices[0].message.content.strip().lower()
return t if t in SCHEMA_MAP else "invoice"
Step 3: Test Structured Extraction
from app.structuring.extractor import StructuredExtractor
import json
extractor = StructuredExtractor()
sample = """
INVOICE #INV-2024-0042
Date: January 15, 2024
Due: February 15, 2024
From: Acme Corp, 123 Business Ave, San Francisco, CA 94102
Bill To: Widget Inc, 456 Commerce St, New York, NY 10001
1. Web Development - 40 hrs @ $150/hr = $6,000.00
2. Cloud Hosting (Jan) - 1 @ $299.00 = $299.00
3. SSL Certificate - 1 yr @ $49.99 = $49.99
Subtotal: $6,348.99
Tax (8.5%): $539.66
Total: $6,888.65
"""
result = extractor.extract(sample, "invoice")
print(json.dumps(result["data"], indent=2))
OpenAI function calling guarantees output matches your Pydantic schema. Combined with Pydantic validation, you get type-safe, structured data every time.
Key Takeaways
- Pydantic models define exact extraction fields with types and descriptions.
- OpenAI function calling forces structured JSON output matching your schema.
- Auto-detection classifies documents before extraction for the right schema.
- Two-layer validation (OpenAI + Pydantic) catches errors before they reach your database.
Lilly Tech Systems