Intermediate

Structured Data Extraction

In this step, you will build the structured extraction layer that takes raw text and extracts specific fields into validated JSON using Pydantic models and OpenAI function calling.

Why Structured Extraction?

Raw text is useful for search, but downstream systems need structured data. An accounting system needs the invoice number, date, and line items as separate fields — not as a blob of text.

Step 1: Define Extraction Schemas

# app/structuring/schemas.py
from pydantic import BaseModel, Field
from typing import Optional


class Address(BaseModel):
    street: str = Field(description="Street address")
    city: str = Field(description="City name")
    state: str = Field(default="", description="State or province")
    zip_code: str = Field(default="", description="ZIP or postal code")
    country: str = Field(default="", description="Country")


class LineItem(BaseModel):
    description: str = Field(description="Item description")
    quantity: float = Field(default=1, description="Quantity")
    unit_price: float = Field(description="Price per unit")
    total: float = Field(description="Line total")


class InvoiceData(BaseModel):
    invoice_number: str = Field(description="Invoice number or ID")
    invoice_date: Optional[str] = Field(default=None, description="Invoice date")
    due_date: Optional[str] = Field(default=None, description="Due date")
    vendor_name: str = Field(description="Seller/vendor name")
    vendor_address: Optional[Address] = None
    customer_name: str = Field(default="", description="Buyer name")
    customer_address: Optional[Address] = None
    line_items: list[LineItem] = Field(default_factory=list)
    subtotal: Optional[float] = None
    tax: Optional[float] = None
    total: float = Field(description="Total amount due")
    currency: str = Field(default="USD")


class ReceiptData(BaseModel):
    store_name: str = Field(description="Store or merchant name")
    store_address: Optional[str] = None
    date: Optional[str] = None
    items: list[LineItem] = Field(default_factory=list)
    subtotal: Optional[float] = None
    tax: Optional[float] = None
    total: float = Field(description="Total amount")
    payment_method: Optional[str] = None


class ContractData(BaseModel):
    title: str = Field(description="Contract title")
    parties: list[str] = Field(description="Contracting parties")
    effective_date: Optional[str] = None
    expiration_date: Optional[str] = None
    key_terms: list[str] = Field(default_factory=list)
    total_value: Optional[float] = None
    currency: str = Field(default="USD")

Step 2: Build the Extraction Engine

# app/structuring/extractor.py
import json
import logging
from typing import Type
from openai import OpenAI
from pydantic import BaseModel
from app.config import get_settings
from app.structuring.schemas import InvoiceData, ReceiptData, ContractData

logger = logging.getLogger(__name__)
settings = get_settings()

SCHEMA_MAP = {
    "invoice": InvoiceData,
    "receipt": ReceiptData,
    "contract": ContractData,
}


class StructuredExtractor:
    """Extract structured data using OpenAI function calling."""

    def __init__(self):
        self.client = OpenAI(api_key=settings.openai_api_key)
        self.model = settings.openai_chat_model

    def _schema_to_function(self, schema_class: Type[BaseModel]) -> dict:
        schema = schema_class.model_json_schema()
        return {
            "type": "function",
            "function": {
                "name": f"extract_{schema_class.__name__.lower()}",
                "description": f"Extract {schema_class.__name__} from document",
                "parameters": schema,
            },
        }

    def extract(self, text: str, document_type: str = "invoice") -> dict:
        """Extract structured data from document text."""
        schema_class = SCHEMA_MAP.get(document_type)
        if not schema_class:
            raise ValueError(f"Unknown type: {document_type}")

        tool = self._schema_to_function(schema_class)
        response = self.client.chat.completions.create(
            model=self.model,
            messages=[
                {"role": "system", "content": "You are a document extraction expert. Extract all fields precisely."},
                {"role": "user", "content": f"Extract from this {document_type}:\n\n{text}"},
            ],
            tools=[tool],
            tool_choice={"type": "function", "function": {"name": tool["function"]["name"]}},
            temperature=0.0,
        )

        extracted = json.loads(response.choices[0].message.tool_calls[0].function.arguments)
        validated = schema_class(**extracted)

        return {
            "document_type": document_type,
            "data": validated.model_dump(),
            "tokens_used": response.usage.total_tokens if response.usage else 0,
        }

    def detect_document_type(self, text: str) -> str:
        """Auto-detect document type from text content."""
        response = self.client.chat.completions.create(
            model=self.model,
            messages=[
                {"role": "system", "content": "Classify document type. Reply: invoice, receipt, or contract."},
                {"role": "user", "content": f"Type?\n\n{text[:2000]}"},
            ],
            temperature=0.0, max_tokens=10,
        )
        t = response.choices[0].message.content.strip().lower()
        return t if t in SCHEMA_MAP else "invoice"

Step 3: Test Structured Extraction

from app.structuring.extractor import StructuredExtractor
import json

extractor = StructuredExtractor()

sample = """
INVOICE #INV-2024-0042
Date: January 15, 2024
Due: February 15, 2024

From: Acme Corp, 123 Business Ave, San Francisco, CA 94102
Bill To: Widget Inc, 456 Commerce St, New York, NY 10001

1. Web Development - 40 hrs @ $150/hr = $6,000.00
2. Cloud Hosting (Jan) - 1 @ $299.00 = $299.00
3. SSL Certificate - 1 yr @ $49.99 = $49.99

Subtotal: $6,348.99
Tax (8.5%): $539.66
Total: $6,888.65
"""

result = extractor.extract(sample, "invoice")
print(json.dumps(result["data"], indent=2))

💡

OpenAI function calling guarantees output matches your Pydantic schema. Combined with Pydantic validation, you get type-safe, structured data every time.

Key Takeaways

Pydantic models define exact extraction fields with types and descriptions.
OpenAI function calling forces structured JSON output matching your schema.
Auto-detection classifies documents before extraction for the right schema.
Two-layer validation (OpenAI + Pydantic) catches errors before they reach your database.

← Previous Vision AI for Complex Documents Next → Batch Processing Pipeline