Intermediate

Vision AI for Complex Documents

In this step, you will integrate GPT-4 Vision to analyze documents that traditional text extraction cannot handle: handwritten notes, photographed receipts, charts, diagrams, and scanned documents.

When to Use Vision AI

Text extraction works for digital PDFs, but many real-world documents need visual understanding:

Scanned documents: PDFs from scanners with no embedded text layer.
Handwritten notes: Handwriting recognition requires visual AI.
Charts and diagrams: Extract data from bar charts, pie charts, flow diagrams.
Photos of documents: Receipts, whiteboards, business cards from phone cameras.

Step 1: Vision Analyzer Module

# app/vision/vision_analyzer.py
import base64
import logging
from pathlib import Path
from openai import OpenAI
from app.config import get_settings

logger = logging.getLogger(__name__)
settings = get_settings()


class VisionAnalyzer:
    """Analyze documents using GPT-4 Vision."""

    def __init__(self):
        self.client = OpenAI(api_key=settings.openai_api_key)
        self.model = settings.openai_vision_model

    def _encode_image(self, image_path: str) -> str:
        """Encode an image file to base64."""
        with open(image_path, "rb") as f:
            return base64.b64encode(f.read()).decode("utf-8")

    def _encode_pdf_page(self, pdf_path: str, page_num: int, dpi: int = 200) -> str:
        """Convert a PDF page to a PNG image and encode as base64."""
        import fitz
        doc = fitz.open(pdf_path)
        page = doc[page_num]
        mat = fitz.Matrix(dpi / 72, dpi / 72)
        pix = page.get_pixmap(matrix=mat)
        img_bytes = pix.tobytes("png")
        doc.close()
        return base64.b64encode(img_bytes).decode("utf-8")

    def analyze_image(self, image_path: str, prompt: str = None) -> dict:
        """Analyze a document image with GPT-4 Vision."""
        if prompt is None:
            prompt = "Extract all text and data from this document. Preserve structure."

        base64_image = self._encode_image(image_path)
        ext = Path(image_path).suffix.lower().lstrip(".")
        media_map = {"png": "image/png", "jpg": "image/jpeg", "jpeg": "image/jpeg",
                     "tiff": "image/tiff", "bmp": "image/bmp"}
        media_type = media_map.get(ext, "image/png")

        response = self.client.chat.completions.create(
            model=self.model,
            messages=[{
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt},
                    {"type": "image_url", "image_url": {
                        "url": f"data:{media_type};base64,{base64_image}",
                        "detail": "high"
                    }}
                ]
            }],
            max_tokens=4096, temperature=0.1,
        )
        return {
            "text": response.choices[0].message.content,
            "tokens_used": response.usage.total_tokens if response.usage else 0,
            "model": self.model, "source": image_path,
        }

    def analyze_pdf_page(self, pdf_path: str, page_num: int = 0, prompt: str = None) -> dict:
        """Analyze a specific PDF page using vision."""
        if prompt is None:
            prompt = "Extract all text, tables, and data from this document page."
        base64_image = self._encode_pdf_page(pdf_path, page_num)
        response = self.client.chat.completions.create(
            model=self.model,
            messages=[{
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt},
                    {"type": "image_url", "image_url": {
                        "url": f"data:image/png;base64,{base64_image}", "detail": "high"
                    }}
                ]
            }],
            max_tokens=4096, temperature=0.1,
        )
        return {
            "text": response.choices[0].message.content,
            "page_number": page_num + 1,
            "tokens_used": response.usage.total_tokens if response.usage else 0,
        }

    def extract_handwriting(self, image_path: str) -> dict:
        """Specialized extraction for handwritten documents."""
        return self.analyze_image(image_path,
            "Transcribe ALL handwritten text exactly as written. "
            "Preserve line breaks. Mark unclear text with [unclear].")

    def extract_chart_data(self, image_path: str) -> dict:
        """Extract data from charts and graphs."""
        return self.analyze_image(image_path,
            "Extract chart type, title, axis labels, all data points as a table, "
            "and any legends. Format as structured text.")

Step 2: Smart Routing

Automatically decide whether a document needs text extraction or vision analysis:

# app/vision/router.py
import fitz
import logging
from pathlib import Path

logger = logging.getLogger(__name__)


def needs_vision_analysis(file_path: str) -> bool:
    """Return True if the file needs vision AI instead of text extraction."""
    ext = Path(file_path).suffix.lower()

    # Image files always need vision
    if ext in {".png", ".jpg", ".jpeg", ".tiff", ".bmp"}:
        return True

    # For PDFs, check if extractable text exists
    if ext == ".pdf":
        try:
            doc = fitz.open(file_path)
            text = ""
            for i in range(min(3, len(doc))):
                text += doc[i].get_text("text")
            doc.close()
            words = text.split()
            if len(words) < 10:
                logger.info(f"Scanned PDF detected ({len(words)} words)")
                return True
            return False
        except Exception as e:
            logger.warning(f"Error checking PDF: {e}")
            return True
    return False

Step 3: Test Vision Pipeline

from app.vision.vision_analyzer import VisionAnalyzer
from app.vision.router import needs_vision_analysis

analyzer = VisionAnalyzer()

# Analyze a receipt photo
result = analyzer.analyze_image("data/receipt.jpg",
    "Extract all items, prices, and total from this receipt.")
print(result["text"])

# Analyze a scanned PDF
if needs_vision_analysis("data/scanned.pdf"):
    result = analyzer.analyze_pdf_page("data/scanned.pdf", 0)
    print(result["text"])

# Extract handwriting
result = analyzer.extract_handwriting("data/notes.jpg")
print(result["text"])

# Extract chart data
result = analyzer.extract_chart_data("data/chart.png")
print(result["text"])

💡

Cost optimization: GPT-4 Vision with "detail": "high" costs ~$0.01-0.03 per page. Use "detail": "low" first for batch jobs and upgrade to high detail only when needed.

Key Takeaways

GPT-4 Vision reads handwritten text, charts, and scanned documents that PyMuPDF cannot handle.
The smart router checks PDF text density to choose between text extraction and vision analysis.
Convert PDF pages to 200 DPI images for optimal vision model accuracy.
Specialized prompts for handwriting and charts produce better results than generic prompts.

← Previous PDF Text & Table Extraction Next → Structured Data Extraction