Intermediate

Vision AI for Complex Documents

In this step, you will integrate GPT-4 Vision to analyze documents that traditional text extraction cannot handle: handwritten notes, photographed receipts, charts, diagrams, and scanned documents.

When to Use Vision AI

Text extraction works for digital PDFs, but many real-world documents need visual understanding:

  • Scanned documents: PDFs from scanners with no embedded text layer.
  • Handwritten notes: Handwriting recognition requires visual AI.
  • Charts and diagrams: Extract data from bar charts, pie charts, flow diagrams.
  • Photos of documents: Receipts, whiteboards, business cards from phone cameras.

Step 1: Vision Analyzer Module

# app/vision/vision_analyzer.py
import base64
import logging
from pathlib import Path
from openai import OpenAI
from app.config import get_settings

logger = logging.getLogger(__name__)
settings = get_settings()


class VisionAnalyzer:
    """Analyze documents using GPT-4 Vision."""

    def __init__(self):
        self.client = OpenAI(api_key=settings.openai_api_key)
        self.model = settings.openai_vision_model

    def _encode_image(self, image_path: str) -> str:
        """Encode an image file to base64."""
        with open(image_path, "rb") as f:
            return base64.b64encode(f.read()).decode("utf-8")

    def _encode_pdf_page(self, pdf_path: str, page_num: int, dpi: int = 200) -> str:
        """Convert a PDF page to a PNG image and encode as base64."""
        import fitz
        doc = fitz.open(pdf_path)
        page = doc[page_num]
        mat = fitz.Matrix(dpi / 72, dpi / 72)
        pix = page.get_pixmap(matrix=mat)
        img_bytes = pix.tobytes("png")
        doc.close()
        return base64.b64encode(img_bytes).decode("utf-8")

    def analyze_image(self, image_path: str, prompt: str = None) -> dict:
        """Analyze a document image with GPT-4 Vision."""
        if prompt is None:
            prompt = "Extract all text and data from this document. Preserve structure."

        base64_image = self._encode_image(image_path)
        ext = Path(image_path).suffix.lower().lstrip(".")
        media_map = {"png": "image/png", "jpg": "image/jpeg", "jpeg": "image/jpeg",
                     "tiff": "image/tiff", "bmp": "image/bmp"}
        media_type = media_map.get(ext, "image/png")

        response = self.client.chat.completions.create(
            model=self.model,
            messages=[{
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt},
                    {"type": "image_url", "image_url": {
                        "url": f"data:{media_type};base64,{base64_image}",
                        "detail": "high"
                    }}
                ]
            }],
            max_tokens=4096, temperature=0.1,
        )
        return {
            "text": response.choices[0].message.content,
            "tokens_used": response.usage.total_tokens if response.usage else 0,
            "model": self.model, "source": image_path,
        }

    def analyze_pdf_page(self, pdf_path: str, page_num: int = 0, prompt: str = None) -> dict:
        """Analyze a specific PDF page using vision."""
        if prompt is None:
            prompt = "Extract all text, tables, and data from this document page."
        base64_image = self._encode_pdf_page(pdf_path, page_num)
        response = self.client.chat.completions.create(
            model=self.model,
            messages=[{
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt},
                    {"type": "image_url", "image_url": {
                        "url": f"data:image/png;base64,{base64_image}", "detail": "high"
                    }}
                ]
            }],
            max_tokens=4096, temperature=0.1,
        )
        return {
            "text": response.choices[0].message.content,
            "page_number": page_num + 1,
            "tokens_used": response.usage.total_tokens if response.usage else 0,
        }

    def extract_handwriting(self, image_path: str) -> dict:
        """Specialized extraction for handwritten documents."""
        return self.analyze_image(image_path,
            "Transcribe ALL handwritten text exactly as written. "
            "Preserve line breaks. Mark unclear text with [unclear].")

    def extract_chart_data(self, image_path: str) -> dict:
        """Extract data from charts and graphs."""
        return self.analyze_image(image_path,
            "Extract chart type, title, axis labels, all data points as a table, "
            "and any legends. Format as structured text.")

Step 2: Smart Routing

Automatically decide whether a document needs text extraction or vision analysis:

# app/vision/router.py
import fitz
import logging
from pathlib import Path

logger = logging.getLogger(__name__)


def needs_vision_analysis(file_path: str) -> bool:
    """Return True if the file needs vision AI instead of text extraction."""
    ext = Path(file_path).suffix.lower()

    # Image files always need vision
    if ext in {".png", ".jpg", ".jpeg", ".tiff", ".bmp"}:
        return True

    # For PDFs, check if extractable text exists
    if ext == ".pdf":
        try:
            doc = fitz.open(file_path)
            text = ""
            for i in range(min(3, len(doc))):
                text += doc[i].get_text("text")
            doc.close()
            words = text.split()
            if len(words) < 10:
                logger.info(f"Scanned PDF detected ({len(words)} words)")
                return True
            return False
        except Exception as e:
            logger.warning(f"Error checking PDF: {e}")
            return True
    return False

Step 3: Test Vision Pipeline

from app.vision.vision_analyzer import VisionAnalyzer
from app.vision.router import needs_vision_analysis

analyzer = VisionAnalyzer()

# Analyze a receipt photo
result = analyzer.analyze_image("data/receipt.jpg",
    "Extract all items, prices, and total from this receipt.")
print(result["text"])

# Analyze a scanned PDF
if needs_vision_analysis("data/scanned.pdf"):
    result = analyzer.analyze_pdf_page("data/scanned.pdf", 0)
    print(result["text"])

# Extract handwriting
result = analyzer.extract_handwriting("data/notes.jpg")
print(result["text"])

# Extract chart data
result = analyzer.extract_chart_data("data/chart.png")
print(result["text"])
💡
Cost optimization: GPT-4 Vision with "detail": "high" costs ~$0.01-0.03 per page. Use "detail": "low" first for batch jobs and upgrade to high detail only when needed.

Key Takeaways

  • GPT-4 Vision reads handwritten text, charts, and scanned documents that PyMuPDF cannot handle.
  • The smart router checks PDF text density to choose between text extraction and vision analysis.
  • Convert PDF pages to 200 DPI images for optimal vision model accuracy.
  • Specialized prompts for handwriting and charts produce better results than generic prompts.