Intermediate
Vision AI for Complex Documents
In this step, you will integrate GPT-4 Vision to analyze documents that traditional text extraction cannot handle: handwritten notes, photographed receipts, charts, diagrams, and scanned documents.
When to Use Vision AI
Text extraction works for digital PDFs, but many real-world documents need visual understanding:
- Scanned documents: PDFs from scanners with no embedded text layer.
- Handwritten notes: Handwriting recognition requires visual AI.
- Charts and diagrams: Extract data from bar charts, pie charts, flow diagrams.
- Photos of documents: Receipts, whiteboards, business cards from phone cameras.
Step 1: Vision Analyzer Module
# app/vision/vision_analyzer.py
import base64
import logging
from pathlib import Path
from openai import OpenAI
from app.config import get_settings
logger = logging.getLogger(__name__)
settings = get_settings()
class VisionAnalyzer:
"""Analyze documents using GPT-4 Vision."""
def __init__(self):
self.client = OpenAI(api_key=settings.openai_api_key)
self.model = settings.openai_vision_model
def _encode_image(self, image_path: str) -> str:
"""Encode an image file to base64."""
with open(image_path, "rb") as f:
return base64.b64encode(f.read()).decode("utf-8")
def _encode_pdf_page(self, pdf_path: str, page_num: int, dpi: int = 200) -> str:
"""Convert a PDF page to a PNG image and encode as base64."""
import fitz
doc = fitz.open(pdf_path)
page = doc[page_num]
mat = fitz.Matrix(dpi / 72, dpi / 72)
pix = page.get_pixmap(matrix=mat)
img_bytes = pix.tobytes("png")
doc.close()
return base64.b64encode(img_bytes).decode("utf-8")
def analyze_image(self, image_path: str, prompt: str = None) -> dict:
"""Analyze a document image with GPT-4 Vision."""
if prompt is None:
prompt = "Extract all text and data from this document. Preserve structure."
base64_image = self._encode_image(image_path)
ext = Path(image_path).suffix.lower().lstrip(".")
media_map = {"png": "image/png", "jpg": "image/jpeg", "jpeg": "image/jpeg",
"tiff": "image/tiff", "bmp": "image/bmp"}
media_type = media_map.get(ext, "image/png")
response = self.client.chat.completions.create(
model=self.model,
messages=[{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {
"url": f"data:{media_type};base64,{base64_image}",
"detail": "high"
}}
]
}],
max_tokens=4096, temperature=0.1,
)
return {
"text": response.choices[0].message.content,
"tokens_used": response.usage.total_tokens if response.usage else 0,
"model": self.model, "source": image_path,
}
def analyze_pdf_page(self, pdf_path: str, page_num: int = 0, prompt: str = None) -> dict:
"""Analyze a specific PDF page using vision."""
if prompt is None:
prompt = "Extract all text, tables, and data from this document page."
base64_image = self._encode_pdf_page(pdf_path, page_num)
response = self.client.chat.completions.create(
model=self.model,
messages=[{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {
"url": f"data:image/png;base64,{base64_image}", "detail": "high"
}}
]
}],
max_tokens=4096, temperature=0.1,
)
return {
"text": response.choices[0].message.content,
"page_number": page_num + 1,
"tokens_used": response.usage.total_tokens if response.usage else 0,
}
def extract_handwriting(self, image_path: str) -> dict:
"""Specialized extraction for handwritten documents."""
return self.analyze_image(image_path,
"Transcribe ALL handwritten text exactly as written. "
"Preserve line breaks. Mark unclear text with [unclear].")
def extract_chart_data(self, image_path: str) -> dict:
"""Extract data from charts and graphs."""
return self.analyze_image(image_path,
"Extract chart type, title, axis labels, all data points as a table, "
"and any legends. Format as structured text.")
Step 2: Smart Routing
Automatically decide whether a document needs text extraction or vision analysis:
# app/vision/router.py
import fitz
import logging
from pathlib import Path
logger = logging.getLogger(__name__)
def needs_vision_analysis(file_path: str) -> bool:
"""Return True if the file needs vision AI instead of text extraction."""
ext = Path(file_path).suffix.lower()
# Image files always need vision
if ext in {".png", ".jpg", ".jpeg", ".tiff", ".bmp"}:
return True
# For PDFs, check if extractable text exists
if ext == ".pdf":
try:
doc = fitz.open(file_path)
text = ""
for i in range(min(3, len(doc))):
text += doc[i].get_text("text")
doc.close()
words = text.split()
if len(words) < 10:
logger.info(f"Scanned PDF detected ({len(words)} words)")
return True
return False
except Exception as e:
logger.warning(f"Error checking PDF: {e}")
return True
return False
Step 3: Test Vision Pipeline
from app.vision.vision_analyzer import VisionAnalyzer
from app.vision.router import needs_vision_analysis
analyzer = VisionAnalyzer()
# Analyze a receipt photo
result = analyzer.analyze_image("data/receipt.jpg",
"Extract all items, prices, and total from this receipt.")
print(result["text"])
# Analyze a scanned PDF
if needs_vision_analysis("data/scanned.pdf"):
result = analyzer.analyze_pdf_page("data/scanned.pdf", 0)
print(result["text"])
# Extract handwriting
result = analyzer.extract_handwriting("data/notes.jpg")
print(result["text"])
# Extract chart data
result = analyzer.extract_chart_data("data/chart.png")
print(result["text"])
Cost optimization: GPT-4 Vision with
"detail": "high" costs ~$0.01-0.03 per page. Use "detail": "low" first for batch jobs and upgrade to high detail only when needed.Key Takeaways
- GPT-4 Vision reads handwritten text, charts, and scanned documents that PyMuPDF cannot handle.
- The smart router checks PDF text density to choose between text extraction and vision analysis.
- Convert PDF pages to 200 DPI images for optimal vision model accuracy.
- Specialized prompts for handwriting and charts produce better results than generic prompts.
Lilly Tech Systems