Intermediate

PDF Text & Table Extraction

In this step, you will build the core PDF extraction module. Using PyMuPDF for text and layout analysis and tabula for table extraction, you will handle multi-column layouts, headers, footers, and embedded tables.

Understanding PDF Structure

PDFs are not like HTML. Text is positioned absolutely on a page with x,y coordinates. There are no semantic paragraphs or headings — just character sequences placed at specific positions. Our extractor must reconstruct the reading order from these coordinates.

Step 1: Basic Text Extraction with PyMuPDF

PyMuPDF (also called fitz) is the fastest Python PDF library. It extracts text with position data, which we need for layout analysis.

# app/extraction/pdf_extractor.py
import fitz  # PyMuPDF
import logging
from dataclasses import dataclass
from pathlib import Path

logger = logging.getLogger(__name__)


@dataclass
class PageContent:
    """Extracted content from a single PDF page."""
    page_number: int
    text: str
    blocks: list[dict]
    images: list[dict]
    width: float
    height: float


@dataclass
class PDFContent:
    """Extracted content from an entire PDF."""
    filename: str
    total_pages: int
    pages: list[PageContent]
    metadata: dict


class PDFExtractor:
    """Extract text and layout from PDF files using PyMuPDF."""

    def extract(self, file_path: str) -> PDFContent:
        """Extract all content from a PDF file."""
        path = Path(file_path)
        if not path.exists():
            raise FileNotFoundError(f"PDF not found: {file_path}")

        doc = fitz.open(file_path)
        pages = []

        for page_num in range(len(doc)):
            page = doc[page_num]
            blocks = page.get_text("dict", sort=True)["blocks"]

            text_blocks = []
            image_blocks = []

            for block in blocks:
                if block["type"] == 0:  # Text block
                    text_blocks.append({
                        "type": "text",
                        "bbox": block["bbox"],
                        "lines": [
                            {
                                "text": " ".join(
                                    span["text"]
                                    for line_span in line["spans"]
                                    for span in [line_span]
                                ),
                                "font": line["spans"][0]["font"]
                                    if line["spans"] else "",
                                "size": line["spans"][0]["size"]
                                    if line["spans"] else 0,
                                "bbox": line["bbox"],
                            }
                            for line in block["lines"]
                        ],
                    })
                elif block["type"] == 1:  # Image block
                    image_blocks.append({
                        "type": "image",
                        "bbox": block["bbox"],
                        "width": block.get("width", 0),
                        "height": block.get("height", 0),
                    })

            plain_text = page.get_text("text", sort=True)
            pages.append(PageContent(
                page_number=page_num + 1,
                text=plain_text,
                blocks=text_blocks,
                images=image_blocks,
                width=page.rect.width,
                height=page.rect.height,
            ))

        metadata = doc.metadata or {}
        doc.close()
        logger.info(f"Extracted {len(pages)} pages from {path.name}")

        return PDFContent(
            filename=path.name,
            total_pages=len(pages),
            pages=pages,
            metadata=metadata,
        )

Step 2: Table Extraction with tabula

tabula-py wraps the tabula-java library to extract tables from PDFs into pandas DataFrames.

# app/extraction/table_extractor.py
import tabula
import pandas as pd
import logging
from dataclasses import dataclass
from pathlib import Path

logger = logging.getLogger(__name__)


@dataclass
class ExtractedTable:
    """A table extracted from a PDF page."""
    page_number: int
    table_index: int
    headers: list[str]
    rows: list[list[str]]
    dataframe: pd.DataFrame


class TableExtractor:
    """Extract tables from PDF files using tabula-py."""

    def extract_tables(self, file_path: str, pages: str = "all") -> list[ExtractedTable]:
        """Extract all tables from a PDF."""
        path = Path(file_path)
        if not path.exists():
            raise FileNotFoundError(f"PDF not found: {file_path}")

        try:
            dfs = tabula.read_pdf(str(path), pages=pages,
                multiple_tables=True, lattice=True)
        except Exception:
            logger.warning("Lattice failed, trying stream mode")
            dfs = tabula.read_pdf(str(path), pages=pages,
                multiple_tables=True, stream=True)

        tables = []
        for idx, df in enumerate(dfs):
            df = df.dropna(how="all").fillna("")
            df.columns = [str(c).strip() for c in df.columns]
            tables.append(ExtractedTable(
                page_number=0, table_index=idx,
                headers=list(df.columns),
                rows=[[str(cell).strip() for cell in row] for row in df.values.tolist()],
                dataframe=df,
            ))

        logger.info(f"Extracted {len(tables)} tables from {path.name}")
        return tables

Step 3: Layout Analysis

Detect multi-column layouts, headers, footers, and section boundaries:

# app/extraction/layout_analyzer.py
import logging
from dataclasses import dataclass

logger = logging.getLogger(__name__)


@dataclass
class LayoutRegion:
    """A detected region on the page."""
    type: str  # "header", "footer", "body"
    bbox: tuple
    text: str


class LayoutAnalyzer:
    """Analyze page layout to detect columns, headers, and footers."""

    def __init__(self, header_threshold=0.08, footer_threshold=0.92):
        self.header_threshold = header_threshold
        self.footer_threshold = footer_threshold

    def analyze_page(self, blocks, page_width, page_height):
        """Classify blocks into header, footer, and body regions."""
        regions = []
        header_y = page_height * self.header_threshold
        footer_y = page_height * self.footer_threshold

        for block in blocks:
            bbox = block.get("bbox", (0, 0, 0, 0))
            text = " ".join(
                line.get("text", "") for line in block.get("lines", [])
            ).strip()
            if not text:
                continue

            y_center = (bbox[1] + bbox[3]) / 2
            if y_center < header_y:
                region_type = "header"
            elif y_center > footer_y:
                region_type = "footer"
            else:
                region_type = "body"

            regions.append(LayoutRegion(type=region_type, bbox=bbox, text=text))
        return regions

    def detect_columns(self, blocks, page_width):
        """Detect number of text columns on a page."""
        if not blocks:
            return 1
        x_positions = [b.get("bbox", (0,))[0] for b in blocks]
        midpoint = page_width / 2
        left = sum(1 for x in x_positions if x < midpoint * 0.8)
        right = sum(1 for x in x_positions if x > midpoint * 1.2)
        return 2 if left > 2 and right > 2 else 1

Step 4: Integration Test

# tests/test_extraction.py
from app.extraction.pdf_extractor import PDFExtractor
from app.extraction.table_extractor import TableExtractor
from app.extraction.layout_analyzer import LayoutAnalyzer


def test_full_extraction(pdf_path="data/sample-invoice.pdf"):
    extractor = PDFExtractor()
    content = extractor.extract(pdf_path)
    print(f"Pages: {content.total_pages}")
    for page in content.pages:
        print(f"Page {page.page_number}: {len(page.blocks)} blocks")

    table_ext = TableExtractor()
    tables = table_ext.extract_tables(pdf_path)
    for t in tables:
        print(f"Table: {t.headers}")

    analyzer = LayoutAnalyzer()
    for page in content.pages:
        cols = analyzer.detect_columns(page.blocks, page.width)
        print(f"Page {page.page_number}: {cols} column(s)")


if __name__ == "__main__":
    test_full_extraction()

💡

Pro tip: PyMuPDF returns font size with each text span. Use larger font sizes to detect headings and section titles. This gives you document structure without needing any AI.

Key Takeaways

PyMuPDF extracts text with position data, font information, and images from digital PDFs.
tabula-py handles table extraction with lattice (bordered) and stream (borderless) modes.
Layout analysis uses y-coordinates to classify blocks as headers, footers, or body content.
Column detection clusters text blocks by x-position to identify multi-column layouts.

← Previous Project Setup Next → Vision AI for Complex Documents