Intermediate

PDF Text & Table Extraction

In this step, you will build the core PDF extraction module. Using PyMuPDF for text and layout analysis and tabula for table extraction, you will handle multi-column layouts, headers, footers, and embedded tables.

Understanding PDF Structure

PDFs are not like HTML. Text is positioned absolutely on a page with x,y coordinates. There are no semantic paragraphs or headings — just character sequences placed at specific positions. Our extractor must reconstruct the reading order from these coordinates.

Step 1: Basic Text Extraction with PyMuPDF

PyMuPDF (also called fitz) is the fastest Python PDF library. It extracts text with position data, which we need for layout analysis.

# app/extraction/pdf_extractor.py
import fitz  # PyMuPDF
import logging
from dataclasses import dataclass
from pathlib import Path

logger = logging.getLogger(__name__)


@dataclass
class PageContent:
    """Extracted content from a single PDF page."""
    page_number: int
    text: str
    blocks: list[dict]
    images: list[dict]
    width: float
    height: float


@dataclass
class PDFContent:
    """Extracted content from an entire PDF."""
    filename: str
    total_pages: int
    pages: list[PageContent]
    metadata: dict


class PDFExtractor:
    """Extract text and layout from PDF files using PyMuPDF."""

    def extract(self, file_path: str) -> PDFContent:
        """Extract all content from a PDF file."""
        path = Path(file_path)
        if not path.exists():
            raise FileNotFoundError(f"PDF not found: {file_path}")

        doc = fitz.open(file_path)
        pages = []

        for page_num in range(len(doc)):
            page = doc[page_num]
            blocks = page.get_text("dict", sort=True)["blocks"]

            text_blocks = []
            image_blocks = []

            for block in blocks:
                if block["type"] == 0:  # Text block
                    text_blocks.append({
                        "type": "text",
                        "bbox": block["bbox"],
                        "lines": [
                            {
                                "text": " ".join(
                                    span["text"]
                                    for line_span in line["spans"]
                                    for span in [line_span]
                                ),
                                "font": line["spans"][0]["font"]
                                    if line["spans"] else "",
                                "size": line["spans"][0]["size"]
                                    if line["spans"] else 0,
                                "bbox": line["bbox"],
                            }
                            for line in block["lines"]
                        ],
                    })
                elif block["type"] == 1:  # Image block
                    image_blocks.append({
                        "type": "image",
                        "bbox": block["bbox"],
                        "width": block.get("width", 0),
                        "height": block.get("height", 0),
                    })

            plain_text = page.get_text("text", sort=True)
            pages.append(PageContent(
                page_number=page_num + 1,
                text=plain_text,
                blocks=text_blocks,
                images=image_blocks,
                width=page.rect.width,
                height=page.rect.height,
            ))

        metadata = doc.metadata or {}
        doc.close()
        logger.info(f"Extracted {len(pages)} pages from {path.name}")

        return PDFContent(
            filename=path.name,
            total_pages=len(pages),
            pages=pages,
            metadata=metadata,
        )

Step 2: Table Extraction with tabula

tabula-py wraps the tabula-java library to extract tables from PDFs into pandas DataFrames.

# app/extraction/table_extractor.py
import tabula
import pandas as pd
import logging
from dataclasses import dataclass
from pathlib import Path

logger = logging.getLogger(__name__)


@dataclass
class ExtractedTable:
    """A table extracted from a PDF page."""
    page_number: int
    table_index: int
    headers: list[str]
    rows: list[list[str]]
    dataframe: pd.DataFrame


class TableExtractor:
    """Extract tables from PDF files using tabula-py."""

    def extract_tables(self, file_path: str, pages: str = "all") -> list[ExtractedTable]:
        """Extract all tables from a PDF."""
        path = Path(file_path)
        if not path.exists():
            raise FileNotFoundError(f"PDF not found: {file_path}")

        try:
            dfs = tabula.read_pdf(str(path), pages=pages,
                multiple_tables=True, lattice=True)
        except Exception:
            logger.warning("Lattice failed, trying stream mode")
            dfs = tabula.read_pdf(str(path), pages=pages,
                multiple_tables=True, stream=True)

        tables = []
        for idx, df in enumerate(dfs):
            df = df.dropna(how="all").fillna("")
            df.columns = [str(c).strip() for c in df.columns]
            tables.append(ExtractedTable(
                page_number=0, table_index=idx,
                headers=list(df.columns),
                rows=[[str(cell).strip() for cell in row] for row in df.values.tolist()],
                dataframe=df,
            ))

        logger.info(f"Extracted {len(tables)} tables from {path.name}")
        return tables

Step 3: Layout Analysis

Detect multi-column layouts, headers, footers, and section boundaries:

# app/extraction/layout_analyzer.py
import logging
from dataclasses import dataclass

logger = logging.getLogger(__name__)


@dataclass
class LayoutRegion:
    """A detected region on the page."""
    type: str  # "header", "footer", "body"
    bbox: tuple
    text: str


class LayoutAnalyzer:
    """Analyze page layout to detect columns, headers, and footers."""

    def __init__(self, header_threshold=0.08, footer_threshold=0.92):
        self.header_threshold = header_threshold
        self.footer_threshold = footer_threshold

    def analyze_page(self, blocks, page_width, page_height):
        """Classify blocks into header, footer, and body regions."""
        regions = []
        header_y = page_height * self.header_threshold
        footer_y = page_height * self.footer_threshold

        for block in blocks:
            bbox = block.get("bbox", (0, 0, 0, 0))
            text = " ".join(
                line.get("text", "") for line in block.get("lines", [])
            ).strip()
            if not text:
                continue

            y_center = (bbox[1] + bbox[3]) / 2
            if y_center < header_y:
                region_type = "header"
            elif y_center > footer_y:
                region_type = "footer"
            else:
                region_type = "body"

            regions.append(LayoutRegion(type=region_type, bbox=bbox, text=text))
        return regions

    def detect_columns(self, blocks, page_width):
        """Detect number of text columns on a page."""
        if not blocks:
            return 1
        x_positions = [b.get("bbox", (0,))[0] for b in blocks]
        midpoint = page_width / 2
        left = sum(1 for x in x_positions if x < midpoint * 0.8)
        right = sum(1 for x in x_positions if x > midpoint * 1.2)
        return 2 if left > 2 and right > 2 else 1

Step 4: Integration Test

# tests/test_extraction.py
from app.extraction.pdf_extractor import PDFExtractor
from app.extraction.table_extractor import TableExtractor
from app.extraction.layout_analyzer import LayoutAnalyzer


def test_full_extraction(pdf_path="data/sample-invoice.pdf"):
    extractor = PDFExtractor()
    content = extractor.extract(pdf_path)
    print(f"Pages: {content.total_pages}")
    for page in content.pages:
        print(f"Page {page.page_number}: {len(page.blocks)} blocks")

    table_ext = TableExtractor()
    tables = table_ext.extract_tables(pdf_path)
    for t in tables:
        print(f"Table: {t.headers}")

    analyzer = LayoutAnalyzer()
    for page in content.pages:
        cols = analyzer.detect_columns(page.blocks, page.width)
        print(f"Page {page.page_number}: {cols} column(s)")


if __name__ == "__main__":
    test_full_extraction()
💡
Pro tip: PyMuPDF returns font size with each text span. Use larger font sizes to detect headings and section titles. This gives you document structure without needing any AI.

Key Takeaways

  • PyMuPDF extracts text with position data, font information, and images from digital PDFs.
  • tabula-py handles table extraction with lattice (bordered) and stream (borderless) modes.
  • Layout analysis uses y-coordinates to classify blocks as headers, footers, or body content.
  • Column detection clusters text blocks by x-position to identify multi-column layouts.