PDF Text & Table Extraction
In this step, you will build the core PDF extraction module. Using PyMuPDF for text and layout analysis and tabula for table extraction, you will handle multi-column layouts, headers, footers, and embedded tables.
Understanding PDF Structure
PDFs are not like HTML. Text is positioned absolutely on a page with x,y coordinates. There are no semantic paragraphs or headings — just character sequences placed at specific positions. Our extractor must reconstruct the reading order from these coordinates.
Step 1: Basic Text Extraction with PyMuPDF
PyMuPDF (also called fitz) is the fastest Python PDF library. It extracts text with position data, which we need for layout analysis.
# app/extraction/pdf_extractor.py
import fitz # PyMuPDF
import logging
from dataclasses import dataclass
from pathlib import Path
logger = logging.getLogger(__name__)
@dataclass
class PageContent:
"""Extracted content from a single PDF page."""
page_number: int
text: str
blocks: list[dict]
images: list[dict]
width: float
height: float
@dataclass
class PDFContent:
"""Extracted content from an entire PDF."""
filename: str
total_pages: int
pages: list[PageContent]
metadata: dict
class PDFExtractor:
"""Extract text and layout from PDF files using PyMuPDF."""
def extract(self, file_path: str) -> PDFContent:
"""Extract all content from a PDF file."""
path = Path(file_path)
if not path.exists():
raise FileNotFoundError(f"PDF not found: {file_path}")
doc = fitz.open(file_path)
pages = []
for page_num in range(len(doc)):
page = doc[page_num]
blocks = page.get_text("dict", sort=True)["blocks"]
text_blocks = []
image_blocks = []
for block in blocks:
if block["type"] == 0: # Text block
text_blocks.append({
"type": "text",
"bbox": block["bbox"],
"lines": [
{
"text": " ".join(
span["text"]
for line_span in line["spans"]
for span in [line_span]
),
"font": line["spans"][0]["font"]
if line["spans"] else "",
"size": line["spans"][0]["size"]
if line["spans"] else 0,
"bbox": line["bbox"],
}
for line in block["lines"]
],
})
elif block["type"] == 1: # Image block
image_blocks.append({
"type": "image",
"bbox": block["bbox"],
"width": block.get("width", 0),
"height": block.get("height", 0),
})
plain_text = page.get_text("text", sort=True)
pages.append(PageContent(
page_number=page_num + 1,
text=plain_text,
blocks=text_blocks,
images=image_blocks,
width=page.rect.width,
height=page.rect.height,
))
metadata = doc.metadata or {}
doc.close()
logger.info(f"Extracted {len(pages)} pages from {path.name}")
return PDFContent(
filename=path.name,
total_pages=len(pages),
pages=pages,
metadata=metadata,
)
Step 2: Table Extraction with tabula
tabula-py wraps the tabula-java library to extract tables from PDFs into pandas DataFrames.
# app/extraction/table_extractor.py
import tabula
import pandas as pd
import logging
from dataclasses import dataclass
from pathlib import Path
logger = logging.getLogger(__name__)
@dataclass
class ExtractedTable:
"""A table extracted from a PDF page."""
page_number: int
table_index: int
headers: list[str]
rows: list[list[str]]
dataframe: pd.DataFrame
class TableExtractor:
"""Extract tables from PDF files using tabula-py."""
def extract_tables(self, file_path: str, pages: str = "all") -> list[ExtractedTable]:
"""Extract all tables from a PDF."""
path = Path(file_path)
if not path.exists():
raise FileNotFoundError(f"PDF not found: {file_path}")
try:
dfs = tabula.read_pdf(str(path), pages=pages,
multiple_tables=True, lattice=True)
except Exception:
logger.warning("Lattice failed, trying stream mode")
dfs = tabula.read_pdf(str(path), pages=pages,
multiple_tables=True, stream=True)
tables = []
for idx, df in enumerate(dfs):
df = df.dropna(how="all").fillna("")
df.columns = [str(c).strip() for c in df.columns]
tables.append(ExtractedTable(
page_number=0, table_index=idx,
headers=list(df.columns),
rows=[[str(cell).strip() for cell in row] for row in df.values.tolist()],
dataframe=df,
))
logger.info(f"Extracted {len(tables)} tables from {path.name}")
return tables
Step 3: Layout Analysis
Detect multi-column layouts, headers, footers, and section boundaries:
# app/extraction/layout_analyzer.py
import logging
from dataclasses import dataclass
logger = logging.getLogger(__name__)
@dataclass
class LayoutRegion:
"""A detected region on the page."""
type: str # "header", "footer", "body"
bbox: tuple
text: str
class LayoutAnalyzer:
"""Analyze page layout to detect columns, headers, and footers."""
def __init__(self, header_threshold=0.08, footer_threshold=0.92):
self.header_threshold = header_threshold
self.footer_threshold = footer_threshold
def analyze_page(self, blocks, page_width, page_height):
"""Classify blocks into header, footer, and body regions."""
regions = []
header_y = page_height * self.header_threshold
footer_y = page_height * self.footer_threshold
for block in blocks:
bbox = block.get("bbox", (0, 0, 0, 0))
text = " ".join(
line.get("text", "") for line in block.get("lines", [])
).strip()
if not text:
continue
y_center = (bbox[1] + bbox[3]) / 2
if y_center < header_y:
region_type = "header"
elif y_center > footer_y:
region_type = "footer"
else:
region_type = "body"
regions.append(LayoutRegion(type=region_type, bbox=bbox, text=text))
return regions
def detect_columns(self, blocks, page_width):
"""Detect number of text columns on a page."""
if not blocks:
return 1
x_positions = [b.get("bbox", (0,))[0] for b in blocks]
midpoint = page_width / 2
left = sum(1 for x in x_positions if x < midpoint * 0.8)
right = sum(1 for x in x_positions if x > midpoint * 1.2)
return 2 if left > 2 and right > 2 else 1
Step 4: Integration Test
# tests/test_extraction.py
from app.extraction.pdf_extractor import PDFExtractor
from app.extraction.table_extractor import TableExtractor
from app.extraction.layout_analyzer import LayoutAnalyzer
def test_full_extraction(pdf_path="data/sample-invoice.pdf"):
extractor = PDFExtractor()
content = extractor.extract(pdf_path)
print(f"Pages: {content.total_pages}")
for page in content.pages:
print(f"Page {page.page_number}: {len(page.blocks)} blocks")
table_ext = TableExtractor()
tables = table_ext.extract_tables(pdf_path)
for t in tables:
print(f"Table: {t.headers}")
analyzer = LayoutAnalyzer()
for page in content.pages:
cols = analyzer.detect_columns(page.blocks, page.width)
print(f"Page {page.page_number}: {cols} column(s)")
if __name__ == "__main__":
test_full_extraction()
Key Takeaways
- PyMuPDF extracts text with position data, font information, and images from digital PDFs.
- tabula-py handles table extraction with lattice (bordered) and stream (borderless) modes.
- Layout analysis uses y-coordinates to classify blocks as headers, footers, or body content.
- Column detection clusters text blocks by x-position to identify multi-column layouts.
Lilly Tech Systems