dhdaines · dhdaines · Jan 14, 2026 · Jan 14, 2026 · Jan 14, 2026 · Jan 14, 2026
diff --git a/benchmarks/tables.py b/benchmarks/tables.py
@@ -33,7 +33,9 @@ def benchmark_pages(detector, path: Path):
     parser.add_argument("pdf", type=Path)
     parser.add_argument("--over", choices=["doc", "page", "pagelist"], default="doc")
     parser.add_argument(
-        "--detector", choices=["structure", "tatr", "detr"], default="structure"
+        "--detector",
+        choices=["structure", "docling_heron", "table_transformer"],
+        default="structure",
     )
     args = parser.parse_args()
 

diff --git a/src/paves/image/converters.py b/src/paves/image/converters.py
@@ -3,7 +3,7 @@
 """
 
 from os import PathLike
-from typing import Any, Callable, Iterator, List, Protocol, Tuple, Union
+from typing import Callable, Iterator, List, Protocol, Tuple, Union
 from playa import Document, Page, PageList
 from PIL import Image
 

diff --git a/src/paves/image/poppler.py b/src/paves/image/poppler.py
@@ -3,6 +3,7 @@
 """
 
 import functools
+import shutil
 import subprocess
 import tempfile
 from os import PathLike
@@ -22,6 +23,8 @@
 from paves.exceptions import NotInstalledError
 from paves.image.converters import converter
 
+PATH_TO_POPPLER: Union[str, None] = shutil.which("pdftoppm")
+
 
 def make_poppler_args(dpi: int, width: int, height: int) -> List[str]:
     args = []
@@ -132,10 +135,11 @@ def _popple_pages(
             first = last = next_last
         else:
             last = next_last
+    assert PATH_TO_POPPLER is not None
     for first, last in spans:
         subprocess.run(
             [
-                "pdftoppm",
+                PATH_TO_POPPLER,
                 *args,
                 "-f",
                 str(first),

diff --git a/src/paves/tables/__init__.py b/src/paves/tables/__init__.py
@@ -4,8 +4,8 @@
 
 from paves.tables.detectors import lookup as detector
 from paves.tables.detectors import tables, tables_orelse
-from paves.tables.detr import detr as tables_detr
-from paves.tables.tatr import tatr as tables_tatr
+from paves.tables.detr import docling_heron as tables_detr
+from paves.tables.detr import table_transformer as tables_tatr
 from paves.tables.structure import structure as tables_structure
 
 __all__ = [

diff --git a/src/paves/tables/detectors.py b/src/paves/tables/detectors.py
@@ -15,8 +15,7 @@ class Detector(Protocol):
     def __call__(
         self,
         pdf: Union[str, PathLike, Document, Page, PageList],
-    ) -> Union[Iterator[TableObject], None]:
-        ...
+    ) -> Union[Iterator[TableObject], None]: ...
 
     __name__: str
 
@@ -46,7 +45,7 @@ def lookup(name: str) -> Union[Detector, None]:
 
 
 def tables_orelse(
-    pdf: Union[str, PathLike, Document, Page, PageList]
+    pdf: Union[str, PathLike, Document, Page, PageList],
 ) -> Union[Iterator[TableObject], None]:
     """Identify tables in a PDF or one of its pages, or fail.
 
@@ -72,17 +71,17 @@ def tables_orelse(
 
 
 def tables(
-    pdf: Union[str, PathLike, Document, Page, PageList]
+    pdf: Union[str, PathLike, Document, Page, PageList],
 ) -> Iterator[TableObject]:
     """Identify tables in a PDF or one of its pages.
 
     This will always try to use logical structure (via PLAYA-PDF)
     first to identify tables.
 
-    For the moment, this only works on tagged and accessible PDFs.
-    So, like `paves.image`, it can also use Machine Learning Models™
-    to do so, which involves nasty horrible dependencyses (we hates
-    them, they stole the precious) like `cudnn-10-gigabytes-of-c++`.
+    Of course, that only works on tagged and accessible PDFs.  So,
+    like `paves.image`, we can also use Machine Learning Models™ here,
+    which involves nasty horrible dependencyses (we hates them, they
+    stole the precious) like `cudnn-10-gigabytes-of-c++`.
 
     If you'd like to try that, then you can do so by installing the
     `transformers[torch]` package (if you don't have a GPU, try adding

diff --git a/src/paves/tables/detr.py b/src/paves/tables/detr.py
@@ -1,18 +1,18 @@
 """
-Detect tables using RT-DETR models from IBM Docling project.
+Detect tables using RT-DETR models from IBM Docling project, or
+Microsoft Table Transformer (which is also DETR).
 """
 
 import logging
 from functools import singledispatch
 from os import PathLike
-from typing import Iterable, Iterator, List, Tuple, Union, cast
-
-import playa
-from playa import Document, Page, PageList, Rect
+from typing import Any, Iterator, List, Tuple, Union
 
 import paves.image as pi
+import playa
 from paves.tables.detectors import detector
 from paves.tables.table import TableObject
+from playa import Document, Page, PageList, Rect
 
 LOGGER = logging.getLogger(__name__)
 
@@ -44,70 +44,123 @@ def _get_pages_page(page: Page) -> Iterator[Page]:
     yield page
 
 
-def table_bounds_to_objects(
+def make_rect(box: List[Union[int, float]]) -> Rect:
+    """Verify and create a bounding box as tuple of ints."""
+    rect = tuple(round(x) for x in box)
+    if len(rect) != 4:
+        raise TypeError(f"Rectangle does not have 4 corners: {box}")
+    return rect
+
+
+def detect_objects(
     pdf: Union[str, PathLike, Document, Page, PageList],
-    bounds: Iterable[Tuple[int, Iterable[Rect]]],
-) -> Iterator[TableObject]:
-    """Create TableObjects from detected bounding boxes."""
-    for page, (page_idx, tables) in zip(_get_pages(pdf), bounds):
-        assert page.page_idx == page_idx
-        for bbox in tables:
-            yield TableObject.from_bbox(page, bbox)
-
-
-def table_bounds(
-    pdf: Union[str, PathLike, Document, Page, PageList]
-) -> Iterator[Tuple[int, List[Rect]]]:
+    model_name: str,
+    *,
+    model_kwargs: dict[str, Any] | None = None,
+    threshold: float = 0.5,
+) -> Iterator[Tuple[int, List[Tuple[str, Rect]]]]:
     """Iterate over all text objects in a PDF, page, or pages"""
     import torch
     from transformers import AutoImageProcessor, AutoModelForObjectDetection
 
-    processor = AutoImageProcessor.from_pretrained(
-        "ds4sd/docling-layout-old", use_fast=True
-    )
+    processor = AutoImageProcessor.from_pretrained(model_name, use_fast=True)
     # FIXME: sorry, AMD owners, and everybody else, this will get fixed
     device = "cuda" if torch.cuda.is_available() else "cpu"
     torch_device = torch.device(device)
-    model = AutoModelForObjectDetection.from_pretrained("ds4sd/docling-layout-old").to(
+    if model_kwargs is None:
+        model_kwargs = {}
+    model = AutoModelForObjectDetection.from_pretrained(model_name, **model_kwargs).to(
         torch_device
     )
-    width = processor.size["width"]
-    height = processor.size["height"]
-    # Labels are off-by-one for no good reason
-    table_label = int(model.config.label2id["Table"]) - 1
+
+    # Concoct some arguments for pi.convert (FIXME: should be able to
+    # pass it processor.size directly)
+    dpi = width = height = 0
+    if "width" in processor.size and "height" in processor.size:
+        width = processor.size["width"]
+        height = processor.size["height"]
+    elif "longest_edge" in processor.size:
+        # FIXME: This isn't really what it means, but it works anyway
+        width = height = processor.size["longest_edge"]
+    else:
+        # Render it big and let the model figure it out
+        dpi = 144
+
     # We could do this in a batch, but that easily runs out of memory
     with torch.inference_mode():
-        for image in pi.convert(pdf, width=width, height=height):
+        for image in pi.convert(pdf, dpi=dpi, width=width, height=height):
             inputs = processor(images=[image], return_tensors="pt").to(torch_device)
             outputs = model(**inputs)
             results = processor.post_process_object_detection(
                 outputs,
                 target_sizes=[(image.info["page_height"], image.info["page_width"])],
+                threshold=threshold,
             )
-            boxes: List[Rect] = []
+            boxes: List[Tuple[str, Rect]] = []
             for label, box in zip(results[0]["labels"], results[0]["boxes"]):
-                if label.item() != table_label:
-                    continue
-                bbox = tuple(round(x) for x in box.tolist())
-                assert len(bbox) == 4
-                boxes.append(cast(Rect, bbox))
+                name = model.config.id2label[label.item()]
+                bbox = make_rect(box.tolist())
+                boxes.append((name, bbox))
             yield image.info["page_index"], boxes
 
 
-@detector(priority=50)
-def detr(
+@detector(priority=10)
+def docling_heron(
+    pdf: Union[str, PathLike, Document, Page, PageList],
+) -> Union[Iterator[TableObject], None]:
+    """Identify tables in a PDF or one of its pages using Docling Project
+    layout model.
+
+    Args:
+        pdf: PLAYA-PDF document, page, pages, or path to a PDF.
+
+    Returns:
+      An iterator over `TableObject`, or `None`, if the model can't be used
+
+    """
+    try:
+        detected = detect_objects(pdf, "docling-project/docling-layout-heron")
+    except ImportError:
+        return None
+
+    def itor() -> Iterator[TableObject]:
+        for page, (page_idx, objects) in zip(_get_pages(pdf), detected):
+            assert page.page_idx == page_idx
+            for label, bbox in objects:
+                if label == "Table":
+                    yield TableObject.from_bbox(page, bbox)
+
+    return itor()
+
+
+@detector(priority=20)
+def table_transformer(
     pdf: Union[str, PathLike, Document, Page, PageList],
 ) -> Union[Iterator[TableObject], None]:
-    """Identify tables in a PDF or one of its pages using IBM's
-    RT-DETR layout detection model
+    """Identify tables in a PDF or one of its pages using Microsoft Table
+    Transformer model.
 
     Args:
         pdf: PLAYA-PDF document, page, pages, or path to a PDF.
 
     Returns:
       An iterator over `TableObject`, or `None`, if the model can't be used
+
     """
     try:
-        return table_bounds_to_objects(pdf, table_bounds(pdf))
+        detected = detect_objects(
+            pdf,
+            "microsoft/table-transformer-detection",
+            model_kwargs={"revision": "no_timm"},
+            threshold=0.9,
+        )
     except ImportError:
         return None
+
+    def itor() -> Iterator[TableObject]:
+        for page, (page_idx, objects) in zip(_get_pages(pdf), detected):
+            assert page.page_idx == page_idx
+            for label, bbox in objects:
+                yield TableObject.from_bbox(page, bbox)
+
+    return itor()
diff --git a/src/paves/tables/structure.py b/src/paves/tables/structure.py
@@ -2,83 +2,22 @@
 Table detection using PDF logical structure.
 """
 
-from copy import copy
 from functools import singledispatch
 from itertools import groupby
-from typing import Iterable, Iterator, List, Tuple, Union
+from typing import Iterable, Iterator, Union
 from operator import attrgetter
 from os import PathLike
 
 import playa
 from playa import Document, Page, PageList
-from playa.content import GraphicState, MarkedContent
-from playa.page import Annotation
-from playa.pdftypes import Matrix, Rect
 from playa.structure import (
     Element,
-    ContentItem,
-    ContentObject as StructContentObject,
 )
-from playa.worker import _ref_page
 
 from paves.tables.detectors import detector
 from paves.tables.table import TableObject
 
 
-def _from_element(
-    el: Element,
-    page: Page,
-    contents: Union[Iterable[Union[ContentItem, StructContentObject]], None] = None,
-) -> Union["TableObject", None]:
-    if contents is None:
-        contents = el.contents
-    # Find a ContentObject so we can get a bbox, mcstack, ctm
-    # (they might not be *correct* of course, but oh well)
-    gstate: Union[GraphicState, None] = None
-    ctm: Union[Matrix, None] = None
-    mcstack: Union[Tuple[MarkedContent, ...], None] = None
-    bbox: Union[Rect, None] = None
-    for kid in contents:
-        # For multi-page tables, skip any contents on a different page
-        if kid.page != page:
-            continue
-        if isinstance(kid, StructContentObject):
-            obj = kid.obj
-            if obj is None:
-                continue
-            elif isinstance(obj, Annotation):
-                # FIXME: for the moment just ignore these
-                continue
-            else:
-                gstate = copy(obj.gstate)
-                ctm = obj.ctm
-                mcstack = obj.mcstack
-                bbox = obj.bbox
-                break
-        elif isinstance(kid, ContentItem):
-            # It's a ContentItem
-            try:
-                cobj = next(iter(kid))
-            except StopIteration:
-                continue
-            gstate = copy(cobj.gstate)
-            ctm = cobj.ctm
-            mcstack = cobj.mcstack
-            break
-    else:
-        # No contents, no table for you!
-        return None
-    return TableObject(
-        _pageref=_ref_page(page),
-        _parentkey=None,
-        gstate=gstate,
-        ctm=ctm,
-        mcstack=mcstack,
-        _bbox=bbox,
-        _parent=el,
-    )
-
-
 @singledispatch
 def table_elements(
     pdf: Union[str, PathLike, Document, Page, PageList],
@@ -135,7 +74,7 @@ def table_elements_to_objects(
                 continue
             if page is not None and kidpage is not page:
                 continue
-            table = _from_element(el, kidpage, kids)
+            table = TableObject.from_element(el, kidpage, kids)
             if table is not None:
                 yield table