Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion benchmarks/tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,9 @@ def benchmark_pages(detector, path: Path):
parser.add_argument("pdf", type=Path)
parser.add_argument("--over", choices=["doc", "page", "pagelist"], default="doc")
parser.add_argument(
"--detector", choices=["structure", "tatr", "detr"], default="structure"
"--detector",
choices=["structure", "docling_heron", "table_transformer"],
default="structure",
)
args = parser.parse_args()

Expand Down
2 changes: 1 addition & 1 deletion src/paves/image/converters.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"""

from os import PathLike
from typing import Any, Callable, Iterator, List, Protocol, Tuple, Union
from typing import Callable, Iterator, List, Protocol, Tuple, Union
from playa import Document, Page, PageList
from PIL import Image

Expand Down
6 changes: 5 additions & 1 deletion src/paves/image/poppler.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"""

import functools
import shutil
import subprocess
import tempfile
from os import PathLike
Expand All @@ -22,6 +23,8 @@
from paves.exceptions import NotInstalledError
from paves.image.converters import converter

PATH_TO_POPPLER: Union[str, None] = shutil.which("pdftoppm")


def make_poppler_args(dpi: int, width: int, height: int) -> List[str]:
args = []
Expand Down Expand Up @@ -132,10 +135,11 @@ def _popple_pages(
first = last = next_last
else:
last = next_last
assert PATH_TO_POPPLER is not None
for first, last in spans:
subprocess.run(
[
"pdftoppm",
PATH_TO_POPPLER,
*args,
"-f",
str(first),
Expand Down
4 changes: 2 additions & 2 deletions src/paves/tables/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@

from paves.tables.detectors import lookup as detector
from paves.tables.detectors import tables, tables_orelse
from paves.tables.detr import detr as tables_detr
from paves.tables.tatr import tatr as tables_tatr
from paves.tables.detr import docling_heron as tables_detr
from paves.tables.detr import table_transformer as tables_tatr
from paves.tables.structure import structure as tables_structure

__all__ = [
Expand Down
15 changes: 7 additions & 8 deletions src/paves/tables/detectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,7 @@ class Detector(Protocol):
def __call__(
self,
pdf: Union[str, PathLike, Document, Page, PageList],
) -> Union[Iterator[TableObject], None]:
...
) -> Union[Iterator[TableObject], None]: ...

__name__: str

Expand Down Expand Up @@ -46,7 +45,7 @@ def lookup(name: str) -> Union[Detector, None]:


def tables_orelse(
pdf: Union[str, PathLike, Document, Page, PageList]
pdf: Union[str, PathLike, Document, Page, PageList],
) -> Union[Iterator[TableObject], None]:
"""Identify tables in a PDF or one of its pages, or fail.

Expand All @@ -72,17 +71,17 @@ def tables_orelse(


def tables(
pdf: Union[str, PathLike, Document, Page, PageList]
pdf: Union[str, PathLike, Document, Page, PageList],
) -> Iterator[TableObject]:
"""Identify tables in a PDF or one of its pages.

This will always try to use logical structure (via PLAYA-PDF)
first to identify tables.

For the moment, this only works on tagged and accessible PDFs.
So, like `paves.image`, it can also use Machine Learning Models™
to do so, which involves nasty horrible dependencyses (we hates
them, they stole the precious) like `cudnn-10-gigabytes-of-c++`.
Of course, that only works on tagged and accessible PDFs. So,
like `paves.image`, we can also use Machine Learning Models™ here,
which involves nasty horrible dependencyses (we hates them, they
stole the precious) like `cudnn-10-gigabytes-of-c++`.

If you'd like to try that, then you can do so by installing the
`transformers[torch]` package (if you don't have a GPU, try adding
Expand Down
129 changes: 91 additions & 38 deletions src/paves/tables/detr.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
"""
Detect tables using RT-DETR models from IBM Docling project.
Detect tables using RT-DETR models from IBM Docling project, or
Microsoft Table Transformer (which is also DETR).
"""

import logging
from functools import singledispatch
from os import PathLike
from typing import Iterable, Iterator, List, Tuple, Union, cast

import playa
from playa import Document, Page, PageList, Rect
from typing import Any, Iterator, List, Tuple, Union

import paves.image as pi
import playa
from paves.tables.detectors import detector
from paves.tables.table import TableObject
from playa import Document, Page, PageList, Rect

LOGGER = logging.getLogger(__name__)

Expand Down Expand Up @@ -44,70 +44,123 @@ def _get_pages_page(page: Page) -> Iterator[Page]:
yield page


def table_bounds_to_objects(
def make_rect(box: List[Union[int, float]]) -> Rect:
"""Verify and create a bounding box as tuple of ints."""
rect = tuple(round(x) for x in box)
if len(rect) != 4:
raise TypeError(f"Rectangle does not have 4 corners: {box}")
return rect


def detect_objects(
pdf: Union[str, PathLike, Document, Page, PageList],
bounds: Iterable[Tuple[int, Iterable[Rect]]],
) -> Iterator[TableObject]:
"""Create TableObjects from detected bounding boxes."""
for page, (page_idx, tables) in zip(_get_pages(pdf), bounds):
assert page.page_idx == page_idx
for bbox in tables:
yield TableObject.from_bbox(page, bbox)


def table_bounds(
pdf: Union[str, PathLike, Document, Page, PageList]
) -> Iterator[Tuple[int, List[Rect]]]:
model_name: str,
*,
model_kwargs: dict[str, Any] | None = None,
threshold: float = 0.5,
) -> Iterator[Tuple[int, List[Tuple[str, Rect]]]]:
"""Iterate over all text objects in a PDF, page, or pages"""
import torch
from transformers import AutoImageProcessor, AutoModelForObjectDetection

processor = AutoImageProcessor.from_pretrained(
"ds4sd/docling-layout-old", use_fast=True
)
processor = AutoImageProcessor.from_pretrained(model_name, use_fast=True)
# FIXME: sorry, AMD owners, and everybody else, this will get fixed
device = "cuda" if torch.cuda.is_available() else "cpu"
torch_device = torch.device(device)
model = AutoModelForObjectDetection.from_pretrained("ds4sd/docling-layout-old").to(
if model_kwargs is None:
model_kwargs = {}
model = AutoModelForObjectDetection.from_pretrained(model_name, **model_kwargs).to(
torch_device
)
width = processor.size["width"]
height = processor.size["height"]
# Labels are off-by-one for no good reason
table_label = int(model.config.label2id["Table"]) - 1

# Concoct some arguments for pi.convert (FIXME: should be able to
# pass it processor.size directly)
dpi = width = height = 0
if "width" in processor.size and "height" in processor.size:
width = processor.size["width"]
height = processor.size["height"]
elif "longest_edge" in processor.size:
# FIXME: This isn't really what it means, but it works anyway
width = height = processor.size["longest_edge"]
else:
# Render it big and let the model figure it out
dpi = 144

# We could do this in a batch, but that easily runs out of memory
with torch.inference_mode():
for image in pi.convert(pdf, width=width, height=height):
for image in pi.convert(pdf, dpi=dpi, width=width, height=height):
inputs = processor(images=[image], return_tensors="pt").to(torch_device)
outputs = model(**inputs)
results = processor.post_process_object_detection(
outputs,
target_sizes=[(image.info["page_height"], image.info["page_width"])],
threshold=threshold,
)
boxes: List[Rect] = []
boxes: List[Tuple[str, Rect]] = []
for label, box in zip(results[0]["labels"], results[0]["boxes"]):
if label.item() != table_label:
continue
bbox = tuple(round(x) for x in box.tolist())
assert len(bbox) == 4
boxes.append(cast(Rect, bbox))
name = model.config.id2label[label.item()]
bbox = make_rect(box.tolist())
boxes.append((name, bbox))
yield image.info["page_index"], boxes


@detector(priority=50)
def detr(
@detector(priority=10)
def docling_heron(
pdf: Union[str, PathLike, Document, Page, PageList],
) -> Union[Iterator[TableObject], None]:
"""Identify tables in a PDF or one of its pages using Docling Project
layout model.

Args:
pdf: PLAYA-PDF document, page, pages, or path to a PDF.

Returns:
An iterator over `TableObject`, or `None`, if the model can't be used

"""
try:
detected = detect_objects(pdf, "docling-project/docling-layout-heron")
except ImportError:
return None

def itor() -> Iterator[TableObject]:
for page, (page_idx, objects) in zip(_get_pages(pdf), detected):
assert page.page_idx == page_idx
for label, bbox in objects:
if label == "Table":
yield TableObject.from_bbox(page, bbox)

return itor()


@detector(priority=20)
def table_transformer(
pdf: Union[str, PathLike, Document, Page, PageList],
) -> Union[Iterator[TableObject], None]:
"""Identify tables in a PDF or one of its pages using IBM's
RT-DETR layout detection model
"""Identify tables in a PDF or one of its pages using Microsoft Table
Transformer model.

Args:
pdf: PLAYA-PDF document, page, pages, or path to a PDF.

Returns:
An iterator over `TableObject`, or `None`, if the model can't be used

"""
try:
return table_bounds_to_objects(pdf, table_bounds(pdf))
detected = detect_objects(
pdf,
"microsoft/table-transformer-detection",
model_kwargs={"revision": "no_timm"},
threshold=0.9,
)
except ImportError:
return None

def itor() -> Iterator[TableObject]:
for page, (page_idx, objects) in zip(_get_pages(pdf), detected):
assert page.page_idx == page_idx
for label, bbox in objects:
yield TableObject.from_bbox(page, bbox)

return itor()
65 changes: 2 additions & 63 deletions src/paves/tables/structure.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,83 +2,22 @@
Table detection using PDF logical structure.
"""

from copy import copy
from functools import singledispatch
from itertools import groupby
from typing import Iterable, Iterator, List, Tuple, Union
from typing import Iterable, Iterator, Union
from operator import attrgetter
from os import PathLike

import playa
from playa import Document, Page, PageList
from playa.content import GraphicState, MarkedContent
from playa.page import Annotation
from playa.pdftypes import Matrix, Rect
from playa.structure import (
Element,
ContentItem,
ContentObject as StructContentObject,
)
from playa.worker import _ref_page

from paves.tables.detectors import detector
from paves.tables.table import TableObject


def _from_element(
el: Element,
page: Page,
contents: Union[Iterable[Union[ContentItem, StructContentObject]], None] = None,
) -> Union["TableObject", None]:
if contents is None:
contents = el.contents
# Find a ContentObject so we can get a bbox, mcstack, ctm
# (they might not be *correct* of course, but oh well)
gstate: Union[GraphicState, None] = None
ctm: Union[Matrix, None] = None
mcstack: Union[Tuple[MarkedContent, ...], None] = None
bbox: Union[Rect, None] = None
for kid in contents:
# For multi-page tables, skip any contents on a different page
if kid.page != page:
continue
if isinstance(kid, StructContentObject):
obj = kid.obj
if obj is None:
continue
elif isinstance(obj, Annotation):
# FIXME: for the moment just ignore these
continue
else:
gstate = copy(obj.gstate)
ctm = obj.ctm
mcstack = obj.mcstack
bbox = obj.bbox
break
elif isinstance(kid, ContentItem):
# It's a ContentItem
try:
cobj = next(iter(kid))
except StopIteration:
continue
gstate = copy(cobj.gstate)
ctm = cobj.ctm
mcstack = cobj.mcstack
break
else:
# No contents, no table for you!
return None
return TableObject(
_pageref=_ref_page(page),
_parentkey=None,
gstate=gstate,
ctm=ctm,
mcstack=mcstack,
_bbox=bbox,
_parent=el,
)


@singledispatch
def table_elements(
pdf: Union[str, PathLike, Document, Page, PageList],
Expand Down Expand Up @@ -135,7 +74,7 @@ def table_elements_to_objects(
continue
if page is not None and kidpage is not page:
continue
table = _from_element(el, kidpage, kids)
table = TableObject.from_element(el, kidpage, kids)
if table is not None:
yield table

Expand Down
Loading
Loading