Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@
__pycache__
/src/paves/__about__.py
/dist
/site
3 changes: 3 additions & 0 deletions src/paves/exceptions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
class NotInstalledError(RuntimeError):
"""Exception raised if the dependencies for a particular PDF to
image backend are not installed."""
343 changes: 4 additions & 339 deletions src/paves/image.py → src/paves/image/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,365 +3,30 @@
models and/or visualisation.`
"""

import contextlib
import functools
import itertools
import subprocess
import tempfile
from io import BytesIO
from os import PathLike
from pathlib import Path
from typing import (
TYPE_CHECKING,
Any,
Callable,
Dict,
Iterable,
Iterator,
List,
Protocol,
Tuple,
Union,
cast,
)

import playa
from PIL import Image, ImageDraw, ImageFont
from playa.document import Document, PageList
from playa.page import ContentObject, Page, Annotation
from playa.structure import Element
from playa.utils import Rect, transform_bbox

if TYPE_CHECKING:
import pypdfium2 # types: ignore
from paves.image.converters import convert
from paves.image.poppler import popple
from paves.image.pdfium import pdfium


class NotInstalledError(RuntimeError):
"""Exception raised if the dependencies for a particular PDF to
image backend are not installed."""


def make_poppler_args(dpi: int, width: int, height: int) -> List[str]:
args = []
if width or height:
args.extend(
[
"-scale-to-x",
str(width or -1), # -1 means use aspect ratio
"-scale-to-y",
str(height or -1),
]
)
if not args:
args.extend(["-r", str(dpi or 72)])
return args


@functools.singledispatch
def _popple(pdf, tempdir: Path, args: List[str]) -> List[Tuple[int, float, float]]:
raise NotImplementedError


@_popple.register(str)
@_popple.register(PathLike)
def _popple_path(
pdf: Union[str, PathLike], tempdir: Path, args: List[str]
) -> List[Tuple[int, float, float]]:
subprocess.run(
[
"pdftoppm",
*args,
str(pdf),
tempdir / "ppm",
],
check=True,
)
with playa.open(pdf) as doc:
return [(page.page_idx, page.width, page.height) for page in doc.pages]


@_popple.register(Document)
def _popple_doc(
pdf: Document, tempdir: Path, args: List[str]
) -> List[Tuple[int, float, float]]:
pdfpdf = tempdir / "pdf.pdf"
# FIXME: This is... not great (can we popple in a pipeline please?)
with open(pdfpdf, "wb") as outfh:
outfh.write(pdf.buffer)
subprocess.run(
[
"pdftoppm",
*args,
str(pdfpdf),
tempdir / "ppm",
],
check=True,
)
pdfpdf.unlink()
return [(page.page_idx, page.width, page.height) for page in pdf.pages]


@_popple.register(Page)
def _popple_page(
pdf: Page, tempdir: Path, args: List[str]
) -> List[Tuple[int, float, float]]:
assert pdf.doc is not None # bug in PLAYA-PDF, oops, it cannot be None
pdfpdf = tempdir / "pdf.pdf"
with open(pdfpdf, "wb") as outfh:
outfh.write(pdf.doc.buffer)
page_number = pdf.page_idx + 1
subprocess.run(
[
"pdftoppm",
*args,
"-f",
str(page_number),
"-l",
str(page_number),
str(pdfpdf),
tempdir / "ppm",
],
check=True,
)
pdfpdf.unlink()
return [(pdf.page_idx, pdf.width, pdf.height)]


@_popple.register(PageList)
def _popple_pages(
pdf: PageList, tempdir: Path, args: List[str]
) -> List[Tuple[int, float, float]]:
pdfpdf = tempdir / "pdf.pdf"
assert pdf[0].doc is not None # bug in PLAYA-PDF, oops, it cannot be None
with open(pdfpdf, "wb") as outfh:
outfh.write(pdf[0].doc.buffer)
pages = sorted(page.page_idx + 1 for page in pdf)
itor = iter(pages)
first = last = next(itor)
spans = []
while True:
try:
next_last = next(itor)
except StopIteration:
spans.append((first, last))
break
if next_last > last + 1:
spans.append((first, last))
first = last = next_last
else:
last = next_last
for first, last in spans:
subprocess.run(
[
"pdftoppm",
*args,
"-f",
str(first),
"-l",
str(last),
str(pdfpdf),
tempdir / "ppm",
],
check=True,
)
pdfpdf.unlink()
return [(page.page_idx, page.width, page.height) for page in pdf]


def popple(
pdf: Union[str, PathLike, Document, Page, PageList],
*,
dpi: int = 0,
width: int = 0,
height: int = 0,
) -> Iterator[Image.Image]:
"""Convert a PDF to images using Poppler's pdftoppm.

Args:
pdf: PLAYA-PDF document, page, pages, or path to a PDF.
dpi: Render to this resolution (default is 72 dpi).
width: Render to this width in pixels.
height: Render to this height in pixels.
Yields:
Pillow `Image.Image` objects, one per page.
Raises:
ValueError: Invalid arguments (e.g. both `dpi` and `width`/`height`)
NotInstalledError: If Poppler is not installed.
"""
if dpi and (width or height):
raise ValueError("Cannot specify both `dpi` and `width` or `height`")
try:
subprocess.run(["pdftoppm", "-h"], capture_output=True)
except FileNotFoundError as e:
raise NotInstalledError("Poppler does not seem to be installed") from e
args = make_poppler_args(dpi, width, height)
with tempfile.TemporaryDirectory() as tempdir:
temppath = Path(tempdir)
# FIXME: Possible to Popple in a Parallel Pipeline
page_sizes = _popple(pdf, temppath, args)
for (page_idx, page_width, page_height), ppm in zip(
page_sizes,
(path for path in sorted(temppath.iterdir()) if path.suffix == ".ppm"),
):
img = Image.open(ppm)
img.info["page_index"] = page_idx
img.info["page_width"] = page_width
img.info["page_height"] = page_height
yield img


@functools.singledispatch
def _get_pdfium_pages(
pdf: Union[str, PathLike, Document, Page, PageList],
) -> Iterator[Tuple[int, "pypdfium2.PdfPage"]]:
import pypdfium2

doc = pypdfium2.PdfDocument(pdf)
for idx, page in enumerate(doc):
yield idx, page
page.close()
doc.close()


@contextlib.contextmanager
def _get_pdfium_doc(pdf: Document) -> Iterator["pypdfium2.PdfDocument"]:
import pypdfium2

if pdf._fp is None:
# Yes, you can actually wrap a BytesIO around an mmap!
with BytesIO(pdf.buffer) as fp:
doc = pypdfium2.PdfDocument(fp)
yield doc
doc.close()
else:
doc = pypdfium2.PdfDocument(pdf._fp)
yield doc
doc.close()


@_get_pdfium_pages.register(Document)
def _get_pdfium_pages_doc(pdf: Document) -> Iterator[Tuple[int, "pypdfium2.PdfPage"]]:
with _get_pdfium_doc(pdf) as doc:
for idx, page in enumerate(doc):
yield idx, page
page.close()


@_get_pdfium_pages.register(Page)
def _get_pdfium_pages_page(page: Page) -> Iterator[Tuple[int, "pypdfium2.PdfPage"]]:
pdf = page.doc
assert pdf is not None
with _get_pdfium_doc(pdf) as doc:
pdfium_page = doc[page.page_idx]
yield page.page_idx, pdfium_page
pdfium_page.close()


@_get_pdfium_pages.register(PageList)
def _get_pdfium_pages_pagelist(
pages: PageList,
) -> Iterator[Tuple[int, "pypdfium2.PdfPage"]]:
pdf = pages.doc
assert pdf is not None
with _get_pdfium_doc(pdf) as doc:
for page in pages:
pdfium_page = doc[page.page_idx]
yield page.page_idx, pdfium_page
pdfium_page.close()


def pdfium(
pdf: Union[str, PathLike, Document, Page, PageList],
*,
dpi: int = 0,
width: int = 0,
height: int = 0,
) -> Iterator[Image.Image]:
"""Convert a PDF to images using PyPDFium2

Args:
pdf: PLAYA-PDF document, page, pages, or path to a PDF.
dpi: Render to this resolution (default is 72 dpi).
width: Render to this width in pixels.
height: Render to this height in pixels.
Yields:
Pillow `Image.Image` objects, one per page. Page width and height are
available in the `info` property of the images.
Raises:
ValueError: Invalid arguments (e.g. both `dpi` and `width`/`height`)
NotInstalledError: If PyPDFium2 is not installed.
"""
if dpi and (width or height):
raise ValueError("Cannot specify both `dpi` and `width` or `height`")
try:
import pypdfium2 # noqa: F401
except ImportError as e:
raise NotInstalledError("PyPDFium2 does not seem to be installed") from e
for idx, page in _get_pdfium_pages(pdf):
page_width = page.get_width()
page_height = page.get_height()
if width == 0 and height == 0:
scale = (dpi or 72) / 72
img = page.render(scale=scale).to_pil()
else:
if width and height:
# Scale to longest side (since pypdfium2 doesn't
# appear to allow non-1:1 aspect ratio)
scale = max(width / page_width, height / page_height)
img = page.render(scale=scale).to_pil()
# Resize down to desired size
img = img.resize(size=(width, height))
elif width:
scale = width / page.get_width()
img = page.render(scale=scale).to_pil()
elif height:
scale = height / page.get_height()
img = page.render(scale=scale).to_pil()
img.info["page_index"] = idx
img.info["page_width"] = page_width
img.info["page_height"] = page_height
yield img


METHODS = [popple, pdfium]


def convert(
pdf: Union[str, PathLike, Document, Page, PageList],
*,
dpi: int = 0,
width: int = 0,
height: int = 0,
) -> Iterator[Image.Image]:
"""Convert a PDF to images.

Args:
pdf: PLAYA-PDF document, page, pages, or path to a PDF.
dpi: Render to this resolution (default is 72 dpi).
width: Render to this width in pixels (0 to keep aspect ratio).
height: Render to this height in pixels (0 to keep aspect ratio).
Yields:
Pillow `Image.Image` objects, one per page. The original page
width and height in default user space units are available in
the `info` property of these images as `page_width` and
`page_height`
Raises:
ValueError: Invalid arguments (e.g. both `dpi` and `width`/`height`)
NotInstalledError: If no renderer is available

"""
for method in METHODS:
try:
for img in method(pdf, dpi=dpi, width=width, height=height):
yield img
break
except NotInstalledError:
continue
else:
raise NotInstalledError(
"No renderers available, tried: %s"
% (", ".join(m.__name__ for m in METHODS))
)
__all__ = ["convert", "popple", "pdfium", "show", "box", "mark"]


def show(page: Page, dpi: int = 72) -> Image.Image:
Expand Down
Loading
Loading