Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 10 additions & 4 deletions .github/workflows/python-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,11 @@ jobs:
--health-timeout 5s
--health-retries 5
ports:
# Maps tcp port 5432 on service container to the host
- 5432:5432
unoserver:
image: ghcr.io/dochub-ulb/unoserver:latest
ports:
- 2003:2003


steps:
Expand All @@ -46,7 +49,10 @@ jobs:
run: sudo apt update

- name: Install apt packages
run: sudo apt install -y graphicsmagick mupdf-tools ghostscript unoconv python3-dev
run: sudo apt install -y graphicsmagick mupdf-tools ghostscript pipx python3-dev

- name: Install unoserver
run: pipx install unoserver --system-site-packages

- name: Install the project
run: uv sync --dev
Expand All @@ -64,10 +70,10 @@ jobs:
run: uv run ./manage.py collectstatic --noinput -v 0

- name: pytest with SQLite
run: uv run pytest -k "not unoconv"
run: uv run pytest

- name: pytest with PostgreSQL
run: uv run pytest -k "not unoconv" -m postgresql
run: uv run pytest -m postgresql
env:
DB_URL: postgres://postgres:postgres@localhost:5432/postgres

Expand Down
10 changes: 7 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,17 @@ First, install uv and system dependencies:
curl -LsSf https://astral.sh/uv/install.sh | sh

# Ubuntu
sudo apt-get install unoconv python3-dev ruby libtiff5-dev libjpeg-dev zlib1g-dev libfreetype6-dev liblcms2-dev libwebp-dev tcl8.6-dev tk8.6-dev python-tk mupdf-tools redis-server
sudo apt-get install libreoffice pipx python3-dev ruby libtiff5-dev libjpeg-dev zlib1g-dev libfreetype6-dev liblcms2-dev libwebp-dev tcl8.6-dev tk8.6-dev python-tk mupdf-tools redis-server
# unoserver needs access to LibreOffice's 'uno' library from system packages
pipx install unoserver --system-site-packages
sudo systemctl enable --now redis-server
# Fedora
sudo dnf install unoconv python-devel ruby mupdf redis
sudo dnf install libreoffice pipx python-devel ruby mupdf redis
pipx install unoserver --system-site-packages
sudo systemctl enable --now redis
# Arch linux
sudo pacman -S unoconv ruby python mupdf-tools redis
sudo pacman -S libreoffice python-pipx ruby python mupdf-tools redis
pipx install unoserver --system-site-packages
sudo systemctl enable --now redis
```

Expand Down
67 changes: 41 additions & 26 deletions documents/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@
import re
import subprocess
import tempfile
import time
import uuid
from io import BytesIO

from django.conf import settings
from django.core.files.base import ContentFile, File

from celery import chain, shared_task
from celery.exceptions import SoftTimeLimitExceeded
from pypdf import PdfReader

from documents.models import Document, DocumentError
Expand Down Expand Up @@ -130,34 +130,49 @@ def checksum(self, document_id: int) -> int:

@short_doctask
def convert_office_to_pdf(self, document_id: int) -> int:
try:
document = Document.objects.get(pk=document_id)
document = Document.objects.get(pk=document_id)

with file_as_local(
document.original, prefix="dochub_unoconv_input_"
) as tmpfile:
if settings.DEBUG:
# Check if unoserver is running
ping_result = subprocess.run(
["unoping"], capture_output=True, timeout=5, check=False
)

if ping_result.returncode != 0:
# Server not running, start it as a daemon
# Here we want to use the system unoserver, as it needs access to LibreOffice
try:
sub = subprocess.check_output(
["unoconv", "-f", "pdf", "--stdout", tmpfile.name]
subprocess.Popen(
[
"unoserver",
"--daemon",
"--conversion-timeout",
"300",
],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
)
except OSError as e:
raise MissingBinary("unoconv") from e
except subprocess.CalledProcessError as e:
raise DocumentProcessingError(
document, exc=e, message='"unoconv" has failed: %s' % e.output[:800]
) from e

document.pdf.save(str(uuid.uuid4()) + ".pdf", ContentFile(sub))

return document_id

except SoftTimeLimitExceeded as e:
# If we timeouted, kill the faulty openoffice daemon
# it will respawn at the next unoconv invocation
os.system("killall soffice.bin")
# Still raise the exception so the pipeline for this
# document is still stopped
raise e
except FileNotFoundError as e:
raise MissingBinary("unoserver") from e
# Give the server time to start up and be ready
time.sleep(2)

try:
result = subprocess.run(
["unoconvert", "-", "-", "--convert-to", "pdf"],
input=document.original.read(),
capture_output=True,
check=True,
)
sub = result.stdout
except subprocess.CalledProcessError as e:
raise DocumentProcessingError(
document, exc=e, message="unoconvert has failed: %s" % e.stderr[:2000]
) from e

document.pdf.save(str(uuid.uuid4()) + ".pdf", ContentFile(sub))

return document_id


@short_doctask
Expand Down
31 changes: 15 additions & 16 deletions documents/templates/documents/viewer.html
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ <h1 class="fs-4 mb-0 d-flex align-items-center gap-1">

<div class="d-sm-flex gap-2 align-items-center">
<div class="mt-2 d-flex gap-2 align-items-center">
{% if document.state == "DONE" %}
{% if document.state == "DONE" and not document.is_unconvertible %}
<a class="btn btn-primary btn-sm d-inline-flex align-items-center gap-1" data-turbo="false"
href="{% url 'document_pdf' document.pk %}">
<svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor"
Expand Down Expand Up @@ -140,7 +140,14 @@ <h1 class="fs-4 mb-0 d-flex align-items-center gap-1">
{% endblock header %}

{% block content %}
{% if document.state == "DONE" %}
{% if document.is_unconvertible %}
<div class="alert alert-primary" role="alert">
DocHub ne sait pas générer d'aperçu pour ce document.
Mais tu peux le
<a href="{% url 'document_original' document.pk %}">télécharger</a>
et l'ouvrir directement chez toi.
</div>
{% elif document.state == "DONE" %}
<div class="container-xl" data-controller="viewer"
data-viewer-src-value="{% url 'document_pdf' document.pk %}?embed">
<div data-viewer-target="sidebar">
Expand All @@ -165,20 +172,12 @@ <h1 class="fs-4 mb-0 d-flex align-items-center gap-1">

</div>
{% elif document.state == "ERROR" %}
{% if document.is_unconvertible %}
<div class="alert alert-primary" role="alert">
DocHub ne sait pas générer d'aperçu pour ce document.
Mais tu peux le
<a href="{% url 'document_original' document.pk %}">télécharger</a>
et l'ouvrir directement chez toi.
</div>
{% else %}
<div class="alert alert-primary" role="alert">
Ce document n'a pas pu être traité par DocHub à cause d'une erreur.
Mais tu peux <a href="{% url 'document_original' document.pk %}">télécharger
la version originale </a> si tu le désires.
</div>
{% endif %}

<div class="alert alert-primary" role="alert">
Ce document n'a pas pu être traité par DocHub à cause d'une erreur.
Mais tu peux <a href="{% url 'document_original' document.pk %}">télécharger
la version originale </a> si tu le désires.
</div>
{% else %}
<div class="alert alert-primary" role="alert">
Ce document est en cours de traitement par DocHub et il n'est donc pas encore
Expand Down
78 changes: 50 additions & 28 deletions documents/tests/celery_test.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import signal
from subprocess import call
import logging
import subprocess

from django.core.files import File

Expand All @@ -11,28 +11,9 @@
from documents.tasks import mutool_get_pages, process_document
from users.models import User

pytestmark = [pytest.mark.django_db, pytest.mark.celery]


class Alarm(Exception):
pass


def alarm_handler(signum, frame):
raise Alarm

logger = logging.getLogger(__name__)

def start_unoconv():
signal.signal(signal.SIGALRM, alarm_handler)
signal.alarm(1)
try:
call(["unoconv", "--listener"]) # workaround for a shitty unoconv
# Error: Unable to connect or start own listener. Aborting.
# Setting a timeout because if a listener exists alreay it hangs...
except Alarm:
pass

signal.alarm(0) # cancel alarm
pytestmark = [pytest.mark.django_db, pytest.mark.celery]


def create_doc(name, ext):
Expand Down Expand Up @@ -79,18 +60,59 @@ def test_send_duplicate():
assert Document.objects.filter(id=doc.id).count() == 0


# TODO : mock unoconv and provide a fake pdf instead
@pytest.mark.unoconv
@pytest.fixture
def unoserver():
# Check if unoserver is running
ping_result = subprocess.run(
["unoping"], capture_output=True, timeout=5, check=False
)

if ping_result.returncode != 0:
logger.debug("Unoserver is not running, starting it ourselves")
sub = subprocess.Popen(
[
"unoserver",
"--daemon",
"--conversion-timeout",
"300",
],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
logger.debug("Unoserver started")

yield

logger.debug("Killing unoserver")
sub.kill()

# Get stdout and stderr
stdout, stderr = sub.communicate(timeout=5)

# Log them
if stdout:
logger.info(
"unoserver stdout:\n%s", stdout.decode("utf-8", errors="replace")
)
if stderr:
logger.error(
"unoserver stderr:\n%s", stderr.decode("utf-8", errors="replace")
)
else:
logger.debug("Unoserver is already running")
yield


# TODO : mock unoserver and provide a fake pdf instead
@pytest.mark.unoserver
@pytest.mark.slow
def test_send_office():
def test_send_office(unoserver):
doc = create_doc("My office doc", ".docx")

with open("documents/tests/files/2pages.docx", "rb") as fd:
f = File(fd)
doc.original.save("silly-unique-deadbeef-file.docx", f)

start_unoconv()

result = process_document.delay(doc.id)
assert result.status == celery.states.SUCCESS, result.traceback

Expand Down
7 changes: 5 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -62,13 +62,16 @@ dev = [
]

[tool.pytest.ini_options]
norecursedirs = "ve ve3 static media .git node_modules"
norecursedirs = ".venv ve ve3 static media .git node_modules"
DJANGO_SETTINGS_MODULE="www.test_settings"
addopts = "--reuse-db"
filterwarnings = [
"ignore:builtin type.*has no __module__ attribute:DeprecationWarning"
]
markers = """
slow: marks tests as slow (deselect with '-m "not slow"')
network: marks tests using the network (deselect with '-m "not network"')
unoconv: uses unoconv (underterministic)
unoserver: uses unoserver for office document conversion
webtest: http queries against localhost
celery: uses celery tasks
postgresql: needs a postgresql database to run
Expand Down
37 changes: 37 additions & 0 deletions unoserver/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Mostly copied from
# https://github.com/unoconv/unoserver-docker/blob/main/Dockerfile
# Under MIT License Copyright (c) 2022 unoconv

FROM eclipse-temurin:24.0.1_9-jdk-alpine-3.21

LABEL org.opencontainers.image.source=https://github.com/DocHub-ULB/DocHub/unoserver

# Install LibreOffice, Python, and essential dependencies
RUN apk add --no-cache \
py3-pip \
libreoffice \
# Essential fonts for document rendering
font-noto \
font-noto-cjk \
ttf-dejavu \
ttf-liberation \
fontconfig && \
fc-cache -f && \
rm -rf /var/cache/apk/* /tmp/*

# Install unoserver
RUN pip install --break-system-packages unoserver==3.6

# Create non-root user
RUN addgroup -S worker && adduser -S worker -G worker
USER worker
WORKDIR /home/worker

# Expose unoserver port
EXPOSE 2003

HEALTHCHECK --interval=5s CMD unoping --host 127.0.0.1 --port 2003

# Run unoserver
CMD ["unoserver", "--interface", "0.0.0.0", "--port", "2003"]

5 changes: 5 additions & 0 deletions unoserver/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Unoserver Docker Image

This Docker image is used as a service in our GitHub Actions workflow for running tests (see `.github/workflows/python-tests.yml`).

To build and push the image: `docker buildx build --platform linux/amd64,linux/arm64 -t ghcr.io/dochub-ulb/unoserver:latest . --push`
Loading