Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: [ '3.9', '3.10', '3.11', '3.12', '3.13' ]
python-version: [ '3.10', '3.11', '3.12', '3.13' ]

steps:
- uses: actions/checkout@v3
Expand All @@ -22,7 +22,8 @@ jobs:
- name: Install build environment
run: |
python -m pip install --upgrade pip
python -m pip install tox setuptools pytest pytest-cov codecov
- name: Build and test with tox.
python -m pip install uv
- name: Build and test with uv.
run: |
tox -vv -e flake8
uv run ruff check
uv build
74 changes: 22 additions & 52 deletions benchmarking/run_benchmarking.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
#!/usr/bin/env python3
"""
Runs a benchmarking suite to compare speed
and output of different implementations.
"""
"""Run a benchmarking suite to compare speed and output of different implementations."""

import argparse
import operator
Expand Down Expand Up @@ -53,23 +50,16 @@


class AbstractHtmlConverter:
"""
An abstract HTML convert class.
"""
"""An abstract HTML convert class."""

def get_text(self, html):
"""
Returns:
a text representation of the given HTML snippet.
"""
"""Return a text representation of the given HTML snippet."""
raise NotImplementedError

def benchmark(self, html):
"""
Benchmarks the classes HTML to text converter.
"""Benchmarks the classes HTML to text converter.

Returns:
A tuple of the required time and the obtained text representation.
Return a tuple of the required time and the obtained text representation.
"""
start_time = time()
for _ in range(TRIES):
Expand All @@ -78,9 +68,7 @@ def benchmark(self, html):


class BeautifulSoupHtmlConverter(AbstractHtmlConverter):
"""
Converts HTML to text using BeautifulSoup.
"""
"""Converts HTML to text using BeautifulSoup."""

name = "BeautifulSoup"

Expand All @@ -100,9 +88,7 @@ def get_text(self, html):


class JustextConverter(AbstractHtmlConverter):
"""
Converts HTML to text using Justtext.
"""
"""Converts HTML to text using Justtext."""

name = "Justtext"

Expand All @@ -116,9 +102,7 @@ def get_text(self, html):


class Html2TextConverter(AbstractHtmlConverter):
"""
Converts HTML to text using Html2Text.
"""
"""Converts HTML to text using Html2Text."""

name = "Html2Text"

Expand All @@ -133,9 +117,7 @@ def get_text(self, html):


class LynxConverter(AbstractHtmlConverter):
"""
Converts HTML to text using lynx.
"""
"""Converts HTML to text using lynx."""

name = "Lynx"

Expand Down Expand Up @@ -166,9 +148,7 @@ def kill_lynx(pid):


class LinksConverter(AbstractHtmlConverter):
"""
Converts HTML to text using links.
"""
"""Converts HTML to text using links."""

name = "Links"

Expand Down Expand Up @@ -199,9 +179,7 @@ def kill_links(pid):


class InscriptisHtmlConverter(AbstractHtmlConverter):
"""
Converts HTML to text using Inscriptis.
"""
"""Converts HTML to text using Inscriptis."""

name = "Inscriptis"

Expand All @@ -217,18 +195,14 @@ def __init__(self):


def save_to_file(algorithm, url, data, benchmarking_results_dir):
"""
Saves a benchmarking result to the given file.
"""
"""Save the benchmarking result to the given file."""
result_file = os.path.join(benchmarking_results_dir, f"{algorithm}_{url}.txt")
with open(result_file, "w") as output_file:
output_file.write(data)


def get_speed_table(times):
"""
Provides the table which compares the conversion speed.
"""
"""Provide the table which compares the conversion speed."""
fastest = min((value for _, value in times.items()))
longest_key = max(len(key) for key, _ in times.items())
longest_value = max(len(str(value)) for _, value in times.items())
Expand All @@ -251,9 +225,7 @@ def get_speed_table(times):


def get_fname(url) -> str:
"""
Transforms a URL to a file name.
"""
"""Transform a URL to a file name."""
trash = (("http://", ""), ("https://", ""), ("/", "-"), (":", "-"), ("%", ""))

for key, value in trash:
Expand All @@ -272,9 +244,7 @@ def get_fname(url) -> str:


def parse_args():
"""
Parse optional benchmarking arguments.
"""
"""Parse optional benchmarking arguments."""
parser = argparse.ArgumentParser(description="Inscriptis benchmarking suite")
parser.add_argument(
"converter",
Expand Down Expand Up @@ -306,11 +276,11 @@ def parse_args():


def _setup_benchmarking_directories(args):
"""
Setup the benchmarking result and caching directories.
"""Set up the benchmarking result and caching directories.

Args:
args: command line arguments that provide the directory names.

"""
if not os.path.exists(args.benchmarking_results):
os.makedirs(args.benchmarking_results)
Expand All @@ -319,16 +289,17 @@ def _setup_benchmarking_directories(args):


def _fetch_url(url, cache_dir):
"""
Fetch the given URL either from the cache or from the Web.
"""Fetch the given URL either from the cache or from the Web.

URLs that are not yet cached are added to the cache.

Args:
url: the URL to fetch.
cache_dir: the cache directory.

Returns:
A tuple of the cache file name and the URLs content.

"""
source_name = get_fname(url)
source_cache_path = os.path.join(cache_dir, source_name)
Expand All @@ -349,14 +320,13 @@ def _fetch_url(url, cache_dir):


def benchmark(args, source_list):
"""
Run the benchmark.
"""Run the benchmark.

Args:
args: command line arguments
source_list: a list of URLs to benchmark.
"""

"""
Comment on lines 322 to +329
Copy link

Copilot AI Nov 18, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The closing triple quotes for the docstring are in the wrong position. They should come immediately after the documentation ends, not on a separate line after Args section. Move the closing \"\"\" to line 328 (after 'source_list: a list of URLs to benchmark.').

Copilot uses AI. Check for mistakes.
_setup_benchmarking_directories(args)

output = []
Expand Down
4 changes: 2 additions & 2 deletions examples/custom-html-handling.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
#!/usr/bin/env python3

"""
Custom HTML tag handling example.
"""Custom HTML tag handling example.

Add a custom HTML handler for the bold <b> tag which encloses
bold text with "**".

Example:
"Welcome to <b>Chur</b>" is rendered as "Welcome to **Chur**".

"""

from lxml.html import fromstring
Expand Down
5 changes: 3 additions & 2 deletions publish.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,9 @@ case "$1" in
# cleanup dist
rm -rf ./dist

# build and publish packages
poetry publish --build
# build with hatchling and publish to PyPI
uv build
uv publish
;;
docker)
echo "Publishing ${IMAGE_NAME} in version ${VERSION}"
Expand Down
90 changes: 58 additions & 32 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ version = "2.7.0"
description = "inscriptis - HTML to text converter."
license = "Apache-2.0"
readme = "README.rst"
requires-python = ">=3.9,<3.14"
requires-python = ">=3.10,<3.15"

authors = [
{ name = "Albert Weichselbraun", email = "albert.weichselbraun@fhgr.ch" },
Expand All @@ -20,12 +20,13 @@ classifiers = [
"Topic :: Text Processing :: Markup :: HTML",
"Topic :: Utilities",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: 3.13",
"Programming Language :: Python :: 3.14",
]


dependencies = [
"requests>=2.32.3,<3.0.0",
Expand All @@ -49,14 +50,15 @@ web-service = [

[dependency-groups]
dev = [
"pytest>=8.3.5",
"pytest>=9.0.1",
"fastapi>=0.115.11,<1.0.0",
"ruff>=0.11.12",
"ruff>=0.14.5",
"httpx>=0.28.1",
"uvicorn>=0.34.2",
"ty>=0.0.1a7",
"pytest-cov>=6.1.1",
"safety>=3.5.1",
"uvicorn>=0.38.0",
"ty>=0.0.1a26",
"pytest-cov>=7.0.0",
"safety>=3.7.0",
"tox>=4.23.0",
]
[build-system]
requires = ["hatchling"]
Expand All @@ -77,32 +79,56 @@ quote-style = "double"

[tool.ruff.lint]
select = [
# pycodestyle
"E",
# Pyflakes
"F",
# pyupgrade
"UP",
# flake8-builtins
"A",
# flake8-bugbear
"B",
# flake8-comprehensions
"C4",
# flake8-errmsg
"EM",
# flake8-quotes
"Q",
# flake8-pyi
"PYI",
# flake8-simplify
"SIM",
# isort
"I",
"RSE", "RET", "SLOT", "TID", "TC", "C90", "N", "PERF", "E", "W",
"UP", "FURB", "RUF", "TRY", "YTT"
"A", # flake8-builtins
"B", # flake8-bugbear
"COM", # flake8-commas - trailing commas
"BLE", # flake8-blind-except - avoid bare except
"D", # flake8-docstrings
"C4", # flake8-comprehensions
"E", # pycodestyle
"EM", # flake8-errmsg
"F", # Pyflakes
"FA", # flake8-future-annotations - use modern annotations
"ICN", # flake8-import-conventions - standard import aliases
"PIE", # flake8-pie
"PLE", # pylint equivalents
"PLW", # pylint equivalents
"PTH", # flake8-use-pathlib - prefer pathlib over os.path
"PYI", # flake8-pyi
"Q", # flake8-quotes
"N", # flake8-naming
"SIM", # flake8-simplify
"I", # isort
"RET", # flake8-return
"RSE", "SLOT", "TID", "TC", "C90", "PERF", "E", "W",
"FURB", "RUF", "TRY", "YTT",
Comment on lines +103 to +104
Copy link

Copilot AI Nov 18, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Line 103 ends with a trailing comma but line 104 continues the list without proper formatting. Consider placing all items on line 103 on separate lines for better readability, or consolidate them properly on line 104.

Suggested change
"RSE", "SLOT", "TID", "TC", "C90", "PERF", "E", "W",
"FURB", "RUF", "TRY", "YTT",
"RSE", "SLOT", "TID", "TC", "C90", "PERF", "E", "W"
, "FURB", "RUF", "TRY", "YTT",

Copilot uses AI. Check for mistakes.
"TCH", # flake8-type-checking - optimize type checking imports
"S", # flake8-bandit (security) — replaces dlint/bandit
"UP", # pyupgrade
]

ignore = [
"D102", # missing docstring in public method
"D105", # missing docstring in magic method
"D107", # missing docstring in __init__
"D203", # incorrect-blank-line-before-class
"D213", # multi-line-summary-second-line
]

[tool.ruff.lint.per-file-ignores]
"tests/**/*.py" = [
"S101", # allow asserts
"D", # no dockstring checks
"S310", # allow URLs
"PTH", # prefer pathlib
]
"benchmarking/*.py" = [
"S310", # allow URLs
"S603", # call: check for execution of untrusted input
"PTH", # prefer pathlib
]


[tool.ty.src]
root="./src"

Expand Down
3 changes: 3 additions & 0 deletions src/inscriptis/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ def _get_html_tree(html_content: str) -> HtmlElement | None:

Returns:
The corresponding HTML parse tree.

"""
html_content = html_content.strip()
if not html_content:
Expand All @@ -107,6 +108,7 @@ def get_text(html_content: str, config: ParserConfig | None = None) -> str:

Returns:
The text representation of the HTML content.

"""
html_tree = _get_html_tree(html_content)
return Inscriptis(html_tree, config).get_text() if html_tree is not None else ""
Expand All @@ -128,6 +130,7 @@ def get_annotated_text(html_content: str, config: ParserConfig | None = None) ->

Returns:
A dictionary of text (key: 'text') and annotations (key: 'label')

"""
html_tree = _get_html_tree(html_content)
if html_tree is None:
Expand Down
Loading
Loading