diff --git a/CHANGELOG.md b/CHANGELOG.md index b1e429b..bd7b0a1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,20 @@ # Changelog +## 8.0.17 - 2026-05-07 + +### Fixed + +- `kg_build` postprocessor contexts now inject + `Context.extensions["kg_build.iri_lookup"]` from the callback URL and + `existing_web_page_id`, allowing the trailing canonical ID postprocessor to + reuse existing URL-mapped IRIs before falling back to generated IDs. + +### Changed + +- Bump package version to `8.0.17`. +- Sync release-facing documentation and indexes (`README.md`, + `docs/INDEX.md`, `specs/INDEX.md`). + ## 8.0.16 - 2026-04-17 ### Fixed diff --git a/README.md b/README.md index 90c929c..643a8af 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ A Python toolkit for orchestrating WordLift imports: fetch URLs from sitemaps, Google Sheets, or explicit lists, filter out already imported pages, enqueue search console jobs, push RDF graphs, and call the WordLift APIs to import web pages. -Current release: `8.0.16` (2026-04-17). +Current release: `8.0.17` (2026-05-07). ## Features - URL sources: XML sitemaps, Google Sheets (`url` column), or Python lists, with global optional `URL_REGEX` filtering (also enforced in graph-sync source selection before `new_or_changed` GraphQL lookup). @@ -317,6 +317,8 @@ The SDK now includes a profile-driven cloud mapping module under `wordlift_sdk.k 3. otherwise no postprocessors - Callback canonicalization order: profile postprocessors run first, then built-in canonical ID generation runs on the postprocessed graph immediately before patching. - Built-in canonical IDs support optional lookup-based root IRI reuse via `Context.extensions["kg_build.iri_lookup"]` (`IriLookup.iri_for_subject(graph, subject)`), with default fallback to generated IDs when lookup misses. +- `kg_build` callback contexts populate that lookup from the callback URL and + existing URL-mapped IRI when the URL source provides one. - Lookup-based reuse is root-only: dependent nodes (for example `Offer`, `Answer`, `Action`) still follow canonical parent-nested rewrite rules. - Execution is manifest-based only (hard cutover): no legacy `.py` or `*.command.toml` discovery. - During callback patch preparation, the SDK annotates first-level URI-subject nodes in the generated graph with `seovoc:source "web-page-import"` where first-level is dataset ID depth `///` (for example `https://data.host/dataset/types/name`); deeper child IDs and blank nodes are not annotated. diff --git a/docs/INDEX.md b/docs/INDEX.md index ec8a695..d7697c6 100644 --- a/docs/INDEX.md +++ b/docs/INDEX.md @@ -1,6 +1,6 @@ # Docs Index -Release sync: this index is aligned with SDK `8.0.16` (see `CHANGELOG.md`). +Release sync: this index is aligned with SDK `8.0.17` (see `CHANGELOG.md`). - `docs/public_entry_points.md`: Task-oriented public API inventory for clients/agents. - `docs/packaging_slices_v7.md`: v7 packaging model for lean base installs plus optional feature extras under the same `wordlift_sdk` namespace. diff --git a/docs/canonical_id_policy.md b/docs/canonical_id_policy.md index 20cb2b6..34ac491 100644 --- a/docs/canonical_id_policy.md +++ b/docs/canonical_id_policy.md @@ -76,6 +76,8 @@ hook: - protocol: `IriLookup.iri_for_subject(graph, subject) -> str | None` - builtin dataframe implementation: `wordlift_sdk.kg_build.DataFrameUrlIriLookup` (`url`/`iri` columns) +- `kg_build` callback contexts populate this lookup from the callback URL and + `existing_web_page_id` when the URL source already resolved a URL-mapped IRI. Behavior: diff --git a/pyproject.toml b/pyproject.toml index 5a7b805..95449af 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "wordlift-sdk" -version = "8.0.16" +version = "8.0.17" description = "Python toolkit for orchestrating WordLift imports and structured data workflows." authors = ["David Riccitelli "] readme = "README.md" diff --git a/specs/INDEX.md b/specs/INDEX.md index 25e431f..4c0bc31 100644 --- a/specs/INDEX.md +++ b/specs/INDEX.md @@ -1,6 +1,6 @@ # Specs Index -Release sync: this index is aligned with SDK `8.0.16` (see `CHANGELOG.md`). +Release sync: this index is aligned with SDK `8.0.17` (see `CHANGELOG.md`). - `specs/GSC_CANONICAL_SELECTION.md`: Client integration spec for canonical URL election from GSC impressions (`url,title` CSV input, OAuth/service-account credential handoff, interval/concurrency contract). - `specs/GRAPH_SYNC_API.md`: Graph-scoped REST API and persistence model for recording sync runs, live progress, aggregate KPIs, and per-URL SHACL reporting. diff --git a/specs/PIPELINE_ARCHITECTURE.md b/specs/PIPELINE_ARCHITECTURE.md index 8c801b4..75a6cd9 100644 --- a/specs/PIPELINE_ARCHITECTURE.md +++ b/specs/PIPELINE_ARCHITECTURE.md @@ -114,6 +114,9 @@ Current implementation status: (`IriLookup.iri_for_subject(graph, subject)`) resolved via `Context.extensions["kg_build.iri_lookup"]`; lookup applies only to root subjects and falls back to generated IDs on misses. +- callback postprocessor contexts inject that lookup from the callback URL and + `existing_web_page_id` when the source lookup has resolved an existing + URL-mapped IRI. - callback graph canonicalization includes a fallback subject pass so non-blank subject IRIs are rewritten to canonical dataset-rooted paths when they are not already under canonical root prefixes; static-template graph patching remains diff --git a/tests/kg_build/postprocessors/processors/test_id_generator.py b/tests/kg_build/postprocessors/processors/test_id_generator.py index b46acfa..232451f 100644 --- a/tests/kg_build/postprocessors/processors/test_id_generator.py +++ b/tests/kg_build/postprocessors/processors/test_id_generator.py @@ -616,7 +616,7 @@ def test_dependency_graph_strategy_reparents_faq_under_article_typed_as_faqpage( Expected transformation: old root : .../articles/article-1 - new root : .../articles/credit-card-debt-relief-freedom-debt-relief- + new root : .../articles/article-1- question : .../articles//questions/ answer : .../articles//questions//answers/answer """ @@ -659,8 +659,9 @@ def test_dependency_graph_strategy_reparents_faq_under_article_typed_as_faqpage( assert len(articles) == 1 article_iri = str(articles[0]) url_hash = generator._url_hash("https://example.com/articles/article-1") - assert article_iri == ( - f"{DATASET}/articles/credit-card-debt-relief-freedom-debt-relief-{url_hash}" + assert ( + article_iri + == f"{DATASET}/articles/credit-card-debt-relief-freedom-debt-relief-{url_hash}" ), article_iri questions = list(output.subjects(RDF.type, URIRef(f"{SCHEMA}Question"))) @@ -673,9 +674,9 @@ def test_dependency_graph_strategy_reparents_faq_under_article_typed_as_faqpage( answer_iri = str(answers[0]) assert answer_iri.startswith(f"{question_iri}/answers/"), answer_iri - # No stale article-1 path may remain anywhere in the graph + # No stale external article path may remain anywhere in the graph all_iris = {str(s) for s in output.subjects() if isinstance(s, URIRef)} - assert not any("article-1" in iri for iri in all_iris), all_iris + assert not any(iri.startswith("https://example.com/") for iri in all_iris), all_iris def test_dependency_graph_strategy_is_generic_product_offer_pricespec() -> None: diff --git a/tests/kg_build/test_protocol.py b/tests/kg_build/test_protocol.py index ea968a2..47f1369 100644 --- a/tests/kg_build/test_protocol.py +++ b/tests/kg_build/test_protocol.py @@ -107,6 +107,62 @@ def _make_validation_outcome( ) +def test_build_pp_context_injects_url_iri_lookup() -> None: + protocol = ProfileImportProtocol( + context=_make_context(), + profile=_make_profile(), + root_dir=Path("."), + ) + response = WebPageScrapeResponse( + web_page=WebPage(url="https://example.com/article", html="") + ) + + context = protocol._build_pp_context( + "https://example.com/article", + response, + "https://data.example.com/dataset/articles/existing-article", + "existing-hash", + ) + lookup = context.extensions.get("kg_build.iri_lookup") + + assert lookup is not None + + graph = Graph() + subject = URIRef("https://example.com/article#node") + faq_page = URIRef("https://example.com/article#faq") + graph.add((subject, RDF.type, URIRef("http://schema.org/Article"))) + graph.add( + ( + subject, + URIRef("http://schema.org/url"), + Literal("https://example.com/article"), + ) + ) + graph.add((faq_page, RDF.type, URIRef("http://schema.org/FAQPage"))) + graph.add((subject, URIRef("http://schema.org/subjectOf"), faq_page)) + graph.add((faq_page, URIRef("http://schema.org/about"), subject)) + assert ( + lookup.iri_for_subject(graph, subject) + == "https://data.example.com/dataset/articles/existing-article" + ) + + output = CanonicalIdsPostprocessor(strategy="dependency_graph").process_graph( + graph, context + ) + assert ( + URIRef("https://data.example.com/dataset/articles/existing-article"), + RDF.type, + URIRef("http://schema.org/Article"), + ) in output + assert ( + URIRef( + "https://data.example.com/dataset/articles/existing-article/faq-pages/faq-page" + ), + URIRef("http://schema.org/about"), + URIRef("https://data.example.com/dataset/articles/existing-article"), + ) in output + + def _passthrough_pp() -> AsyncMock: return AsyncMock( side_effect=lambda g, url, resp, ewi, eih: PostprocessorResult( diff --git a/wordlift_sdk/kg_build/postprocessors/types.py b/wordlift_sdk/kg_build/postprocessors/types.py index a9323bd..ca625d3 100644 --- a/wordlift_sdk/kg_build/postprocessors/types.py +++ b/wordlift_sdk/kg_build/postprocessors/types.py @@ -1,6 +1,6 @@ from __future__ import annotations -from dataclasses import dataclass +from dataclasses import dataclass, field from enum import Enum from pathlib import Path from typing import Any, Protocol, runtime_checkable @@ -27,6 +27,7 @@ class PostprocessorContext: existing_import_hash: str | None = None import_hash_mode: str = "on" ids: Any | None = None + extensions: dict[str, Any] = field(default_factory=dict) @dataclass(frozen=True) diff --git a/wordlift_sdk/kg_build/protocol.py b/wordlift_sdk/kg_build/protocol.py index 0c0a3c3..94a8bf9 100644 --- a/wordlift_sdk/kg_build/protocol.py +++ b/wordlift_sdk/kg_build/protocol.py @@ -11,6 +11,7 @@ from typing import Any from jinja2 import UndefinedError +import pandas as pd from rdflib import Graph, URIRef from wordlift_client.models.web_page_scrape_response import WebPageScrapeResponse from wordlift_sdk.protocol import Context @@ -27,6 +28,7 @@ from .cloud_flow import run_cloud_workflow as run_cloud_workflow # noqa: F401 from .config import ProfileDefinition from .entity_patcher import EntityPatcher +from .iri_lookup import DataFrameUrlIriLookup from .kpi import KgBuildKpiCollector from .postprocessors import ( LoadedPostprocessor, @@ -503,6 +505,11 @@ def _build_pp_context( profile_settings = dict(profile_payload.get("settings", {}) or {}) profile_settings.setdefault("api_url", "https://api.wordlift.io") profile_payload["settings"] = profile_settings + extensions: dict[str, Any] = {} + if url and existing_web_page_id: + extensions["kg_build.iri_lookup"] = DataFrameUrlIriLookup( + pd.DataFrame([{"url": url, "iri": existing_web_page_id}]) + ) return PostprocessorContext( profile_name=self.profile.name, profile=profile_payload, @@ -515,6 +522,7 @@ def _build_pp_context( existing_import_hash=existing_import_hash, import_hash_mode=self._import_hash_mode, ids=ids, + extensions=extensions, ) def _resolve_path(self, raw_path: str) -> Path: