Skip to content

Commit cd9bcf5

Browse files
Merge pull request #20 from Zipstack/fix/clone-dry-run-remap-chaining
[FIX] correct dry-run counts via planned-remap chaining
2 parents 0ceaf52 + 9e5da17 commit cd9bcf5

26 files changed

Lines changed: 316 additions & 82 deletions

src/unstract/clone/context.py

Lines changed: 34 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
import threading
1616
from dataclasses import dataclass, field
1717
from typing import TYPE_CHECKING, Any
18+
from uuid import NAMESPACE_URL, uuid5
1819

1920
if TYPE_CHECKING:
2021
from unstract.clone.client import PlatformClient
@@ -74,10 +75,30 @@ class RemapTable:
7475

7576
def __init__(self) -> None:
7677
self._table: dict[str, dict[str, str]] = {}
78+
# Synthetic target ids minted by record_planned() in dry-run. Tracked
79+
# so the report can mask them — they are not real target ids.
80+
self._planned: set[str] = set()
7781

7882
def record(self, entity: str, src_uuid: str, tgt_uuid: str) -> None:
7983
self._table.setdefault(entity, {})[src_uuid] = tgt_uuid
8084

85+
def record_planned(self, entity: str, src_uuid: str) -> str:
86+
"""Dry-run only: record a deterministic synthetic target id so
87+
dependent phases can resolve the FK and plan-count without writing.
88+
Never reaches the wire, so the fake id stays in-memory scaffolding.
89+
"""
90+
tgt_uuid = str(uuid5(NAMESPACE_URL, f"planned:{entity}:{src_uuid}"))
91+
self.record(entity, src_uuid, tgt_uuid)
92+
self._planned.add(tgt_uuid)
93+
return tgt_uuid
94+
95+
def is_planned(self, tgt_uuid: str) -> bool:
96+
"""True if ``tgt_uuid`` is a dry-run synthetic id (no real row on
97+
target). Callers use this to skip live target lookups that would
98+
query a non-existent id.
99+
"""
100+
return tgt_uuid in self._planned
101+
81102
def resolve(self, entity: str, src_uuid: str) -> str | None:
82103
return self._table.get(entity, {}).get(src_uuid)
83104

@@ -90,9 +111,19 @@ def resolve_any(self, src_uuid: str) -> str | None:
90111
return hit
91112
return None
92113

93-
def snapshot(self) -> dict[str, dict[str, str]]:
94-
"""Read-only snapshot for the post-run report."""
95-
return {entity: dict(m) for entity, m in self._table.items()}
114+
def snapshot(self, *, hide_planned: bool = False) -> dict[str, dict[str, str]]:
115+
"""Read-only snapshot for the post-run report. ``hide_planned`` masks
116+
dry-run synthetic ids (rendered as ``"(planned)"``) while keeping the
117+
per-entity counts intact.
118+
"""
119+
120+
def _val(tgt: str) -> str:
121+
return "(planned)" if hide_planned and tgt in self._planned else tgt
122+
123+
return {
124+
entity: {src: _val(tgt) for src, tgt in m.items()}
125+
for entity, m in self._table.items()
126+
}
96127

97128

98129
@dataclass

src/unstract/clone/orchestrator.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@ def clone(
8484
target=Endpoint(
8585
base_url=target.base_url, organization_id=target.organization_id
8686
),
87+
dry_run=opts.dry_run,
8788
)
8889

8990
run_started = time.perf_counter()
@@ -112,7 +113,7 @@ def clone(
112113
)
113114

114115
report.total_duration_s = time.perf_counter() - run_started
115-
report.remap_snapshot = ctx.remap.snapshot()
116+
report.remap_snapshot = ctx.remap.snapshot(hide_planned=opts.dry_run)
116117
return report
117118
finally:
118119
src_client.close()

src/unstract/clone/phases/adapter.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,8 @@ def _clone_one(
9797
)
9898
elif self.ctx.options.dry_run:
9999
with lock:
100-
result.skipped += 1
100+
result.created += 1
101+
self.ctx.remap.record_planned("adapter", src_id)
101102
logger.info(
102103
"[dry-run] would create adapter '%s' [%s] src=%s", name, atype, src_id
103104
)

src/unstract/clone/phases/api_deployment.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,8 @@ def _clone_one(
112112
)
113113
elif self.ctx.options.dry_run:
114114
with lock:
115-
result.skipped += 1
115+
result.created += 1
116+
self.ctx.remap.record_planned("api_deployment", src_id)
116117
logger.info(
117118
"[dry-run] would create api_deployment '%s' src=%s", api_name, src_id
118119
)

src/unstract/clone/phases/connector.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,8 @@ def _clone_one(
134134
)
135135
elif self.ctx.options.dry_run:
136136
with lock:
137-
result.skipped += 1
137+
result.created += 1
138+
self.ctx.remap.record_planned("connector", src_id)
138139
logger.info("[dry-run] would create connector '%s' src=%s", name, src_id)
139140
return
140141
else:

src/unstract/clone/phases/custom_tool.py

Lines changed: 50 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -145,12 +145,12 @@ def _clone_one(
145145
"tool_name": tool_name,
146146
}
147147

148+
# Each sub-path (adopt / fresh / fresh-dry-run) owns its own
149+
# custom_tool remap, since only it knows whether the target id is
150+
# real or a planned synthetic.
148151
if tgt_tool_id is None:
149152
return
150153

151-
with lock:
152-
self.ctx.remap.record("custom_tool", src_tool_id, tgt_tool_id)
153-
154154
# Neither the export blob nor list rows carry share axes —
155155
# share state comes from the source detail.
156156
self.apply_share(
@@ -163,6 +163,10 @@ def _clone_one(
163163
)
164164

165165
if self.ctx.options.dry_run:
166+
# Can't republish the registry without writing, but ToolInstance
167+
# needs a prompt_studio_registry remap to plan-count. Mirror it
168+
# with a planned id derived from the source registry (read-only).
169+
self._record_planned_registry(src_tool_id, tool_name, lock)
166170
return
167171

168172
# Tools never exported on source (e.g. empty projects — backend
@@ -223,6 +227,30 @@ def _clone_one(
223227
tgt_regs[0]["prompt_registry_id"],
224228
)
225229

230+
def _record_planned_registry(
231+
self, src_tool_id: str, tool_name: str, lock: threading.Lock
232+
) -> None:
233+
"""Dry-run: record a planned prompt_studio_registry remap from the
234+
source registry id, so ToolInstancePhase can resolve tool_id and
235+
plan-count. No-op for tools never exported on source (no registry).
236+
"""
237+
try:
238+
src_regs = self.ctx.source.list_registries(custom_tool=src_tool_id)
239+
except Exception as e:
240+
logger.warning(
241+
"[dry-run] source registry lookup failed for tool '%s' "
242+
"(tool_instance plan may under-count): %s",
243+
tool_name,
244+
e,
245+
)
246+
return
247+
if not src_regs:
248+
return
249+
with lock:
250+
self.ctx.remap.record_planned(
251+
"prompt_studio_registry", src_regs[0]["prompt_registry_id"]
252+
)
253+
226254
def _adopt(
227255
self,
228256
match: dict[str, Any],
@@ -240,7 +268,8 @@ def _adopt(
240268
tgt_tool_id = match["tool_id"]
241269
if self.ctx.options.dry_run:
242270
with lock:
243-
result.skipped += 1
271+
result.adopted += 1
272+
self.ctx.remap.record("custom_tool", src_tool_id, tgt_tool_id)
244273
logger.info(
245274
"[dry-run] would sync prompts into adopted tool '%s' src=%s -> tgt=%s",
246275
tool_name,
@@ -260,6 +289,7 @@ def _adopt(
260289

261290
with lock:
262291
result.adopted += 1
292+
self.ctx.remap.record("custom_tool", src_tool_id, tgt_tool_id)
263293
logger.info(
264294
"adopted tool '%s' src=%s -> tgt=%s (prompts re-synced)",
265295
tool_name,
@@ -276,14 +306,9 @@ def _create_fresh(
276306
result: PhaseResult,
277307
lock: threading.Lock,
278308
) -> str | None:
279-
if self.ctx.options.dry_run:
280-
with lock:
281-
result.skipped += 1
282-
logger.info(
283-
"[dry-run] would import tool '%s' src=%s", tool_name, src_tool_id
284-
)
285-
return None
286-
309+
# Run the source-side validations even in dry-run — they decide
310+
# whether a real run would create or frictionless-skip, so the plan
311+
# counts must reflect them. Only the target-write steps are stubbed.
287312
default_profile = self._source_default_profile(src_tool_id, tool_name)
288313
if default_profile is None:
289314
with lock:
@@ -300,6 +325,18 @@ def _create_fresh(
300325
)
301326
return None
302327

328+
if self.ctx.options.dry_run:
329+
# Target adapter resolution is skipped: adapters this run would
330+
# create don't exist on target yet, so it can't resolve. The
331+
# frictionless check above already caught the real skip cases.
332+
with lock:
333+
result.created += 1
334+
tgt_tool_id = self.ctx.remap.record_planned("custom_tool", src_tool_id)
335+
logger.info(
336+
"[dry-run] would import tool '%s' src=%s", tool_name, src_tool_id
337+
)
338+
return tgt_tool_id
339+
303340
adapter_ids = self._resolve_target_adapter_ids(default_profile, tool_name)
304341
if adapter_ids is None:
305342
with lock:
@@ -321,6 +358,7 @@ def _create_fresh(
321358
tgt_tool_id = tgt["tool_id"]
322359
with lock:
323360
result.created += 1
361+
self.ctx.remap.record("custom_tool", src_tool_id, tgt_tool_id)
324362
logger.info(
325363
"created tool '%s' src=%s -> tgt=%s (needs_adapter_config=%s)",
326364
tool_name,

src/unstract/clone/phases/files.py

Lines changed: 18 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -126,18 +126,23 @@ def _build_tool_tasks(
126126
report: CloneReport,
127127
result: PhaseResult,
128128
) -> list[_FileTask]:
129-
try:
130-
tgt_docs = self.ctx.target.list_prompt_documents(tgt_tool_id)
131-
except Exception as e:
132-
logger.exception(
133-
"files: failed to list target DM rows for tool %s: %s",
134-
tool_name,
135-
e,
136-
)
137-
result.failed += 1
138-
result.errors.append(f"list target docs {tool_name}: {e}")
139-
return []
140-
target_names = {d["document_name"] for d in tgt_docs}
129+
# A planned (dry-run) tool id has no row on target; skip the live
130+
# lookup and treat every source file as missing → predicted upload.
131+
if self.ctx.options.dry_run and self.ctx.remap.is_planned(tgt_tool_id):
132+
target_names: set[str] = set()
133+
else:
134+
try:
135+
tgt_docs = self.ctx.target.list_prompt_documents(tgt_tool_id)
136+
except Exception as e:
137+
logger.exception(
138+
"files: failed to list target DM rows for tool %s: %s",
139+
tool_name,
140+
e,
141+
)
142+
result.failed += 1
143+
result.errors.append(f"list target docs {tool_name}: {e}")
144+
return []
145+
target_names = {d["document_name"] for d in tgt_docs}
141146

142147
tasks: list[_FileTask] = []
143148
for doc in src_docs:
@@ -163,7 +168,7 @@ def _build_tool_tasks(
163168
)
164169
continue
165170
if self.ctx.options.dry_run:
166-
result.skipped += 1
171+
result.created += 1
167172
logger.info(
168173
"[dry-run] files: would clone tool=%s file=%s",
169174
tool_name,

src/unstract/clone/phases/group.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,8 @@ def _clone_one(
7575
logger.info("reusing group '%s' src=%s -> tgt=%s", name, src_id, tgt["id"])
7676
elif self.ctx.options.dry_run:
7777
with lock:
78-
result.skipped += 1
78+
result.created += 1
79+
self.ctx.remap.record_planned("group", str(src_id))
7980
logger.info("[dry-run] would create group '%s' src=%s", name, src_id)
8081
if self.ctx.options.clone_group_members:
8182
# Still computed so would-skip members show up in the report.

src/unstract/clone/phases/pipeline.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,8 @@ def _clone_one(
116116
)
117117
elif self.ctx.options.dry_run:
118118
with lock:
119-
result.skipped += 1
119+
result.created += 1
120+
self.ctx.remap.record_planned("pipeline", src_id)
120121
logger.info("[dry-run] would create pipeline '%s' src=%s", name, src_id)
121122
return
122123
else:

src/unstract/clone/phases/tag.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,8 @@ def _clone_one(
7676
logger.info("adopted tag '%s' src=%s -> tgt=%s", name, src_id, tgt["id"])
7777
elif self.ctx.options.dry_run:
7878
with lock:
79-
result.skipped += 1
79+
result.created += 1
80+
self.ctx.remap.record_planned("tag", src_id)
8081
logger.info("[dry-run] would create tag '%s' src=%s", name, src_id)
8182
return
8283
else:

0 commit comments

Comments
 (0)