Skip to content

Commit d769942

Browse files
Merge pull request #23 from Zipstack/fix/clone-agentic-docs-and-tool
fix(clone): clone agentic project documents + resolve exported agentic tools
2 parents d22180a + e35b564 commit d769942

5 files changed

Lines changed: 519 additions & 4 deletions

File tree

src/unstract/clone/client.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -921,3 +921,62 @@ def list_agentic_registries(
921921
params["agentic_project"] = agentic_project
922922
result = self._request("GET", "agentic-studio-registry/", params=params)
923923
return result if isinstance(result, list) else (result or {}).get("results", [])
924+
925+
def list_agentic_documents(self, project_id: str) -> list[dict[str, Any]]:
926+
"""List a project's uploaded documents. Rows carry ``id`` and
927+
``original_filename``. Agentic docs are a store of their own, distinct
928+
from Prompt Studio ``prompt-document`` rows.
929+
"""
930+
result = self._request(
931+
"GET", "agentic/documents/", params={"project_id": project_id}
932+
)
933+
return result if isinstance(result, list) else (result or {}).get("results", [])
934+
935+
def download_agentic_document(self, document_id: str) -> bytes:
936+
"""Download an agentic document's original bytes.
937+
938+
The ``file`` route serves the raw binary (not a JSON envelope), so this
939+
bypasses the JSON-decoding request path.
940+
"""
941+
url = self._url(f"agentic/documents/{document_id}/file/")
942+
logger.debug("GET %s", url)
943+
resp = self._session.get(url, timeout=self.timeout, verify=self.verify)
944+
if not 200 <= resp.status_code < 300:
945+
raise PlatformAPIError(
946+
f"GET agentic/documents/{document_id}/file/ "
947+
f"returned {resp.status_code}",
948+
status_code=resp.status_code,
949+
body=resp.text[:2000],
950+
)
951+
return resp.content
952+
953+
def upload_agentic_document(
954+
self, project_id: str, file_name: str, data: bytes, mime_type: str
955+
) -> dict[str, Any]:
956+
"""Upload a document into a target agentic project.
957+
958+
Creates the ``AgenticDocument`` row; extraction/summary stay a UI step,
959+
as with Prompt Studio uploads. Callers pre-check filenames to avoid
960+
duplicates on re-runs.
961+
"""
962+
files = {"file": (file_name, data, mime_type)}
963+
return self._request(
964+
"POST", f"agentic/projects/{project_id}/documents/upload/", files=files
965+
)
966+
967+
def list_agentic_verified_data(self, project_id: str) -> list[dict[str, Any]]:
968+
"""List a project's verified (ground-truth) data rows. Each carries
969+
``document_name``, ``document``, and ``data`` (the curated JSON).
970+
"""
971+
result = self._request(
972+
"GET", "agentic/verified-data/", params={"project_id": project_id}
973+
)
974+
return result if isinstance(result, list) else (result or {}).get("results", [])
975+
976+
def create_agentic_verified_data(self, payload: dict[str, Any]) -> dict[str, Any]:
977+
"""Create a verified-data row (``project``, ``document``, ``data``).
978+
979+
One row per (project, document); callers pre-check to avoid a
980+
uniqueness clash on re-runs.
981+
"""
982+
return self._request("POST", "agentic/verified-data/", json=payload)

src/unstract/clone/phases/agentic_studio.py

Lines changed: 222 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,11 @@
2222
per-run ``agentic_prompt_version`` table and the child's ``parent_version``
2323
resolves through it. ``project`` is bound to the target id.
2424
- **schemas**: bound to the target ``project`` and created.
25+
- **documents**: source uploads re-uploaded to the target project (skipping
26+
filenames already present); they live in their own store, not the Prompt
27+
Studio document table.
28+
- **verified data**: ground-truth rows re-pointed to the cloned document by
29+
filename. Extracted/comparison data is regenerable and not cloned.
2530
3. **Registry**: if the project has an active schema + prompt, republish its
2631
registry entry via the ``export`` action (mirror of custom_tool) and
2732
record an ``agentic_studio_registry`` remap. Projects with no source
@@ -35,6 +40,7 @@
3540
from __future__ import annotations
3641

3742
import logging
43+
import mimetypes
3844
import threading
3945
from typing import Any
4046

@@ -170,6 +176,8 @@ def _clone_project(
170176
self._replicate_share(src, name, tgt_project_id, result, lock)
171177
self._clone_prompt_versions(name, src_project_id, tgt_project_id, result, lock)
172178
self._clone_schemas(name, src_project_id, tgt_project_id, result, lock)
179+
self._clone_documents(name, src_project_id, tgt_project_id, result, lock)
180+
self._clone_verified_data(name, src_project_id, tgt_project_id, result, lock)
173181
self._republish_registry(name, src_project_id, tgt_project_id, result, lock)
174182

175183
def _build_project_payload(
@@ -422,6 +430,201 @@ def _clone_schemas(
422430
with lock:
423431
result.created += 1
424432

433+
# ----- documents -----
434+
435+
def _clone_documents(
436+
self,
437+
name: str,
438+
src_project_id: str,
439+
tgt_project_id: str,
440+
result: PhaseResult,
441+
lock: threading.Lock,
442+
) -> None:
443+
try:
444+
src_docs = self.ctx.source.list_agentic_documents(src_project_id)
445+
except Exception as e:
446+
logger.exception("agentic '%s': document listing failed: %s", name, e)
447+
with lock:
448+
result.failed += 1
449+
result.errors.append(f"agentic {name} list documents: {e}")
450+
return
451+
if not src_docs:
452+
return
453+
454+
# Honour the file strategy: 'skip' means no binary transfer, like the
455+
# files and lookups phases. Operator re-uploads on target.
456+
if self.ctx.options.file_strategy == "skip":
457+
with lock:
458+
result.skipped += len(src_docs)
459+
result.warnings.append(
460+
f"agentic {name}: {len(src_docs)} document(s) not copied "
461+
"(file_strategy=skip) — upload them manually on target"
462+
)
463+
return
464+
465+
try:
466+
tgt_docs = self.ctx.target.list_agentic_documents(tgt_project_id)
467+
except Exception as e:
468+
logger.warning(
469+
"agentic '%s': target document listing failed "
470+
"(re-run may duplicate): %s",
471+
name,
472+
e,
473+
)
474+
tgt_docs = []
475+
target_names = {d.get("original_filename") for d in tgt_docs}
476+
477+
for src in src_docs:
478+
file_name = src.get("original_filename")
479+
src_doc_id = src.get("id")
480+
if not file_name or not src_doc_id:
481+
continue
482+
if file_name in target_names:
483+
with lock:
484+
result.skipped += 1
485+
logger.info(
486+
"agentic '%s': document '%s' already on target — skipping",
487+
name,
488+
file_name,
489+
)
490+
continue
491+
self._clone_one_document(
492+
name, tgt_project_id, src_doc_id, file_name, result, lock
493+
)
494+
495+
def _clone_one_document(
496+
self,
497+
name: str,
498+
tgt_project_id: str,
499+
src_doc_id: str,
500+
file_name: str,
501+
result: PhaseResult,
502+
lock: threading.Lock,
503+
) -> None:
504+
try:
505+
raw = self.ctx.source.download_agentic_document(src_doc_id)
506+
except Exception as e:
507+
logger.exception(
508+
"agentic '%s': document '%s' download failed: %s", name, file_name, e
509+
)
510+
with lock:
511+
result.failed += 1
512+
result.errors.append(f"agentic {name} download {file_name}: {e}")
513+
return
514+
515+
if len(raw) > self.ctx.options.max_file_size:
516+
with lock:
517+
result.skipped += 1
518+
result.warnings.append(
519+
f"agentic {name}: document {file_name} exceeds size cap — "
520+
"upload it manually on target"
521+
)
522+
return
523+
524+
mime = mimetypes.guess_type(file_name)[0] or "application/pdf"
525+
try:
526+
self.ctx.target.upload_agentic_document(
527+
tgt_project_id, file_name, raw, mime
528+
)
529+
except Exception as e:
530+
logger.exception(
531+
"agentic '%s': document '%s' upload failed: %s", name, file_name, e
532+
)
533+
with lock:
534+
result.failed += 1
535+
result.errors.append(f"agentic {name} upload {file_name}: {e}")
536+
return
537+
with lock:
538+
result.created += 1
539+
logger.info("agentic '%s': uploaded document '%s'", name, file_name)
540+
541+
# ----- verified (ground-truth) data -----
542+
543+
def _clone_verified_data(
544+
self,
545+
name: str,
546+
src_project_id: str,
547+
tgt_project_id: str,
548+
result: PhaseResult,
549+
lock: threading.Lock,
550+
) -> None:
551+
try:
552+
src_rows = self.ctx.source.list_agentic_verified_data(src_project_id)
553+
except Exception as e:
554+
logger.exception("agentic '%s': verified-data listing failed: %s", name, e)
555+
with lock:
556+
result.failed += 1
557+
result.errors.append(f"agentic {name} list verified-data: {e}")
558+
return
559+
if not src_rows:
560+
return
561+
562+
# Verified data FKs a document; under 'skip' no documents are copied,
563+
# so skip the rows too — matching the plan and _clone_documents.
564+
if self.ctx.options.file_strategy == "skip":
565+
with lock:
566+
result.skipped += len(src_rows)
567+
result.warnings.append(
568+
f"agentic {name}: {len(src_rows)} verified-data row(s) not "
569+
"copied (file_strategy=skip)"
570+
)
571+
return
572+
573+
# Verified data FKs a document; map source rows to target docs by
574+
# filename, the only identity stable across orgs.
575+
try:
576+
tgt_docs = self.ctx.target.list_agentic_documents(tgt_project_id)
577+
tgt_existing = self.ctx.target.list_agentic_verified_data(tgt_project_id)
578+
except Exception as e:
579+
logger.warning(
580+
"agentic '%s': target verified-data lookup failed "
581+
"(re-run may duplicate): %s",
582+
name,
583+
e,
584+
)
585+
tgt_docs, tgt_existing = [], []
586+
doc_id_by_name = {d.get("original_filename"): d.get("id") for d in tgt_docs}
587+
verified_doc_ids = {r.get("document") for r in tgt_existing}
588+
589+
for src in src_rows:
590+
file_name = src.get("document_name")
591+
tgt_doc_id = doc_id_by_name.get(file_name)
592+
if not tgt_doc_id:
593+
with lock:
594+
result.skipped += 1
595+
result.warnings.append(
596+
f"agentic {name}: verified data for '{file_name}' skipped — "
597+
"document not on target"
598+
)
599+
continue
600+
if tgt_doc_id in verified_doc_ids:
601+
with lock:
602+
result.skipped += 1
603+
continue
604+
payload = {
605+
"project": tgt_project_id,
606+
"document": tgt_doc_id,
607+
"data": src.get("data"),
608+
}
609+
try:
610+
self.ctx.target.create_agentic_verified_data(payload)
611+
except Exception as e:
612+
logger.exception(
613+
"agentic '%s': verified data for '%s' create failed: %s",
614+
name,
615+
file_name,
616+
e,
617+
)
618+
with lock:
619+
result.failed += 1
620+
result.errors.append(
621+
f"agentic {name} create verified-data {file_name}: {e}"
622+
)
623+
continue
624+
with lock:
625+
result.created += 1
626+
logger.info("agentic '%s': cloned verified data for '%s'", name, file_name)
627+
425628
# ----- registry -----
426629

427630
def _republish_registry(
@@ -507,8 +710,9 @@ def _plan_children(
507710
result: PhaseResult,
508711
lock: threading.Lock,
509712
) -> None:
510-
"""Dry-run: count source prompt versions + schemas as planned and
511-
record planned prompt-version ids so downstream resolves don't miss.
713+
"""Dry-run: count source prompt versions + schemas + documents as
714+
planned and record planned prompt-version ids so downstream resolves
715+
don't miss.
512716
"""
513717
try:
514718
src_versions = self.ctx.source.list_agentic_prompt_versions(
@@ -522,11 +726,27 @@ def _plan_children(
522726
)
523727
except Exception:
524728
src_schemas = []
729+
try:
730+
src_docs = self.ctx.source.list_agentic_documents(src_project_id)
731+
except Exception:
732+
src_docs = []
733+
try:
734+
src_verified = self.ctx.source.list_agentic_verified_data(src_project_id)
735+
except Exception:
736+
src_verified = []
525737
with lock:
526738
for v in src_versions:
527739
self.ctx.remap.record_planned("agentic_prompt_version", v["id"])
528740
result.created += 1
529741
result.created += len(src_schemas)
742+
# Documents move only when the file strategy copies binaries.
743+
# Verified data FKs a document, so skipping files strands it too.
744+
if self.ctx.options.file_strategy == "skip":
745+
result.skipped += len(src_docs)
746+
result.skipped += len(src_verified)
747+
else:
748+
result.created += len(src_docs)
749+
result.created += len(src_verified)
530750

531751
# ----- settings -----
532752

src/unstract/clone/phases/tool_instance.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -108,12 +108,17 @@ def _clone_workflow_tools(
108108
src_ti_id = src_ti["id"]
109109
src_tool_id = src_ti["tool_id"]
110110

111+
# tool_id is a registry id: Prompt Studio tools register under
112+
# prompt_studio_registry, exported agentic projects under
113+
# agentic_studio_registry. Resolve against both.
111114
with lock:
112-
tgt_tool_id = self.ctx.remap.resolve("prompt_studio_registry", src_tool_id)
115+
tgt_tool_id = self.ctx.remap.resolve(
116+
"prompt_studio_registry", src_tool_id
117+
) or self.ctx.remap.resolve("agentic_studio_registry", src_tool_id)
113118
if not tgt_tool_id:
114119
logger.warning(
115120
"skipping tool_instance %s — no registry remap for tool_id %s "
116-
"(custom tool likely unpublished on source)",
121+
"(tool likely unpublished on source)",
117122
src_ti_id,
118123
src_tool_id,
119124
)

0 commit comments

Comments
 (0)