From 54ccd90d0d96ee09eb54373c3c13f6beea40cf7a Mon Sep 17 00:00:00 2001 From: Michael Bunsen Date: Wed, 25 Mar 2026 15:18:24 -0700 Subject: [PATCH 1/9] feat: add reassign_deployments management command Management command to move deployments and all related data between projects. Handles all direct and indirect relationships including Events, SourceImages, Occurrences, Jobs, Detections, Classifications, Identifications, SourceImageCollections (with mixed-collection splitting), pipeline configs, processing services, and taxa M2M links. Features: - Dry-run mode by default, --execute to commit - Per-deployment before/after snapshots with row counts - Conservation checks (source + target = original) - FK integrity and indirect access validation - Shared resource handling (clone vs reassign devices, sites, S3 sources) - Raw SQL for ProcessingService M2M to avoid ORM column mismatch Co-Authored-By: Claude --- .../commands/reassign_deployments.py | 624 ++++++++++++++++++ 1 file changed, 624 insertions(+) create mode 100644 ami/main/management/commands/reassign_deployments.py diff --git a/ami/main/management/commands/reassign_deployments.py b/ami/main/management/commands/reassign_deployments.py new file mode 100644 index 000000000..56132f82a --- /dev/null +++ b/ami/main/management/commands/reassign_deployments.py @@ -0,0 +1,624 @@ +""" +Management command to move deployments (and all related data) from one project to another. + +Usage: + # Dry run (default) — shows what would happen + python manage.py reassign_deployments --source-project 20 --target-project 99 --deployment-ids 60,61 + + # Execute the move + python manage.py reassign_deployments --source-project 20 --target-project 99 --deployment-ids 60,61 --execute + + # Create a new target project + python manage.py reassign_deployments --source-project 20 \ + --create-project "Nunavik" --deployment-ids 60,61 --execute + +See docs/claude/planning/deployment-reassignment-guide.md for the full relationship map. +""" + +import logging + +from django.core.management.base import BaseCommand, CommandError +from django.db import connection, transaction + +from ami.jobs.models import Job +from ami.main.models import ( + Classification, + Deployment, + Detection, + Event, + Identification, + Occurrence, + Project, + SourceImage, + SourceImageCollection, + Taxon, +) +from ami.ml.models import ProjectPipelineConfig + +logger = logging.getLogger(__name__) + + +def collect_deployment_snapshot(dep_id: int) -> dict: + """Capture row counts for a single deployment.""" + return { + "events": Event.objects.filter(deployment_id=dep_id).count(), + "source_images": SourceImage.objects.filter(deployment_id=dep_id).count(), + "occurrences": Occurrence.objects.filter(deployment_id=dep_id).count(), + "detections": Detection.objects.filter(source_image__deployment_id=dep_id).count(), + "classifications": Classification.objects.filter(detection__source_image__deployment_id=dep_id).count(), + "identifications": Identification.objects.filter(occurrence__deployment_id=dep_id).count(), + "jobs": Job.objects.filter(deployment_id=dep_id).count(), + } + + +def collect_aggregate_snapshot(deployment_ids: list[int]) -> dict: + """Capture aggregate row counts for all related models.""" + return { + "deployments": Deployment.objects.filter(pk__in=deployment_ids).count(), + "events": Event.objects.filter(deployment_id__in=deployment_ids).count(), + "source_images": SourceImage.objects.filter(deployment_id__in=deployment_ids).count(), + "occurrences": Occurrence.objects.filter(deployment_id__in=deployment_ids).count(), + "detections": Detection.objects.filter(source_image__deployment_id__in=deployment_ids).count(), + "classifications": Classification.objects.filter( + detection__source_image__deployment_id__in=deployment_ids + ).count(), + "identifications": Identification.objects.filter(occurrence__deployment_id__in=deployment_ids).count(), + "jobs": Job.objects.filter(deployment_id__in=deployment_ids).count(), + } + + +def collect_project_counts(project_id: int) -> dict: + """Capture high-level project counts.""" + return { + "deployments": Deployment.objects.filter(project_id=project_id).count(), + "events": Event.objects.filter(project_id=project_id).count(), + "source_images": SourceImage.objects.filter(project_id=project_id).count(), + "occurrences": Occurrence.objects.filter(project_id=project_id).count(), + "jobs": Job.objects.filter(project_id=project_id).count(), + } + + +def link_processing_services_raw(source_project_id: int, target_project_id: int) -> int: + """Link ProcessingServices via raw SQL to avoid ORM column mismatch issues.""" + with connection.cursor() as cursor: + cursor.execute( + """ + INSERT INTO ml_processingservice_projects (processingservice_id, project_id) + SELECT processingservice_id, %s + FROM ml_processingservice_projects + WHERE project_id = %s + AND processingservice_id NOT IN ( + SELECT processingservice_id FROM ml_processingservice_projects WHERE project_id = %s + ) + """, + [target_project_id, source_project_id, target_project_id], + ) + return cursor.rowcount + + +class Command(BaseCommand): + help = "Move deployments and all related data from one project to another." + + def add_arguments(self, parser): + parser.add_argument("--source-project", type=int, required=True, help="Source project ID") + parser.add_argument("--target-project", type=int, help="Target project ID (must already exist)") + parser.add_argument( + "--create-project", + type=str, + help="Create a new target project with this name (instead of --target-project)", + ) + parser.add_argument("--deployment-ids", type=str, required=True, help="Comma-separated deployment IDs to move") + parser.add_argument( + "--no-clone-pipelines", + action="store_true", + default=False, + help="Skip cloning pipeline configs to target project", + ) + parser.add_argument( + "--no-clone-collections", + action="store_true", + default=False, + help="Skip cloning mixed SourceImageCollections (images will just be removed from source collections)", + ) + parser.add_argument( + "--execute", + action="store_true", + default=False, + help="Actually execute the move (default is dry run)", + ) + + def log(self, msg, style=None): + """Write to stdout and logger.""" + if style: + self.stdout.write(style(msg)) + else: + self.stdout.write(msg) + logger.info(msg) + + def handle(self, *args, **options): + source_project_id = options["source_project"] + deployment_ids = [int(x.strip()) for x in options["deployment_ids"].split(",")] + execute = options["execute"] + clone_pipelines = not options["no_clone_pipelines"] + clone_collections = not options["no_clone_collections"] + + mode = "EXECUTE" if execute else "DRY RUN" + self.log(f"\n{'=' * 60}") + self.log(f" DEPLOYMENT REASSIGNMENT — {mode}") + self.log(f"{'=' * 60}") + + # --- Validate inputs --- + try: + source_project = Project.objects.get(pk=source_project_id) + except Project.DoesNotExist: + raise CommandError(f"Source project {source_project_id} does not exist") + + self.log(f"\nSource project: {source_project.name} (id={source_project.pk})") + + deployments = Deployment.objects.filter(pk__in=deployment_ids) + if deployments.count() != len(deployment_ids): + found = set(deployments.values_list("pk", flat=True)) + missing = set(deployment_ids) - found + raise CommandError(f"Deployments not found: {missing}") + + wrong_project = deployments.exclude(project_id=source_project_id) + if wrong_project.exists(): + wrong = {d.pk: d.project_id for d in wrong_project} + raise CommandError(f"Deployments not in source project {source_project_id}: {wrong}") + + # Target project resolution + create_project_name = options.get("create_project") + if create_project_name: + self.log(f"Target project: NEW — '{create_project_name}'") + target_project = None + elif options.get("target_project"): + try: + target_project = Project.objects.get(pk=options["target_project"]) + except Project.DoesNotExist: + raise CommandError(f"Target project {options['target_project']} does not exist") + create_project_name = None + self.log(f"Target project: {target_project.name} (id={target_project.pk})") + else: + raise CommandError("Must specify either --target-project or --create-project") + + self.log(f"Deployments to move: {deployment_ids}") + + # --- Per-deployment before snapshot --- + self.log(f"\n{'─' * 60}") + self.log(" BEFORE — Per-deployment breakdown") + self.log(f"{'─' * 60}") + + per_dep_snapshots = {} + for dep in deployments: + snap = collect_deployment_snapshot(dep.pk) + per_dep_snapshots[dep.pk] = snap + self.log(f"\n {dep.name} (id={dep.pk}):") + self.log(f" Project: {dep.project.name} (id={dep.project_id})") + dev_name = dep.device.name if dep.device else "None" + site_name = dep.research_site.name if dep.research_site else "None" + self.log(f" Device: {dev_name} (id={dep.device_id})") + self.log(f" Site: {site_name} (id={dep.research_site_id})") + self.log(f" S3 Source: id={dep.data_source_id}") + for model_name, count in snap.items(): + self.log(f" {model_name:20s} {count:>10,}") + + # Aggregate snapshot + self.log(f"\n{'─' * 60}") + self.log(" BEFORE — Aggregate totals") + self.log(f"{'─' * 60}") + + pre_snapshot = collect_aggregate_snapshot(deployment_ids) + for model_name, count in pre_snapshot.items(): + self.log(f" {model_name:20s} {count:>10,}") + + source_pre = collect_project_counts(source_project_id) + self.log(f"\n Source project totals ({source_project.name}):") + for model_name, count in source_pre.items(): + self.log(f" {model_name:20s} {count:>10,}") + + if target_project: + target_pre = collect_project_counts(target_project.pk) + self.log(f"\n Target project totals ({target_project.name}):") + for model_name, count in target_pre.items(): + self.log(f" {model_name:20s} {count:>10,}") + + # --- Shared resource analysis --- + self.log(f"\n{'─' * 60}") + self.log(" SHARED RESOURCE ANALYSIS") + self.log(f"{'─' * 60}") + + # S3StorageSources + s3_sources = {} + for dep in deployments: + if dep.data_source_id: + s3_sources[dep.data_source_id] = dep.data_source + for s3_id, s3 in s3_sources.items(): + other_deps = Deployment.objects.filter(data_source_id=s3_id).exclude(pk__in=deployment_ids) + if other_deps.exists(): + others = ", ".join(f"{d.name}(id={d.pk})" for d in other_deps[:5]) + n = other_deps.count() + self.log(f" S3StorageSource {s3_id} (project={s3.project_id}):" f" SHARED with {n} others [{others}]") + else: + self.log(f" S3StorageSource {s3_id} (project={s3.project_id}): exclusive to moved deployments") + + # Devices + devices = {} + for dep in deployments: + if dep.device_id: + devices[dep.device_id] = dep.device + for dev_id, dev in devices.items(): + other_deps = Deployment.objects.filter(device_id=dev_id).exclude(pk__in=deployment_ids) + action = ( + "no change needed (project=NULL)" + if dev.project_id is None + else ( + f"will CLONE (owned by source project {dev.project_id})" + if dev.project_id == source_project_id and other_deps.exists() + else f"will REASSIGN (owned by source project {dev.project_id})" + if dev.project_id == source_project_id + else f"no change needed (owned by project {dev.project_id})" + ) + ) + self.log(f" Device '{dev.name}' (id={dev_id}): {action}") + + # Sites + sites = {} + for dep in deployments: + if dep.research_site_id: + sites[dep.research_site_id] = dep.research_site + for site_id, site in sites.items(): + other_deps = Deployment.objects.filter(research_site_id=site_id).exclude(pk__in=deployment_ids) + action = ( + "no change needed (project=NULL)" + if site.project_id is None + else ( + f"will CLONE (owned by source project {site.project_id})" + if site.project_id == source_project_id and other_deps.exists() + else f"will REASSIGN (owned by source project {site.project_id})" + if site.project_id == source_project_id + else f"no change needed (owned by project {site.project_id})" + ) + ) + self.log(f" Site '{site.name}' (id={site_id}): {action}") + + # Collections + mixed_collections = [] + exclusive_collections = [] + collections = SourceImageCollection.objects.filter( + project_id=source_project_id, + images__deployment_id__in=deployment_ids, + ).distinct() + for coll in collections: + target_count = coll.images.filter(deployment_id__in=deployment_ids).count() + other_count = coll.images.exclude(deployment_id__in=deployment_ids).count() + if other_count > 0: + mixed_collections.append((coll, target_count, other_count)) + action = f"MIXED — {target_count} moving, {other_count} staying" + if clone_collections: + action += " → will SPLIT (clone to target, remove from source)" + else: + action += " → will REMOVE moved images from source only" + else: + exclusive_collections.append(coll) + action = f"EXCLUSIVE — all {target_count} images moving → will REASSIGN" + self.log(f" Collection '{coll.name}' (id={coll.pk}): {action}") + + # Taxa + taxa_ids = set( + Occurrence.objects.filter(deployment_id__in=deployment_ids) + .exclude(determination__isnull=True) + .values_list("determination_id", flat=True) + .distinct() + ) + self.log(f"\n Taxa referenced by moved occurrences: {len(taxa_ids)}") + + if not execute: + self.log( + f"\n{'=' * 60}\n DRY RUN COMPLETE — no changes made.\n" + f" Re-run with --execute to proceed.\n{'=' * 60}", + style=self.style.WARNING, + ) + return + + # === EXECUTE === + self.log(f"\n{'─' * 60}") + self.log(" EXECUTING MOVE") + self.log(f"{'─' * 60}") + + with transaction.atomic(): + # 0. Create target project inside transaction + if create_project_name: + target_project = Project(name=create_project_name, owner=source_project.owner) + target_project.save() + for membership in source_project.project_memberships.all(): + target_project.members.add(membership.user) + self.log(f" [1/12] Created project '{target_project.name}' (id={target_project.pk})") + target_pre = collect_project_counts(target_project.pk) + else: + self.log(f" [1/12] Using existing project '{target_project.name}' (id={target_project.pk})") + + target_id = target_project.pk + + # 1. Clone or reassign S3StorageSources + s3_clone_map = {} + for s3_id, s3 in s3_sources.items(): + if s3.project_id == source_project_id: + other_deps = Deployment.objects.filter(data_source_id=s3_id).exclude(pk__in=deployment_ids) + if other_deps.exists(): + old_pk = s3.pk + s3.pk = None + s3.project_id = target_id + s3.save() + s3_clone_map[old_pk] = s3.pk + self.log(f" [2/12] Cloned S3StorageSource {old_pk} → {s3.pk}") + else: + s3.project_id = target_id + s3.save() + self.log(f" [2/12] Reassigned S3StorageSource {s3_id}") + + # 2. Clone or reassign Devices + device_clone_map = {} + for dev_id, dev in devices.items(): + if dev.project_id == source_project_id: + other_deps = Deployment.objects.filter(device_id=dev_id).exclude(pk__in=deployment_ids) + if other_deps.exists(): + old_pk = dev.pk + dev.pk = None + dev.project_id = target_id + dev.save() + device_clone_map[old_pk] = dev.pk + self.log(f" [3/12] Cloned Device '{dev.name}' {old_pk} → {dev.pk}") + else: + dev.project_id = target_id + dev.save() + self.log(f" [3/12] Reassigned Device '{dev.name}' {dev_id}") + + # 3. Clone or reassign Sites + site_clone_map = {} + for site_id, site in sites.items(): + if site.project_id == source_project_id: + other_deps = Deployment.objects.filter(research_site_id=site_id).exclude(pk__in=deployment_ids) + if other_deps.exists(): + old_pk = site.pk + site.pk = None + site.project_id = target_id + site.save() + site_clone_map[old_pk] = site.pk + self.log(f" [4/12] Cloned Site '{site.name}' {old_pk} → {site.pk}") + else: + site.project_id = target_id + site.save() + self.log(f" [4/12] Reassigned Site '{site.name}' {site_id}") + + # 4. Update Deployments + for dep in deployments: + old_project = dep.project_id + dep.project_id = target_id + if dep.data_source_id in s3_clone_map: + dep.data_source_id = s3_clone_map[dep.data_source_id] + if dep.device_id in device_clone_map: + dep.device_id = device_clone_map[dep.device_id] + if dep.research_site_id in site_clone_map: + dep.research_site_id = site_clone_map[dep.research_site_id] + dep.save(update_calculated_fields=False, regroup_async=False) + self.log( + f" [5/12] Moved Deployment '{dep.name}' (id={dep.pk}) " f"project {old_project} → {target_id}" + ) + + # 5. Bulk update Events + event_count = Event.objects.filter(deployment_id__in=deployment_ids).update(project_id=target_id) + self.log(f" [6/12] Updated {event_count:,} Events") + + # 6. Bulk update SourceImages + img_count = SourceImage.objects.filter(deployment_id__in=deployment_ids).update(project_id=target_id) + self.log(f" [7/12] Updated {img_count:,} SourceImages") + + # 7. Bulk update Occurrences + occ_count = Occurrence.objects.filter(deployment_id__in=deployment_ids).update(project_id=target_id) + self.log(f" [8/12] Updated {occ_count:,} Occurrences") + + # 8. Bulk update Jobs + job_count = Job.objects.filter(deployment_id__in=deployment_ids).update(project_id=target_id) + self.log(f" [9/12] Updated {job_count:,} Jobs") + + # 9. Handle collections + for coll, target_count, other_count in mixed_collections: + moved_images = coll.images.filter(deployment_id__in=deployment_ids) + moved_image_ids = list(moved_images.values_list("pk", flat=True)) + + if clone_collections: + new_coll = SourceImageCollection.objects.create( + name=coll.name, + project_id=target_id, + description=coll.description or "", + ) + new_coll.images.set(moved_image_ids) + self.log( + f" [10/12] Split collection '{coll.name}': " + f"cloned {len(moved_image_ids):,} images → target collection id={new_coll.pk}" + ) + + coll.images.remove(*moved_image_ids) + self.log(f" [10/12] Removed {len(moved_image_ids):,} images from source collection '{coll.name}'") + + for coll in exclusive_collections: + coll.project_id = target_id + coll.save() + self.log(f" [10/12] Reassigned collection '{coll.name}' (id={coll.pk})") + + # 10. Clone pipeline configs + if clone_pipelines: + existing_pipelines = set( + ProjectPipelineConfig.objects.filter(project_id=target_id).values_list("pipeline_id", flat=True) + ) + cloned_count = 0 + for config in ProjectPipelineConfig.objects.filter(project_id=source_project_id): + if config.pipeline_id not in existing_pipelines: + ProjectPipelineConfig.objects.create(project_id=target_id, pipeline_id=config.pipeline_id) + cloned_count += 1 + total = ProjectPipelineConfig.objects.filter(project_id=target_id).count() + self.log(f" [11/12] Pipeline configs: cloned {cloned_count}, target now has {total}") + + # 11. Link ProcessingServices (raw SQL to avoid ORM column mismatch) + linked = link_processing_services_raw(source_project_id, target_id) + self.log(f" [11/12] Linked {linked} ProcessingService(s) to target project") + + # 12. Link taxa to target project + if taxa_ids: + for taxon in Taxon.objects.filter(pk__in=taxa_ids): + taxon.projects.add(target_project) + self.log(f" [12/12] Linked {len(taxa_ids):,} taxa to target project") + else: + self.log(" [12/12] No taxa to link (no occurrences with determinations)") + + # --- Post-move: update cached fields (outside transaction) --- + self.log(f"\n{'─' * 60}") + self.log(" UPDATING CACHED FIELDS") + self.log(f"{'─' * 60}") + for dep in Deployment.objects.filter(pk__in=deployment_ids): + dep.update_calculated_fields(save=True) + self.log(f" Updated cached fields for {dep.name} (id={dep.pk})") + + # --- Post-move: per-deployment after snapshot --- + self.log(f"\n{'─' * 60}") + self.log(" AFTER — Per-deployment breakdown") + self.log(f"{'─' * 60}") + + all_ok = True + for dep in Deployment.objects.filter(pk__in=deployment_ids).select_related( + "project", "device", "research_site" + ): + snap_after = collect_deployment_snapshot(dep.pk) + snap_before = per_dep_snapshots[dep.pk] + self.log(f"\n {dep.name} (id={dep.pk}):") + self.log(f" Project: {dep.project.name} (id={dep.project_id})") + dev_name = dep.device.name if dep.device else "None" + site_name = dep.research_site.name if dep.research_site else "None" + self.log(f" Device: {dev_name} (id={dep.device_id})") + self.log(f" Site: {site_name} (id={dep.research_site_id})") + self.log(f" S3 Source: id={dep.data_source_id}") + for model_name in snap_after: + before = snap_before[model_name] + after = snap_after[model_name] + status = "OK" if before == after else "MISMATCH" + if status != "OK": + all_ok = False + self.log(f" {model_name:20s} before={before:>10,} after={after:>10,} {status}") + + # --- Post-move: aggregate snapshot --- + self.log(f"\n{'─' * 60}") + self.log(" AFTER — Aggregate totals") + self.log(f"{'─' * 60}") + + post_snapshot = collect_aggregate_snapshot(deployment_ids) + for model_name, count in post_snapshot.items(): + pre_count = pre_snapshot[model_name] + status = "OK" if count == pre_count else f"MISMATCH (was {pre_count})" + if count != pre_count: + all_ok = False + self.log(f" {model_name:20s} {count:>10,} {status}") + + source_post = collect_project_counts(source_project_id) + self.log(f"\n Source project ({source_project.name}) after move:") + for model_name, count in source_post.items(): + diff = source_pre[model_name] - count + self.log(f" {model_name:20s} {count:>10,} (moved {diff:,})") + + target_post = collect_project_counts(target_id) + self.log(f"\n Target project (id={target_id}) after move:") + for model_name, count in target_post.items(): + self.log(f" {model_name:20s} {count:>10,}") + + # --- Validation --- + self.log(f"\n{'─' * 60}") + self.log(" VALIDATION") + self.log(f"{'─' * 60}") + errors = [] + + # FK integrity: all moved data points to target project + for model_name, model_cls, filter_field in [ + ("Events", Event, "deployment_id__in"), + ("SourceImages", SourceImage, "deployment_id__in"), + ("Occurrences", Occurrence, "deployment_id__in"), + ("Jobs", Job, "deployment_id__in"), + ]: + bad = model_cls.objects.filter(**{filter_field: deployment_ids}).exclude(project_id=target_id).count() + if bad: + errors.append(f"{bad} {model_name} still pointing to wrong project") + self.log(f" FAIL: {bad} {model_name} still pointing to wrong project") + else: + self.log(f" OK: All {model_name} point to target project") + + # Indirect access consistency + dets_via_project = Detection.objects.filter(source_image__project_id=target_id).count() + dets_via_dep = Detection.objects.filter(source_image__deployment_id__in=deployment_ids).count() + if dets_via_project != dets_via_dep: + errors.append(f"Detection count mismatch: via project={dets_via_project}, via deployment={dets_via_dep}") + self.log( + f" FAIL: Detection count mismatch: via project={dets_via_project}, via deployment={dets_via_dep}" + ) + else: + self.log( + f" OK: Detections consistent ({dets_via_project:,} via project, {dets_via_dep:,} via deployment)" + ) + + cls_via_project = Classification.objects.filter(detection__source_image__project_id=target_id).count() + cls_via_dep = Classification.objects.filter(detection__source_image__deployment_id__in=deployment_ids).count() + if cls_via_project != cls_via_dep: + errors.append( + f"Classification count mismatch: via project={cls_via_project}, via deployment={cls_via_dep}" + ) + else: + self.log(f" OK: Classifications consistent ({cls_via_project:,})") + + idents_via_project = Identification.objects.filter(occurrence__project_id=target_id).count() + idents_via_dep = Identification.objects.filter(occurrence__deployment_id__in=deployment_ids).count() + if idents_via_project != idents_via_dep: + errors.append( + f"Identification count mismatch: via project={idents_via_project}, via deployment={idents_via_dep}" + ) + else: + self.log(f" OK: Identifications consistent ({idents_via_project:,})") + + # Source project has no leaked data from moved deployments + for model_name, model_cls in [("Events", Event), ("SourceImages", SourceImage), ("Occurrences", Occurrence)]: + leaked = model_cls.objects.filter(project_id=source_project_id, deployment_id__in=deployment_ids).count() + if leaked: + errors.append(f"{leaked} {model_name} leaked in source project") + else: + self.log(f" OK: No {model_name} leaked in source project") + + # Collection integrity + source_colls = SourceImageCollection.objects.filter(project_id=source_project_id) + for coll in source_colls: + leaked = coll.images.filter(deployment_id__in=deployment_ids).count() + if leaked: + errors.append(f"Source collection '{coll.name}' still has {leaked} moved images") + self.log(f" OK: No moved images in source collections" if not any("collection" in e for e in errors) else "") + + # Conservation: source + target = original totals + for model_name in source_pre: + combined = source_post[model_name] + target_post[model_name] + original = source_pre[model_name] + target_pre.get(model_name, 0) + if combined != original: + errors.append( + f"Conservation failed for {model_name}: " + f"source({source_post[model_name]}) + target({target_post[model_name]}) = {combined} " + f"!= original({original})" + ) + else: + self.log(f" OK: Conservation check passed for {model_name} ({combined:,} = {original:,})") + + # Per-deployment row count integrity + if not all_ok: + errors.append("Per-deployment row counts changed (see breakdown above)") + + # --- Final verdict --- + self.log(f"\n{'=' * 60}") + if errors: + self.log(" VALIDATION FAILED", style=self.style.ERROR) + for err in errors: + self.log(f" ✗ {err}", style=self.style.ERROR) + else: + self.log(" ALL VALIDATION CHECKS PASSED", style=self.style.SUCCESS) + self.log(f"{'=' * 60}") From 40173dd210b307985a0d44b78b13a852015841f1 Mon Sep 17 00:00:00 2001 From: Michael Bunsen Date: Wed, 25 Mar 2026 15:18:56 -0700 Subject: [PATCH 2/9] docs: add deployment reassignment guide Documents the full relationship map, edge cases, and validation checklist for moving deployments between projects. Co-Authored-By: Claude --- .../planning/deployment-reassignment-guide.md | 144 ++++++++++++++++++ 1 file changed, 144 insertions(+) create mode 100644 docs/claude/planning/deployment-reassignment-guide.md diff --git a/docs/claude/planning/deployment-reassignment-guide.md b/docs/claude/planning/deployment-reassignment-guide.md new file mode 100644 index 000000000..062778570 --- /dev/null +++ b/docs/claude/planning/deployment-reassignment-guide.md @@ -0,0 +1,144 @@ +# Deployment Reassignment Guide + +Moving deployments (stations) and all their associated data from one project to another. + +## Overview + +A "deployment reassignment" transfers one or more Deployment records and every piece of data hanging off them to a different Project. The physical data in S3 does not move — only database references change. + +This guide covers the full relationship map, edge cases, and a validation checklist. An automated management command (`reassign_deployments`) implements these steps. + +## Complete Relationship Map + +### Tier 1 — Models with direct `project` ForeignKey (must update `project_id`) + +| Model | on_delete | Notes | +|---|---|---| +| **Deployment** | SET_NULL | Primary target of the move | +| **Event** | SET_NULL | Linked to deployment; also has own project FK | +| **SourceImage** | SET_NULL | Linked to deployment; also has own project FK | +| **Occurrence** | SET_NULL | Linked to deployment; also has own project FK | +| **Job** | CASCADE | Linked to deployment; also has own project FK | + +### Tier 2 — Models with indirect project access (no `project_id` column, no update needed) + +| Model | Access path | Notes | +|---|---|---| +| **Detection** | `source_image.project` | Follows SourceImage automatically | +| **Classification** | `detection.source_image.project` | Follows Detection chain | +| **Identification** | `occurrence.project` | Follows Occurrence automatically | + +### Tier 3 — Shared/linked resources (may need cloning or re-linking) + +| Resource | Relationship | Reassignment strategy | +|---|---|---| +| **S3StorageSource** | FK on Deployment (`data_source`) | Clone if `project_id` points to source project; update deployment FK | +| **Device** | FK on Deployment | Clone if `project_id` = source project; or set NULL (shared) | +| **Site** | FK on Deployment (`research_site`) | Clone if `project_id` = source project; or set NULL (shared) | +| **SourceImageCollection** | M2M with SourceImage, FK to Project | Split: remove moved images from source collection; optionally create mirror collection in target project | +| **Tag** | FK to Project (CASCADE) | Not deployment-scoped — usually not moved | +| **TaxaList** | M2M to Project | Add target project to M2M if relevant lists exist | +| **Taxon** | M2M to Project (`projects`) | Add target project to M2M for taxa referenced by moved occurrences | +| **ProcessingService** | M2M to Project | Add target project to M2M | +| **ProjectPipelineConfig** | FK to Project (through model for Pipeline↔Project) | Clone configs for target project | + +### Tier 4 — Target project setup (create if new) + +| Resource | Action | +|---|---| +| **Project** | Create with owner, name, description | +| **UserProjectMembership** | Copy relevant members | +| **ProjectPipelineConfig** | Clone from source project | +| **ProcessingService** links | Add to target project M2M | + +## Edge Cases + +### Mixed SourceImageCollections + +Collections can contain images from multiple deployments. When moving deployments: + +1. **Identify mixed collections** — collections in the source project containing images from both moving and staying deployments. +2. **Strategy options:** + - **Remove moved images** from source collection (images lose collection membership). + - **Clone collection** in target project containing only the moved images. + - **Both** — remove from source AND create in target (recommended). +3. Collections with `project_id` pointing to source project that ONLY contain moved images can be reassigned directly. + +### Shared Devices, Sites, S3StorageSources + +These resources have a nullable `project` FK: +- If `project_id IS NULL` → already shared, no action needed. +- If `project_id = source_project` → either clone for target project or set NULL to share. +- If `project_id = some_other_project` → leave as-is. + +**Recommendation:** Clone rather than nullify, to maintain clear ownership. + +### Cached/Denormalized Fields + +After moving, these must be recalculated: +- `Deployment.events_count`, `captures_count`, `occurrences_count`, `detections_count`, `taxa_count` +- `Project` summary statistics +- `Event` cached counts +- Call `update_calculated_fields()` on affected deployments and both projects. + +### Taxa M2M + +Occurrences reference Taxa via `determination`. The Taxon↔Project M2M (`taxon.projects`) controls which taxa appear in a project's taxonomy browser. After moving occurrences, add their referenced taxa to the target project's M2M. + +### Jobs + +Jobs have both `project` and `deployment` FKs. Historical jobs should be moved with their deployment to maintain audit trail. Pipeline references within jobs don't need changing (pipelines are shared objects). + +## Validation Checklist + +After a reassignment, verify: + +### Row Count Integrity +- [ ] Total rows across source + target = original total (no data lost or duplicated) +- [ ] Per-deployment counts match pre-move snapshot for: Events, SourceImages, Occurrences, Detections, Classifications, Identifications, Jobs + +### FK Integrity +- [ ] All Events for moved deployments have `project_id` = target project +- [ ] All SourceImages for moved deployments have `project_id` = target project +- [ ] All Occurrences for moved deployments have `project_id` = target project +- [ ] All Jobs for moved deployments have `project_id` = target project +- [ ] No orphaned records (events/images/occurrences with NULL project that shouldn't be) + +### Indirect Relationships +- [ ] Detections accessible via `source_image__project` = target project +- [ ] Classifications accessible via `detection__source_image__project` = target project +- [ ] Identifications accessible via `occurrence__project` = target project + +### Shared Resources +- [ ] Moved deployments point to correct S3StorageSource (cloned or shared) +- [ ] Moved deployments point to correct Device (cloned or shared) +- [ ] Moved deployments point to correct Site (cloned or shared) +- [ ] Source project's remaining deployments still point to valid resources + +### Collections +- [ ] No SourceImageCollection in source project contains images from moved deployments +- [ ] If collections were cloned to target, they contain the correct images + +### ML Configuration +- [ ] Target project has appropriate ProjectPipelineConfigs +- [ ] Target project linked to appropriate ProcessingServices + +### Cached Fields +- [ ] `update_calculated_fields()` called on all moved deployments +- [ ] Both source and target project stats are accurate + +### Taxa +- [ ] All taxa referenced by moved occurrences' `determination` are linked to target project M2M + +## Future: Organization-Level Object + +When an org-level model is added, deployment reassignment becomes simpler: +- Projects under the same org can share devices, sites, S3 sources, taxa, and pipeline configs natively. +- The reassignment reduces to updating `project_id` on the core models. +- Cross-org moves would still require the full cloning strategy. + +## Related Files + +- Management command: `ami/main/management/commands/reassign_deployments.py` +- Models: `ami/main/models.py`, `ami/ml/models/`, `ami/jobs/models.py` +- Filters: `ami/main/models_future/filters.py` From 3b02b5293d45804ad4c230e6717c4e8d7ae1c3c5 Mon Sep 17 00:00:00 2001 From: Michael Bunsen Date: Wed, 25 Mar 2026 15:22:28 -0700 Subject: [PATCH 3/9] fix: update event and project cached fields after deployment transfer The reassign_deployments command now also recalculates: - Event cached counts (captures_count, detections_count, occurrences_count) - Both source and target project related calculated fields Co-Authored-By: Claude --- .../commands/reassign_deployments.py | 21 ++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/ami/main/management/commands/reassign_deployments.py b/ami/main/management/commands/reassign_deployments.py index 56132f82a..dd990434f 100644 --- a/ami/main/management/commands/reassign_deployments.py +++ b/ami/main/management/commands/reassign_deployments.py @@ -475,9 +475,28 @@ def handle(self, *args, **options): self.log(f"\n{'─' * 60}") self.log(" UPDATING CACHED FIELDS") self.log(f"{'─' * 60}") + + # Update deployment cached counts for dep in Deployment.objects.filter(pk__in=deployment_ids): dep.update_calculated_fields(save=True) - self.log(f" Updated cached fields for {dep.name} (id={dep.pk})") + self.log(f" Deployment '{dep.name}' (id={dep.pk}): cached fields updated") + + # Update event cached counts for moved events + from ami.main.models import update_calculated_fields_for_events + + moved_event_pks = list(Event.objects.filter(deployment_id__in=deployment_ids).values_list("pk", flat=True)) + if moved_event_pks: + update_calculated_fields_for_events(pks=moved_event_pks) + self.log(f" Updated cached fields for {len(moved_event_pks)} events") + + # Update both projects' related calculated fields (events + deployments) + self.log(f" Updating source project cached fields...") + source_project.update_related_calculated_fields() + self.log(f" Source project '{source_project.name}': related fields updated") + + self.log(f" Updating target project cached fields...") + target_project.update_related_calculated_fields() + self.log(f" Target project (id={target_id}): related fields updated") # --- Post-move: per-deployment after snapshot --- self.log(f"\n{'─' * 60}") From bf1c690263bf161cd81673aeff87f7c3d04bd9b5 Mon Sep 17 00:00:00 2001 From: Michael Bunsen Date: Wed, 25 Mar 2026 15:33:42 -0700 Subject: [PATCH 4/9] refactor: rename to move_project_data with scope warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Renamed from reassign_deployments to move_project_data to better communicate the scope of the operation (all occurrences, identifications, classifications, etc. — not just deployment records) - Added conditional scope warning that shows full data weight when processed data exists, or a simple note for unprocessed transfers - Added identifier membership: users who made identifications on moved data are auto-added to the target project with Identifier role - Added default filter config copy (score threshold, include/exclude taxa, default processing pipeline) Co-Authored-By: Claude --- ...gn_deployments.py => move_project_data.py} | 109 +++++++++++++++++- 1 file changed, 103 insertions(+), 6 deletions(-) rename ami/main/management/commands/{reassign_deployments.py => move_project_data.py} (83%) diff --git a/ami/main/management/commands/reassign_deployments.py b/ami/main/management/commands/move_project_data.py similarity index 83% rename from ami/main/management/commands/reassign_deployments.py rename to ami/main/management/commands/move_project_data.py index dd990434f..b70d7fec3 100644 --- a/ami/main/management/commands/reassign_deployments.py +++ b/ami/main/management/commands/move_project_data.py @@ -1,15 +1,19 @@ """ -Management command to move deployments (and all related data) from one project to another. +Move deployments and ALL associated data from one project to another. + +This is a comprehensive data transfer: deployments, source images, events, +occurrences, detections, classifications, identifications, jobs, collection +memberships, taxa links, pipeline configs, and processing service links. Usage: # Dry run (default) — shows what would happen - python manage.py reassign_deployments --source-project 20 --target-project 99 --deployment-ids 60,61 + python manage.py move_project_data --source-project 20 --target-project 99 --deployment-ids 60,61 # Execute the move - python manage.py reassign_deployments --source-project 20 --target-project 99 --deployment-ids 60,61 --execute + python manage.py move_project_data --source-project 20 --target-project 99 --deployment-ids 60,61 --execute # Create a new target project - python manage.py reassign_deployments --source-project 20 \ + python manage.py move_project_data --source-project 20 \ --create-project "Nunavik" --deployment-ids 60,61 --execute See docs/claude/planning/deployment-reassignment-guide.md for the full relationship map. @@ -97,7 +101,7 @@ def link_processing_services_raw(source_project_id: int, target_project_id: int) class Command(BaseCommand): - help = "Move deployments and all related data from one project to another." + help = "Move deployments and all associated data (images, occurrences, identifications, etc.) between projects." def add_arguments(self, parser): parser.add_argument("--source-project", type=int, required=True, help="Source project ID") @@ -144,7 +148,7 @@ def handle(self, *args, **options): mode = "EXECUTE" if execute else "DRY RUN" self.log(f"\n{'=' * 60}") - self.log(f" DEPLOYMENT REASSIGNMENT — {mode}") + self.log(f" MOVE PROJECT DATA — {mode}") self.log(f"{'=' * 60}") # --- Validate inputs --- @@ -312,6 +316,64 @@ def handle(self, *args, **options): ) self.log(f"\n Taxa referenced by moved occurrences: {len(taxa_ids)}") + # Identifiers: users who made identifications on moved data + identifier_users = set( + Identification.objects.filter(occurrence__deployment_id__in=deployment_ids) + .exclude(user__isnull=True) + .values_list("user_id", flat=True) + .distinct() + ) + source_member_ids = set(source_project.members.values_list("pk", flat=True)) + if identifier_users: + from ami.users.models import User + + self.log(f"\n Identifiers (users with identifications on moved data):") + for uid in identifier_users: + user = User.objects.get(pk=uid) + is_source_member = uid in source_member_ids + self.log( + f" {user.email}: source project member={is_source_member}" f" → will add to target project" + ) + + # Default filter config + self.log(f"\n Source project default filters:") + include_taxa = list(source_project.default_filters_include_taxa.values_list("name", flat=True)) + exclude_taxa = list(source_project.default_filters_exclude_taxa.values_list("name", flat=True)) + self.log(f" Score threshold: {source_project.default_filters_score_threshold}") + self.log(f" Include taxa: {include_taxa or '(none)'}") + self.log(f" Exclude taxa: {exclude_taxa or '(none)'}") + if include_taxa or exclude_taxa: + self.log(" → Will copy default filter config to target project") + + # --- Scope warning (conditional) --- + has_processed_data = pre_snapshot["detections"] > 0 + has_identifications = pre_snapshot["identifications"] > 0 + has_classifications = pre_snapshot["classifications"] > 0 + + self.log(f"\n{'─' * 60}") + if has_processed_data: + self.log(" SCOPE WARNING — This is a significant data transfer:") + self.log(f" {pre_snapshot['source_images']:>10,} source images") + if has_processed_data: + self.log(f" {pre_snapshot['detections']:>10,} detections (ML predictions)") + if has_classifications: + self.log(f" {pre_snapshot['classifications']:>10,} classifications") + if has_identifications: + self.log(f" {pre_snapshot['identifications']:>10,} identifications (human reviews)") + self.log(f" {pre_snapshot['occurrences']:>10,} occurrences") + self.log(f" {pre_snapshot['jobs']:>10,} job records") + self.log(f" {len(taxa_ids):>10,} taxa references") + if identifier_users: + self.log(f" {len(identifier_users):>10,} identifier user(s)") + self.log("") + self.log(" All of this data will be moved to the target project.") + self.log(" The source project will no longer contain this data.") + else: + self.log(" This is a lightweight transfer (unprocessed image data).") + self.log(f" {pre_snapshot['source_images']:>10,} source images") + self.log(f" {pre_snapshot['events']:>10,} events") + self.log(f"{'─' * 60}") + if not execute: self.log( f"\n{'=' * 60}\n DRY RUN COMPLETE — no changes made.\n" @@ -471,6 +533,41 @@ def handle(self, *args, **options): else: self.log(" [12/12] No taxa to link (no occurrences with determinations)") + # 13. Add identifier users to target project with Identifier role + if identifier_users: + from ami.users.models import User + from ami.users.roles import Identifier + + target_member_ids = set(target_project.members.values_list("pk", flat=True)) + added_count = 0 + for uid in identifier_users: + if uid not in target_member_ids: + user = User.objects.get(pk=uid) + target_project.members.add(user) + # Assign Identifier role (can make identifications) + Identifier.assign_user(user, target_project) + added_count += 1 + self.log(f" [13/14] Added {user.email} as Identifier") + if added_count == 0: + self.log(" [13/14] All identifiers already members") + else: + self.log(f" [13/14] Added {added_count} identifier(s) to target") + else: + self.log(" [13/14] No identifier users to add") + + # 14. Copy default filter config to target project + if include_taxa or exclude_taxa: + for t in source_project.default_filters_include_taxa.all(): + target_project.default_filters_include_taxa.add(t) + for t in source_project.default_filters_exclude_taxa.all(): + target_project.default_filters_exclude_taxa.add(t) + self.log(" [14/14] Copied default filter taxa config") + target_project.default_filters_score_threshold = source_project.default_filters_score_threshold + if source_project.default_processing_pipeline: + target_project.default_processing_pipeline = source_project.default_processing_pipeline + target_project.save() + self.log(f" [14/14] Score threshold: " f"{target_project.default_filters_score_threshold}") + # --- Post-move: update cached fields (outside transaction) --- self.log(f"\n{'─' * 60}") self.log(" UPDATING CACHED FIELDS") From 375f81ab4ff2b75c28c87bbc773c8d516e800d36 Mon Sep 17 00:00:00 2001 From: Michael Bunsen Date: Wed, 25 Mar 2026 15:33:54 -0700 Subject: [PATCH 5/9] docs: update guide to reference move_project_data command Co-Authored-By: Claude --- docs/claude/planning/deployment-reassignment-guide.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/claude/planning/deployment-reassignment-guide.md b/docs/claude/planning/deployment-reassignment-guide.md index 062778570..c03c740f4 100644 --- a/docs/claude/planning/deployment-reassignment-guide.md +++ b/docs/claude/planning/deployment-reassignment-guide.md @@ -6,7 +6,7 @@ Moving deployments (stations) and all their associated data from one project to A "deployment reassignment" transfers one or more Deployment records and every piece of data hanging off them to a different Project. The physical data in S3 does not move — only database references change. -This guide covers the full relationship map, edge cases, and a validation checklist. An automated management command (`reassign_deployments`) implements these steps. +This guide covers the full relationship map, edge cases, and a validation checklist. An automated management command (`move_project_data`) implements these steps. ## Complete Relationship Map @@ -139,6 +139,6 @@ When an org-level model is added, deployment reassignment becomes simpler: ## Related Files -- Management command: `ami/main/management/commands/reassign_deployments.py` +- Management command: `ami/main/management/commands/move_project_data.py` - Models: `ami/main/models.py`, `ami/ml/models/`, `ami/jobs/models.py` - Filters: `ami/main/models_future/filters.py` From c02cad02fb72f0d51c3d84e4b87eaaa7b832d740 Mon Sep 17 00:00:00 2001 From: Michael Bunsen Date: Wed, 25 Mar 2026 15:40:17 -0700 Subject: [PATCH 6/9] feat: preserve identifier roles, link TaxaLists, add TaxaList analysis MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Identifier users now get their existing source project role preserved (e.g. ProjectManager stays ProjectManager), falling back to Identifier role only for users who aren't source project members - TaxaLists linked to source project are now also linked to target project (M2M add, not move — TaxaLists can be shared) - Dry-run output shows TaxaLists and role assignments that will be made Co-Authored-By: Claude --- .../management/commands/move_project_data.py | 62 +++++++++++++------ 1 file changed, 44 insertions(+), 18 deletions(-) diff --git a/ami/main/management/commands/move_project_data.py b/ami/main/management/commands/move_project_data.py index b70d7fec3..23eb91fc9 100644 --- a/ami/main/management/commands/move_project_data.py +++ b/ami/main/management/commands/move_project_data.py @@ -35,6 +35,7 @@ Project, SourceImage, SourceImageCollection, + TaxaList, Taxon, ) from ami.ml.models import ProjectPipelineConfig @@ -323,17 +324,24 @@ def handle(self, *args, **options): .values_list("user_id", flat=True) .distinct() ) - source_member_ids = set(source_project.members.values_list("pk", flat=True)) + # Map each identifier to the role they should get in the target project + identifier_role_map = {} # user_id -> role_class if identifier_users: from ami.users.models import User + from ami.users.roles import Identifier, Role self.log(f"\n Identifiers (users with identifications on moved data):") for uid in identifier_users: user = User.objects.get(pk=uid) - is_source_member = uid in source_member_ids - self.log( - f" {user.email}: source project member={is_source_member}" f" → will add to target project" - ) + source_role = Role.get_primary_role(source_project, user) + if source_role: + role_to_assign = source_role + role_source = "source project role" + else: + role_to_assign = Identifier + role_source = "default (not a source member)" + identifier_role_map[uid] = role_to_assign + self.log(f" {user.email}: " f"{role_to_assign.display_name} ({role_source})") # Default filter config self.log(f"\n Source project default filters:") @@ -345,6 +353,17 @@ def handle(self, *args, **options): if include_taxa or exclude_taxa: self.log(" → Will copy default filter config to target project") + # TaxaLists + source_taxa_lists = TaxaList.objects.filter(projects=source_project) + if source_taxa_lists.exists(): + self.log(f"\n TaxaLists linked to source project:") + for tl in source_taxa_lists: + shared = tl.projects.count() + self.log( + f" '{tl.name}' (id={tl.pk}): {tl.taxa.count()} taxa," + f" shared with {shared} project(s) → will link to target" + ) + # --- Scope warning (conditional) --- has_processed_data = pre_snapshot["detections"] > 0 has_identifications = pre_snapshot["identifications"] > 0 @@ -533,40 +552,47 @@ def handle(self, *args, **options): else: self.log(" [12/12] No taxa to link (no occurrences with determinations)") - # 13. Add identifier users to target project with Identifier role - if identifier_users: + # 13. Add identifier users to target project, preserving roles + if identifier_role_map: from ami.users.models import User - from ami.users.roles import Identifier target_member_ids = set(target_project.members.values_list("pk", flat=True)) added_count = 0 - for uid in identifier_users: + for uid, role_cls in identifier_role_map.items(): if uid not in target_member_ids: user = User.objects.get(pk=uid) target_project.members.add(user) - # Assign Identifier role (can make identifications) - Identifier.assign_user(user, target_project) + role_cls.assign_user(user, target_project) added_count += 1 - self.log(f" [13/14] Added {user.email} as Identifier") + self.log(f" [13/16] Added {user.email}" f" as {role_cls.display_name}") if added_count == 0: - self.log(" [13/14] All identifiers already members") + self.log(" [13/16] All identifiers already members") else: - self.log(f" [13/14] Added {added_count} identifier(s) to target") + self.log(f" [13/16] Added {added_count} identifier(s)") + else: + self.log(" [13/16] No identifier users to add") + + # 14. Link TaxaLists to target project + source_taxa_lists = TaxaList.objects.filter(projects=source_project) + if source_taxa_lists.exists(): + for tl in source_taxa_lists: + tl.projects.add(target_project) + self.log(f" [14/16] Linked {source_taxa_lists.count()}" f" TaxaList(s) to target project") else: - self.log(" [13/14] No identifier users to add") + self.log(" [14/16] No TaxaLists to link") - # 14. Copy default filter config to target project + # 15. Copy default filter config to target project if include_taxa or exclude_taxa: for t in source_project.default_filters_include_taxa.all(): target_project.default_filters_include_taxa.add(t) for t in source_project.default_filters_exclude_taxa.all(): target_project.default_filters_exclude_taxa.add(t) - self.log(" [14/14] Copied default filter taxa config") + self.log(" [15/16] Copied default filter taxa config") target_project.default_filters_score_threshold = source_project.default_filters_score_threshold if source_project.default_processing_pipeline: target_project.default_processing_pipeline = source_project.default_processing_pipeline target_project.save() - self.log(f" [14/14] Score threshold: " f"{target_project.default_filters_score_threshold}") + self.log(f" [16/16] Score threshold:" f" {target_project.default_filters_score_threshold}") # --- Post-move: update cached fields (outside transaction) --- self.log(f"\n{'─' * 60}") From 881e9132773e04a1bb68f64761b5de10e4fd1fba Mon Sep 17 00:00:00 2001 From: Michael Bunsen Date: Thu, 26 Mar 2026 12:21:12 -0700 Subject: [PATCH 7/9] docs: add project portability spec (UUIDs, export/import, natural keys) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Design spec for making projects portable between Antenna instances. Covers UUID fields, Organization model, natural keys, export/import commands, and Darwin Core integration. Includes 21 research areas spanning internal data validation, biodiversity standards (GBIF, iNaturalist, BOLD, Camtrap DP), and patterns from non-biodiversity applications (GitLab, WordPress, Notion, Jira). Status: draft — pending research and validation. Co-Authored-By: Claude --- .../planning/project-portability-spec.md | 470 ++++++++++++++++++ 1 file changed, 470 insertions(+) create mode 100644 docs/claude/planning/project-portability-spec.md diff --git a/docs/claude/planning/project-portability-spec.md b/docs/claude/planning/project-portability-spec.md new file mode 100644 index 000000000..dead1a4dd --- /dev/null +++ b/docs/claude/planning/project-portability-spec.md @@ -0,0 +1,470 @@ +# Project Portability: Export/Import with UUIDs and Natural Keys + +**Date:** 2026-03-26 +**Status:** Draft — pending research and validation +**Authors:** Michael, Claude + +## Motivation + +Antenna projects need to be portable between Django environments for two primary use cases: + +1. **Instance migration** — Moving projects from one Antenna deployment to another (e.g., self-hosted → managed, or between collaborating institutions). ID clashes between instances must be handled. +2. **Production → dev/staging** — Cloning real project data into development environments for testing. The target DB can be wiped first, so ID clashes are less of a concern. + +A secondary goal is to improve user-facing exports (CSV, JSON, Darwin Core) by establishing stable, human-readable identifiers across the system. + +## Design Overview + +Three interconnected changes: + +1. **UUID fields** on all models — stable cross-instance identifiers +2. **Organization model** — lightweight namespace for projects +3. **Export/import management commands** — serialize and restore full project graphs +4. **Natural key methods** — for Django serialization and human-readable exports + +## 1. UUID Fields + +### Principle: integer PKs stay internal, UUIDs are external identity + +Every model in the export graph gets: + +```python +uuid = models.UUIDField(default=uuid.uuid4, unique=True, editable=False) +``` + +Integer PKs remain the primary key. This is deliberate: + +- Django's FK system, joins, and indexing are optimized for integers +- All existing queries, admin URLs, Celery task arguments, and internal references continue unchanged +- Integer PKs are 8 bytes; UUIDs are 16 bytes — this matters on tables with millions of rows and FK-heavy joins (Detection, Classification) +- Changing PK type on large tables with existing FK references is a dangerous, multi-step migration + +### When to use which + +| Context | Use | +|---|---| +| Database ForeignKey fields | Integer PK | +| Database joins, indexes, internal queries | Integer PK | +| Django admin URLs | Integer PK | +| Celery task arguments | Integer PK | +| Export/import serialization | UUID | +| User-facing CSV/JSON exports | UUID | +| Darwin Core `occurrenceID`, `eventID`, etc. | UUID | +| Cross-instance data matching | UUID | +| Future external API identifiers | UUID (via DRF `lookup_field`) | + +### DB constraints + +- `UNIQUE` constraint on `uuid` — enforced at database level +- `db_index=True` — implicit from `unique=True` +- `default=uuid.uuid4` — auto-generated on creation +- `editable=False` — immutable once set +- Backfill migration generates UUIDs for all existing rows + +### Models receiving UUID fields + +**Project-scoped (included in project exports):** + +| Model | App | Has UUID today? | +|---|---|---| +| Organization (new) | main | New model | +| Project | main | No → Add | +| Deployment | main | No → Add | +| Event | main | No → Add | +| SourceImage | main | No → Add | +| Detection | main | No → Add | +| Classification | main | No → Add | +| Occurrence | main | No → Add | +| Identification | main | No → Add | +| Device | main | No → Add | +| Site | main | No → Add | +| S3StorageSource | main | No → Add | +| SourceImageCollection | main | No → Add | +| Tag | main | No → Add | +| TaxaList | main | No → Add | +| Job | jobs | No → Add | +| DataExport | exports | No → Add | +| Pipeline | ml | No → Add | +| Algorithm | ml | No → Add | +| ProcessingService | ml | No → Add | +| ProjectPipelineConfig | ml | No → Add | + +**Shared/global (referenced in exports but not project-scoped):** + +| Model | App | Notes | +|---|---|---| +| Taxon | main | Shared across projects; matched by `name` on import | +| User | users | Shared; matched by `email` on import | + +## 2. Organization Model + +Lightweight namespace for projects. No permissions model yet — that's a separate design. + +```python +class Organization(BaseModel): + name = models.CharField(max_length=255) + slug = models.SlugField(max_length=255, unique=True) + description = models.TextField(blank=True, default="") + uuid = models.UUIDField(default=uuid.uuid4, unique=True, editable=False) + +# On Project: +organization = models.ForeignKey( + Organization, null=True, blank=True, on_delete=models.SET_NULL, + related_name="projects", +) +``` + +- Projects without an org continue to work (nullable FK) +- The org `slug` participates in the project's natural key for export: `(org_slug, project_name)` +- Single migration: add Organization model + add FK to Project + +## 3. Natural Keys + +### Purpose + +Natural keys serve three roles: +1. **Django serialization** (`dumpdata --natural-foreign`) — FK references as human-readable tuples instead of opaque integers +2. **User-facing exports** — meaningful identifiers in CSV/JSON output +3. **Import deduplication** — matching existing records on the target instance + +### Natural key definitions + +Each model implements `natural_key()` (instance method) and `get_by_natural_key()` (manager classmethod). + +**Models with strong natural keys (existing unique constraints):** + +| Model | Natural key | DB constraint | +|---|---|---| +| Organization | `(slug,)` | `UNIQUE(slug)` | +| Project | `(org_slug_or_none, name)` | Needs `UNIQUE(organization, name)` | +| Taxon | `(name,)` | `UNIQUE(name)` | +| Event | `(deployment_uuid, group_by)` | `UNIQUE(deployment, group_by)` | +| SourceImage | `(deployment_uuid, path)` | `UNIQUE(deployment, path)` | +| Pipeline | `(name, version)` | `UNIQUE(name, version)` | +| Algorithm | `(name, version)` | `UNIQUE(name, version)` | +| User | `(email,)` | `UNIQUE(email)` | +| Tag | `(name, project_uuid)` | `UNIQUE(name, project)` | + +**Models with domain-level natural keys (application-enforced, need DB constraints):** + +| Model | Natural key | Current enforcement | +|---|---|---| +| Detection | `(source_image, bbox)` for real detections; `(source_image, algorithm, bbox=NULL)` for null detections | App-level `.filter().first()` in `pipeline.py:454-464` | +| Classification | `(detection, algorithm, taxon, score)` | App-level `.filter().first()` in `pipeline.py:681-686` | +| Deployment | `(project, name)` | No constraint (names can collide within a project) | +| Device | `(project, name)` | No constraint | +| Site | `(project, name)` | No constraint | + +**Models without natural keys (UUID is the identity):** + +| Model | Why no natural key | +|---|---| +| Occurrence | Grouped by event + determination, but multiple occurrences of the same species in the same event are valid | +| Identification | A user can identify the same occurrence multiple times (withdraw and re-identify) | +| SourceImageCollection | Name is not unique | +| TaxaList | Name is not unique | +| Job | Transient processing records | +| S3StorageSource | Configuration record, no unique domain identifier | +| DataExport | Transient output records | +| ProcessingService | Configuration record | +| ProjectPipelineConfig | Through model | + +For these, the `natural_key()` method returns `(uuid,)` and `get_by_natural_key()` looks up by UUID. + +### Missing unique constraints to add + +These should be added as part of this work to align the DB with the domain logic: + +1. `Project`: `UNIQUE(organization, name)` — prevents duplicate project names within an org +2. `Deployment`: `UNIQUE(project, name)` — prevents duplicate deployment names within a project +3. `Device`: `UNIQUE(project, name)` — same +4. `Site`: `UNIQUE(project, name)` — same +5. `Detection`: Composite unique constraint for deduplication (needs research — see below) +6. `Classification`: Composite unique constraint (needs research — see below) + +## 4. Export Command: `export_project` + +### Usage + +```bash +# Export a single project +python manage.py export_project --project 23 --output nunavik-export.json + +# Export with options +python manage.py export_project --project 23 --output nunavik/ --split-models +``` + +### Output format + +Single JSON file (or directory with one file per model type for large projects): + +```json +{ + "format_version": "1.0", + "exported_at": "2026-03-26T12:00:00Z", + "antenna_version": "x.y.z", + "source_instance": "antenna.example.org", + "organization": { + "uuid": "...", "name": "...", "slug": "..." + }, + "project": { + "uuid": "...", "name": "...", "fields": { ... } + }, + "shared_references": { + "taxa": [ + { "natural_key": ["Lepidoptera"], "uuid": "..." }, + ... + ], + "users": [ + { "natural_key": ["user@example.org"], "uuid": "..." }, + ... + ], + "pipelines": [ + { "natural_key": ["Quebec & Vermont moths", "2023"], "uuid": "..." }, + ... + ], + "algorithms": [ + { "natural_key": ["moth_detector", "1.0"], "uuid": "..." }, + ... + ] + }, + "models": { + "main.deployment": [ + { "uuid": "...", "fields": { "name": "Kuujjuaq", "project": "", ... } }, + ... + ], + "main.event": [ ... ], + "main.sourceimage": [ ... ], + "main.detection": [ ... ], + "main.classification": [ ... ], + "main.occurrence": [ ... ], + "main.identification": [ ... ], + ... + } +} +``` + +### FK serialization rules + +- **Project-scoped FKs** (Deployment → Project, Event → Deployment, etc.): Serialized as target object's UUID +- **Shared/global references** (Classification → Taxon, Identification → User, Detection → Algorithm): Serialized as natural key tuple. On import, matched by natural key; if not found, created or errored depending on type. +- **Self-referential FKs** (Taxon → parent Taxon): Natural key `(name,)` +- **Nullable FKs**: Serialized as `null` + +### Export ordering (FK dependency graph) + +Same ordering as `move_project_data`: + +1. Organization +2. Project (+ members, default filter taxa) +3. S3StorageSource +4. Device, Site +5. Deployment +6. Event +7. SourceImage +8. Detection +9. Classification +10. Occurrence +11. Identification +12. SourceImageCollection (+ M2M image links) +13. Job +14. Tag, TaxaList +15. ProjectPipelineConfig +16. DataExport + +### What is NOT exported + +- **Source image files** — remain in S3/MinIO. SourceImage records preserve their `path` field; the target instance must configure its own S3StorageSource to access the same (or copied) bucket. +- **Detection crop images** — can be regenerated from source images + bounding boxes. +- **Celery task state** — transient; jobs are exported as historical records only. +- **User passwords/tokens** — users are referenced by email; authentication is instance-local. +- **Django permissions/groups** — recreated by `create_roles_for_project()` on import. + +## 5. Import Command: `import_project` + +### Usage + +```bash +# Import into clean database (prod → dev) +python manage.py import_project nunavik-export.json + +# Import with conflict handling (instance migration) +python manage.py import_project nunavik-export.json --on-conflict=skip + +# Dry run +python manage.py import_project nunavik-export.json --dry-run +``` + +### Import process + +1. **Parse and validate** — check format_version, antenna_version compatibility +2. **Resolve shared references** by natural key: + - Taxon → `(name,)` — match existing or create + - User → `(email,)` — match existing only (do not create users) + - Pipeline → `(name, version)` — match existing or create + - Algorithm → `(name, version)` — match existing or create +3. **Create Organization** — match by slug or create +4. **Create Project** — match by `(org_slug, name)` or create +5. **Create project-scoped objects** in FK dependency order, building a `uuid → new_pk` mapping as each object is created +6. **Wire FK references** — all FKs in the export are UUIDs; resolve each to the new PK via the mapping +7. **Restore M2M relationships** — project members, collection images, taxa lists, etc. +8. **Post-import**: call `update_calculated_fields()` on deployments, events; call `update_related_calculated_fields()` on project; call `create_roles_for_project()` for permission groups + +### Conflict handling (`--on-conflict`) + +When a UUID from the export already exists in the target database: + +| Mode | Behavior | +|---|---| +| `error` (default) | Abort import with an error message listing conflicting UUIDs | +| `skip` | Skip the conflicting record; use the existing object's PK in the FK mapping | +| `update` | Update the existing record's fields from the export data | + +### Validation and reporting + +Same pattern as `move_project_data`: +- Pre-import: dry-run mode showing what would be created +- Post-import: row counts per model type, FK integrity checks, cached field verification + +## 6. Integration with Existing Export Framework + +The existing `ami/exports/` app produces occurrence CSVs and JSONs for end users. These are enhanced to use the new UUID and natural key infrastructure: + +### CSV export additions + +New columns in `OccurrenceTabularSerializer`: + +| Column | Source | +|---|---| +| `occurrence_uuid` | `occurrence.uuid` | +| `event_uuid` | `occurrence.event.uuid` | +| `deployment_uuid` | `occurrence.deployment.uuid` | +| `detection_uuid` | Best detection's UUID | +| `determination_uuid` | `occurrence.determination.uuid` (Taxon) | + +Existing human-readable columns (`deployment_name`, `taxon_name`, etc.) remain unchanged — these are already natural-key-like. + +### Darwin Core mapping + +UUID fields map directly to Darwin Core terms: + +| Antenna field | Darwin Core term | +|---|---| +| `occurrence.uuid` | `occurrenceID` | +| `event.uuid` | `eventID` | +| `taxon.name` | `scientificName` | +| `taxon.rank` | `taxonRank` | +| `deployment.name` | `locationID` or `locality` | +| `detection.bbox` | Could map to annotation extensions | +| `identification.uuid` | `identificationID` | +| `identification.user.email` | `identifiedBy` | + +### No structural changes to the export framework + +The `BaseExporter`, `ExportRegistry`, and `DataExport` model remain unchanged. Only the serializers get new fields. New export formats (Darwin Core Archive) can be added as new registered exporters. + +--- + +## Areas Requiring Further Research and Validation + +### Internal: Existing Data and Models + +1. **Detection uniqueness constraint feasibility** + - The current deduplication logic uses `(source_image, bbox)` for real detections and `(source_image, algorithm, bbox=NULL)` for null detections. This is two different composite keys depending on whether `bbox` is null. + - **Research needed:** Can this be expressed as a single DB-level constraint? PostgreSQL supports partial unique indexes (`CREATE UNIQUE INDEX ... WHERE bbox IS NOT NULL` and a separate one `WHERE bbox IS NULL`), but Django's `UniqueConstraint` with `condition` may be needed. Audit existing data for violations before adding constraints. + - **Validate:** Run a query to check for duplicate `(source_image_id, bbox)` pairs in the Detection table. If violations exist, they need to be cleaned up first. + +2. **Classification uniqueness constraint feasibility** + - Current dedup uses `(detection, taxon, algorithm, score)`. Including `score` (a float) in a unique constraint is unusual and may have precision issues. + - **Research needed:** Is `score` truly part of the identity, or is it `(detection, algorithm, taxon)` with the score being an attribute? If the same algorithm classifies the same detection as the same taxon twice with different scores, is that a duplicate or two valid records? + - **Validate:** Query for duplicate `(detection_id, algorithm_id, taxon_id)` triples to understand the data. + +3. **Deployment name uniqueness** + - We want `UNIQUE(project, name)` on Deployment, but names may already collide within projects. + - **Validate:** `SELECT project_id, name, COUNT(*) FROM main_deployment GROUP BY project_id, name HAVING COUNT(*) > 1` + - Same validation needed for Device and Site. + +4. **Project name uniqueness within organization** + - Adding `UNIQUE(organization, name)` requires handling the case where `organization` is NULL (multiple NULL org projects with the same name are allowed by PostgreSQL's unique constraint semantics, which treats NULLs as distinct). + - **Research needed:** Do we want a `UNIQUE(name) WHERE organization IS NULL` partial constraint too? + +5. **UUID backfill migration performance** + - Adding a UUID column with `default=uuid4` to tables with millions of rows (SourceImage: ~10M, Detection: ~1M, Classification: ~1M) will lock the table during migration. + - **Research needed:** Test migration time on a staging copy. Consider a phased approach: add nullable column → backfill in batches → set NOT NULL + unique. + +6. **Occurrence identity** + - Occurrences are currently created by grouping detections within an event. The same species can have multiple occurrences in the same event. + - **Research needed:** Is there any domain-level identity for occurrences beyond the auto-generated PK? How do other platforms handle this? (See GBIF section below.) + +7. **Existing `DataExport` output consumers** + - Adding UUID columns to CSV exports changes the schema that downstream consumers (researchers' scripts, R/Python notebooks) expect. + - **Research needed:** Are there documented consumers? Should UUID columns be opt-in via a parameter? + +### External: Biodiversity Data Standards and Platforms + +8. **GBIF (Global Biodiversity Information Facility)** + - GBIF uses Darwin Core Archive (DwC-A) as its interchange format — a zip of CSV files with a `meta.xml` descriptor. + - `occurrenceID` must be a globally unique, persistent identifier. GBIF recommends URIs (e.g., `urn:catalog:institution:collection:id`). UUIDs satisfy this but are not human-friendly. + - **Research needed:** Review GBIF's [Publishing Data Guide](https://www.gbif.org/publishing-data) and [Darwin Core Archive specification](https://dwc.tdwg.org/text/). Understand how they handle: dataset versioning, record updates/deletions, identifier persistence across re-publications, and the relationship between `occurrenceID`, `catalogNumber`, and `institutionCode`. + - **Specific question:** GBIF assigns its own `gbifID` to every record. How does that interact with the publisher's `occurrenceID`? Is the publisher's ID used for deduplication on re-upload? + +9. **iNaturalist** + - iNaturalist uses integer IDs in its API (`observation_id`, `taxon_id`) but also exposes UUIDs for observations (`observation.uuid`). + - Their export format (DwC-A for GBIF) maps `observation.uuid` to `occurrenceID`. + - **Research needed:** How does iNaturalist handle observation identity across their API, their GBIF export, and their CSV export? Do they use UUIDs internally or only externally? What does their identifier lifecycle look like when an observation is merged or deleted? + +10. **BOLD (Barcode of Life Data System)** + - BOLD uses `processID` as the primary identifier for specimen records. + - **Research needed:** How does BOLD handle data portability between institutions? What identifier scheme do they use for cross-system references? How do they handle specimen records that appear in both BOLD and GBIF? + +11. **Darwin Core identifier conventions** + - DwC defines `occurrenceID`, `eventID`, `locationID`, `identificationID`, `taxonID` — all expected to be persistent, unique identifiers. + - **Research needed:** What is the recommended format? Plain UUIDs? URIs with a namespace prefix (e.g., `urn:antenna:occurrence:uuid`)? How do major publishers format these? + - **Specific question:** Should we mint URIs like `https://antenna.example.org/occurrences/{uuid}` that could theoretically resolve to the occurrence, or just use plain UUIDs? + +12. **Camera trap data standards (Camtrap DP)** + - [Camtrap DP](https://camtrap-dp.tdwg.org/) is a community-developed exchange format for camera trap data, built on Frictionless Data Packages. + - It defines `deploymentID`, `mediaID`, `observationID` fields. + - **Research needed:** How does Camtrap DP handle identifiers? Is it compatible with our model structure? Could Antenna export Camtrap DP format as an additional export type? + +### External: Non-Biodiversity Applications with Portable Data + +13. **Notion / Confluence (knowledge bases)** + - Notion exports as Markdown + CSV with internal UUIDs as page identifiers. Import reconstructs the link graph from UUIDs. + - **Relevant pattern:** They solve the same FK-remapping problem when importing into a different workspace. Their approach: every block has a UUID, all references use UUIDs, import creates fresh internal IDs but preserves UUID-based cross-references. + +14. **GitLab / GitHub (project migration)** + - GitLab's project export (`gitlab-export.tar.gz`) contains JSON files per model type (issues, merge requests, notes, labels) with internal IDs. Import remaps all IDs to fresh sequences. + - **Relevant pattern:** They handle the "shared references" problem (users, labels) by matching on natural keys (username, label name) and creating placeholders when matches fail. + - **Research needed:** How does GitLab handle cross-project references in imported data? How do they version their export format for backwards compatibility? + +15. **Basecamp / Linear / Jira (project management)** + - Jira's project export uses an XML format with internal IDs. Import into a different instance requires an ID remapping step. Jira also supports "project key" as a natural key for cross-references. + - **Relevant pattern:** Jira distinguishes between "project-scoped" data (issues, comments) and "global" data (users, custom field definitions). Only project-scoped data is exported; global data must pre-exist on the target instance. + +16. **WordPress (site migration)** + - WordPress export (WXR format) uses slugs and GUIDs (`` tags) for post identity. Import matches by GUID to detect duplicates. + - **Relevant pattern:** Media files are referenced by URL in the export. The importer can optionally download and re-host media, or leave the URLs pointing to the original site. This is analogous to our S3 source image handling. + +17. **Django's own `dumpdata`/`loaddata` with natural keys** + - Django's contrib apps (auth, contenttypes, sites) are the canonical example of natural keys in Django. + - `ContentType` natural key: `(app_label, model)`. `Permission` natural key: `(codename, app_label, model)`. + - **Research needed:** Review how Django handles natural key ordering in `dumpdata`. The `--natural-primary` flag serializes the PK itself as a natural key, while `--natural-foreign` serializes FK references. We want `--natural-foreign` but NOT `--natural-primary` (we want UUIDs for PKs in the export, natural keys for FK references to shared objects). + +### Implementation Risks and Open Questions + +18. **Schema evolution between Antenna versions** + - If the export format includes field-level data, schema changes between versions break import. + - **Decision needed:** How strict is version compatibility? Options: (a) exact version match required, (b) format_version-based compatibility with documented breaking changes, (c) field-level schema included in export for self-describing archives. + +19. **Large project export performance** + - Projects with millions of source images and detections will produce multi-GB JSON files. + - **Decision needed:** Stream-based export (JSONL) vs. single JSON blob? Split by model type into separate files? Compression? + +20. **Taxonomy divergence between instances** + - Taxon natural key is `(name,)`, but two instances might have the same taxon name with different parent hierarchies or ranks. + - **Decision needed:** On import, if a taxon name matches but the parent chain differs, is that a conflict? Should we include parent info in the natural key? + +21. **Partial re-import / incremental sync** + - The `--on-conflict=skip` mode enables incremental sync, but what about records that were deleted on the source after the previous export? + - **Decision needed:** Is deletion sync in scope? Probably not for v1, but worth noting. From b859a99d21b7b8d4c5033230f3b514979b99d1ba Mon Sep 17 00:00:00 2001 From: Michael Bunsen Date: Thu, 26 Mar 2026 12:22:26 -0700 Subject: [PATCH 8/9] docs: add per-model UUID use cases to portability spec Documents concrete use cases for each model's UUID beyond export/import: Darwin Core field mappings (occurrenceID, eventID, etc.), ML pipeline coordination (Pipeline/Algorithm slug collision risk), device tracking across institutions, and scientific reproducibility requirements. Co-Authored-By: Claude --- .../planning/project-portability-spec.md | 58 ++++++++++--------- 1 file changed, 30 insertions(+), 28 deletions(-) diff --git a/docs/claude/planning/project-portability-spec.md b/docs/claude/planning/project-portability-spec.md index dead1a4dd..628ad5d0b 100644 --- a/docs/claude/planning/project-portability-spec.md +++ b/docs/claude/planning/project-portability-spec.md @@ -61,40 +61,42 @@ Integer PKs remain the primary key. This is deliberate: - `editable=False` — immutable once set - Backfill migration generates UUIDs for all existing rows -### Models receiving UUID fields +### Models receiving UUID fields, with known use cases + +Each model's UUID serves export/import portability. The table below documents additional known use cases beyond that baseline. **Project-scoped (included in project exports):** -| Model | App | Has UUID today? | -|---|---|---| -| Organization (new) | main | New model | -| Project | main | No → Add | -| Deployment | main | No → Add | -| Event | main | No → Add | -| SourceImage | main | No → Add | -| Detection | main | No → Add | -| Classification | main | No → Add | -| Occurrence | main | No → Add | -| Identification | main | No → Add | -| Device | main | No → Add | -| Site | main | No → Add | -| S3StorageSource | main | No → Add | -| SourceImageCollection | main | No → Add | -| Tag | main | No → Add | -| TaxaList | main | No → Add | -| Job | jobs | No → Add | -| DataExport | exports | No → Add | -| Pipeline | ml | No → Add | -| Algorithm | ml | No → Add | -| ProcessingService | ml | No → Add | -| ProjectPipelineConfig | ml | No → Add | +| Model | Known UUID use cases | +|---|---| +| **Organization** (new) | Namespace for projects across instances. Future: org-level API access, multi-tenant routing. | +| **Project** | Cross-instance project identity. Enables "this is the same project on staging and prod" matching. Future: project-level API tokens scoped to UUID. | +| **Deployment** | Deployment identity across instances and external systems. Deployments represent physical monitoring stations — the same station may be referenced in publications, field logs, and partner databases. A UUID gives it a stable external reference. Darwin Core: maps to `locationID`. | +| **Event** | Temporal capture session identity. Darwin Core: maps to `eventID`. Events can be referenced in publications ("the June 14-15 2023 capture session at Kuujjuaq"). | +| **SourceImage** | Image identity across instances. When the same S3 bucket is accessible from multiple Antenna instances, UUID prevents duplicate processing. Also useful for cross-referencing images in external annotation tools (Label Studio, CVAT). | +| **Detection** | Detection identity for ML pipeline reproducibility. When comparing results across pipeline versions or instances, UUID lets you track "this exact bounding box crop" across systems. Crop images can be regenerated from `(source_image, bbox)` but the UUID tracks the detection record itself. | +| **Classification** | Classification identity for audit trails. Each classification is an immutable record of "algorithm X said taxon Y with score Z for detection D." UUID enables cross-referencing classification provenance across instances. | +| **Occurrence** | **Primary external-facing identifier.** Darwin Core: maps to `occurrenceID` — must be globally unique and persistent. Published to GBIF. Referenced in research papers, datasets, and partner databases. This is the most important UUID in the system. | +| **Identification** | Human review audit trail. Darwin Core: maps to `identificationID`. Each identification records a human expert's taxonomic opinion. UUID enables tracking identification history across instance migrations. | +| **Device** | Hardware identity. The same physical AMI trap may be deployed across projects and tracked across institutions. UUID lets you say "this is the same physical device" even when it moves between projects. | +| **Site** | Research site identity. Same site may appear in multiple projects and external databases. Darwin Core: related to `locationID` and `locality`. | +| **S3StorageSource** | Storage configuration identity. When migrating instances, the UUID confirms "this export references the same S3 bucket configuration." | +| **SourceImageCollection** | Collection identity for reproducible subsets. Collections define curated image sets for specific processing jobs. UUID enables referencing "process this exact collection" across instances. | +| **Tag** | Tag identity within a project. Tags are project-scoped labels applied to taxa. UUID ensures tag references survive export/import. | +| **TaxaList** | Curated taxa list identity. TaxaLists are shared across projects and define taxa subsets for filtering. UUID enables consistent list references. | +| **Job** | Job identity for audit and reproducibility. Historical record of "pipeline X was run on deployment Y at time Z." UUID enables cross-referencing job provenance across instances. | +| **DataExport** | Export record identity. Less critical — primarily for audit trail of what was exported when. | +| **Pipeline** | **Critical for ML service coordination.** Pipelines are currently matched between Antenna and external processing services by **slug** (`get_or_create(slug=results.pipeline)`). This creates collision risk: two different pipelines with the same slug (e.g., `panama_moths_2024`) on different processing services can clash, causing half of a job's images to be processed by the wrong pipeline. UUID provides unambiguous pipeline identity across Antenna instances and processing services. The slug remains the human-readable label; the UUID becomes the coordination identifier. | +| **Algorithm** | **Critical for ML reproducibility.** Algorithms are currently matched by `(name, version)` from processing service `/info` responses. Same collision risk as pipelines — two different model checkpoints shipped as the same `(name, version)` by different services. UUID provides unambiguous algorithm identity for tracking which exact model produced which detections/classifications. Essential for scientific reproducibility. | +| **ProcessingService** | Service endpoint identity. Less critical since services are instance-local, but UUID helps when migrating processing service configurations between instances. | +| **ProjectPipelineConfig** | Through-model for Pipeline↔Project M2M. UUID useful for config identity in export/import. | **Shared/global (referenced in exports but not project-scoped):** -| Model | App | Notes | -|---|---|---| -| Taxon | main | Shared across projects; matched by `name` on import | -| User | users | Shared; matched by `email` on import | +| Model | Known UUID use cases | +|---|---| +| **Taxon** | Taxonomy identity. Matched by `name` on import (taxa are shared across projects). UUID useful for Darwin Core `taxonID` and cross-referencing with external taxonomic databases. Note: taxonomy names can change (synonymization, reclassification), so `name` as natural key has known limitations. | +| **User** | User identity. Matched by `email` on import. UUID useful for anonymized exports where email should not be exposed but user identity needs to be preserved. | ## 2. Organization Model From ae41cb20faa8b02d871ab22a4ce7e2d8553f9c70 Mon Sep 17 00:00:00 2001 From: Michael Bunsen Date: Fri, 3 Apr 2026 10:12:45 -0700 Subject: [PATCH 9/9] fix: address PR review feedback and add tests for move_project_data Fixes from Copilot and CodeRabbit review: - Mutual exclusion check for --target-project / --create-project - Pipeline config clone preserves enabled and config fields - Collection clone preserves method and kwargs fields - Validation queries scoped to moved deployments (not entire target) - Validation failure raises CommandError (triggers transaction rollback) - Project creation uses ProjectManager (create_defaults=True) - Bulk taxa linking via target_project.taxa.add(*taxa_ids) - Remove unused Taxon import and f-prefix on plain strings Add 37 automated tests covering: - Basic move, dry run, --create-project - All 6 error handling paths - Shared resource clone-vs-reassign (Device, Site, S3StorageSource) - Collection split/reassign logic - Pipeline config cloning - ProcessingService linking - Identifier role preservation - TaxaList linking, default filter copying - Edge cases (empty deployment, move all, multiple deployments) Co-Authored-By: Claude --- .../management/commands/move_project_data.py | 54 +- ami/main/tests/__init__.py | 0 ami/main/tests/test_move_project_data.py | 800 ++++++++++++++++++ 3 files changed, 839 insertions(+), 15 deletions(-) create mode 100644 ami/main/tests/__init__.py create mode 100644 ami/main/tests/test_move_project_data.py diff --git a/ami/main/management/commands/move_project_data.py b/ami/main/management/commands/move_project_data.py index 23eb91fc9..70d796915 100644 --- a/ami/main/management/commands/move_project_data.py +++ b/ami/main/management/commands/move_project_data.py @@ -36,7 +36,6 @@ SourceImage, SourceImageCollection, TaxaList, - Taxon, ) from ami.ml.models import ProjectPipelineConfig @@ -173,6 +172,9 @@ def handle(self, *args, **options): # Target project resolution create_project_name = options.get("create_project") + if create_project_name and options.get("target_project"): + raise CommandError("Use either --target-project or --create-project, not both") + if create_project_name: self.log(f"Target project: NEW — '{create_project_name}'") target_project = None @@ -330,7 +332,7 @@ def handle(self, *args, **options): from ami.users.models import User from ami.users.roles import Identifier, Role - self.log(f"\n Identifiers (users with identifications on moved data):") + self.log("\n Identifiers (users with identifications on moved data):") for uid in identifier_users: user = User.objects.get(pk=uid) source_role = Role.get_primary_role(source_project, user) @@ -344,7 +346,7 @@ def handle(self, *args, **options): self.log(f" {user.email}: " f"{role_to_assign.display_name} ({role_source})") # Default filter config - self.log(f"\n Source project default filters:") + self.log("\n Source project default filters:") include_taxa = list(source_project.default_filters_include_taxa.values_list("name", flat=True)) exclude_taxa = list(source_project.default_filters_exclude_taxa.values_list("name", flat=True)) self.log(f" Score threshold: {source_project.default_filters_score_threshold}") @@ -356,7 +358,7 @@ def handle(self, *args, **options): # TaxaLists source_taxa_lists = TaxaList.objects.filter(projects=source_project) if source_taxa_lists.exists(): - self.log(f"\n TaxaLists linked to source project:") + self.log("\n TaxaLists linked to source project:") for tl in source_taxa_lists: shared = tl.projects.count() self.log( @@ -409,8 +411,9 @@ def handle(self, *args, **options): with transaction.atomic(): # 0. Create target project inside transaction if create_project_name: - target_project = Project(name=create_project_name, owner=source_project.owner) - target_project.save() + target_project = Project.objects.create( + name=create_project_name, owner=source_project.owner, create_defaults=True + ) for membership in source_project.project_memberships.all(): target_project.members.add(membership.user) self.log(f" [1/12] Created project '{target_project.name}' (id={target_project.pk})") @@ -512,6 +515,8 @@ def handle(self, *args, **options): name=coll.name, project_id=target_id, description=coll.description or "", + method=coll.method, + kwargs=coll.kwargs or {}, ) new_coll.images.set(moved_image_ids) self.log( @@ -535,7 +540,12 @@ def handle(self, *args, **options): cloned_count = 0 for config in ProjectPipelineConfig.objects.filter(project_id=source_project_id): if config.pipeline_id not in existing_pipelines: - ProjectPipelineConfig.objects.create(project_id=target_id, pipeline_id=config.pipeline_id) + ProjectPipelineConfig.objects.create( + project_id=target_id, + pipeline_id=config.pipeline_id, + enabled=config.enabled, + config=config.config, + ) cloned_count += 1 total = ProjectPipelineConfig.objects.filter(project_id=target_id).count() self.log(f" [11/12] Pipeline configs: cloned {cloned_count}, target now has {total}") @@ -546,8 +556,7 @@ def handle(self, *args, **options): # 12. Link taxa to target project if taxa_ids: - for taxon in Taxon.objects.filter(pk__in=taxa_ids): - taxon.projects.add(target_project) + target_project.taxa.add(*taxa_ids) self.log(f" [12/12] Linked {len(taxa_ids):,} taxa to target project") else: self.log(" [12/12] No taxa to link (no occurrences with determinations)") @@ -613,11 +622,11 @@ def handle(self, *args, **options): self.log(f" Updated cached fields for {len(moved_event_pks)} events") # Update both projects' related calculated fields (events + deployments) - self.log(f" Updating source project cached fields...") + self.log(" Updating source project cached fields...") source_project.update_related_calculated_fields() self.log(f" Source project '{source_project.name}': related fields updated") - self.log(f" Updating target project cached fields...") + self.log(" Updating target project cached fields...") target_project.update_related_calculated_fields() self.log(f" Target project (id={target_id}): related fields updated") @@ -692,7 +701,9 @@ def handle(self, *args, **options): self.log(f" OK: All {model_name} point to target project") # Indirect access consistency - dets_via_project = Detection.objects.filter(source_image__project_id=target_id).count() + dets_via_project = Detection.objects.filter( + source_image__project_id=target_id, source_image__deployment_id__in=deployment_ids + ).count() dets_via_dep = Detection.objects.filter(source_image__deployment_id__in=deployment_ids).count() if dets_via_project != dets_via_dep: errors.append(f"Detection count mismatch: via project={dets_via_project}, via deployment={dets_via_dep}") @@ -704,21 +715,32 @@ def handle(self, *args, **options): f" OK: Detections consistent ({dets_via_project:,} via project, {dets_via_dep:,} via deployment)" ) - cls_via_project = Classification.objects.filter(detection__source_image__project_id=target_id).count() + cls_via_project = Classification.objects.filter( + detection__source_image__project_id=target_id, detection__source_image__deployment_id__in=deployment_ids + ).count() cls_via_dep = Classification.objects.filter(detection__source_image__deployment_id__in=deployment_ids).count() if cls_via_project != cls_via_dep: errors.append( f"Classification count mismatch: via project={cls_via_project}, via deployment={cls_via_dep}" ) + self.log( + f" FAIL: Classification count mismatch: via project={cls_via_project}, via deployment={cls_via_dep}" + ) else: self.log(f" OK: Classifications consistent ({cls_via_project:,})") - idents_via_project = Identification.objects.filter(occurrence__project_id=target_id).count() + idents_via_project = Identification.objects.filter( + occurrence__project_id=target_id, occurrence__deployment_id__in=deployment_ids + ).count() idents_via_dep = Identification.objects.filter(occurrence__deployment_id__in=deployment_ids).count() if idents_via_project != idents_via_dep: errors.append( f"Identification count mismatch: via project={idents_via_project}, via deployment={idents_via_dep}" ) + self.log( + f" FAIL: Identification count mismatch:" + f" via project={idents_via_project}, via deployment={idents_via_dep}" + ) else: self.log(f" OK: Identifications consistent ({idents_via_project:,})") @@ -736,7 +758,7 @@ def handle(self, *args, **options): leaked = coll.images.filter(deployment_id__in=deployment_ids).count() if leaked: errors.append(f"Source collection '{coll.name}' still has {leaked} moved images") - self.log(f" OK: No moved images in source collections" if not any("collection" in e for e in errors) else "") + self.log(" OK: No moved images in source collections" if not any("collection" in e for e in errors) else "") # Conservation: source + target = original totals for model_name in source_pre: @@ -761,6 +783,8 @@ def handle(self, *args, **options): self.log(" VALIDATION FAILED", style=self.style.ERROR) for err in errors: self.log(f" ✗ {err}", style=self.style.ERROR) + self.log(f"{'=' * 60}") + raise CommandError("Post-move validation failed; see log output above for details.") else: self.log(" ALL VALIDATION CHECKS PASSED", style=self.style.SUCCESS) self.log(f"{'=' * 60}") diff --git a/ami/main/tests/__init__.py b/ami/main/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/ami/main/tests/test_move_project_data.py b/ami/main/tests/test_move_project_data.py new file mode 100644 index 000000000..c68f00e02 --- /dev/null +++ b/ami/main/tests/test_move_project_data.py @@ -0,0 +1,800 @@ +""" +Tests for the move_project_data management command. +""" + +import datetime +import uuid +from io import StringIO +from unittest.mock import patch + +from django.core.management import call_command +from django.core.management.base import CommandError +from django.db import connection +from django.test import TestCase + +from ami.jobs.models import Job +from ami.main.models import ( + Classification, + Deployment, + Detection, + Device, + Event, + Identification, + Occurrence, + Project, + S3StorageSource, + Site, + SourceImage, + SourceImageCollection, + TaxaList, + Taxon, + TaxonRank, + group_images_into_events, +) +from ami.ml.models.pipeline import Pipeline +from ami.ml.models.project_pipeline_config import ProjectPipelineConfig +from ami.users.models import User + + +def _uid(): + return uuid.uuid4().hex[:8] + + +def _create_user(email=None): + return User.objects.create_user(email=email or f"test-{_uid()}@example.com", password="testpass") + + +def _create_project(owner, name=None): + """Create a project with create_defaults=False to avoid side effects.""" + return Project.objects.create(name=name or f"Project {_uid()}", owner=owner, create_defaults=False) + + +def _create_s3_source(project, name=None): + return S3StorageSource.objects.create( + project=project, + name=name or f"S3 Source {_uid()}", + bucket="test-bucket", + endpoint_url="http://minio:9000", + access_key="test", + secret_key="test", + ) + + +def _create_device(project, name=None): + return Device.objects.create(project=project, name=name or f"Device {_uid()}") + + +def _create_site(project, name=None): + return Site.objects.create(project=project, name=name or f"Site {_uid()}") + + +def _create_deployment(project, s3_source=None, device=None, site=None, name=None): + return Deployment.objects.create( + project=project, + name=name or f"Deployment {_uid()}", + data_source=s3_source, + device=device, + research_site=site, + ) + + +def _create_captures(deployment, count=3): + """Create source images and group into events.""" + images = [] + base_time = datetime.datetime(2024, 6, 15, 22, 0) + for i in range(count): + img = SourceImage.objects.create( + deployment=deployment, + project=deployment.project, + timestamp=base_time + datetime.timedelta(minutes=i * 10), + path=f"test/{_uid()}_{i}.jpg", + ) + images.append(img) + group_images_into_events(deployment) + # Refresh to get event assignments + for img in images: + img.refresh_from_db() + return images + + +def _create_taxa(project): + """Create a small taxonomy tree and link to project.""" + order = Taxon.objects.create(name=f"Lepidoptera-{_uid()}", rank=TaxonRank.ORDER.name) + species = Taxon.objects.create(name=f"Vanessa atalanta-{_uid()}", rank=TaxonRank.SPECIES.name, parent=order) + project.taxa.add(order, species) + return [order, species] + + +def _create_occurrences(deployment, taxa, images): + """Create occurrences with detections and classifications.""" + occurrences = [] + for i, img in enumerate(images): + occ = Occurrence.objects.create( + event=img.event, + deployment=deployment, + project=deployment.project, + determination=taxa[i % len(taxa)], + determination_score=0.9, + ) + det = Detection.objects.create( + source_image=img, + timestamp=img.timestamp, + bbox=[0.1, 0.1, 0.2, 0.2], + occurrence=occ, + ) + Classification.objects.create( + detection=det, + taxon=taxa[i % len(taxa)], + score=0.85, + timestamp=img.timestamp, + ) + occurrences.append(occ) + return occurrences + + +def _create_identification(occurrence, user, taxon): + return Identification.objects.create(occurrence=occurrence, user=user, taxon=taxon) + + +def _create_pipeline(name=None): + return Pipeline.objects.create(name=name or f"Pipeline {_uid()}") + + +def _run_command(*args, **kwargs): + """Call move_project_data and return (stdout, stderr).""" + out = StringIO() + err = StringIO() + call_command("move_project_data", *args, stdout=out, stderr=err, **kwargs) + return out.getvalue(), err.getvalue() + + +class MoveProjectDataSetupMixin: + """Common setup for move_project_data tests.""" + + def _setup_source(self, num_images=3, with_occurrences=True, with_identifications=False): + """ + Create a source project with one deployment, images, taxa, and optionally + occurrences/identifications. + """ + self.owner = _create_user() + self.source_project = _create_project(self.owner, "Source Project") + self.s3_source = _create_s3_source(self.source_project) + self.device = _create_device(self.source_project) + self.site = _create_site(self.source_project) + self.deployment = _create_deployment( + self.source_project, s3_source=self.s3_source, device=self.device, site=self.site, name="Dep A" + ) + self.images = _create_captures(self.deployment, count=num_images) + self.taxa = _create_taxa(self.source_project) + + self.occurrences = [] + if with_occurrences: + self.occurrences = _create_occurrences(self.deployment, self.taxa, self.images) + + self.identifications = [] + if with_identifications: + self.identifier_user = _create_user(email="identifier@example.com") + for occ in self.occurrences: + ident = _create_identification(occ, self.identifier_user, self.taxa[0]) + self.identifications.append(ident) + + def _base_args(self): + return ["--source-project", str(self.source_project.pk), "--deployment-ids", str(self.deployment.pk)] + + +class TestMoveToExistingProject(MoveProjectDataSetupMixin, TestCase): + """Test moving deployments to an existing target project.""" + + def setUp(self): + self._setup_source(num_images=3, with_occurrences=True) + self.target_project = _create_project(self.owner, "Target Project") + + def test_basic_move(self): + """All data is moved to the target project and removed from source.""" + dep_id = self.deployment.pk + pre_source_occs = Occurrence.objects.filter(project=self.source_project).count() + + _run_command(*self._base_args(), "--target-project", str(self.target_project.pk), "--execute") + + # Deployment now in target + self.deployment.refresh_from_db() + self.assertEqual(self.deployment.project_id, self.target_project.pk) + + # All images moved + self.assertEqual(SourceImage.objects.filter(deployment_id=dep_id, project=self.target_project).count(), 3) + self.assertEqual(SourceImage.objects.filter(deployment_id=dep_id, project=self.source_project).count(), 0) + + # All occurrences moved + self.assertEqual( + Occurrence.objects.filter(deployment_id=dep_id, project=self.target_project).count(), pre_source_occs + ) + self.assertEqual(Occurrence.objects.filter(deployment_id=dep_id, project=self.source_project).count(), 0) + + # Events moved + self.assertEqual(Event.objects.filter(deployment_id=dep_id, project=self.target_project).count(), 1) + self.assertEqual(Event.objects.filter(deployment_id=dep_id, project=self.source_project).count(), 0) + + # Detections still accessible via deployment + self.assertEqual(Detection.objects.filter(source_image__deployment_id=dep_id).count(), 3) + # And via project + self.assertEqual(Detection.objects.filter(source_image__project=self.target_project).count(), 3) + + # Classifications still accessible + self.assertEqual(Classification.objects.filter(detection__source_image__deployment_id=dep_id).count(), 3) + + # Source project should be empty + self.assertEqual(SourceImage.objects.filter(project=self.source_project).count(), 0) + + def test_taxa_linked_to_target(self): + """Taxa referenced by moved occurrences are linked to target project.""" + _run_command(*self._base_args(), "--target-project", str(self.target_project.pk), "--execute") + + target_taxa_ids = set(self.target_project.taxa.values_list("pk", flat=True)) + for taxon in self.taxa: + self.assertIn(taxon.pk, target_taxa_ids) + + def test_conservation_counts(self): + """Source + target row counts equal the original totals after move.""" + pre_total_imgs = SourceImage.objects.filter(project=self.source_project).count() + pre_total_occs = Occurrence.objects.filter(project=self.source_project).count() + + _run_command(*self._base_args(), "--target-project", str(self.target_project.pk), "--execute") + + post_source_imgs = SourceImage.objects.filter(project=self.source_project).count() + post_target_imgs = SourceImage.objects.filter(project=self.target_project).count() + self.assertEqual(post_source_imgs + post_target_imgs, pre_total_imgs) + + post_source_occs = Occurrence.objects.filter(project=self.source_project).count() + post_target_occs = Occurrence.objects.filter(project=self.target_project).count() + self.assertEqual(post_source_occs + post_target_occs, pre_total_occs) + + def test_jobs_moved(self): + """Jobs associated with moved deployments are moved.""" + pipeline = _create_pipeline() + Job.objects.create(project=self.source_project, deployment=self.deployment, pipeline=pipeline) + + _run_command(*self._base_args(), "--target-project", str(self.target_project.pk), "--execute") + + self.assertEqual(Job.objects.filter(deployment=self.deployment, project=self.target_project).count(), 1) + self.assertEqual(Job.objects.filter(deployment=self.deployment, project=self.source_project).count(), 0) + + +class TestDryRun(MoveProjectDataSetupMixin, TestCase): + """Dry run must not modify any data.""" + + def setUp(self): + self._setup_source(num_images=3, with_occurrences=True) + self.target_project = _create_project(self.owner, "Target Project") + + def test_dry_run_no_changes(self): + """Without --execute, nothing is modified.""" + pre_img_count = SourceImage.objects.filter(project=self.source_project).count() + pre_occ_count = Occurrence.objects.filter(project=self.source_project).count() + + out, _ = _run_command(*self._base_args(), "--target-project", str(self.target_project.pk)) + + self.assertIn("DRY RUN", out) + + # Nothing changed + self.deployment.refresh_from_db() + self.assertEqual(self.deployment.project_id, self.source_project.pk) + self.assertEqual(SourceImage.objects.filter(project=self.source_project).count(), pre_img_count) + self.assertEqual(Occurrence.objects.filter(project=self.source_project).count(), pre_occ_count) + self.assertEqual(SourceImage.objects.filter(project=self.target_project).count(), 0) + + +class TestCreateProject(MoveProjectDataSetupMixin, TestCase): + """Test --create-project flag.""" + + def setUp(self): + self._setup_source(num_images=2, with_occurrences=False) + + @patch("ami.main.models.ProjectManager.create_related_defaults") + def test_create_new_target(self, mock_defaults): + """--create-project creates a new project and moves data into it.""" + new_name = f"New Project {_uid()}" + + _run_command(*self._base_args(), "--create-project", new_name, "--execute") + + target = Project.objects.get(name=new_name) + self.deployment.refresh_from_db() + self.assertEqual(self.deployment.project_id, target.pk) + self.assertEqual(target.owner_id, self.owner.pk) + self.assertEqual(SourceImage.objects.filter(project=target).count(), 2) + mock_defaults.assert_called_once() + + @patch("ami.main.models.ProjectManager.create_related_defaults") + def test_create_project_copies_members(self, mock_defaults): + """Members from source are added to created target.""" + member = _create_user() + self.source_project.members.add(member) + + new_name = f"New Project {_uid()}" + _run_command(*self._base_args(), "--create-project", new_name, "--execute") + + target = Project.objects.get(name=new_name) + self.assertIn(member, target.members.all()) + + +class TestErrorHandling(MoveProjectDataSetupMixin, TestCase): + """Test argument validation and error conditions.""" + + def setUp(self): + self._setup_source(num_images=1, with_occurrences=False) + + def test_source_project_not_found(self): + with self.assertRaises(CommandError, msg="Source project 99999 does not exist"): + _run_command("--source-project", "99999", "--deployment-ids", "1", "--target-project", "1") + + def test_target_project_not_found(self): + with self.assertRaises(CommandError, msg="does not exist"): + _run_command(*self._base_args(), "--target-project", "99999") + + def test_deployment_not_found(self): + with self.assertRaises(CommandError, msg="not found"): + _run_command( + "--source-project", str(self.source_project.pk), "--deployment-ids", "99999", "--target-project", "1" + ) + + def test_deployment_wrong_project(self): + other_project = _create_project(self.owner, "Other") + other_dep = _create_deployment(other_project, name="Other Dep") + + with self.assertRaises(CommandError, msg="not in source project"): + _run_command( + "--source-project", + str(self.source_project.pk), + "--deployment-ids", + str(other_dep.pk), + "--target-project", + str(other_project.pk), + ) + + def test_both_target_and_create(self): + target = _create_project(self.owner, "Target") + with self.assertRaises(CommandError, msg="not both"): + _run_command( + *self._base_args(), "--target-project", str(target.pk), "--create-project", "New Project", "--execute" + ) + + def test_neither_target_nor_create(self): + with self.assertRaises(CommandError, msg="Must specify"): + _run_command(*self._base_args()) + + +class TestSharedResourceCloning(MoveProjectDataSetupMixin, TestCase): + """Test clone-vs-reassign logic for Device, Site, and S3StorageSource.""" + + def setUp(self): + self._setup_source(num_images=2, with_occurrences=False) + self.target_project = _create_project(self.owner, "Target Project") + + def test_exclusive_device_reassigned(self): + """When only the moved deployment uses a device, it's reassigned (not cloned).""" + original_device_pk = self.device.pk + _run_command(*self._base_args(), "--target-project", str(self.target_project.pk), "--execute") + + self.device.refresh_from_db() + self.assertEqual(self.device.pk, original_device_pk) # Same PK, not cloned + self.assertEqual(self.device.project_id, self.target_project.pk) + + def test_shared_device_cloned(self): + """When another deployment in source also uses the device, it's cloned.""" + # Create a second deployment that shares the same device + _create_deployment(self.source_project, device=self.device, name="Dep B (stays)") + original_device_pk = self.device.pk + + _run_command(*self._base_args(), "--target-project", str(self.target_project.pk), "--execute") + + # Original device still belongs to source project + original_device = Device.objects.get(pk=original_device_pk) + self.assertEqual(original_device.project_id, self.source_project.pk) + + # Moved deployment now points to a NEW device (cloned) + self.deployment.refresh_from_db() + self.assertNotEqual(self.deployment.device_id, original_device_pk) + cloned_device = Device.objects.get(pk=self.deployment.device_id) + self.assertEqual(cloned_device.project_id, self.target_project.pk) + self.assertEqual(cloned_device.name, original_device.name) + + def test_exclusive_site_reassigned(self): + """When only the moved deployment uses a site, it's reassigned.""" + original_site_pk = self.site.pk + _run_command(*self._base_args(), "--target-project", str(self.target_project.pk), "--execute") + + self.site.refresh_from_db() + self.assertEqual(self.site.pk, original_site_pk) + self.assertEqual(self.site.project_id, self.target_project.pk) + + def test_shared_site_cloned(self): + """When another deployment in source also uses the site, it's cloned.""" + _create_deployment(self.source_project, site=self.site, name="Dep B (stays)") + original_site_pk = self.site.pk + + _run_command(*self._base_args(), "--target-project", str(self.target_project.pk), "--execute") + + original_site = Site.objects.get(pk=original_site_pk) + self.assertEqual(original_site.project_id, self.source_project.pk) + + self.deployment.refresh_from_db() + self.assertNotEqual(self.deployment.research_site_id, original_site_pk) + cloned_site = Site.objects.get(pk=self.deployment.research_site_id) + self.assertEqual(cloned_site.project_id, self.target_project.pk) + self.assertEqual(cloned_site.name, original_site.name) + + def test_exclusive_s3_source_reassigned(self): + """When only the moved deployment uses an S3 source, it's reassigned.""" + original_s3_pk = self.s3_source.pk + _run_command(*self._base_args(), "--target-project", str(self.target_project.pk), "--execute") + + self.s3_source.refresh_from_db() + self.assertEqual(self.s3_source.pk, original_s3_pk) + self.assertEqual(self.s3_source.project_id, self.target_project.pk) + + def test_shared_s3_source_cloned(self): + """When another deployment in source also uses the S3 source, it's cloned.""" + _create_deployment(self.source_project, s3_source=self.s3_source, name="Dep B (stays)") + original_s3_pk = self.s3_source.pk + + _run_command(*self._base_args(), "--target-project", str(self.target_project.pk), "--execute") + + original_s3 = S3StorageSource.objects.get(pk=original_s3_pk) + self.assertEqual(original_s3.project_id, self.source_project.pk) + + self.deployment.refresh_from_db() + self.assertNotEqual(self.deployment.data_source_id, original_s3_pk) + cloned_s3 = S3StorageSource.objects.get(pk=self.deployment.data_source_id) + self.assertEqual(cloned_s3.project_id, self.target_project.pk) + + def test_device_owned_by_other_project_unchanged(self): + """A device owned by a different project is left untouched.""" + other_project = _create_project(self.owner, "Other") + external_device = _create_device(other_project, name="External Device") + self.deployment.device = external_device + self.deployment.save(update_calculated_fields=False, regroup_async=False) + + _run_command(*self._base_args(), "--target-project", str(self.target_project.pk), "--execute") + + external_device.refresh_from_db() + self.assertEqual(external_device.project_id, other_project.pk) # Unchanged + self.deployment.refresh_from_db() + self.assertEqual(self.deployment.device_id, external_device.pk) # Still references it + + +class TestCollectionHandling(MoveProjectDataSetupMixin, TestCase): + """Test collection split/reassign logic.""" + + def setUp(self): + self._setup_source(num_images=3, with_occurrences=False) + self.target_project = _create_project(self.owner, "Target Project") + + # Create a second deployment that stays in source + self.dep_b = _create_deployment(self.source_project, name="Dep B (stays)") + self.images_b = _create_captures(self.dep_b, count=2) + + def test_exclusive_collection_reassigned(self): + """Collection with only moved images is reassigned to target.""" + coll = SourceImageCollection.objects.create(name="Exclusive Coll", project=self.source_project) + coll.images.set(self.images) + + _run_command(*self._base_args(), "--target-project", str(self.target_project.pk), "--execute") + + coll.refresh_from_db() + self.assertEqual(coll.project_id, self.target_project.pk) + self.assertEqual(coll.images.count(), 3) + + def test_mixed_collection_split(self): + """Collection with images from both deployments is split.""" + coll = SourceImageCollection.objects.create( + name="Mixed Coll", + project=self.source_project, + method="random", + kwargs={"seed": 42}, + ) + coll.images.set(list(self.images) + list(self.images_b)) + original_total = coll.images.count() + + _run_command(*self._base_args(), "--target-project", str(self.target_project.pk), "--execute") + + # Source collection now has only dep_b images + coll.refresh_from_db() + self.assertEqual(coll.project_id, self.source_project.pk) + self.assertEqual(coll.images.count(), len(self.images_b)) + + # New collection created in target with moved images + target_coll = SourceImageCollection.objects.filter(project=self.target_project, name="Mixed Coll").first() + self.assertIsNotNone(target_coll) + self.assertEqual(target_coll.images.count(), len(self.images)) + # Preserves method and kwargs + self.assertEqual(target_coll.method, "random") + self.assertEqual(target_coll.kwargs, {"seed": 42}) + + # Conservation: total images across both collections unchanged + self.assertEqual(coll.images.count() + target_coll.images.count(), original_total) + + def test_no_clone_collections_flag(self): + """--no-clone-collections removes images from source but doesn't create target collection.""" + coll = SourceImageCollection.objects.create(name="Mixed Coll", project=self.source_project) + coll.images.set(list(self.images) + list(self.images_b)) + + _run_command( + *self._base_args(), + "--target-project", + str(self.target_project.pk), + "--no-clone-collections", + "--execute", + ) + + coll.refresh_from_db() + self.assertEqual(coll.images.count(), len(self.images_b)) + # No collection created in target + self.assertFalse(SourceImageCollection.objects.filter(project=self.target_project, name="Mixed Coll").exists()) + + +class TestPipelineConfigCloning(MoveProjectDataSetupMixin, TestCase): + """Test pipeline config clone logic.""" + + def setUp(self): + self._setup_source(num_images=1, with_occurrences=False) + self.target_project = _create_project(self.owner, "Target Project") + self.pipeline = _create_pipeline("Test Pipeline") + ProjectPipelineConfig.objects.create( + project=self.source_project, + pipeline=self.pipeline, + enabled=False, + config={"batch_size": 32}, + ) + + def test_pipeline_config_cloned(self): + """Pipeline configs are cloned to target preserving enabled and config.""" + _run_command(*self._base_args(), "--target-project", str(self.target_project.pk), "--execute") + + target_config = ProjectPipelineConfig.objects.get(project=self.target_project, pipeline=self.pipeline) + self.assertFalse(target_config.enabled) + self.assertEqual(target_config.config, {"batch_size": 32}) + + def test_existing_pipeline_config_not_duplicated(self): + """If target already has a config for the same pipeline, it's not overwritten.""" + ProjectPipelineConfig.objects.create( + project=self.target_project, + pipeline=self.pipeline, + enabled=True, + config={"batch_size": 64}, + ) + + _run_command(*self._base_args(), "--target-project", str(self.target_project.pk), "--execute") + + target_config = ProjectPipelineConfig.objects.get(project=self.target_project, pipeline=self.pipeline) + # Original target config preserved, not overwritten + self.assertTrue(target_config.enabled) + self.assertEqual(target_config.config, {"batch_size": 64}) + + def test_no_clone_pipelines_flag(self): + """--no-clone-pipelines skips pipeline config cloning.""" + _run_command( + *self._base_args(), + "--target-project", + str(self.target_project.pk), + "--no-clone-pipelines", + "--execute", + ) + + self.assertFalse(ProjectPipelineConfig.objects.filter(project=self.target_project).exists()) + + +class TestProcessingServiceLinking(MoveProjectDataSetupMixin, TestCase): + """Test that ProcessingServices are linked to target project.""" + + def setUp(self): + self._setup_source(num_images=1, with_occurrences=False) + self.target_project = _create_project(self.owner, "Target Project") + + def _create_processing_service_raw(self, name, project_ids): + """Create a ProcessingService without triggering get_status() via the custom manager.""" + with connection.cursor() as cursor: + cursor.execute( + "INSERT INTO ml_processingservice (name, description, endpoint_url, created_at, updated_at)" + " VALUES (%s, '', 'http://test:2000', NOW(), NOW()) RETURNING id", + [name], + ) + ps_id = cursor.fetchone()[0] + for pid in project_ids: + cursor.execute( + "INSERT INTO ml_processingservice_projects (processingservice_id, project_id) VALUES (%s, %s)", + [ps_id, pid], + ) + return ps_id + + def test_processing_services_linked(self): + """Processing services from source are linked to target.""" + ps_id = self._create_processing_service_raw("Test PS", [self.source_project.pk]) + + _run_command(*self._base_args(), "--target-project", str(self.target_project.pk), "--execute") + + # Check via raw SQL since ORM has column issues + with connection.cursor() as cursor: + cursor.execute( + "SELECT COUNT(*) FROM ml_processingservice_projects WHERE processingservice_id=%s AND project_id=%s", + [ps_id, self.target_project.pk], + ) + count = cursor.fetchone()[0] + self.assertEqual(count, 1) + + def test_already_linked_service_not_duplicated(self): + """If target already has the processing service linked, no duplicate is created.""" + ps_id = self._create_processing_service_raw("Test PS", [self.source_project.pk, self.target_project.pk]) + + _run_command(*self._base_args(), "--target-project", str(self.target_project.pk), "--execute") + + with connection.cursor() as cursor: + cursor.execute( + "SELECT COUNT(*) FROM ml_processingservice_projects WHERE processingservice_id=%s AND project_id=%s", + [ps_id, self.target_project.pk], + ) + count = cursor.fetchone()[0] + self.assertEqual(count, 1) # Not duplicated + + +class TestIdentifierRolePreservation(MoveProjectDataSetupMixin, TestCase): + """Test that identifier users are added to target project with correct roles.""" + + def setUp(self): + self._setup_source(num_images=2, with_occurrences=True, with_identifications=True) + self.target_project = _create_project(self.owner, "Target Project") + + def test_identifiers_added_to_target(self): + """Users who made identifications on moved data are added to target project.""" + _run_command(*self._base_args(), "--target-project", str(self.target_project.pk), "--execute") + + target_member_ids = set(self.target_project.members.values_list("pk", flat=True)) + self.assertIn(self.identifier_user.pk, target_member_ids) + + def test_identifier_already_member_not_duplicated(self): + """If the identifier is already a member of target, they're not re-added.""" + self.target_project.members.add(self.identifier_user) + pre_count = self.target_project.members.count() + + _run_command(*self._base_args(), "--target-project", str(self.target_project.pk), "--execute") + + self.assertEqual(self.target_project.members.count(), pre_count) + + +class TestTaxaListLinking(MoveProjectDataSetupMixin, TestCase): + """Test TaxaList linking to target project.""" + + def setUp(self): + self._setup_source(num_images=1, with_occurrences=True) + self.target_project = _create_project(self.owner, "Target Project") + + def test_taxa_lists_linked(self): + """TaxaLists from source are linked to target project.""" + taxa_list = TaxaList.objects.create(name="Test List") + taxa_list.projects.add(self.source_project) + taxa_list.taxa.set(self.taxa) + + _run_command(*self._base_args(), "--target-project", str(self.target_project.pk), "--execute") + + self.assertIn(self.target_project, taxa_list.projects.all()) + + +class TestDefaultFilterCopying(MoveProjectDataSetupMixin, TestCase): + """Test copying of default filter config.""" + + def setUp(self): + self._setup_source(num_images=1, with_occurrences=True) + self.target_project = _create_project(self.owner, "Target Project") + + def test_score_threshold_copied(self): + """Source project's score threshold is copied to target.""" + self.source_project.default_filters_score_threshold = 0.75 + self.source_project.save() + + _run_command(*self._base_args(), "--target-project", str(self.target_project.pk), "--execute") + + self.target_project.refresh_from_db() + self.assertEqual(self.target_project.default_filters_score_threshold, 0.75) + + def test_include_exclude_taxa_copied(self): + """Source project's include/exclude taxa lists are copied.""" + self.source_project.default_filters_include_taxa.add(self.taxa[0]) + self.source_project.default_filters_exclude_taxa.add(self.taxa[1]) + + _run_command(*self._base_args(), "--target-project", str(self.target_project.pk), "--execute") + + self.target_project.refresh_from_db() + self.assertIn(self.taxa[0], self.target_project.default_filters_include_taxa.all()) + self.assertIn(self.taxa[1], self.target_project.default_filters_exclude_taxa.all()) + + +class TestEdgeCases(MoveProjectDataSetupMixin, TestCase): + """Edge cases and boundary conditions.""" + + def setUp(self): + self.owner = _create_user() + self.source_project = _create_project(self.owner, "Source Project") + + def test_move_deployment_with_no_images(self): + """Moving an empty deployment should succeed.""" + dep = _create_deployment(self.source_project, name="Empty Dep") + target = _create_project(self.owner, "Target") + + _run_command( + "--source-project", + str(self.source_project.pk), + "--deployment-ids", + str(dep.pk), + "--target-project", + str(target.pk), + "--execute", + ) + + dep.refresh_from_db() + self.assertEqual(dep.project_id, target.pk) + + def test_move_all_deployments_from_source(self): + """Moving all deployments leaves source project empty.""" + dep1 = _create_deployment(self.source_project, name="Dep 1") + dep2 = _create_deployment(self.source_project, name="Dep 2") + _create_captures(dep1, count=2) + _create_captures(dep2, count=2) + target = _create_project(self.owner, "Target") + + _run_command( + "--source-project", + str(self.source_project.pk), + "--deployment-ids", + f"{dep1.pk},{dep2.pk}", + "--target-project", + str(target.pk), + "--execute", + ) + + self.assertEqual(Deployment.objects.filter(project=self.source_project).count(), 0) + self.assertEqual(SourceImage.objects.filter(project=self.source_project).count(), 0) + self.assertEqual(Deployment.objects.filter(project=target).count(), 2) + + def test_move_multiple_deployments(self): + """Multiple comma-separated deployment IDs work.""" + dep1 = _create_deployment(self.source_project, name="Dep 1") + dep2 = _create_deployment(self.source_project, name="Dep 2") + _create_captures(dep1, count=2) + _create_captures(dep2, count=3) + target = _create_project(self.owner, "Target") + + _run_command( + "--source-project", + str(self.source_project.pk), + "--deployment-ids", + f"{dep1.pk},{dep2.pk}", + "--target-project", + str(target.pk), + "--execute", + ) + + self.assertEqual(SourceImage.objects.filter(project=target).count(), 5) + + def test_target_already_has_taxa(self): + """Moving to a project that already has taxa doesn't create duplicates.""" + dep = _create_deployment(self.source_project, name="Dep A") + imgs = _create_captures(dep, count=2) + taxa = _create_taxa(self.source_project) + # Create occurrences referencing both taxa (2 images, 2 taxa → both used as determinations) + _create_occurrences(dep, taxa, imgs) + + target = _create_project(self.owner, "Target") + target.taxa.add(taxa[0]) # Pre-existing + + _run_command( + "--source-project", + str(self.source_project.pk), + "--deployment-ids", + str(dep.pk), + "--target-project", + str(target.pk), + "--execute", + ) + + # taxa[0] should appear only once (no duplicate from .add()) + self.assertEqual(target.taxa.filter(pk=taxa[0].pk).count(), 1) + # taxa[1] should also be linked (referenced by second occurrence's determination) + self.assertIn(taxa[1], target.taxa.all())