Skip to content

Commit 7faa3a8

Browse files
authored
fix: remove terminated extension services from instance config (#620)
## Description Fixes a gap where extension services marked for removal were not cleaned up from instance config even after all DPUs reported successful termination. Previously, terminated extension service cleanup was only executed in the `WaitingForExtensionServicesConfig` instance state; however, extension service config updates do not transition the machine out of Ready (only tenant state moves to `Configuring`). This change adds instance extension service config cleanup in the `Ready` state so terminated services can be cleaned up. ## Type of Change <!-- Check one that best describes this PR --> - [ ] **Add** - New feature or capability - [ ] **Change** - Changes in existing functionality - [x] **Fix** - Bug fixes - [ ] **Remove** - Removed features or deprecated functionality - [ ] **Internal** - Internal changes (refactoring, tests, docs, etc.) ## Related Issues (Optional) <!-- If applicable, provide GitHub Issue. --> ## Breaking Changes - [ ] This PR contains breaking changes <!-- If checked above, describe the breaking changes and migration steps --> ## Testing <!-- How was this tested? Check all that apply --> - [x] Unit tests added/updated - [ ] Integration tests added/updated - [ ] Manual testing performed - [ ] No testing required (docs, internal refactor, etc.) Signed-off-by: Felicity Xu <hanyux@nvidia.com>
1 parent 8d52f26 commit 7faa3a8

File tree

3 files changed

+284
-47
lines changed

3 files changed

+284
-47
lines changed

crates/api/src/state_controller/machine/handler.rs

Lines changed: 102 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -5438,9 +5438,25 @@ impl StateHandler for InstanceStateHandler {
54385438
return Ok(StateHandlerOutcome::transition(next_state));
54395439
}
54405440

5441-
let mut txn = ctx.services.db_pool.begin().await?;
5442-
let extension_services_status =
5443-
get_extension_services_status(mh_snapshot, instance, &mut txn).await?;
5441+
let mut extension_services_status =
5442+
get_extension_services_status(mh_snapshot, instance);
5443+
let txn = if extension_services_status.configs_synced == SyncState::Synced
5444+
&& !extension_services_status
5445+
.get_terminated_service_ids()
5446+
.is_empty()
5447+
{
5448+
let mut txn = ctx.services.db_pool.begin().await?;
5449+
cleanup_terminated_extension_services(
5450+
instance,
5451+
&mut extension_services_status,
5452+
txn.as_mut(),
5453+
)
5454+
.await?;
5455+
5456+
Some(txn)
5457+
} else {
5458+
None
5459+
};
54445460
let outcome = match extension_service::compute_extension_services_readiness(&extension_services_status) {
54455461
ExtensionServicesReadiness::Ready => {
54465462
let next_state = ManagedHostState::Assigned {
@@ -5465,7 +5481,10 @@ impl StateHandler for InstanceStateHandler {
54655481
)
54665482
}
54675483
};
5468-
Ok(outcome.with_txn(txn))
5484+
Ok(match txn {
5485+
Some(txn) => outcome.with_txn(txn),
5486+
None => outcome,
5487+
})
54695488
}
54705489
InstanceState::WaitingForRebootToReady => {
54715490
// If custom_pxe_reboot_requested is set, this reboot was triggered by
@@ -5511,6 +5530,33 @@ impl StateHandler for InstanceStateHandler {
55115530
return Ok(StateHandlerOutcome::transition(next_state));
55125531
}
55135532

5533+
// Run cleanup here so fully terminated extension services are
5534+
// removed from persisted instance config.
5535+
let mut txn_opt = None;
5536+
if !instance
5537+
.config
5538+
.extension_services
5539+
.service_configs
5540+
.is_empty()
5541+
{
5542+
let mut extension_services_status =
5543+
get_extension_services_status(mh_snapshot, instance);
5544+
if extension_services_status.configs_synced == SyncState::Synced
5545+
&& !extension_services_status
5546+
.get_terminated_service_ids()
5547+
.is_empty()
5548+
{
5549+
let mut txn = ctx.services.db_pool.begin().await?;
5550+
cleanup_terminated_extension_services(
5551+
instance,
5552+
&mut extension_services_status,
5553+
txn.as_mut(),
5554+
)
5555+
.await?;
5556+
txn_opt = Some(txn);
5557+
}
5558+
}
5559+
55145560
let reprov_can_be_started =
55155561
if dpu_reprovisioning_needed(&mh_snapshot.dpu_snapshots) {
55165562
// Usually all DPUs are updated with user_approval_received field as true
@@ -5610,7 +5656,11 @@ impl StateHandler for InstanceStateHandler {
56105656
}
56115657
};
56125658

5613-
let mut txn = ctx.services.db_pool.begin().await?;
5659+
let mut txn = if let Some(txn) = txn_opt.take() {
5660+
txn
5661+
} else {
5662+
ctx.services.db_pool.begin().await?
5663+
};
56145664

56155665
if host_firmware_requested {
56165666
let health_override =
@@ -5642,6 +5692,8 @@ impl StateHandler for InstanceStateHandler {
56425692
}
56435693

56445694
Ok(StateHandlerOutcome::transition(next_state).with_txn(txn))
5695+
} else if let Some(txn) = txn_opt {
5696+
Ok(StateHandlerOutcome::do_nothing().with_txn(txn))
56455697
} else {
56465698
Ok(StateHandlerOutcome::do_nothing())
56475699
}
@@ -6137,56 +6189,63 @@ impl StateHandler for InstanceStateHandler {
61376189

61386190
// Gets extension services status from DB, checks if any removed services are fully terminated
61396191
// across all DPUs, if so, remove them from the instance config in the DB(without updating the version).
6140-
async fn get_extension_services_status(
6192+
fn get_extension_services_status(
61416193
mh_snapshot: &ManagedHostStateSnapshot,
61426194
instance: &InstanceSnapshot,
6143-
txn: &mut PgConnection,
6144-
) -> Result<InstanceExtensionServicesStatus, StateHandlerError> {
6195+
) -> InstanceExtensionServicesStatus {
61456196
let (_, dpu_id_to_device_map) = mh_snapshot
61466197
.host_snapshot
61476198
.get_dpu_device_and_id_mappings()
61486199
.unwrap_or_else(|_| (HashMap::default(), HashMap::default()));
61496200

61506201
// Gather instance extension services status from all DPU observations
6151-
let mut extension_services_status =
6152-
InstanceExtensionServicesStatus::from_config_and_observations(
6153-
&dpu_id_to_device_map,
6154-
Versioned::new(
6155-
&instance.config.extension_services,
6156-
instance.extension_services_config_version,
6157-
),
6158-
&instance.observations.extension_services,
6159-
);
6160-
6161-
if extension_services_status.configs_synced == SyncState::Synced {
6162-
let terminated_service_ids = extension_services_status.get_terminated_service_ids();
6163-
if !terminated_service_ids.is_empty() {
6164-
tracing::info!(
6165-
instance_id = %instance.id,
6166-
service_ids = ?terminated_service_ids,
6167-
"Cleaning up fully terminated extension services from instance config"
6168-
);
6169-
let new_config = instance
6170-
.config
6171-
.extension_services
6172-
.remove_terminated_services(&terminated_service_ids);
6202+
InstanceExtensionServicesStatus::from_config_and_observations(
6203+
&dpu_id_to_device_map,
6204+
Versioned::new(
6205+
&instance.config.extension_services,
6206+
instance.extension_services_config_version,
6207+
),
6208+
&instance.observations.extension_services,
6209+
)
6210+
}
61736211

6174-
db::instance::update_extension_services_config(
6175-
txn,
6176-
instance.id,
6177-
instance.extension_services_config_version,
6178-
&new_config,
6179-
false,
6180-
)
6181-
.await?;
6212+
async fn cleanup_terminated_extension_services(
6213+
instance: &InstanceSnapshot,
6214+
extension_services_status: &mut InstanceExtensionServicesStatus,
6215+
txn: &mut PgConnection,
6216+
) -> Result<(), StateHandlerError> {
6217+
if extension_services_status.configs_synced != SyncState::Synced {
6218+
return Ok(());
6219+
}
61826220

6183-
extension_services_status
6184-
.extension_services
6185-
.retain(|svc| !terminated_service_ids.contains(&svc.service_id));
6186-
}
6221+
let terminated_service_ids = extension_services_status.get_terminated_service_ids();
6222+
if terminated_service_ids.is_empty() {
6223+
return Ok(());
61876224
}
61886225

6189-
Ok(extension_services_status)
6226+
tracing::info!(
6227+
instance_id = %instance.id,
6228+
service_ids = ?terminated_service_ids,
6229+
"Cleaning up fully terminated extension services from instance config"
6230+
);
6231+
let new_config = instance
6232+
.config
6233+
.extension_services
6234+
.remove_terminated_services(&terminated_service_ids);
6235+
6236+
db::instance::update_extension_services_config(
6237+
txn,
6238+
instance.id,
6239+
instance.extension_services_config_version,
6240+
&new_config,
6241+
false,
6242+
)
6243+
.await?;
6244+
6245+
extension_services_status
6246+
.extension_services
6247+
.retain(|svc| !terminated_service_ids.contains(&svc.service_id));
6248+
Ok(())
61906249
}
61916250

61926251
async fn handle_instance_network_config_update_request(

crates/api/src/tests/common/api_fixtures/mod.rs

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2069,6 +2069,17 @@ pub async fn network_configured_with_health(
20692069
env: &TestEnv,
20702070
dpu_machine_id: &MachineId,
20712071
dpu_health: Option<rpc::health::HealthReport>,
2072+
) {
2073+
network_configured_with_health_and_ext_services(env, dpu_machine_id, dpu_health, None).await
2074+
}
2075+
2076+
/// Fake an iteration of forge-dpu-agent requesting network config, applying it, and reporting back.
2077+
/// When reporting back, the health and extension services statuses reported by the DPU can be overrridden
2078+
pub async fn network_configured_with_health_and_ext_services(
2079+
env: &TestEnv,
2080+
dpu_machine_id: &MachineId,
2081+
dpu_health: Option<rpc::health::HealthReport>,
2082+
extension_services_state: Option<rpc::forge::DpuExtensionServiceDeploymentStatus>,
20722083
) {
20732084
let network_config = env
20742085
.api
@@ -2160,9 +2171,9 @@ pub async fn network_configured_with_health(
21602171
service_type: extension_service.service_type,
21612172
service_name: "".to_string(),
21622173
version: extension_service.version.to_string(),
2163-
state:
2164-
rpc::forge::DpuExtensionServiceDeploymentStatus::DpuExtensionServiceRunning
2165-
as i32,
2174+
state: extension_services_state.unwrap_or(
2175+
rpc::forge::DpuExtensionServiceDeploymentStatus::DpuExtensionServiceRunning,
2176+
) as i32,
21662177
components: vec![],
21672178
message: "".to_string(),
21682179
removed: extension_service.removed.clone(),

0 commit comments

Comments
 (0)