diff --git a/book/src/SUMMARY.md b/book/src/SUMMARY.md index 23af3043c1..2c3d9172a0 100644 --- a/book/src/SUMMARY.md +++ b/book/src/SUMMARY.md @@ -21,6 +21,7 @@ - [NIC and Port selection](architecture/infiniband/nic_selection.md) - [State Machines]() - [Managed Host](architecture/state_machines/managedhost.md) + - [Switch](architecture/state_machines/switch.md) # Manuals diff --git a/book/src/architecture/state_machines/switch.md b/book/src/architecture/state_machines/switch.md new file mode 100644 index 0000000000..e318b8da71 --- /dev/null +++ b/book/src/architecture/state_machines/switch.md @@ -0,0 +1,81 @@ +# Switch State Diagram + +This document describes the Finite State Machine (FSM) for Switches in Carbide: lifecycle from creation through configuration, validation, ready, optional reprovisioning, and deletion. + +## High-Level Overview + +The main flow shows the primary states and transitions: + +
+ +```plantuml +@startuml +skinparam state { + BackgroundColor White +} + +state "Initializing" as Initializing +state "Configuring\n(RotateOsPassword)" as Configuring +state "Validating" as Validating +state "BomValidating" as BomValidating +state "Ready" as Ready +state "ReProvisioning\n(Start → WaitFirmware)" as ReProvisioning +state "Error" as Error +state "Deleting" as Deleting + +[*] --> Initializing : Switch created + +Initializing --> Configuring : init complete +Configuring --> Validating : rotate password done +Validating --> BomValidating : validation complete +BomValidating --> Ready : BOM validation complete + +Ready --> Deleting : marked for deletion +Ready --> ReProvisioning : reprovision requested + +ReProvisioning --> Ready : firmware upgrade Completed +ReProvisioning --> Error : firmware upgrade Failed + +Error --> Deleting : marked for deletion + +Deleting --> [*] : final delete +@enduml +``` + +
+ +## States + +| State | Description | +|-------|-------------| +| **Initializing** | Switch is created in Carbide; controller performs initial setup. | +| **Configuring** | Switch is being configured (rotate OS password). Sub-state: `RotateOsPassword`. | +| **Validating** | Switch is being validated. Sub-state: `ValidateComplete`. | +| **BomValidating** | BOM (Bill of Materials) validation. Sub-state: `BomValidateComplete`. | +| **Ready** | Switch is ready for use. From here it can be deleted, or reprovisioning can be requested. | +| **ReProvisioning** | Reprovisioning (e.g. firmware update) in progress. Sub-states: `Start`, `WaitFirmwareUpdateCompletion`. Completion is driven by `firmware_upgrade_status` (Completed → Ready, Failed → Error). | +| **Error** | Switch is in error (e.g. firmware upgrade failed). Can transition to Deleting if marked for deletion; otherwise waits for manual intervention. | +| **Deleting** | Switch is being removed; ends in final delete (terminal). | + +## Transitions (by trigger) + +| From | To | Trigger / Condition | +|------|-----|----------------------| +| *(create)* | Initializing | Switch created | +| Initializing | Configuring (RotateOsPassword) | Initialization complete | +| Configuring (RotateOsPassword) | Validating (ValidateComplete) | OS password rotated | +| Validating (ValidateComplete) | BomValidating (BomValidateComplete) | Validation complete | +| BomValidating (BomValidateComplete) | Ready | BOM validation complete | +| Ready | Deleting | `deleted` set (marked for deletion) | +| Ready | ReProvisioning (Start) | `switch_reprovisioning_requested` is set | +| ReProvisioning (Start) | ReProvisioning (WaitFirmwareUpdateCompletion) | Reprovision triggered | +| ReProvisioning (WaitFirmwareUpdateCompletion) | Ready | `firmware_upgrade_status == Completed` | +| ReProvisioning (WaitFirmwareUpdateCompletion) | Error | `firmware_upgrade_status == Failed { cause }` | +| Error | Deleting | `deleted` set (marked for deletion) | +| Deleting | *(end)* | Final delete committed | + +## Implementation + +- **State type**: `SwitchControllerState` in `crates/api-model/src/switch/mod.rs`. +- **Handlers**: `crates/api/src/state_controller/switch/` — one module per top-level state (`initializing`, `configuring`, `validating`, `bom_validating`, `ready`, `reprovisioning`, `error_state`, `deleting`). +- **Orchestration**: `SwitchStateHandler` in `handler.rs` delegates to the handler for the current `controller_state`. diff --git a/crates/admin-cli/src/expected_switch/add/args.rs b/crates/admin-cli/src/expected_switch/add/args.rs index 3cba806162..4224742638 100644 --- a/crates/admin-cli/src/expected_switch/add/args.rs +++ b/crates/admin-cli/src/expected_switch/add/args.rs @@ -37,6 +37,8 @@ pub struct Args { )] pub switch_serial_number: String, + #[clap(long, help = "NVOS MAC address of the expected switch")] + pub nvos_mac_address: Option, #[clap(long, help = "NVOS username of the expected switch")] pub nvos_username: Option, #[clap(long, help = "NVOS password of the expected switch")] @@ -89,6 +91,7 @@ impl From for rpc::forge::ExpectedSwitch { switch_serial_number: value.switch_serial_number, metadata: Some(metadata), rack_id: value.rack_id, + nvos_mac_address: value.nvos_mac_address.map(|m| m.to_string()), nvos_username: value.nvos_username, nvos_password: value.nvos_password, } diff --git a/crates/admin-cli/src/expected_switch/common.rs b/crates/admin-cli/src/expected_switch/common.rs index 9324d0a32c..16c6288ed6 100644 --- a/crates/admin-cli/src/expected_switch/common.rs +++ b/crates/admin-cli/src/expected_switch/common.rs @@ -25,6 +25,7 @@ pub struct ExpectedSwitchJson { pub bmc_username: String, pub bmc_password: String, pub switch_serial_number: String, + pub nvos_mac_address: Option, pub nvos_username: Option, pub nvos_password: Option, #[serde(default)] diff --git a/crates/admin-cli/src/expected_switch/show/cmd.rs b/crates/admin-cli/src/expected_switch/show/cmd.rs index 03aae36764..e02a2a7e3d 100644 --- a/crates/admin-cli/src/expected_switch/show/cmd.rs +++ b/crates/admin-cli/src/expected_switch/show/cmd.rs @@ -100,6 +100,7 @@ fn convert_and_print_into_nice_table( table.set_titles(row![ "Serial Number", "BMC Mac", + "NVOS Mac", "Interface IP", "Associated Machine", "Name", @@ -141,6 +142,7 @@ fn convert_and_print_into_nice_table( table.add_row(row![ expected_switch.switch_serial_number, expected_switch.bmc_mac_address, + expected_switch.nvos_mac_address.as_deref().unwrap_or_default(), machine_interface .map(|x| x.address.join("\n")) .unwrap_or("Undiscovered".to_string()), diff --git a/crates/admin-cli/src/expected_switch/update/args.rs b/crates/admin-cli/src/expected_switch/update/args.rs index 91d7449556..277de8e89e 100644 --- a/crates/admin-cli/src/expected_switch/update/args.rs +++ b/crates/admin-cli/src/expected_switch/update/args.rs @@ -27,6 +27,9 @@ use uuid::Uuid; "bmc_username", "bmc_password", "switch_serial_number", +"nvos_mac_address", +"nvos_username", +"nvos_password", ])))] pub struct Args { #[clap(short = 'a', long, help = "BMC MAC Address of the expected switch")] @@ -59,6 +62,12 @@ pub struct Args { )] pub switch_serial_number: Option, + #[clap( + long, + group = "group", + help = "NVOS MAC address of the expected switch" + )] + pub nvos_mac_address: Option, #[clap(long, group = "group", help = "NVOS username of the expected switch")] pub nvos_username: Option, #[clap(long, group = "group", help = "NVOS password of the expected switch")] @@ -140,6 +149,7 @@ impl TryFrom for rpc::forge::ExpectedSwitch { labels: crate::metadata::parse_rpc_labels(args.labels.unwrap_or_default()), }), rack_id: args.rack_id, + nvos_mac_address: args.nvos_mac_address.map(|m| m.to_string()), }) } } diff --git a/crates/admin-cli/src/rpc.rs b/crates/admin-cli/src/rpc.rs index 6778d12ca3..f92a742aec 100644 --- a/crates/admin-cli/src/rpc.rs +++ b/crates/admin-cli/src/rpc.rs @@ -564,7 +564,6 @@ impl ApiClient { Ok(self.0.update_expected_machine(request).await?) } - pub async fn replace_all_expected_machines( &self, expected_machine_list: Vec, @@ -635,6 +634,7 @@ impl ApiClient { bmc_username: switch.bmc_username, bmc_password: switch.bmc_password, switch_serial_number: switch.switch_serial_number, + nvos_mac_address: switch.nvos_mac_address.map(|m| m.to_string()), nvos_username: switch.nvos_username, nvos_password: switch.nvos_password, metadata: switch.metadata, diff --git a/crates/api-db/migrations/20260316120000_switch_reprovisioning_requested.sql b/crates/api-db/migrations/20260316120000_switch_reprovisioning_requested.sql new file mode 100644 index 0000000000..448cd08f88 --- /dev/null +++ b/crates/api-db/migrations/20260316120000_switch_reprovisioning_requested.sql @@ -0,0 +1,6 @@ +-- Add switch_reprovisioning_requested and firmware_upgrade_status columns to switches table. +-- switch_reprovisioning_requested: when set by an external entity, the state controller (when switch is Ready) transitions to ReProvisioning::Start. +-- firmware_upgrade_status: used during ReProvisioning (WaitFirmwareUpdateCompletion): Started, InProgress, Completed, Failed. +ALTER TABLE switches + ADD COLUMN switch_reprovisioning_requested JSONB, + ADD COLUMN firmware_upgrade_status JSONB; diff --git a/crates/api-db/migrations/20260316120002_expected_switches_nvos_mac_address.sql b/crates/api-db/migrations/20260316120002_expected_switches_nvos_mac_address.sql new file mode 100644 index 0000000000..718043c5a2 --- /dev/null +++ b/crates/api-db/migrations/20260316120002_expected_switches_nvos_mac_address.sql @@ -0,0 +1,3 @@ +-- Add nvos_mac_address column to expected_switches table (NVOS host MAC, similar to bmc_mac_address). +ALTER TABLE expected_switches + ADD COLUMN nvos_mac_address macaddr; diff --git a/crates/api-db/src/expected_switch.rs b/crates/api-db/src/expected_switch.rs index 8e3ac1e900..6ebae1738e 100644 --- a/crates/api-db/src/expected_switch.rs +++ b/crates/api-db/src/expected_switch.rs @@ -165,9 +165,9 @@ pub async fn create( ) -> DatabaseResult { let id = switch.expected_switch_id.unwrap_or_else(Uuid::new_v4); let query = "INSERT INTO expected_switches - (expected_switch_id, bmc_mac_address, bmc_username, bmc_password, serial_number, metadata_name, metadata_description, rack_id, metadata_labels, nvos_username, nvos_password) + (expected_switch_id, bmc_mac_address, bmc_username, bmc_password, serial_number, metadata_name, metadata_description, rack_id, metadata_labels, nvos_username, nvos_password, nvos_mac_address) VALUES - ($1::uuid, $2::macaddr, $3::varchar, $4::varchar, $5::varchar, $6::varchar, $7::varchar, $8::varchar, $9::jsonb, $10::varchar, $11::varchar) RETURNING *"; + ($1::uuid, $2::macaddr, $3::varchar, $4::varchar, $5::varchar, $6::varchar, $7::varchar, $8::varchar, $9::jsonb, $10::varchar, $11::varchar, $12::macaddr) RETURNING *"; sqlx::query_as(query) .bind(id) @@ -181,6 +181,7 @@ pub async fn create( .bind(sqlx::types::Json(&switch.metadata.labels)) .bind(&switch.nvos_username) .bind(&switch.nvos_password) + .bind(switch.nvos_mac_address) .fetch_one(txn) .await .map_err(|err: sqlx::Error| match err { diff --git a/crates/api-db/src/switch.rs b/crates/api-db/src/switch.rs index bc5f00b4dd..7fafb68b3f 100644 --- a/crates/api-db/src/switch.rs +++ b/crates/api-db/src/switch.rs @@ -22,7 +22,9 @@ use chrono::prelude::*; use config_version::{ConfigVersion, Versioned}; use futures::StreamExt; use model::controller_outcome::PersistentStateHandlerOutcome; -use model::switch::{NewSwitch, Switch, SwitchControllerState}; +use model::switch::{ + FirmwareUpgradeStatus, NewSwitch, Switch, SwitchControllerState, SwitchReprovisionRequest, +}; use sqlx::PgConnection; use crate::{ @@ -81,6 +83,8 @@ pub async fn create(txn: &mut PgConnection, new_switch: &NewSwitch) -> DatabaseR version, }, controller_state_outcome: None, + switch_reprovisioning_requested: None, + firmware_upgrade_status: None, }) } @@ -128,6 +132,18 @@ pub async fn find_by_id(txn: &mut PgConnection, id: &SwitchId) -> DatabaseResult } } +pub async fn find_by_host_mac_address( + txn: &mut PgConnection, + host_mac_address: &MacAddress, +) -> DatabaseResult> { + let query = sqlx::query_as::<_, Switch>("SELECT * FROM switches WHERE host_mac_address = $1"); + query + .bind(host_mac_address) + .fetch_optional(txn) + .await + .map_err(|e| DatabaseError::new("find_by_host_mac_address", e)) +} + pub async fn find_all(txn: &mut PgConnection) -> DatabaseResult> { let query = sqlx::query_as::<_, SwitchId>("SELECT id FROM switches WHERE deleted IS NULL"); @@ -207,6 +223,62 @@ pub async fn update_controller_state_outcome( Ok(()) } +/// Sets switch_reprovisioning_requested on the switch. Can be called from any state machine or +/// service. When the switch is in Ready state, the switch state controller will observe the flag +/// and transition to ReProvisioning::Start. +pub async fn set_switch_reprovisioning_requested( + txn: &mut PgConnection, + switch_id: SwitchId, + initiator: &str, +) -> DatabaseResult<()> { + let req = SwitchReprovisionRequest { + requested_at: Utc::now(), + initiator: initiator.to_string(), + }; + let query = + "UPDATE switches SET switch_reprovisioning_requested = $1 WHERE id = $2 RETURNING id"; + sqlx::query_as::<_, SwitchId>(query) + .bind(sqlx::types::Json(req)) + .bind(switch_id) + .fetch_optional(txn) + .await + .map_err(|e| DatabaseError::new("set_switch_reprovisioning_requested", e))?; + Ok(()) +} + +/// Clears switch_reprovisioning_requested. Typically called when reprovisioning completes or is +/// cancelled. +pub async fn clear_switch_reprovisioning_requested( + txn: &mut PgConnection, + switch_id: SwitchId, +) -> DatabaseResult<()> { + let query = + "UPDATE switches SET switch_reprovisioning_requested = NULL WHERE id = $1 RETURNING id"; + sqlx::query_as::<_, SwitchId>(query) + .bind(switch_id) + .fetch_optional(txn) + .await + .map_err(|e| DatabaseError::new("clear_switch_reprovisioning_requested", e))?; + Ok(()) +} + +/// Sets firmware_upgrade_status on the switch. Call from any state machine or service to report +/// upgrade progress. WaitFirmwareUpdateCompletion reads this: Completed → Ready, Failed → Error. +pub async fn update_firmware_upgrade_status( + txn: &mut PgConnection, + switch_id: SwitchId, + status: Option<&FirmwareUpgradeStatus>, +) -> DatabaseResult<()> { + let query = "UPDATE switches SET firmware_upgrade_status = $1 WHERE id = $2 RETURNING id"; + sqlx::query_as::<_, SwitchId>(query) + .bind(status.map(|s| sqlx::types::Json(s.clone()))) + .bind(switch_id) + .fetch_optional(txn) + .await + .map_err(|e| DatabaseError::new("update_firmware_upgrade_status", e))?; + Ok(()) +} + pub async fn mark_as_deleted<'a>( switch: &'a mut Switch, txn: &mut PgConnection, diff --git a/crates/api-model/src/expected_switch.rs b/crates/api-model/src/expected_switch.rs index 6ccf690114..8776ba0eb2 100644 --- a/crates/api-model/src/expected_switch.rs +++ b/crates/api-model/src/expected_switch.rs @@ -28,11 +28,13 @@ use uuid::Uuid; use crate::metadata::{Metadata, default_metadata_for_deserializer}; -#[derive(Debug, Clone, Default, Deserialize)] +#[derive(Default, Debug, Clone, Deserialize)] +#[serde(default)] pub struct ExpectedSwitch { #[serde(default)] pub expected_switch_id: Option, pub bmc_mac_address: MacAddress, + pub nvos_mac_address: Option, pub bmc_username: String, pub serial_number: String, pub bmc_password: String, @@ -52,9 +54,12 @@ impl<'r> FromRow<'r, PgRow> for ExpectedSwitch { labels: labels.0, }; + let nvos_mac_address: Option = row.try_get("nvos_mac_address").ok(); + Ok(ExpectedSwitch { expected_switch_id: row.try_get("expected_switch_id")?, bmc_mac_address: row.try_get("bmc_mac_address")?, + nvos_mac_address, bmc_username: row.try_get("bmc_username")?, serial_number: row.try_get("serial_number")?, bmc_password: row.try_get("bmc_password")?, @@ -75,6 +80,7 @@ impl From for rpc::forge::ExpectedSwitch { value: u.to_string(), }), bmc_mac_address: expected_switch.bmc_mac_address.to_string(), + nvos_mac_address: expected_switch.nvos_mac_address.map(|m| m.to_string()), bmc_username: expected_switch.bmc_username, bmc_password: expected_switch.bmc_password, switch_serial_number: expected_switch.serial_number, @@ -92,6 +98,15 @@ impl TryFrom for ExpectedSwitch { fn try_from(rpc: rpc::forge::ExpectedSwitch) -> Result { let bmc_mac_address = MacAddress::try_from(rpc.bmc_mac_address.as_str()) .map_err(|_| RpcDataConversionError::InvalidMacAddress(rpc.bmc_mac_address.clone()))?; + let nvos_mac_address = if rpc.nvos_mac_address.is_none() { + None + } else { + let mac_address = rpc.nvos_mac_address.unwrap(); + Some( + MacAddress::try_from(mac_address.as_str()) + .map_err(|_| RpcDataConversionError::InvalidMacAddress(mac_address))?, + ) + }; let expected_switch_id = rpc .expected_switch_id .map(|u| { @@ -111,6 +126,7 @@ impl TryFrom for ExpectedSwitch { nvos_password: rpc.nvos_password, metadata, rack_id: rpc.rack_id, + nvos_mac_address, }) } } diff --git a/crates/api-model/src/site_explorer/mod.rs b/crates/api-model/src/site_explorer/mod.rs index 35a6067135..1d766c860f 100644 --- a/crates/api-model/src/site_explorer/mod.rs +++ b/crates/api-model/src/site_explorer/mod.rs @@ -667,6 +667,27 @@ impl ExploredManagedHost { } } +/// A combination of DPU and host that was discovered via Site Exploration +#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "PascalCase")] +pub struct ExploredManagedSwitch { + /// The Switch's BMC IP + pub bmc_ip: IpAddr, + // Host mac address + pub nv_os_mac_addresses: Vec, + /// Exploration report for this switch endpoint + pub report: EndpointExplorationReport, +} + +impl ExploredManagedSwitch { + pub fn bmc_info(&self) -> BmcInfo { + BmcInfo { + ip: Some(self.bmc_ip.to_string()), + ..Default::default() + } + } +} + /// Serialization methods for types which support FromStr/Display mod serialize_option_display { use std::fmt::Display; @@ -927,7 +948,7 @@ impl EndpointExplorationReport { //TODO: refactor for common code with generate_power_shelf_id /// Tries to generate and store a MachineId for the discovered endpoint if /// enough data for generation is available - pub fn generate_switch_id(&mut self) -> ModelResult> { + pub fn generate_switch_id(&mut self) -> ModelResult> { if let Some(serial_number) = self .systems .first() @@ -968,8 +989,8 @@ impl EndpointExplorationReport { MissingHardwareInfo::Serial, )) })?; - - Ok(Some(self.switch_id.insert(switch_id))) + self.switch_id = Some(switch_id); + Ok(self.switch_id) } else { Err(ModelError::HardwareInfo( HardwareInfoError::MissingHardwareInfo(MissingHardwareInfo::Serial), diff --git a/crates/api-model/src/switch/mod.rs b/crates/api-model/src/switch/mod.rs index 34797d6082..cbc9d164c3 100644 --- a/crates/api-model/src/switch/mod.rs +++ b/crates/api-model/src/switch/mod.rs @@ -92,6 +92,25 @@ pub struct SwitchStatus { pub health_status: String, // "ok", "warning", "critical" } +/// Set by an external entity to request switch reprovisioning. When the switch is in Ready state, +/// the state controller checks this flag and transitions to ReProvisioning::Start. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct SwitchReprovisionRequest { + pub requested_at: DateTime, + pub initiator: String, +} + +/// Status of the firmware upgrade during ReProvisioning. Set by an external entity (e.g. switch +/// firmware updater). WaitFirmwareUpdateCompletion waits for Completed or Failed. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "lowercase")] +pub enum FirmwareUpgradeStatus { + Started, + InProgress, + Completed, + Failed { cause: String }, +} + #[derive(Debug, Clone)] pub struct Switch { pub id: SwitchId, @@ -105,6 +124,13 @@ pub struct Switch { /// The result of the last attempt to change state pub controller_state_outcome: Option, + + /// When set, the state controller (in Ready) transitions to ReProvisioning::Start. + pub switch_reprovisioning_requested: Option, + + /// Firmware upgrade status during ReProvisioning. WaitFirmwareUpdateCompletion polls this; + /// when Completed, transition to Ready; when Failed, transition to Error. + pub firmware_upgrade_status: Option, // Columns for these exist, but are unused in rust code // pub created: DateTime, // pub updated: DateTime, @@ -118,6 +144,10 @@ impl<'r> FromRow<'r, PgRow> for Switch { let status: Option> = row.try_get("status").ok(); let controller_state_outcome: Option> = row.try_get("controller_state_outcome").ok(); + let switch_reprovisioning_requested: Option> = + row.try_get("switch_reprovisioning_requested").ok(); + let firmware_upgrade_status: Option> = + row.try_get("firmware_upgrade_status").ok(); Ok(Switch { id: row.try_get("id")?, @@ -129,6 +159,8 @@ impl<'r> FromRow<'r, PgRow> for Switch { version: row.try_get("controller_state_version")?, }, controller_state_outcome: controller_state_outcome.map(|o| o.0), + switch_reprovisioning_requested: switch_reprovisioning_requested.map(|j| j.0), + firmware_upgrade_status: firmware_upgrade_status.map(|j| j.0), }) } } @@ -193,18 +225,54 @@ impl TryFrom for rpc::Switch { } } +/// Sub-state for SwitchControllerState::Configuring +#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] +pub enum ConfiguringState { + RotateOsPassword, +} + +/// Sub-state for SwitchControllerState::Validating +#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] +pub enum ValidatingState { + ValidateComplete, +} + +/// Sub-state for SwitchControllerState::BomValidating +#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] +pub enum BomValidatingState { + /// BOM validation is complete; handler transitions to Ready. + BomValidateComplete, +} + +/// Sub-state for SwitchControllerState::ReProvisioning +#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] +pub enum ReProvisioningState { + /// Re-provisioning has been started. + Start, + /// Waiting for firmware update to complete. + WaitFirmwareUpdateCompletion, +} + /// State of a Switch as tracked by the controller #[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] #[serde(tag = "state", rename_all = "lowercase")] pub enum SwitchControllerState { /// The Switch is created in Carbide, waiting for initialization. Initializing, - /// The Switch is fetching data. - FetchingData, /// The Switch is configuring. - Configuring, + Configuring { config_state: ConfiguringState }, + /// The Switch is validating. + Validating { validating_state: ValidatingState }, + /// The Switch is validating the BOM. + BomValidating { + bom_validating_state: BomValidatingState, + }, /// The Switch is ready for use. Ready, + // ReProvisioning + ReProvisioning { + reprovisioning_state: ReProvisioningState, + }, /// There is error in Switch; Switch can not be used if it's in error. Error { cause: String }, /// The Switch is in the process of deleting. @@ -223,15 +291,23 @@ pub fn state_sla(state: &SwitchControllerState, state_version: &ConfigVersion) - std::time::Duration::from_secs(slas::INITIALIZING), time_in_state, ), - SwitchControllerState::FetchingData => StateSla::with_sla( - std::time::Duration::from_secs(slas::FETCHING_DATA), + SwitchControllerState::Configuring { .. } => StateSla::with_sla( + std::time::Duration::from_secs(slas::CONFIGURING), + time_in_state, + ), + SwitchControllerState::Validating { .. } => StateSla::with_sla( + std::time::Duration::from_secs(slas::VALIDATING), time_in_state, ), - SwitchControllerState::Configuring => StateSla::with_sla( + SwitchControllerState::BomValidating { .. } => StateSla::with_sla( std::time::Duration::from_secs(slas::CONFIGURING), time_in_state, ), SwitchControllerState::Ready => StateSla::no_sla(), + SwitchControllerState::ReProvisioning { .. } => StateSla::with_sla( + std::time::Duration::from_secs(slas::CONFIGURING), + time_in_state, + ), SwitchControllerState::Error { .. } => StateSla::no_sla(), SwitchControllerState::Deleting => StateSla::with_sla( std::time::Duration::from_secs(slas::DELETING), @@ -271,28 +347,26 @@ mod tests { #[test] fn serialize_controller_state() { - let state = SwitchControllerState::Initializing {}; + let state = SwitchControllerState::Initializing; let serialized = serde_json::to_string(&state).unwrap(); assert_eq!(serialized, "{\"state\":\"initializing\"}"); assert_eq!( serde_json::from_str::(&serialized).unwrap(), state ); - let state = SwitchControllerState::FetchingData {}; + let state = SwitchControllerState::Configuring { + config_state: ConfiguringState::RotateOsPassword, + }; let serialized = serde_json::to_string(&state).unwrap(); - assert_eq!(serialized, "{\"state\":\"fetchingdata\"}"); assert_eq!( - serde_json::from_str::(&serialized).unwrap(), - state + serialized, + "{\"state\":\"configuring\",\"config_state\":\"RotateOsPassword\"}" ); - let state = SwitchControllerState::Configuring {}; - let serialized = serde_json::to_string(&state).unwrap(); - assert_eq!(serialized, "{\"state\":\"configuring\"}"); assert_eq!( serde_json::from_str::(&serialized).unwrap(), state ); - let state = SwitchControllerState::Ready {}; + let state = SwitchControllerState::Ready; let serialized = serde_json::to_string(&state).unwrap(); assert_eq!(serialized, "{\"state\":\"ready\"}"); assert_eq!( @@ -308,7 +382,7 @@ mod tests { serde_json::from_str::(&serialized).unwrap(), state ); - let state = SwitchControllerState::Deleting {}; + let state = SwitchControllerState::Deleting; let serialized = serde_json::to_string(&state).unwrap(); assert_eq!(serialized, "{\"state\":\"deleting\"}"); assert_eq!( diff --git a/crates/api-model/src/switch/slas.rs b/crates/api-model/src/switch/slas.rs index 6a52f04f87..e82dc2c6d1 100644 --- a/crates/api-model/src/switch/slas.rs +++ b/crates/api-model/src/switch/slas.rs @@ -18,12 +18,12 @@ /// SLA for Switch initialization in seconds pub const INITIALIZING: u64 = 300; // 5 minutes -/// SLA for Switch fetching data in seconds -pub const FETCHING_DATA: u64 = 300; // 5 minutes - /// SLA for Switch configuring in seconds pub const CONFIGURING: u64 = 300; // 5 minutes +/// SLA for Switch validating in seconds +pub const VALIDATING: u64 = 300; // 5 minutes + // /// SLA for Switch ready in seconds // pub const READY: u64 = 0; // 0 minutes diff --git a/crates/api/src/site_explorer/mod.rs b/crates/api/src/site_explorer/mod.rs index 51349e2468..f91a3a440f 100644 --- a/crates/api/src/site_explorer/mod.rs +++ b/crates/api/src/site_explorer/mod.rs @@ -29,12 +29,11 @@ use carbide_network::sanitized_mac; use carbide_uuid::machine::MachineType; use carbide_uuid::network::NetworkSegmentId; use carbide_uuid::power_shelf::{PowerShelfIdSource, PowerShelfType}; -use carbide_uuid::switch::{SwitchIdSource, SwitchType}; use chrono::Utc; use config_version::ConfigVersion; use db::{ self, DatabaseError, ObjectFilter, Transaction, machine, network_segment as db_network_segment, - power_shelf as db_power_shelf, switch as db_switch, + power_shelf as db_power_shelf, }; use futures_util::stream::FuturesUnordered; use futures_util::{StreamExt, TryFutureExt}; @@ -51,10 +50,9 @@ use model::power_shelf::{NewPowerShelf, PowerShelfConfig}; use model::resource_pool::common::CommonPools; use model::site_explorer::{ EndpointExplorationError, EndpointExplorationReport, EndpointType, ExploredDpu, - ExploredEndpoint, ExploredManagedHost, MachineExpectation, PowerState, PreingestionState, - Service, is_bf3_dpu, is_bf3_supernic, is_bluefield_model, + ExploredEndpoint, ExploredManagedHost, ExploredManagedSwitch, MachineExpectation, PowerState, + PreingestionState, Service, is_bf3_dpu, is_bf3_supernic, is_bluefield_model, }; -use model::switch::{NewSwitch, SwitchConfig}; use sqlx::PgPool; use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; @@ -89,6 +87,8 @@ use model::expected_machine::ExpectedMachine; use model::firmware::FirmwareComponentType; use model::machine_interface_address::MachineInterfaceAssociation; use model::network_segment::NetworkSegmentType; +mod switch_creator; +pub use switch_creator::SwitchCreator; use self::metrics::{PairingBlockerReason, exploration_error_to_metric_label}; use crate::site_explorer::explored_endpoint_index::ExploredEndpointIndex; @@ -134,6 +134,7 @@ pub struct SiteExplorer { firmware_config: Arc, work_lock_manager_handle: WorkLockManagerHandle, machine_creator: MachineCreator, + switch_creator: SwitchCreator, boot_order_tracker: BootOrderTracker, rms_client: Option>, } @@ -168,6 +169,10 @@ impl SiteExplorer { common_pools, rms_client.clone(), ), + switch_creator: SwitchCreator::new( + database_connection.clone(), + explorer_config.clone(), + ), database_connection, enabled: explorer_config.enabled, config: explorer_config, @@ -546,14 +551,14 @@ impl SiteExplorer { } // Identify and create switches - let explored_switches = self - .identify_switches_to_ingest(&expected_endpoint_index) - .await?; + let explored_switches = self.identify_switches_to_ingest().await?; if self.config.create_switches.load(Ordering::Relaxed) { let start_create_switches = std::time::Instant::now(); - let create_switches_res: Result<(), CarbideError> = - self.create_switches(metrics, explored_switches).await; + let create_switches_res: Result<(), CarbideError> = self + .switch_creator + .create_switches(metrics, &explored_switches, &expected_endpoint_index) + .await; metrics.create_switches_latency = Some(start_create_switches.elapsed()); create_switches_res?; } @@ -610,35 +615,6 @@ impl SiteExplorer { Ok(()) } - /// Creates a `Switch` object for an identified switch endpoint with initial states - async fn create_switches( - &self, - metrics: &mut SiteExplorationMetrics, - explored_switches: Vec<(ExploredEndpoint, &ExpectedSwitch)>, - ) -> CarbideResult<()> { - for (endpoint, expected_switch) in explored_switches { - let address = endpoint.address; - match self - .create_switch(endpoint, expected_switch, &self.database_connection) - .await - { - Ok(true) => { - metrics.created_switches_count += 1; - if metrics.created_switches_count as u64 == self.config.switches_created_per_run - { - break; - } - } - Ok(false) => {} - Err(error) => { - tracing::error!(%error, "Failed to create switch {:#?}", address) - } - } - } - - Ok(()) - } - pub async fn create_power_shelf( &self, explored_endpoint: ExploredEndpoint, @@ -817,158 +793,6 @@ impl SiteExplorer { Ok(true) } - pub async fn create_switch( - &self, - explored_endpoint: ExploredEndpoint, - expected_switch: &ExpectedSwitch, - pool: &PgPool, - ) -> CarbideResult { - let mut txn = pool - .begin() - .await - .map_err(|e| DatabaseError::new("begin load create_switch", e))?; - - let metadata = expected_switch.metadata.clone(); - - let Some(mac_address) = metadata.labels.get("host_mac_address") else { - return Err(CarbideError::InvalidArgument(format!( - "no host NVOS MAC address found for switch {}", - explored_endpoint.address - ))); - }; - - let host_mac_address = MacAddress::try_from(mac_address.as_str()) - .map_err(|e| CarbideError::InvalidArgument(format!("Invalid MAC address: {}", e)))?; - - let interface = - db::machine_interface::find_by_mac_address(&mut *txn, host_mac_address).await?; - - let (host_nvos_mac_addresses, host_nvos_ip_addresses) = - if let Some(interface) = interface.first() { - ( - vec![mac_address.clone()], - interface - .addresses - .iter() - .map(|ip| ip.to_string()) - .collect::>(), - ) - } else { - (vec![], vec![]) - }; - - // Generate switch_id similar to machine_id using deterministic hashing - // Extract switch metadata similar to how machine_id extracts hardware info - //TODO fetch these from chassis - let switch_serial = expected_switch.serial_number.as_str(); - let switch_vendor = "NVIDIA"; // Default vendor for switches - let switch_model = "Switch"; // Default model identifier - - let switch_id = match model::switch::switch_id::from_hardware_info( - switch_serial, - switch_vendor, - switch_model, - SwitchIdSource::ProductBoardChassisSerial, - SwitchType::NvLink, - ) { - Ok(id) => id, - Err(e) => { - tracing::error!(%e, "Failed to create switch ID"); - return Err(CarbideError::InvalidArgument(format!( - "Failed to create switch ID: {e}" - ))); - } - }; - - // TODO: review - // Check if a switch with the same SwitchId already exists - if let Some(_existing_switch) = db_switch::find_by_id(&mut txn, &switch_id).await? { - tracing::info!( - "Switch with ID '{}' already exists, skipping creation for endpoint {}", - switch_id, - explored_endpoint.address - ); - txn.rollback() - .await - .map_err(|e| DatabaseError::new("rollback create_switch", e))?; - return Ok(false); - } - - let config = SwitchConfig { - name: switch_serial.to_string(), // TODO: use metadata.name if it is not empty - enable_nmxc: false, - fabric_manager_config: None, - location: Some("US/CA/DC/San Jose/1000 N Mathilda Ave".to_string()), - }; - - let new_switch = NewSwitch { - id: switch_id, - config, - }; - - db_switch::create(&mut txn, &new_switch).await?; - - let mac_addresses = explored_endpoint.report.all_mac_addresses(); - for mac_address in mac_addresses { - let mi = db::machine_interface::find_by_mac_address(&mut *txn, mac_address).await?; - if let Some(interface) = mi.first() { - db::machine_interface::associate_interface_with_machine( - &interface.id, - MachineInterfaceAssociation::Switch(switch_id), - &mut txn, - ) - .await?; - } - } - - // No need to update the switch name again; it was already set in config above. - txn.commit() - .await - .map_err(|e| DatabaseError::new("end create_switch", e))?; - - tracing::info!( - "Created switch {} for endpoint {}", - switch_id, - explored_endpoint.address, - ); - - // Register the switch with Rack Manager if RMS client is available - if let Some(rms_client) = &self.rms_client { - if let Some(rack_id) = expected_switch.rack_id { - let bmc_mac_address = expected_switch.bmc_mac_address; - let new_node_info = NewNodeInfo { - rack_id: rack_id.to_string(), - node_id: switch_id.to_string(), - mac_address: bmc_mac_address.to_string(), - ip_address: explored_endpoint.address.to_string(), - port: 443, - username: None, - password: None, - r#type: Some(RmsNodeType::Switch.into()), - vault_path: format!("switch_nvos/{bmc_mac_address}/admin"), - host_ip_addresses: host_nvos_ip_addresses, - host_mac_addresses: host_nvos_mac_addresses, - }; - if let Err(e) = rms::add_node_to_rms(rms_client.as_ref(), new_node_info).await { - tracing::warn!("Failed to add switch {} to Rack Manager: {}", switch_id, e); - } else { - tracing::info!( - "Added switch {} to Rack Manager for endpoint {}", - switch_id, - explored_endpoint.address, - ); - } - } else { - tracing::warn!( - "Cannot add switch {} to Rack Manager: rack_id is missing", - switch_id - ); - } - } - - Ok(true) - } - /// identify_machines_to_ingest returns two maps. /// The first map returned identifies all of the DPUs that site explorer will try to ingest. /// The latter identifies all of the hosts the the site explorer will try to ingest. @@ -1405,10 +1229,7 @@ impl SiteExplorer { Ok(explored_power_shelves) } - async fn identify_switches_to_ingest<'a>( - &self, - expected_endpoint_index: &'a ExploredEndpointIndex, - ) -> CarbideResult> { + async fn identify_switches_to_ingest(&self) -> CarbideResult> { let mut txn = self .database_connection .begin() @@ -1421,20 +1242,19 @@ impl SiteExplorer { txn.commit() .await .map_err(|e| DatabaseError::new("end find_all_preingestion_complete data", e))?; + let mut managed_switches = Vec::new(); + for ep in explored_endpoints.into_iter() { + //TODO: if this can filterout @ DB query. May required new column in explored_endpoints table. + if ep.report.endpoint_type == EndpointType::Bmc && ep.report.is_switch() { + managed_switches.push(ExploredManagedSwitch { + bmc_ip: ep.address, + nv_os_mac_addresses: ep.report.all_mac_addresses(), + report: ep.report, + }); + } + } - Ok(explored_endpoints - .into_iter() - .filter_map(|ep| { - if ep.report.endpoint_type == EndpointType::Bmc - && let Some(expected_switch) = - expected_endpoint_index.matched_expected_switch(&ep.address) - { - Some((ep, expected_switch)) - } else { - None - } - }) - .collect()) + Ok(managed_switches) } /// Checks if all data that a site exploration run requires is actually configured diff --git a/crates/api/src/site_explorer/switch_creator.rs b/crates/api/src/site_explorer/switch_creator.rs new file mode 100644 index 0000000000..3acd6278d4 --- /dev/null +++ b/crates/api/src/site_explorer/switch_creator.rs @@ -0,0 +1,210 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +use carbide_uuid::switch::SwitchId; +use db::DatabaseError; +use model::expected_switch::ExpectedSwitch; +use model::site_explorer::ExploredManagedSwitch; +use sqlx::{PgConnection, PgPool}; + +use crate::CarbideResult; +use crate::site_explorer::SiteExplorerConfig; +use crate::site_explorer::explored_endpoint_index::ExploredEndpointIndex; +use crate::site_explorer::metrics::SiteExplorationMetrics; + +pub struct SwitchCreator { + database_connection: PgPool, + config: SiteExplorerConfig, +} + +impl SwitchCreator { + pub fn new(database_connection: PgPool, config: SiteExplorerConfig) -> Self { + Self { + database_connection, + config, + } + } + + pub(crate) async fn create_switches( + &self, + metrics: &mut SiteExplorationMetrics, + explored_managed_switches: &[ExploredManagedSwitch], + expected_explored_endpoint_index: &ExploredEndpointIndex, + ) -> CarbideResult<()> { + for explored_managed_switch in explored_managed_switches { + let expected_switch = match expected_explored_endpoint_index + .matched_expected_switch(&explored_managed_switch.bmc_ip) + { + Some(expected_switch) => expected_switch, + None => continue, + }; + + match self + .create_managed_switch( + explored_managed_switch, + expected_switch, + &self.database_connection, + ) + .await + { + Ok(true) => { + metrics.created_switches_count += 1; + if metrics.created_switches_count as u64 == self.config.switches_created_per_run + { + break; + } + } + Ok(false) => {} + Err(error) => { + tracing::error!( + %error, + "Failed to create managed switch {:#?}", + explored_managed_switch.bmc_ip + ); + } + } + } + + Ok(()) + } + + pub async fn create_managed_switch( + &self, + explored_managed_switch: &ExploredManagedSwitch, + expected_switch: &ExpectedSwitch, + pool: &PgPool, + ) -> CarbideResult { + let mut txn = pool + .begin() + .await + .map_err(|e| DatabaseError::new("begin create_managed_switch", e))?; + + let created = self + .create_switch(&mut txn, explored_managed_switch, expected_switch) + .await? + .is_some(); + + if !created { + txn.commit() + .await + .map_err(|e| DatabaseError::new("commit create_managed_switch", e))?; + return Ok(false); + } + + txn.commit() + .await + .map_err(|e| DatabaseError::new("commit create_managed_switch", e))?; + + Ok(true) + } + + // Returns SwitchId if switch was created. + async fn create_switch( + &self, + txn: &mut PgConnection, + explored_managed_switch: &ExploredManagedSwitch, + expected_switch: &ExpectedSwitch, + ) -> CarbideResult> { + let nv_os_mac_addresses = if explored_managed_switch.nv_os_mac_addresses.is_empty() { + match expected_switch.nvos_mac_address { + // This is to accomidate bug in redfish BMC + Some(nvos_mac_address) => vec![nvos_mac_address], + None => vec![], + } + } else { + explored_managed_switch.nv_os_mac_addresses.clone() + }; + for mac_address in &nv_os_mac_addresses { + if db::switch::find_by_host_mac_address(txn, mac_address) + .await? + .is_some() + { + // already exists, skip + return Ok(None); + } + } + let switch_id = explored_managed_switch + .clone() + .report + .generate_switch_id()? + .unwrap(); + + tracing::info!(%switch_id, "switch ID generated"); + + let existing_switch = db::switch::find_by_id(txn, &switch_id).await?; + + if let Some(_existing_switch) = existing_switch { + //Possibly multiple eth ports are connected? + tracing::warn!( + %switch_id, + "Switch already exists, skipping. Potentially multiple eth ports with same switch host serial number?" + ); + return Ok(None); + } + + self.create_switch_from_explored_switch( + txn, + explored_managed_switch, + expected_switch, + switch_id, + ) + .await?; + + Ok(Some(switch_id)) + } + + async fn create_switch_from_explored_switch( + &self, + txn: &mut PgConnection, + _explored_switch: &ExploredManagedSwitch, + expected_switch: &ExpectedSwitch, + switch_id: SwitchId, + ) -> CarbideResult<()> { + let name = match expected_switch.metadata.name.is_empty() { + true => expected_switch.serial_number.to_string(), + false => expected_switch.metadata.name.to_string(), + }; + let config = model::switch::SwitchConfig { + name, + enable_nmxc: false, + fabric_manager_config: None, + location: Some("US/CA/DC/San Jose/1000 N Mathilda Ave".to_string()), + }; + let new_switch = model::switch::NewSwitch { + id: switch_id, + config, + }; + + _ = db::switch::create(txn, &new_switch).await?; + // let bmc_info = explored_switch.bmc_info(); + // let hardware_info = HardwareInfo::default(); //TODO: Add this later when we have hardware info + // self.update_switch_topology(txn, &switch_id, bmc_info, hardware_info) + // .await?; + Ok(()) + } + + // async fn update_switch_topology( + // &self, + // _txn: &mut PgConnection, + // _switch_id: &SwitchId, + // _bmc_info: BmcInfo, + // _hardware_info: HardwareInfo, + // ) -> CarbideResult<()> { + // //TODO Add this later when and if required + // Ok(()) + // } +} diff --git a/crates/api/src/state_controller/switch/bom_validating.rs b/crates/api/src/state_controller/switch/bom_validating.rs new file mode 100644 index 0000000000..5cfbac1335 --- /dev/null +++ b/crates/api/src/state_controller/switch/bom_validating.rs @@ -0,0 +1,46 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +//! Handler for SwitchControllerState::BomValidating. + +use carbide_uuid::switch::SwitchId; +use model::switch::{BomValidatingState, Switch, SwitchControllerState}; + +use crate::state_controller::state_handler::StateHandlerContext; +use crate::state_controller::state_handler::{StateHandlerError, StateHandlerOutcome}; +use crate::state_controller::switch::context::SwitchStateHandlerContextObjects; + +/// Handles the BomValidating state for a switch. +pub async fn handle_bom_validating( + _switch_id: &SwitchId, + state: &mut Switch, + _ctx: &mut StateHandlerContext<'_, SwitchStateHandlerContextObjects>, +) -> Result, StateHandlerError> { + let bom_validating_state = match &state.controller_state.value { + SwitchControllerState::BomValidating { + bom_validating_state, + } => bom_validating_state, + _ => unreachable!("handle_bom_validating called with non-BomValidating state"), + }; + + match bom_validating_state { + BomValidatingState::BomValidateComplete => { + tracing::info!("BOM Validating Switch: BomValidateComplete, moving to Ready"); + Ok(StateHandlerOutcome::transition(SwitchControllerState::Ready)) + } + } +} diff --git a/crates/api/src/state_controller/switch/configuring.rs b/crates/api/src/state_controller/switch/configuring.rs new file mode 100644 index 0000000000..6263f20ca9 --- /dev/null +++ b/crates/api/src/state_controller/switch/configuring.rs @@ -0,0 +1,50 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +//! Handler for SwitchControllerState::Configuring. + +use carbide_uuid::switch::SwitchId; +use model::switch::{ConfiguringState, Switch, SwitchControllerState, ValidatingState}; + +use crate::state_controller::state_handler::StateHandlerContext; +use crate::state_controller::state_handler::{StateHandlerError, StateHandlerOutcome}; +use crate::state_controller::switch::context::SwitchStateHandlerContextObjects; + +/// Handles the Configuring state for a switch. +pub async fn handle_configuring( + _switch_id: &SwitchId, + state: &mut Switch, + _ctx: &mut StateHandlerContext<'_, SwitchStateHandlerContextObjects>, +) -> Result, StateHandlerError> { + let config_state = match &state.controller_state.value { + SwitchControllerState::Configuring { config_state } => config_state, + _ => unreachable!("handle_configuring called with non-Configuring state"), + }; + + match config_state { + ConfiguringState::RotateOsPassword => { + tracing::info!("Configuring Switch: RotateOsPassword"); + // TODO: Rotate OS password. Then transition to Validating. + Ok(StateHandlerOutcome::transition( + SwitchControllerState::Validating { + // TODO: Implement validation logic. + validating_state: ValidatingState::ValidateComplete, + }, + )) + } + } +} diff --git a/crates/api/src/state_controller/switch/deleting.rs b/crates/api/src/state_controller/switch/deleting.rs new file mode 100644 index 0000000000..b3e101f3bb --- /dev/null +++ b/crates/api/src/state_controller/switch/deleting.rs @@ -0,0 +1,39 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +//! Handler for SwitchControllerState::Deleting. + +use carbide_uuid::switch::SwitchId; +use db::switch as db_switch; +use model::switch::{Switch, SwitchControllerState}; + +use crate::state_controller::state_handler::{StateHandlerError, StateHandlerOutcome}; +use crate::state_controller::switch::context::SwitchStateHandlerContextObjects; +use crate::state_controller::state_handler::StateHandlerContext; + +/// Handles the Deleting state for a switch. +/// TODO: Implement full deletion logic (check in use, shut down, release resources). +pub async fn handle_deleting( + switch_id: &SwitchId, + _state: &mut Switch, + ctx: &mut StateHandlerContext<'_, SwitchStateHandlerContextObjects>, +) -> Result, StateHandlerError> { + tracing::info!("Deleting Switch"); + let mut txn = ctx.services.db_pool.begin().await?; + db_switch::final_delete(*switch_id, &mut txn).await?; + Ok(StateHandlerOutcome::deleted().with_txn(txn)) +} diff --git a/crates/api/src/state_controller/switch/error_state.rs b/crates/api/src/state_controller/switch/error_state.rs new file mode 100644 index 0000000000..5773772dd9 --- /dev/null +++ b/crates/api/src/state_controller/switch/error_state.rs @@ -0,0 +1,42 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +//! Handler for SwitchControllerState::Error. + +use carbide_uuid::switch::SwitchId; +use model::switch::{Switch, SwitchControllerState}; + +use crate::state_controller::state_handler::{StateHandlerError, StateHandlerOutcome}; +use crate::state_controller::switch::context::SwitchStateHandlerContextObjects; +use crate::state_controller::state_handler::StateHandlerContext; + +/// Handles the Error state for a switch. +/// If marked for deletion, transition to Deleting; otherwise wait for manual intervention. +pub async fn handle_error( + _switch_id: &SwitchId, + state: &mut Switch, + _ctx: &mut StateHandlerContext<'_, SwitchStateHandlerContextObjects>, +) -> Result, StateHandlerError> { + tracing::info!("Switch is in error state"); + if state.is_marked_as_deleted() { + Ok(StateHandlerOutcome::transition( + SwitchControllerState::Deleting, + )) + } else { + Ok(StateHandlerOutcome::do_nothing()) + } +} diff --git a/crates/api/src/state_controller/switch/handler.rs b/crates/api/src/state_controller/switch/handler.rs index 78725568eb..7469ebabe4 100644 --- a/crates/api/src/state_controller/switch/handler.rs +++ b/crates/api/src/state_controller/switch/handler.rs @@ -14,19 +14,70 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + +//! State Handler implementation for Switches (mirrors Machine state handler structure). + use carbide_uuid::switch::SwitchId; -use db::switch as db_switch; use model::switch::{Switch, SwitchControllerState}; +use tracing::instrument; use crate::state_controller::state_handler::{ StateHandler, StateHandlerContext, StateHandlerError, StateHandlerOutcome, }; +use crate::state_controller::switch::bom_validating::handle_bom_validating; +use crate::state_controller::switch::configuring::handle_configuring; use crate::state_controller::switch::context::SwitchStateHandlerContextObjects; +use crate::state_controller::switch::deleting::handle_deleting; +use crate::state_controller::switch::error_state::handle_error; +use crate::state_controller::switch::initializing::handle_initializing; +use crate::state_controller::switch::ready::handle_ready; +use crate::state_controller::switch::reprovisioning::handle_reprovisioning; +use crate::state_controller::switch::validating::handle_validating; -/// The actual Switch State handler +/// The actual Switch State handler (structure mirrors MachineStateHandler). #[derive(Debug, Default, Clone)] pub struct SwitchStateHandler {} +impl SwitchStateHandler { + /// Records metrics for the switch. Stub for now; extend when switch metrics are defined. + fn record_metrics( + &self, + _state: &Switch, + _ctx: &mut StateHandlerContext<'_, SwitchStateHandlerContextObjects>, + ) { + // TODO: Populate when SwitchMetrics has fields (e.g. health, version, etc.) + } + + /// Attempts a state transition by delegating to the appropriate state handler. + async fn attempt_state_transition( + &self, + switch_id: &SwitchId, + state: &mut Switch, + ctx: &mut StateHandlerContext<'_, SwitchStateHandlerContextObjects>, + ) -> Result, StateHandlerError> { + let controller_state = &state.controller_state.value; + + match controller_state { + SwitchControllerState::Initializing => handle_initializing(switch_id, state, ctx).await, + SwitchControllerState::Configuring { .. } => { + handle_configuring(switch_id, state, ctx).await + } + SwitchControllerState::Validating { .. } => { + handle_validating(switch_id, state, ctx).await + } + SwitchControllerState::BomValidating { .. } => { + handle_bom_validating(switch_id, state, ctx).await + } + SwitchControllerState::ReProvisioning { .. } => { + handle_reprovisioning(switch_id, state, ctx).await + } + SwitchControllerState::Ready => handle_ready(switch_id, state, ctx).await, + SwitchControllerState::Deleting => handle_deleting(switch_id, state, ctx).await, + SwitchControllerState::Error { .. } => handle_error(switch_id, state, ctx).await, + } + } +} + #[async_trait::async_trait] impl StateHandler for SwitchStateHandler { type ObjectId = SwitchId; @@ -34,86 +85,15 @@ impl StateHandler for SwitchStateHandler { type ControllerState = SwitchControllerState; type ContextObjects = SwitchStateHandlerContextObjects; + #[instrument(skip_all, fields(object_id=%switch_id))] async fn handle_object_state( &self, switch_id: &SwitchId, state: &mut Switch, - controller_state: &Self::ControllerState, + _controller_state: &SwitchControllerState, ctx: &mut StateHandlerContext, ) -> Result, StateHandlerError> { - match controller_state { - SwitchControllerState::Initializing => { - // TODO: Implement Switch initialization logic - // This would typically involve: - // 1. Validating the Switch configuration - // 2. Allocating resources - tracing::info!("Initializing Switch"); - let new_state = SwitchControllerState::FetchingData; - Ok(StateHandlerOutcome::transition(new_state)) - } - - SwitchControllerState::FetchingData => { - tracing::info!("Fetching Switch data"); - // TODO: Implement Switch fetching data logic - // This would typically involve: - // 1. Fetching data from the Switch - // 2. Updating the Switch status - let new_state = SwitchControllerState::Configuring; - Ok(StateHandlerOutcome::transition(new_state)) - } - - SwitchControllerState::Configuring => { - tracing::info!("Configuring Switch"); - // TODO: Implement Switch configuring logic - // This would typically involve: - // 1. Configuring the Switch - // 2. Updating the Switch status - let new_state = SwitchControllerState::Ready; - Ok(StateHandlerOutcome::transition(new_state)) - } - - SwitchControllerState::Deleting => { - tracing::info!("Deleting Switch"); - // TODO: Implement Switch deletion logic - // This would typically involve: - // 1. Checking if the Switch is in use - // 2. Safely shutting down the Switch - // 3. Releasing allocated resources - - // For now, just delete the Switch from the database - let mut txn = ctx.services.db_pool.begin().await?; - db_switch::final_delete(*switch_id, &mut txn).await?; - Ok(StateHandlerOutcome::deleted().with_txn(txn)) - } - - SwitchControllerState::Ready => { - tracing::info!("Switch is ready"); - if state.is_marked_as_deleted() { - Ok(StateHandlerOutcome::transition( - SwitchControllerState::Deleting, - )) - } else { - // TODO: Implement Switch monitoring logic - // This would typically involve: - // 1. Checking Switch health status - // 2. Updating Switch status - - // For now, just do nothing - Ok(StateHandlerOutcome::do_nothing()) - } - } - - SwitchControllerState::Error { .. } => { - tracing::info!("Switch is in error state"); - if state.is_marked_as_deleted() { - Ok(StateHandlerOutcome::transition( - SwitchControllerState::Deleting, - )) - } else { - // If Switch is in error state, keep it there for manual intervention - Ok(StateHandlerOutcome::do_nothing()) - } - } - } + self.record_metrics(state, ctx); + self.attempt_state_transition(switch_id, state, ctx).await } } diff --git a/crates/api/src/state_controller/switch/initializing.rs b/crates/api/src/state_controller/switch/initializing.rs new file mode 100644 index 0000000000..537f68f7e9 --- /dev/null +++ b/crates/api/src/state_controller/switch/initializing.rs @@ -0,0 +1,41 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +//! Handler for SwitchControllerState::Initializing. + +use carbide_uuid::switch::SwitchId; +use model::switch::{ConfiguringState, Switch, SwitchControllerState}; + +use crate::state_controller::state_handler::StateHandlerContext; +use crate::state_controller::state_handler::{StateHandlerError, StateHandlerOutcome}; +use crate::state_controller::switch::context::SwitchStateHandlerContextObjects; + +/// Handles the Initializing state for a switch. +/// TODO: Implement Switch initialization logic (validate config, allocate resources, etc.). +pub async fn handle_initializing( + _switch_id: &SwitchId, + _state: &mut Switch, + _ctx: &mut StateHandlerContext<'_, SwitchStateHandlerContextObjects>, +) -> Result, StateHandlerError> { + tracing::info!("Initializing Switch"); + //TODO: Implement switch initialization logic. + Ok(StateHandlerOutcome::transition( + SwitchControllerState::Configuring { + config_state: ConfiguringState::RotateOsPassword, + }, + )) +} diff --git a/crates/api/src/state_controller/switch/io.rs b/crates/api/src/state_controller/switch/io.rs index b335a99464..253dd8ed4d 100644 --- a/crates/api/src/state_controller/switch/io.rs +++ b/crates/api/src/state_controller/switch/io.rs @@ -122,9 +122,11 @@ impl StateControllerIO for SwitchStateControllerIO { fn metric_state_names(state: &SwitchControllerState) -> (&'static str, &'static str) { match state { SwitchControllerState::Initializing => ("initializing", ""), - SwitchControllerState::FetchingData => ("fetching_data", ""), - SwitchControllerState::Configuring => ("configuring", ""), + SwitchControllerState::Configuring { .. } => ("configuring", ""), + SwitchControllerState::Validating { .. } => ("validating", ""), + SwitchControllerState::BomValidating { .. } => ("bomvalidating", ""), SwitchControllerState::Ready => ("ready", ""), + SwitchControllerState::ReProvisioning { .. } => ("reprovisioning", ""), SwitchControllerState::Error { .. } => ("error", ""), SwitchControllerState::Deleting => ("deleting", ""), } diff --git a/crates/api/src/state_controller/switch/mod.rs b/crates/api/src/state_controller/switch/mod.rs index e9da5cecbd..71384cc9f1 100644 --- a/crates/api/src/state_controller/switch/mod.rs +++ b/crates/api/src/state_controller/switch/mod.rs @@ -17,6 +17,14 @@ //! State Controller implementation for Switches. +pub mod bom_validating; pub mod context; +pub mod configuring; +pub mod deleting; +pub mod error_state; pub mod handler; +pub mod initializing; pub mod io; +pub mod ready; +pub mod reprovisioning; +pub mod validating; diff --git a/crates/api/src/state_controller/switch/ready.rs b/crates/api/src/state_controller/switch/ready.rs new file mode 100644 index 0000000000..024a0a9f2a --- /dev/null +++ b/crates/api/src/state_controller/switch/ready.rs @@ -0,0 +1,55 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +//! Handler for SwitchControllerState::Ready. + +use carbide_uuid::switch::SwitchId; +use model::switch::{ReProvisioningState, Switch, SwitchControllerState}; + +use crate::state_controller::state_handler::StateHandlerContext; +use crate::state_controller::state_handler::{StateHandlerError, StateHandlerOutcome}; +use crate::state_controller::switch::context::SwitchStateHandlerContextObjects; + +/// Handles the Ready state for a switch. +/// TODO: Implement Switch monitoring (health checks, status updates, etc.). +pub async fn handle_ready( + _switch_id: &SwitchId, + state: &mut Switch, + _ctx: &mut StateHandlerContext<'_, SwitchStateHandlerContextObjects>, +) -> Result, StateHandlerError> { + if state.is_marked_as_deleted() { + return Ok(StateHandlerOutcome::transition( + SwitchControllerState::Deleting, + )); + } + + if is_switch_reprovisioning_requested(state) { + tracing::info!("Switch reprovisioning requested, transitioning to ReProvisioning::Start"); + return Ok(StateHandlerOutcome::transition( + SwitchControllerState::ReProvisioning { + reprovisioning_state: ReProvisioningState::Start, + }, + )); + } + + tracing::info!("Switch is ready"); + Ok(StateHandlerOutcome::do_nothing()) +} + +fn is_switch_reprovisioning_requested(switch: &Switch) -> bool { + switch.switch_reprovisioning_requested.is_some() +} diff --git a/crates/api/src/state_controller/switch/reprovisioning.rs b/crates/api/src/state_controller/switch/reprovisioning.rs new file mode 100644 index 0000000000..3f68b1510b --- /dev/null +++ b/crates/api/src/state_controller/switch/reprovisioning.rs @@ -0,0 +1,80 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +//! Handler for SwitchControllerState::ReProvisioning. + +use carbide_uuid::switch::SwitchId; +use model::switch::{FirmwareUpgradeStatus, ReProvisioningState, Switch, SwitchControllerState}; + +use crate::state_controller::state_handler::StateHandlerContext; +use crate::state_controller::state_handler::{StateHandlerError, StateHandlerOutcome}; +use crate::state_controller::switch::context::SwitchStateHandlerContextObjects; + +/// Handles the ReProvisioning state for a switch. +pub async fn handle_reprovisioning( + _switch_id: &SwitchId, + state: &mut Switch, + _ctx: &mut StateHandlerContext<'_, SwitchStateHandlerContextObjects>, +) -> Result, StateHandlerError> { + let reprovisioning_state = match &state.controller_state.value { + SwitchControllerState::ReProvisioning { + reprovisioning_state, + } => reprovisioning_state, + _ => unreachable!("handle_reprovisioning called with non-ReProvisioning state"), + }; + + match reprovisioning_state { + ReProvisioningState::Start => { + tracing::info!("ReProvisioning Switch: Start"); + // TODO: Trigger reprovisioning (e.g. call switch API). Then transition to waiting. + Ok(StateHandlerOutcome::transition( + SwitchControllerState::ReProvisioning { + reprovisioning_state: ReProvisioningState::WaitFirmwareUpdateCompletion, + }, + )) + } + ReProvisioningState::WaitFirmwareUpdateCompletion => { + match state.firmware_upgrade_status.as_ref() { + Some(FirmwareUpgradeStatus::Completed) => { + tracing::info!( + "ReProvisioning Switch: firmware upgrade completed, moving to Ready" + ); + Ok(StateHandlerOutcome::transition( + SwitchControllerState::Ready, + )) + } + Some(FirmwareUpgradeStatus::Failed { cause }) => { + tracing::warn!("ReProvisioning Switch: firmware upgrade failed: {}", cause); + Ok(StateHandlerOutcome::transition( + SwitchControllerState::Error { + cause: cause.clone(), + }, + )) + } + Some(FirmwareUpgradeStatus::Started) + | Some(FirmwareUpgradeStatus::InProgress) + | None => { + tracing::info!( + "ReProvisioning Switch: WaitFirmwareUpdateCompletion, status {:?} — keep waiting", + state.firmware_upgrade_status + ); + Ok(StateHandlerOutcome::do_nothing()) + } + } + } + } +} diff --git a/crates/api/src/state_controller/switch/validating.rs b/crates/api/src/state_controller/switch/validating.rs new file mode 100644 index 0000000000..db2d1b1be7 --- /dev/null +++ b/crates/api/src/state_controller/switch/validating.rs @@ -0,0 +1,50 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +//! Handler for SwitchControllerState::Validating. + +use carbide_uuid::switch::SwitchId; +use model::switch::{BomValidatingState, Switch, SwitchControllerState, ValidatingState}; + +use crate::state_controller::state_handler::StateHandlerContext; +use crate::state_controller::state_handler::{StateHandlerError, StateHandlerOutcome}; +use crate::state_controller::switch::context::SwitchStateHandlerContextObjects; + +/// Handles the Validating state for a switch. +/// TODO: Implement Switch validation logic. +pub async fn handle_validating( + _switch_id: &SwitchId, + state: &mut Switch, + _ctx: &mut StateHandlerContext<'_, SwitchStateHandlerContextObjects>, +) -> Result, StateHandlerError> { + tracing::info!("Validating Switch"); + let validating_state = match &state.controller_state.value { + SwitchControllerState::Validating { validating_state } => validating_state, + _ => unreachable!("handle_validating called with non-Validating state"), + }; + + match validating_state { + ValidatingState::ValidateComplete => { + tracing::info!("Validating Switch: ValidateComplete"); + Ok(StateHandlerOutcome::transition( + SwitchControllerState::BomValidating { + bom_validating_state: BomValidatingState::BomValidateComplete, + }, + )) + } + } +} diff --git a/crates/api/src/tests/common/api_fixtures/site_explorer.rs b/crates/api/src/tests/common/api_fixtures/site_explorer.rs index 2d61ca61f2..3704bb1642 100644 --- a/crates/api/src/tests/common/api_fixtures/site_explorer.rs +++ b/crates/api/src/tests/common/api_fixtures/site_explorer.rs @@ -66,6 +66,7 @@ use crate::tests::common::api_fixtures::{ machine_validation_completed, persist_machine_validation_result, reboot_completed, update_machine_validation_run, }; +use crate::tests::common::mac_address_pool::EXPECTED_SWITCH_NVOS_MAC_ADDRESS_POOL; use crate::tests::common::rpc_builder::DhcpDiscovery; /// MockExploredHost presents a fluent interface for declaring a mock host and running it through @@ -1755,6 +1756,7 @@ pub async fn create_expected_switches( let switch = ExpectedSwitch { expected_switch_id: None, bmc_mac_address: EXPECTED_SWITCH_BMC_MAC_ADDRESS_POOL.allocate(), + nvos_mac_address: Some(EXPECTED_SWITCH_NVOS_MAC_ADDRESS_POOL.allocate()), serial_number: format!("SW-SN-{:03}", i + 1), bmc_username: "ADMIN".into(), bmc_password: "Pwd2023x0x0x0x7".into(), diff --git a/crates/api/src/tests/common/mac_address_pool.rs b/crates/api/src/tests/common/mac_address_pool.rs index 6253abf87c..290a7cb6e9 100644 --- a/crates/api/src/tests/common/mac_address_pool.rs +++ b/crates/api/src/tests/common/mac_address_pool.rs @@ -120,6 +120,13 @@ lazy_static::lazy_static! { start: [0x44, 0x44, 0x22, 0x22, 0x0, 0x0], length: 65536, }); + + /// Pool of Expected Switch NVOS MAC addresses + pub static ref EXPECTED_SWITCH_NVOS_MAC_ADDRESS_POOL: MacAddressPool = + MacAddressPool::new(MacAddressPoolConfig { + start: [0x44, 0x44, 0x33, 0x33, 0x0, 0x0], + length: 65536, + }); } fn to_u64_be(bytes: [u8; 6]) -> u64 { diff --git a/crates/api/src/tests/expected_switch.rs b/crates/api/src/tests/expected_switch.rs index 73c2114516..6d29b80d56 100644 --- a/crates/api/src/tests/expected_switch.rs +++ b/crates/api/src/tests/expected_switch.rs @@ -55,6 +55,7 @@ async fn test_duplicate_fail_create(pool: sqlx::PgPool) -> Result<(), Box = switches[1].nvos_mac_address; for mut updated_switch in [ rpc::forge::ExpectedSwitch { expected_switch_id: None, bmc_mac_address: bmc_mac_address.to_string(), + nvos_mac_address: nvos_mac_address.map(|m| m.to_string()), bmc_username: "ADMIN_UPDATE".into(), bmc_password: "PASS_UPDATE".into(), switch_serial_number: "SW-UPD-001".into(), @@ -287,6 +293,7 @@ async fn test_update_expected_switch(pool: sqlx::PgPool) { rpc::forge::ExpectedSwitch { expected_switch_id: None, bmc_mac_address: bmc_mac_address.to_string(), + nvos_mac_address: nvos_mac_address.map(|m| m.to_string()), bmc_username: "ADMIN_UPDATE".into(), bmc_password: "PASS_UPDATE".into(), switch_serial_number: "SW-UPD-002".into(), @@ -298,6 +305,7 @@ async fn test_update_expected_switch(pool: sqlx::PgPool) { rpc::forge::ExpectedSwitch { expected_switch_id: None, bmc_mac_address: bmc_mac_address.to_string(), + nvos_mac_address: nvos_mac_address.map(|m| m.to_string()), bmc_username: "ADMIN_UPDATE1".into(), bmc_password: "PASS_UPDATE1".into(), switch_serial_number: "SW-UPD-003".into(), @@ -361,6 +369,7 @@ async fn test_get_expected_switch_by_id(pool: sqlx::PgPool) { value: explicit_id.to_string(), }), bmc_mac_address: "3A:3B:3C:3D:3E:3F".to_string(), + nvos_mac_address: Some("3A:3B:3C:3D:3E:40".to_string()), bmc_username: "ADMIN".into(), bmc_password: "PASS".into(), switch_serial_number: "SW-ID-001".into(), @@ -400,6 +409,7 @@ async fn test_delete_expected_switch_by_id(pool: sqlx::PgPool) { value: explicit_id.to_string(), }), bmc_mac_address: "3A:3B:3C:3D:3E:3F".to_string(), + nvos_mac_address: Some("3A:3B:3C:3D:3E:40".to_string()), bmc_username: "ADMIN".into(), bmc_password: "PASS".into(), switch_serial_number: "SW-DEL-ID-001".into(), @@ -451,6 +461,7 @@ async fn test_update_expected_switch_by_id(pool: sqlx::PgPool) { value: explicit_id.to_string(), }), bmc_mac_address: "3A:3B:3C:3D:3E:3F".to_string(), + nvos_mac_address: Some("3A:3B:3C:3D:3E:40".to_string()), bmc_username: "ADMIN".into(), bmc_password: "PASS".into(), switch_serial_number: "SW-UPD-ID-001".into(), @@ -470,6 +481,7 @@ async fn test_update_expected_switch_by_id(pool: sqlx::PgPool) { value: explicit_id.to_string(), }), bmc_mac_address: "3A:3B:3C:3D:3E:3F".to_string(), + nvos_mac_address: Some("4A:4B:4C:4D:4E:3F".to_string()), bmc_username: "ADMIN_UPDATED".into(), bmc_password: "PASS_UPDATED".into(), switch_serial_number: "SW-UPD-ID-002".into(), @@ -516,6 +528,7 @@ async fn test_create_expected_switch_with_explicit_id(pool: sqlx::PgPool) { value: explicit_id.to_string(), }), bmc_mac_address: "3A:3B:3C:3D:3E:3F".to_string(), + nvos_mac_address: Some("4A:4B:4C:4D:4E:3F".to_string()), bmc_username: "ADMIN".into(), bmc_password: "PASS".into(), switch_serial_number: "SW-EXPLICIT-001".into(), @@ -556,6 +569,7 @@ async fn test_create_expected_switch_auto_generates_id(pool: sqlx::PgPool) { let expected_switch = rpc::forge::ExpectedSwitch { expected_switch_id: None, bmc_mac_address: "3A:3B:3C:3D:3E:3F".to_string(), + nvos_mac_address: Some("4A:4B:4C:4D:4E:3F".to_string()), bmc_username: "ADMIN".into(), bmc_password: "PASS".into(), switch_serial_number: "SW-AUTO-001".into(), @@ -659,9 +673,11 @@ async fn test_delete_expected_switch_error(pool: sqlx::PgPool) { async fn test_update_expected_switch_error(pool: sqlx::PgPool) { let env = create_test_env(pool).await; let bmc_mac_address: MacAddress = "2A:2B:2C:2D:2E:2F".parse().unwrap(); + let nvos_mac_address: MacAddress = "3A:3B:3C:3D:3E:3F".parse().unwrap(); let expected_switch = rpc::forge::ExpectedSwitch { expected_switch_id: None, bmc_mac_address: bmc_mac_address.to_string(), + nvos_mac_address: Some(nvos_mac_address.to_string()), bmc_username: "ADMIN_UPDATE".into(), bmc_password: "PASS_UPDATE".into(), switch_serial_number: "SW-UPD-001".into(), @@ -765,6 +781,7 @@ async fn test_replace_all_expected_switches(pool: sqlx::PgPool) { let expected_switch_1 = rpc::forge::ExpectedSwitch { expected_switch_id: None, bmc_mac_address: "6A:6B:6C:6D:6E:6F".into(), + nvos_mac_address: Some("4A:4B:4C:4D:4E:6F".to_string()), bmc_username: "ADMIN_NEW".into(), bmc_password: "PASS_NEW".into(), switch_serial_number: "SW-NEW-001".into(), @@ -777,6 +794,7 @@ async fn test_replace_all_expected_switches(pool: sqlx::PgPool) { let expected_switch_2 = rpc::forge::ExpectedSwitch { expected_switch_id: None, bmc_mac_address: "7A:7B:7C:7D:7E:7F".into(), + nvos_mac_address: Some("4A:4B:4C:4D:4E:7F".to_string()), bmc_username: "ADMIN_NEW".into(), bmc_password: "PASS_NEW".into(), switch_serial_number: "SW-NEW-002".into(), diff --git a/crates/api/src/tests/switch.rs b/crates/api/src/tests/switch.rs index 27cf1905a7..509657a422 100644 --- a/crates/api/src/tests/switch.rs +++ b/crates/api/src/tests/switch.rs @@ -567,6 +567,7 @@ async fn test_find_switch_with_bmc_info( let switch_serial = "TestSwitch-001"; let switch_id = new_switch(&env, Some(switch_serial.to_string()), None).await?; let bmc_mac: MacAddress = "AA:BB:CC:DD:EE:FF".parse().unwrap(); + let nvos_mac: MacAddress = "AA:BB:CC:DD:EE:99".parse().unwrap(); let mut txn = db::Transaction::begin(&env.pool).await?; db_expected_switch::create( @@ -574,6 +575,7 @@ async fn test_find_switch_with_bmc_info( ExpectedSwitch { expected_switch_id: None, bmc_mac_address: bmc_mac, + nvos_mac_address: Some(nvos_mac), bmc_username: "admin".to_string(), bmc_password: "password".to_string(), serial_number: switch_serial.to_string(), diff --git a/crates/api/src/tests/switch_state_controller/mod.rs b/crates/api/src/tests/switch_state_controller/mod.rs index a3fe86378d..d285a321fb 100644 --- a/crates/api/src/tests/switch_state_controller/mod.rs +++ b/crates/api/src/tests/switch_state_controller/mod.rs @@ -22,7 +22,7 @@ use std::time::Duration; use carbide_uuid::switch::SwitchId; use db::switch as db_switch; -use model::switch::{Switch, SwitchControllerState}; +use model::switch::{ConfiguringState, Switch, SwitchControllerState}; use rpc::forge::forge_server::Forge; use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; @@ -34,6 +34,7 @@ use crate::state_controller::state_handler::{ StateHandler, StateHandlerContext, StateHandlerError, StateHandlerOutcome, }; use crate::state_controller::switch::context::SwitchStateHandlerContextObjects; +use crate::state_controller::switch::handler::SwitchStateHandler; use crate::state_controller::switch::io::SwitchStateControllerIO; use crate::tests::common; use crate::tests::common::api_fixtures::create_test_env; @@ -345,8 +346,9 @@ async fn test_switch_state_transition_validation( // Test state transitions by manually setting different states let states = vec![ - SwitchControllerState::FetchingData, - SwitchControllerState::Configuring, + SwitchControllerState::Configuring { + config_state: ConfiguringState::RotateOsPassword, + }, SwitchControllerState::Ready, SwitchControllerState::Error { cause: "Test error".to_string(), @@ -449,3 +451,103 @@ async fn test_switch_deletion_with_state_controller( Ok(()) } + +/// Tests the entire Switch ControllerState transition flow: Initializing -> Configuring +/// (RotateOsPassword) -> Validating (ValidateComplete) -> BomValidating +/// (BomValidateComplete) -> Ready. Uses the real SwitchStateHandler so each state handler +/// performs its transition. +#[crate::sqlx_test] +async fn test_switch_entire_state_transition_flow( + pool: sqlx::PgPool, +) -> Result<(), Box> { + let env = create_test_env(pool.clone()).await; + + let switch_id = common::api_fixtures::site_explorer::new_switch( + &env, + Some("Entire State Transition Test Switch".to_string()), + Some("Data Center A, Rack 1".to_string()), + ) + .await?; + + // Verify initial state is Initializing + { + let mut txn = pool.acquire().await?; + let switch = db_switch::find_by_id(&mut txn, &switch_id).await?; + let switch = switch.expect("switch should exist"); + assert!( + matches!( + switch.controller_state.value, + SwitchControllerState::Initializing + ), + "initial state should be Initializing, got {:?}", + switch.controller_state.value + ); + } + + let switch_handler = Arc::new(SwitchStateHandler::default()); + const ITERATION_TIME: Duration = Duration::from_millis(50); + const POLL_INTERVAL: Duration = Duration::from_millis(200); + const MAX_WAIT: Duration = Duration::from_secs(30); + + let handler_services = Arc::new(CommonStateHandlerServices { + db_pool: pool.clone(), + db_reader: pool.clone().into(), + redfish_client_pool: env.redfish_sim.clone(), + ib_fabric_manager: env.ib_fabric_manager.clone(), + ib_pools: env.common_pools.infiniband.clone(), + ipmi_tool: env.ipmi_tool.clone(), + site_config: env.config.clone(), + dpa_info: None, + rms_client: None, + }); + + let cancel_token = CancellationToken::new(); + let mut join_set = JoinSet::new(); + StateController::::builder() + .iteration_config(IterationConfig { + iteration_time: ITERATION_TIME, + processor_dispatch_interval: Duration::from_millis(10), + ..Default::default() + }) + .database(pool.clone(), env.api.work_lock_manager_handle.clone()) + .processor_id(uuid::Uuid::new_v4().to_string()) + .services(handler_services.clone()) + .state_handler(switch_handler.clone()) + .build_and_spawn(&mut join_set, cancel_token.clone()) + .unwrap(); + + let deadline = std::time::Instant::now() + MAX_WAIT; + let mut reached_ready = false; + while std::time::Instant::now() < deadline { + tokio::time::sleep(POLL_INTERVAL).await; + let mut txn = pool.acquire().await?; + let switch = db_switch::find_by_id(&mut txn, &switch_id).await?; + if let Some(switch) = switch + && matches!(switch.controller_state.value, SwitchControllerState::Ready) + { + reached_ready = true; + break; + } + } + + cancel_token.cancel(); + join_set.join_all().await; + + assert!( + reached_ready, + "switch should reach Ready within {:?}. Re-run and inspect controller_state.", + MAX_WAIT + ); + + // Final assertion: state is Ready + let mut txn = pool.acquire().await?; + let switch = db_switch::find_by_id(&mut txn, &switch_id).await?; + let switch = switch.expect("switch should exist"); + assert!( + matches!(switch.controller_state.value, SwitchControllerState::Ready), + "expected Ready, got {:?}", + switch.controller_state.value + ); + + Ok(()) +} diff --git a/crates/rpc/proto/forge.proto b/crates/rpc/proto/forge.proto index 0a42e586b2..c9d0253846 100644 --- a/crates/rpc/proto/forge.proto +++ b/crates/rpc/proto/forge.proto @@ -1885,6 +1885,7 @@ message ExpectedSwitch { optional string nvos_password = 8; // Unique identifier for the expected switch. When omitted, server generates one. optional common.UUID expected_switch_id = 9; + optional string nvos_mac_address = 10; } message ExpectedSwitchRequest {