Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
-- Add a column to create a generalized inverted index on machines table for health_report_overrides
CREATE INDEX IF NOT EXISTS machine_health_overrides_merges_gin_idx ON machines USING GIN ((health_report_overrides -> 'merges'));
16 changes: 16 additions & 0 deletions crates/api-db/src/machine.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1657,6 +1657,14 @@ pub async fn find_machine_ids(
qb.push(" INNER JOIN machine_topologies mt ON machines.id = mt.machine_id");
}

// Return only machines that are powered on and have a health override with leak classification
if let Some(pstate) = &search_config.only_with_power_state {
let pstate_normalized = pstate.to_lowercase();
qb.push(" INNER JOIN power_options po ON po.host_id = machines.id AND po.last_fetched_power_state = ");
qb.push_bind(pstate_normalized);
qb.push("::host_power_state_t");
}

qb.push(" WHERE TRUE");

if search_config.only_maintenance {
Expand Down Expand Up @@ -1699,6 +1707,13 @@ pub async fn find_machine_ids(
));
}

if let Some(ovrrd_str) = &search_config.only_with_health_alert {
qb.push(" AND health_report_overrides->'merges' ? ");
qb.push_bind(ovrrd_str.clone());
qb.push(" AND jsonb_array_length(health_report_overrides->'merges'->");
qb.push_bind(ovrrd_str);
qb.push("->'alerts') > 0");
}
if search_config.mnnvl_only {
qb.push(
" AND mt.topology->'discovery_data'->'Info'->'dmi_data'->>'product_name' LIKE '%GB200%'",
Expand All @@ -1715,6 +1730,7 @@ pub async fn find_machine_ids(
}

let q = qb.build_query_as();

let machine_ids: Vec<MachineId> = q
.fetch_all(txn)
.await
Expand Down
4 changes: 4 additions & 0 deletions crates/api-model/src/machine/machine_search_config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ pub struct MachineSearchConfig {
pub for_update: bool,
// Only include NVLink capable machines (GB200/GB300 etc)
pub mnnvl_only: bool,
pub only_with_power_state: Option<String>,
pub only_with_health_alert: Option<String>,
}

impl TryFrom<rpc::forge::MachineSearchConfig> for MachineSearchConfig {
Expand All @@ -63,6 +65,8 @@ impl TryFrom<rpc::forge::MachineSearchConfig> for MachineSearchConfig {
.transpose()?,
for_update: false, // This isn't exposed to API callers
mnnvl_only: value.mnnvl_only,
only_with_power_state: value.only_with_power_state,
only_with_health_alert: value.only_with_health_alert,
})
}
}
21 changes: 12 additions & 9 deletions crates/api/src/handlers/health.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,16 +44,18 @@ pub async fn list_health_report_overrides(

txn.commit().await?;

let ovr = host_machine
.health_report_overrides
.clone()
.into_iter()
.map(|o| HealthReportOverride {
report: Some(o.0.into()),
mode: o.1 as i32,
})
.collect();

Ok(Response::new(rpc::ListHealthReportOverrideResponse {
overrides: host_machine
.health_report_overrides
.clone()
.into_iter()
.map(|o| HealthReportOverride {
report: Some(o.0.into()),
mode: o.1 as i32,
})
.collect(),
overrides: ovr,
}))
}

Expand Down Expand Up @@ -136,6 +138,7 @@ pub async fn insert_health_report_override(
)
.into());
}

let mut txn = api.txn_begin().await?;

let mut report = health_report::HealthReport::try_from(report.clone())
Expand Down
2 changes: 2 additions & 0 deletions crates/api/src/web/machine.rs
Original file line number Diff line number Diff line change
Expand Up @@ -401,6 +401,8 @@ pub async fn fetch_machines(
only_quarantine: false,
instance_type_id: None,
mnnvl_only: false,
only_with_power_state: None,
only_with_health_alert: None,
});

let machine_ids = api
Expand Down
2 changes: 2 additions & 0 deletions crates/api/src/web/managed_host.rs
Original file line number Diff line number Diff line change
Expand Up @@ -660,6 +660,8 @@ async fn fetch_managed_hosts_with_metadata(
only_quarantine: false,
instance_type_id: None,
mnnvl_only: false,
only_with_health_alert: None,
only_with_power_state: None,
}))
.await?
.into_inner()
Expand Down
2 changes: 2 additions & 0 deletions crates/api/src/web/nvlink.rs
Original file line number Diff line number Diff line change
Expand Up @@ -302,6 +302,8 @@ async fn fetch_logical_partitions(
only_quarantine: false,
instance_type_id: None,
mnnvl_only: true,
only_with_health_alert: None,
only_with_power_state: None,
});

let machine_ids = api
Expand Down
1 change: 1 addition & 0 deletions crates/health/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ logfmt = { path = "../logfmt" }
carbide-version = { path = "../version" }

[features]
default = []
bench-hooks = []

[dev-dependencies]
Expand Down
1 change: 1 addition & 0 deletions crates/health/src/processor/health_report.rs
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,7 @@ impl HealthReportProcessor {
calculated_status = ?state,
"Threshold check indicates issue but BMC reports sensor as OK - likely incorrect thresholds, reporting OK"
);

return SensorHealthResult::Success(HealthReportSuccess {
probe_id: Probe::Sensor,
target: Some(health.sensor_id.clone()),
Expand Down
4 changes: 4 additions & 0 deletions crates/rpc/proto/forge.proto
Original file line number Diff line number Diff line change
Expand Up @@ -2712,6 +2712,10 @@ message MachineSearchConfig {
bool only_quarantine = 7;
optional string instance_type_id = 8;
bool mnnvl_only = 9;
// PowerState would mirror host_power_state_t, basically 'on' and 'off'
optional string only_with_power_state = 10;
// For example, to search for leak alerts, set the below to "hardware-health.tray-leak-detection"
optional string only_with_health_alert = 11;
}

message MachineStateHistoriesRequest {
Expand Down
2 changes: 2 additions & 0 deletions crates/rpc/src/errors.rs
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@ pub enum RpcDataConversionError {
InvalidVpcVirtualizationType(i32),
#[error("Invalid enum value received for critical error type: {0}")]
InvalidCriticalErrorType(i32),
#[error("PowerState {0} is not valid")]
InvalidPowerState(i32),
#[error("Instance ID {0} is not valid")]
InvalidInstanceId(String),
#[error("Remediation ID {0} is not valid")]
Expand Down