diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs index 5174bfea858..57c342ff1f6 100644 --- a/dev-tools/omdb/src/bin/omdb/nexus.rs +++ b/dev-tools/omdb/src/bin/omdb/nexus.rs @@ -6,6 +6,7 @@ mod quiesce; mod reconfigurator_config; +mod support_bundle_config; mod update_status; use crate::Omdb; @@ -70,6 +71,7 @@ use nexus_types::internal_api::background::RegionSnapshotReplacementStartStatus; use nexus_types::internal_api::background::RegionSnapshotReplacementStepStatus; use nexus_types::internal_api::background::SitrepGcStatus; use nexus_types::internal_api::background::SitrepLoadStatus; +use nexus_types::internal_api::background::SupportBundleAutoDeletionReport; use nexus_types::internal_api::background::SupportBundleCleanupReport; use nexus_types::internal_api::background::SupportBundleCollectionReport; use nexus_types::internal_api::background::SupportBundleCollectionStepStatus; @@ -101,6 +103,8 @@ use std::os::unix::fs::PermissionsExt; use std::str::FromStr; use std::sync::Arc; use std::time::Duration; +use support_bundle_config::SupportBundleConfigArgs; +use support_bundle_config::cmd_nexus_support_bundle_config; use support_bundle_viewer::LocalFileAccess; use support_bundle_viewer::SupportBundleAccessor; use tabled::Tabled; @@ -160,6 +164,8 @@ enum NexusCommands { ReconfiguratorConfig(ReconfiguratorConfigArgs), /// view sagas, create and complete demo sagas Sagas(SagasArgs), + /// interact with support bundle auto-deletion config + SupportBundleConfig(SupportBundleConfigArgs), /// interact with sleds Sleds(SledsArgs), /// interact with support bundles @@ -783,6 +789,10 @@ impl NexusArgs { cmd_nexus_reconfigurator_config(&omdb, &client, args).await } + NexusCommands::SupportBundleConfig(args) => { + cmd_nexus_support_bundle_config(&omdb, log, args).await + } + NexusCommands::Sagas(SagasArgs { command }) => { if self.nexus_internal_url.is_none() { eprintln!( @@ -2561,6 +2571,7 @@ fn print_task_service_firewall_rule_propagation(details: &serde_json::Value) { fn print_task_support_bundle_collector(details: &serde_json::Value) { #[derive(Deserialize)] struct SupportBundleCollectionStatus { + auto_deletion_report: Option, cleanup_report: Option, cleanup_err: Option, collection_report: Option, @@ -2575,11 +2586,36 @@ fn print_task_support_bundle_collector(details: &serde_json::Value) { error, details ), Ok(SupportBundleCollectionStatus { + auto_deletion_report, cleanup_report, cleanup_err, collection_report, collection_err, }) => { + // Print auto-deletion report first (since it runs first) + if let Some(SupportBundleAutoDeletionReport { + bundles_marked_for_deletion, + free_datasets, + total_datasets, + active_bundles, + errors, + }) = auto_deletion_report + { + println!(" Support Bundle Auto-Deletion Report:"); + println!(" Total debug datasets: {total_datasets}"); + println!(" Active bundles: {active_bundles}"); + println!(" Free datasets: {free_datasets}"); + println!( + " Bundles marked for deletion: {bundles_marked_for_deletion}" + ); + if !errors.is_empty() { + println!(" Errors:"); + for error in errors { + println!(" {error}"); + } + } + } + if let Some(cleanup_err) = cleanup_err { println!(" failed to perform cleanup: {cleanup_err}"); } diff --git a/dev-tools/omdb/src/bin/omdb/nexus/support_bundle_config.rs b/dev-tools/omdb/src/bin/omdb/nexus/support_bundle_config.rs new file mode 100644 index 00000000000..0cf6363c71d --- /dev/null +++ b/dev-tools/omdb/src/bin/omdb/nexus/support_bundle_config.rs @@ -0,0 +1,150 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! omdb commands for support bundle auto-deletion configuration + +use crate::Omdb; +use crate::check_allow_destructive::DestructiveOperationToken; +use crate::db::DbUrlOptions; +use anyhow::Context; +use clap::Args; +use clap::Subcommand; +use nexus_db_queries::context::OpContext; +use nexus_db_queries::db::DataStore; +use std::sync::Arc; + +#[derive(Debug, Args)] +pub struct SupportBundleConfigArgs { + #[clap(flatten)] + db_url_opts: DbUrlOptions, + + #[command(subcommand)] + command: SupportBundleConfigCommands, +} + +#[derive(Debug, Subcommand)] +pub enum SupportBundleConfigCommands { + /// Show current support bundle auto-deletion config + Show, + + /// Set support bundle auto-deletion config + Set(SupportBundleConfigSetArgs), +} + +#[derive(Debug, Clone, Args)] +pub struct SupportBundleConfigSetArgs { + /// Target percentage of datasets to keep free (0-100) + #[clap(long)] + target_free_percent: Option, + + /// Minimum percentage of datasets to keep as bundles (0-100) + #[clap(long)] + min_keep_percent: Option, +} + +pub async fn cmd_nexus_support_bundle_config( + omdb: &Omdb, + log: &slog::Logger, + args: &SupportBundleConfigArgs, +) -> Result<(), anyhow::Error> { + let datastore = args.db_url_opts.connect(omdb, log).await?; + let opctx = OpContext::for_tests(log.clone(), datastore.clone()); + + let result = match &args.command { + SupportBundleConfigCommands::Show => { + support_bundle_config_show(&opctx, &datastore).await + } + SupportBundleConfigCommands::Set(set_args) => { + let token = omdb.check_allow_destructive()?; + support_bundle_config_set(&opctx, &datastore, set_args, token).await + } + }; + + datastore.terminate().await; + result +} + +async fn support_bundle_config_show( + opctx: &OpContext, + datastore: &Arc, +) -> Result<(), anyhow::Error> { + let config = datastore + .support_bundle_config_get(opctx) + .await + .context("failed to get support bundle config")?; + + println!("Support Bundle Auto-Deletion Config:"); + println!( + " Target free datasets: {}% (CEIL calculation)", + config.target_free_percent + ); + println!( + " Minimum bundles to keep: {}% (CEIL calculation)", + config.min_keep_percent + ); + println!(" Last modified: {}", config.time_modified); + + Ok(()) +} + +async fn support_bundle_config_set( + opctx: &OpContext, + datastore: &Arc, + args: &SupportBundleConfigSetArgs, + _destruction_token: DestructiveOperationToken, +) -> Result<(), anyhow::Error> { + // Get current config + let current = datastore + .support_bundle_config_get(opctx) + .await + .context("failed to get current support bundle config")?; + + // Apply changes, using current values as defaults + let new_target_free = + args.target_free_percent.unwrap_or(current.target_free_percent as u8); + let new_min_keep = + args.min_keep_percent.unwrap_or(current.min_keep_percent as u8); + + // Check if anything changed + if i64::from(new_target_free) == current.target_free_percent + && i64::from(new_min_keep) == current.min_keep_percent + { + println!("No changes to current config:"); + println!( + " Target free datasets: {}% (CEIL calculation)", + current.target_free_percent + ); + println!( + " Minimum bundles to keep: {}% (CEIL calculation)", + current.min_keep_percent + ); + return Ok(()); + } + + // Apply the update + datastore + .support_bundle_config_set(opctx, new_target_free, new_min_keep) + .await + .context("failed to set support bundle config")?; + + println!("Support bundle config updated:"); + if i64::from(new_target_free) != current.target_free_percent { + println!( + " Target free datasets: {}% -> {}%", + current.target_free_percent, new_target_free + ); + } else { + println!(" Target free datasets: {}% (unchanged)", new_target_free); + } + if i64::from(new_min_keep) != current.min_keep_percent { + println!( + " Minimum bundles to keep: {}% -> {}%", + current.min_keep_percent, new_min_keep + ); + } else { + println!(" Minimum bundles to keep: {}% (unchanged)", new_min_keep); + } + + Ok(()) +} diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index b7d6d4b0ded..9e0bfe19e70 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -848,6 +848,11 @@ task: "support_bundle_collector" configured period: every days h m s last completed activation: , triggered by started at (s ago) and ran for ms + Support Bundle Auto-Deletion Report: + Total debug datasets: 0 + Active bundles: 0 + Free datasets: 0 + Bundles marked for deletion: 0 Support Bundle Cleanup Report: Bundles deleted from sleds: 0 Bundles not found on sleds: 0 @@ -1422,6 +1427,11 @@ task: "support_bundle_collector" configured period: every days h m s last completed activation: , triggered by started at (s ago) and ran for ms + Support Bundle Auto-Deletion Report: + Total debug datasets: 0 + Active bundles: 0 + Free datasets: 0 + Bundles marked for deletion: 0 Support Bundle Cleanup Report: Bundles deleted from sleds: 0 Bundles not found on sleds: 0 diff --git a/dev-tools/omdb/tests/usage_errors.out b/dev-tools/omdb/tests/usage_errors.out index c7177a64abd..2771c31de7f 100644 --- a/dev-tools/omdb/tests/usage_errors.out +++ b/dev-tools/omdb/tests/usage_errors.out @@ -901,6 +901,7 @@ Commands: quiesce view or modify the quiesce status reconfigurator-config interact with reconfigurator config sagas view sagas, create and complete demo sagas + support-bundle-config interact with support bundle auto-deletion config sleds interact with sleds support-bundles interact with support bundles [aliases: sb] update-status show running artifact versions diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index fca06a07b05..3418265d137 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -16,7 +16,7 @@ use std::{collections::BTreeMap, sync::LazyLock}; /// /// This must be updated when you change the database schema. Refer to /// schema/crdb/README.adoc in the root of this repository for details. -pub const SCHEMA_VERSION: Version = Version::new(225, 0, 0); +pub const SCHEMA_VERSION: Version = Version::new(226, 0, 0); /// List of all past database schema versions, in *reverse* order /// @@ -28,6 +28,7 @@ static KNOWN_VERSIONS: LazyLock> = LazyLock::new(|| { // | leaving the first copy as an example for the next person. // v // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"), + KnownVersion::new(226, "bundle-state-index"), KnownVersion::new(225, "dual-stack-ephemeral-ip"), KnownVersion::new(224, "add-external-subnets"), KnownVersion::new(223, "ip-pool-range-by-pool-id-index"), diff --git a/nexus/db-model/src/support_bundle.rs b/nexus/db-model/src/support_bundle.rs index b9dc9f55b17..6cb08a16b14 100644 --- a/nexus/db-model/src/support_bundle.rs +++ b/nexus/db-model/src/support_bundle.rs @@ -5,6 +5,7 @@ use super::impl_enum_type; use crate::typed_uuid::DbTypedUuid; use nexus_db_schema::schema::support_bundle; +use nexus_db_schema::schema::support_bundle_config; use chrono::{DateTime, Utc}; use nexus_types::external_api::shared::SupportBundleInfo as SupportBundleView; @@ -136,3 +137,18 @@ impl From for SupportBundleView { } } } + +/// Configuration for automatic support bundle deletion. +/// +/// This table uses a singleton pattern - exactly one row exists, created by +/// the schema migration. The row is only updated, never inserted or deleted. +#[derive(Clone, Debug, Queryable, Selectable, Serialize, Deserialize)] +#[diesel(table_name = support_bundle_config)] +pub struct SupportBundleConfig { + pub singleton: bool, + /// Percentage (0-100) of total datasets to keep free for new allocations. + pub target_free_percent: i64, + /// Percentage (0-100) of total datasets to retain as bundles (minimum). + pub min_keep_percent: i64, + pub time_modified: DateTime, +} diff --git a/nexus/db-queries/src/db/datastore/support_bundle.rs b/nexus/db-queries/src/db/datastore/support_bundle.rs index afb0557a44b..e588eaa142d 100644 --- a/nexus/db-queries/src/db/datastore/support_bundle.rs +++ b/nexus/db-queries/src/db/datastore/support_bundle.rs @@ -12,6 +12,7 @@ use crate::db::model::SupportBundle; use crate::db::model::SupportBundleState; use crate::db::pagination::paginated; use crate::db::pagination::paginated_multicolumn; +use crate::db::raw_query_builder::TypedSqlQuery; use crate::db::update_and_check::{UpdateAndCheck, UpdateStatus}; use async_bb8_diesel::AsyncRunQueryDsl; use diesel::prelude::*; @@ -62,6 +63,19 @@ pub struct SupportBundleExpungementReport { pub bundles_reassigned: usize, } +/// Result of atomically deleting support bundles for capacity management. +#[derive(Debug, Clone)] +pub struct AutoDeletionResult { + /// IDs of bundles that were transitioned to Destroying state. + pub deleted_ids: Vec, + /// Total number of debug datasets available. + pub total_datasets: usize, + /// Number of active bundles (before any deletions). + pub active_bundles: usize, + /// Number of free debug datasets (before any deletions). + pub free_datasets: usize, +} + impl DataStore { /// Creates a new support bundle. /// @@ -524,6 +538,220 @@ impl DataStore { Ok(()) } + + /// Atomically finds and deletes support bundles to maintain free dataset buffer. + /// + /// This method performs a single atomic operation that: + /// 1. Reads config from the `support_bundle_config` table + /// 2. Calculates thresholds based on percentage of total datasets + /// 3. Calculates how many deletions are needed (free_datasets < target) + /// 4. Applies min_keep constraint (never delete below minimum) + /// 5. Finds the N oldest Active bundles + /// 6. Transitions them to Destroying state atomically + /// 7. Returns what was actually deleted + /// + /// This prevents over-deletion when multiple Nexuses run concurrently, + /// because the calculation and state transitions happen in a single + /// database operation. + /// + /// Configuration is read from the database, ensuring all Nexus replicas + /// use consistent values. + pub async fn support_bundle_auto_delete( + &self, + opctx: &OpContext, + ) -> Result { + opctx.authorize(authz::Action::Modify, &authz::FLEET).await?; + + let conn = self.pool_connection_authorized(opctx).await?; + + let query = support_bundle_auto_delete_query(); + + // Return type: (total_datasets, used_datasets, active_bundles, deleted_ids) + let result: (i64, i64, i64, Vec) = query + .get_result_async(&*conn) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + + let total_datasets = result.0 as usize; + let used_datasets = result.1 as usize; + let active_bundles = result.2 as usize; + let deleted_ids: Vec = result + .3 + .into_iter() + .map(SupportBundleUuid::from_untyped_uuid) + .collect(); + + Ok(AutoDeletionResult { + deleted_ids, + total_datasets, + active_bundles, + free_datasets: total_datasets.saturating_sub(used_datasets), + }) + } + + /// Get the current support bundle auto-deletion config. + pub async fn support_bundle_config_get( + &self, + opctx: &OpContext, + ) -> Result { + use nexus_db_schema::schema::support_bundle_config::dsl; + + opctx.authorize(authz::Action::Read, &authz::FLEET).await?; + + let conn = self.pool_connection_authorized(opctx).await?; + + dsl::support_bundle_config + .select(crate::db::model::SupportBundleConfig::as_select()) + .first_async(&*conn) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } + + /// Set support bundle auto-deletion config. + /// + /// Values are percentages (0-100). + pub async fn support_bundle_config_set( + &self, + opctx: &OpContext, + target_free_percent: u8, + min_keep_percent: u8, + ) -> Result<(), Error> { + use nexus_db_schema::schema::support_bundle_config::dsl; + + opctx.authorize(authz::Action::Modify, &authz::FLEET).await?; + + // Validate percentages are in range + if target_free_percent > 100 { + return Err(Error::invalid_request( + "target_free_percent must be between 0 and 100", + )); + } + if min_keep_percent > 100 { + return Err(Error::invalid_request( + "min_keep_percent must be between 0 and 100", + )); + } + + let conn = self.pool_connection_authorized(opctx).await?; + + diesel::update(dsl::support_bundle_config) + .filter(dsl::singleton.eq(true)) + .set(( + dsl::target_free_percent.eq(i64::from(target_free_percent)), + dsl::min_keep_percent.eq(i64::from(min_keep_percent)), + dsl::time_modified.eq(chrono::Utc::now()), + )) + .execute_async(&*conn) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + + Ok(()) + } +} + +/// Builds the CTE query for atomic support bundle auto-deletion. +/// +/// This query atomically: +/// 1. Reads config from support_bundle_config table +/// 2. Calculates thresholds based on percentage of total datasets using CEIL +/// 3. Counts datasets and bundles +/// 4. Calculates how many bundles to delete (respecting min_keep) +/// 5. Selects the oldest Active bundles +/// 6. Transitions them to Destroying state +/// 7. Returns the stats and deleted IDs +pub fn support_bundle_auto_delete_query() -> TypedSqlQuery<( + diesel::sql_types::BigInt, + diesel::sql_types::BigInt, + diesel::sql_types::BigInt, + diesel::sql_types::Array, +)> { + use crate::db::raw_query_builder::QueryBuilder; + + let mut query = QueryBuilder::new(); + + // Build the CTE query + query.sql( + " +WITH + -- Read config from database + config AS ( + SELECT target_free_percent, min_keep_percent + FROM support_bundle_config + WHERE singleton = true + ), + -- Count non-tombstoned datasets + dataset_count AS ( + SELECT COUNT(*) as total + FROM rendezvous_debug_dataset + WHERE time_tombstoned IS NULL + ), + -- Count bundles occupying datasets (stable states: Collecting, Active). + -- Bundles in other states (Destroying, Failing, Failed) either do not use + -- datasets, or will transition to not using datasets imminently. + used_count AS ( + SELECT COUNT(*) as used + FROM support_bundle + WHERE state IN ('collecting', 'active') + ), + -- Count only Active bundles (which could be deleted). + -- We don't want to auto-delete bundles which are still being collected, so + -- this is effectively a count of 'viable targets to be deleted'. + active_count AS ( + SELECT COUNT(*) as active + FROM support_bundle + WHERE state = 'active' + ), + -- Calculate how many bundles we want to delete AND are allowed to delete. + -- Uses CROSS JOIN to combine single-row CTEs, making all columns accessible. + -- CEIL ensures we always round up, so 10% of 5 datasets = 1, not 0. + deletion_calc AS ( + SELECT + d.total as total_datasets, + u.used as used_datasets, + a.active as active_bundles, + -- 'Count we want free' - 'Count actually free' + GREATEST(0, + CEIL(d.total * c.target_free_percent / 100.0)::INT8 - (d.total - u.used) + ) as autodeletion_count, + -- 'Count we can delete' - 'Count we must keep' + GREATEST(0, + a.active - CEIL(d.total * c.min_keep_percent / 100.0)::INT8 + ) as max_deletable + FROM dataset_count d + CROSS JOIN used_count u + CROSS JOIN active_count a + CROSS JOIN config c + ), + -- Find the N oldest active bundles we're allowed to delete. + -- Uses lookup_bundle_by_state_and_creation index for ordering. + candidates AS ( + SELECT id + FROM support_bundle + WHERE state = 'active' + -- Secondary sort on id ensures deterministic selection when timestamps match, + -- which helps with test determinism if the timestamps don't have sufficient + -- granularity between bundle creation times. + ORDER BY time_created ASC, id ASC + LIMIT (SELECT LEAST(autodeletion_count, max_deletable) FROM deletion_calc) + ), + -- Atomically transition to Destroying (only if still Active). + -- The state='active' check handles concurrent user deletions. + deleted AS ( + UPDATE support_bundle + SET state = 'destroying' + WHERE id IN (SELECT id FROM candidates) + AND state = 'active' + RETURNING id + ) +SELECT + (SELECT total_datasets FROM deletion_calc), + (SELECT used_datasets FROM deletion_calc), + (SELECT active_bundles FROM deletion_calc), + ARRAY(SELECT id FROM deleted) as deleted_ids +", + ); + + query.query() } #[cfg(test)] @@ -1548,4 +1776,721 @@ mod test { db.terminate().await; logctx.cleanup_successful(); } + + #[tokio::test] + async fn test_auto_deletion_no_bundles() { + let logctx = dev::test_setup_log("test_auto_deletion_no_bundles"); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + // Create 5 debug datasets (pools) + let _test_sled = create_sled_and_zpools(&datastore, &opctx, 5).await; + + // Set config: 60% target_free (CEIL(5*60/100)=3), 0% min_keep + // With 5 datasets, no bundles, free=5 >= 3, should delete no bundles + datastore + .support_bundle_config_set(&opctx, 60, 0) + .await + .expect("Should set config"); + + let result = datastore + .support_bundle_auto_delete(&opctx) + .await + .expect("Should succeed"); + + assert_eq!(result.total_datasets, 5); + assert_eq!(result.active_bundles, 0); + assert_eq!(result.free_datasets, 5); + assert!(result.deleted_ids.is_empty()); + + db.terminate().await; + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn test_auto_deletion_enough_free_datasets() { + let logctx = + dev::test_setup_log("test_auto_deletion_enough_free_datasets"); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + let nexus_id = OmicronZoneUuid::new_v4(); + + // Create 10 debug datasets + let _test_sled = create_sled_and_zpools(&datastore, &opctx, 10).await; + + // Create 5 bundles, leaving 5 free + for _ in 0..5 { + let bundle = datastore + .support_bundle_create(&opctx, "for tests", nexus_id, None) + .await + .expect("Should be able to create bundle"); + + // Mark as active + let authz_bundle = authz_support_bundle_from_id(bundle.id.into()); + datastore + .support_bundle_update( + &opctx, + &authz_bundle, + SupportBundleState::Active, + ) + .await + .expect("Should update state"); + } + + // Set config: 30% target_free (CEIL(10*30/100)=3), 0% min_keep + // With 10 datasets, 5 bundles, free=5 >= 3, should delete no bundles + datastore + .support_bundle_config_set(&opctx, 30, 0) + .await + .expect("Should set config"); + + let result = datastore + .support_bundle_auto_delete(&opctx) + .await + .expect("Should succeed"); + + assert_eq!(result.total_datasets, 10); + assert_eq!(result.active_bundles, 5); + assert_eq!(result.free_datasets, 5); + assert!(result.deleted_ids.is_empty()); + + db.terminate().await; + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn test_auto_deletion_deletes_oldest_first() { + let logctx = + dev::test_setup_log("test_auto_deletion_deletes_oldest_first"); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + let nexus_id = OmicronZoneUuid::new_v4(); + + // Create 5 debug datasets + let _test_sled = create_sled_and_zpools(&datastore, &opctx, 5).await; + + // Create 5 bundles (all slots filled) + let mut bundle_ids = Vec::new(); + for _ in 0..5 { + let bundle = datastore + .support_bundle_create(&opctx, "for tests", nexus_id, None) + .await + .expect("Should be able to create bundle"); + + // Mark as active + let authz_bundle = authz_support_bundle_from_id(bundle.id.into()); + datastore + .support_bundle_update( + &opctx, + &authz_bundle, + SupportBundleState::Active, + ) + .await + .expect("Should update state"); + + bundle_ids.push(bundle.id); + + // Small delay to ensure different creation times + tokio::time::sleep(tokio::time::Duration::from_millis(10)).await; + } + + // Set config: 40% target_free (CEIL(5*40/100)=2), 0% min_keep + // With 5 datasets, 5 bundles, free=0 < 2, need to delete 2 bundles + datastore + .support_bundle_config_set(&opctx, 40, 0) + .await + .expect("Should set config"); + + let result = datastore + .support_bundle_auto_delete(&opctx) + .await + .expect("Should succeed"); + + assert_eq!(result.total_datasets, 5); + assert_eq!(result.active_bundles, 5); + assert_eq!(result.free_datasets, 0); + assert_eq!(result.deleted_ids.len(), 2); + + // Verify the oldest bundles are selected (first two created) + assert!(result.deleted_ids.contains(&bundle_ids[0].into())); + assert!(result.deleted_ids.contains(&bundle_ids[1].into())); + + db.terminate().await; + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn test_auto_deletion_respects_min_bundles_to_keep() { + let logctx = dev::test_setup_log( + "test_auto_deletion_respects_min_bundles_to_keep", + ); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + let nexus_id = OmicronZoneUuid::new_v4(); + + // Create 5 debug datasets + let _test_sled = create_sled_and_zpools(&datastore, &opctx, 5).await; + + // Create 5 bundles (all slots filled) + for _ in 0..5 { + let bundle = datastore + .support_bundle_create(&opctx, "for tests", nexus_id, None) + .await + .expect("Should be able to create bundle"); + + // Mark as active + let authz_bundle = authz_support_bundle_from_id(bundle.id.into()); + datastore + .support_bundle_update( + &opctx, + &authz_bundle, + SupportBundleState::Active, + ) + .await + .expect("Should update state"); + } + + // Set config: 60% target_free (CEIL(5*60/100)=3), 80% min_keep (CEIL(5*80/100)=4) + // With free=0, we'd want to delete 3 bundles + // But min_keep=4 means we can only delete 1 (5-4=1) + datastore + .support_bundle_config_set(&opctx, 60, 80) + .await + .expect("Should set config"); + + let result = datastore + .support_bundle_auto_delete(&opctx) + .await + .expect("Should succeed"); + + assert_eq!(result.total_datasets, 5); + assert_eq!(result.active_bundles, 5); + assert_eq!(result.free_datasets, 0); + // Can only delete 1 bundle due to min_bundles_to_keep constraint + assert_eq!(result.deleted_ids.len(), 1); + + db.terminate().await; + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn test_auto_deletion_min_bundles_prevents_all_deletion() { + let logctx = dev::test_setup_log( + "test_auto_deletion_min_bundles_prevents_all_deletion", + ); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + let nexus_id = OmicronZoneUuid::new_v4(); + + // Create 5 debug datasets + let _test_sled = create_sled_and_zpools(&datastore, &opctx, 5).await; + + // Create 3 bundles + for _ in 0..3 { + let bundle = datastore + .support_bundle_create(&opctx, "for tests", nexus_id, None) + .await + .expect("Should be able to create bundle"); + + // Mark as active + let authz_bundle = authz_support_bundle_from_id(bundle.id.into()); + datastore + .support_bundle_update( + &opctx, + &authz_bundle, + SupportBundleState::Active, + ) + .await + .expect("Should update state"); + } + + // Set config: 100% target_free (CEIL(5*100/100)=5), 100% min_keep (CEIL(5*100/100)=5) + // With free=2, we'd want to delete 3 bundles, but min_keep=5 > active=3 + // So we can't delete any + datastore + .support_bundle_config_set(&opctx, 100, 100) + .await + .expect("Should set config"); + + let result = datastore + .support_bundle_auto_delete(&opctx) + .await + .expect("Should succeed"); + + assert_eq!(result.total_datasets, 5); + assert_eq!(result.active_bundles, 3); + assert_eq!(result.free_datasets, 2); + // min_keep (5) > active_bundles (3), so no deletion + assert!(result.deleted_ids.is_empty()); + + db.terminate().await; + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn test_auto_deletion_only_selects_active_bundles() { + let logctx = dev::test_setup_log( + "test_auto_deletion_only_selects_active_bundles", + ); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + let nexus_id = OmicronZoneUuid::new_v4(); + + // Create 5 debug datasets + let _test_sled = create_sled_and_zpools(&datastore, &opctx, 5).await; + + // Create 3 bundles: 1 active, 1 collecting, 1 destroying + let bundle1 = datastore + .support_bundle_create(&opctx, "for tests", nexus_id, None) + .await + .expect("Should be able to create bundle"); + let authz_bundle1 = authz_support_bundle_from_id(bundle1.id.into()); + datastore + .support_bundle_update( + &opctx, + &authz_bundle1, + SupportBundleState::Active, + ) + .await + .expect("Should update state"); + + // Second bundle stays in Collecting + let _bundle2 = datastore + .support_bundle_create(&opctx, "for tests", nexus_id, None) + .await + .expect("Should be able to create bundle"); + + // Third bundle is Destroying + let bundle3 = datastore + .support_bundle_create(&opctx, "for tests", nexus_id, None) + .await + .expect("Should be able to create bundle"); + let authz_bundle3 = authz_support_bundle_from_id(bundle3.id.into()); + datastore + .support_bundle_update( + &opctx, + &authz_bundle3, + SupportBundleState::Destroying, + ) + .await + .expect("Should update state"); + + // Set config: 100% target_free (CEIL(5*100/100)=5), 0% min_keep + // With 3 bundles (1 Active, 1 Collecting, 1 Destroying): + // - used_count = 2 (Active + Collecting; Destroying not counted for deletion calc) + // - free_datasets = 5 - 2 = 3 + // We should only delete Active bundles though + datastore + .support_bundle_config_set(&opctx, 100, 0) + .await + .expect("Should set config"); + + let result = datastore + .support_bundle_auto_delete(&opctx) + .await + .expect("Should succeed"); + + assert_eq!(result.total_datasets, 5); + // Only 1 bundle is Active (candidates for deletion) + assert_eq!(result.active_bundles, 1); + // Free datasets: 5 total - 2 used (Active + Collecting) = 3 + // (Destroying bundles are not counted as they're already being freed) + assert_eq!(result.free_datasets, 3); + // We want 5 free but only have 3, so we want to delete 2 + // But we only have 1 Active bundle to delete + assert_eq!(result.deleted_ids.len(), 1); + assert_eq!(result.deleted_ids[0], bundle1.id.into()); + + db.terminate().await; + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn test_auto_deletion_verifies_state_transition() { + let logctx = + dev::test_setup_log("test_auto_deletion_verifies_state_transition"); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + let nexus_id = OmicronZoneUuid::new_v4(); + + // Create 3 debug datasets + let _test_sled = create_sled_and_zpools(&datastore, &opctx, 3).await; + + // Create 3 bundles + let mut bundle_ids = Vec::new(); + for _ in 0..3 { + let bundle = datastore + .support_bundle_create(&opctx, "for tests", nexus_id, None) + .await + .expect("Should be able to create bundle"); + + // Mark as active + let authz_bundle = authz_support_bundle_from_id(bundle.id.into()); + datastore + .support_bundle_update( + &opctx, + &authz_bundle, + SupportBundleState::Active, + ) + .await + .expect("Should update state"); + + bundle_ids.push(bundle.id); + tokio::time::sleep(tokio::time::Duration::from_millis(10)).await; + } + + // Verify bundles start in Active state + for id in &bundle_ids { + let bundle = datastore + .support_bundle_get(&opctx, (*id).into()) + .await + .unwrap(); + assert_eq!(bundle.state, SupportBundleState::Active); + } + + // Set config: 50% target_free → CEIL(3*50/100)=2, 0% min_keep + datastore + .support_bundle_config_set(&opctx, 50, 0) + .await + .expect("Should set config"); + + // With target_free=2 (from 50% of 3 datasets) and free=0, delete 2 bundles + let result = datastore + .support_bundle_auto_delete(&opctx) + .await + .expect("Should succeed"); + + assert_eq!(result.deleted_ids.len(), 2); + + // Verify deleted bundles are now in Destroying state + for id in &result.deleted_ids { + let bundle = + datastore.support_bundle_get(&opctx, *id).await.unwrap(); + assert_eq!( + bundle.state, + SupportBundleState::Destroying, + "Bundle {} should be Destroying", + id + ); + } + + // Verify the remaining bundle is still Active + let remaining_id = bundle_ids + .iter() + .find(|id| { + !result.deleted_ids.contains(&SupportBundleUuid::from(**id)) + }) + .unwrap(); + let remaining_bundle = datastore + .support_bundle_get(&opctx, (*remaining_id).into()) + .await + .unwrap(); + assert_eq!(remaining_bundle.state, SupportBundleState::Active); + + db.terminate().await; + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn test_auto_deletion_failed_bundles_dont_occupy_datasets() { + let logctx = dev::test_setup_log( + "test_auto_deletion_failed_bundles_dont_occupy_datasets", + ); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + let nexus_id = OmicronZoneUuid::new_v4(); + + // Create 3 debug datasets + let _test_sled = create_sled_and_zpools(&datastore, &opctx, 3).await; + + // Create 3 bundles, all Active + let mut bundle_ids = Vec::new(); + for _ in 0..3 { + let bundle = datastore + .support_bundle_create(&opctx, "for tests", nexus_id, None) + .await + .expect("Should be able to create bundle"); + + let authz_bundle = authz_support_bundle_from_id(bundle.id.into()); + datastore + .support_bundle_update( + &opctx, + &authz_bundle, + SupportBundleState::Active, + ) + .await + .expect("Should update state"); + + bundle_ids.push(bundle.id); + } + + // Set config: 20% target_free → CEIL(3*20/100)=1, 0% min_keep + datastore + .support_bundle_config_set(&opctx, 20, 0) + .await + .expect("Should set config"); + + // All 3 datasets are used, free=0 + let result = datastore + .support_bundle_auto_delete(&opctx) + .await + .expect("Should succeed"); + + assert_eq!(result.free_datasets, 0); + assert_eq!(result.deleted_ids.len(), 1); + + // Manually mark one bundle as Failed (simulating dataset expungement) + // Failed bundles don't occupy datasets + let authz_bundle = authz_support_bundle_from_id(bundle_ids[1].into()); + datastore + .support_bundle_update( + &opctx, + &authz_bundle, + SupportBundleState::Failing, + ) + .await + .expect("Should update state"); + datastore + .support_bundle_update( + &opctx, + &authz_bundle, + SupportBundleState::Failed, + ) + .await + .expect("Should update state"); + + // Now we have: 1 Destroying, 1 Failed, 1 Active + // - Total datasets: 3 + // - Used for deletion calc: 1 (only Active; Destroying not counted) + // - Free datasets: 3 - 1 = 2 + // Config still: 20% target_free (CEIL(3*20/100)=1), 0% min_keep + let result = datastore + .support_bundle_auto_delete(&opctx) + .await + .expect("Should succeed"); + + // Free should be 2 now: + // - Failed bundle doesn't count (dataset was expunged) + // - Destroying bundle doesn't count (already being freed) + assert_eq!(result.free_datasets, 2); + // Since free=2 >= target=1, no deletion needed + assert!(result.deleted_ids.is_empty()); + + db.terminate().await; + logctx.cleanup_successful(); + } + + // Tests for the auto_delete CTE query + + #[tokio::test] + async fn test_auto_delete_query_explain() { + use crate::db::explain::ExplainableAsync; + use crate::db::pub_test_utils::TestDatabase; + + let logctx = dev::test_setup_log("test_auto_delete_query_explain"); + let db = TestDatabase::new_with_pool(&logctx.log).await; + let conn = db.pool().claim().await.unwrap(); + + let query = support_bundle_auto_delete_query(); + + let _ = query.explain_async(&conn).await.unwrap_or_else(|e| { + panic!("Failed to explain query, is it valid SQL?\nerror: {e:#?}") + }); + + db.terminate().await; + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn expectorate_auto_delete_query() { + use crate::db::raw_query_builder::expectorate_query_contents; + + let query = support_bundle_auto_delete_query(); + expectorate_query_contents( + query, + "tests/output/support_bundle_auto_delete.sql", + ) + .await; + } + + /// Test that concurrent auto-deletion operations don't over-delete bundles. + /// + /// This verifies the atomic CTE prevents the TOCTTOU issue where multiple + /// Nexuses running concurrently could each decide to delete bundles based + /// on stale state, resulting in more deletions than intended. + #[tokio::test] + async fn test_auto_deletion_concurrent_execution_prevents_over_deletion() { + let logctx = dev::test_setup_log( + "test_auto_deletion_concurrent_execution_prevents_over_deletion", + ); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + let nexus_id = OmicronZoneUuid::new_v4(); + + // Create 10 debug datasets + let _test_sled = create_sled_and_zpools(&datastore, &opctx, 10).await; + + // Create 10 bundles (all slots filled) + for _ in 0..10 { + let bundle = datastore + .support_bundle_create(&opctx, "for tests", nexus_id, None) + .await + .expect("Should be able to create bundle"); + + let authz_bundle = authz_support_bundle_from_id(bundle.id.into()); + datastore + .support_bundle_update( + &opctx, + &authz_bundle, + SupportBundleState::Active, + ) + .await + .expect("Should update state"); + } + + // Set config: 20% target_free (CEIL(10*20/100)=2), 0% min_keep + // With 10 datasets, 10 bundles, free=0 < 2, need to delete 2 bundles + datastore + .support_bundle_config_set(&opctx, 20, 0) + .await + .expect("Should set config"); + + // Spawn multiple concurrent auto-delete operations. + // Without the atomic CTE, each would see free=0 and try to delete 2, + // potentially resulting in 10 deletions (5 tasks × 2 each). + // With the atomic CTE, only the first operation(s) should delete, + // and subsequent ones should see the updated state. + let num_concurrent_tasks = 5; + let mut handles = Vec::new(); + + for _ in 0..num_concurrent_tasks { + let datastore = datastore.clone(); + let opctx = opctx.child(std::collections::BTreeMap::new()); + handles.push(tokio::spawn(async move { + datastore.support_bundle_auto_delete(&opctx).await + })); + } + + // Collect all results + let mut total_deleted = 0; + for handle in handles { + let result = handle.await.expect("Task should complete"); + let result = result.expect("Auto-delete should succeed"); + total_deleted += result.deleted_ids.len(); + } + + // The key assertion: we should have deleted exactly 2 bundles total, + // not 2 × num_concurrent_tasks. The atomic CTE ensures that once + // bundles are transitioned to Destroying, they're no longer candidates + // for other concurrent operations (the UPDATE's WHERE state='active' + // clause filters them out). + assert_eq!( + total_deleted, 2, + "Should delete exactly 2 bundles total across all concurrent \ + operations, not {} (which would indicate over-deletion)", + total_deleted + ); + + // Verify the final state: 8 Active bundles remain + use nexus_db_schema::schema::support_bundle::dsl; + let conn = datastore.pool_connection_authorized(&opctx).await.unwrap(); + let active_count: i64 = dsl::support_bundle + .filter(dsl::state.eq(SupportBundleState::Active)) + .count() + .get_result_async(&*conn) + .await + .expect("Should count active bundles"); + + assert_eq!( + active_count, 8, + "Should have 8 Active bundles remaining after concurrent deletion" + ); + + // Verify we now have 2 Destroying bundles + let destroying_count: i64 = dsl::support_bundle + .filter(dsl::state.eq(SupportBundleState::Destroying)) + .count() + .get_result_async(&*conn) + .await + .expect("Should count destroying bundles"); + + assert_eq!( + destroying_count, 2, + "Should have exactly 2 bundles in Destroying state" + ); + + db.terminate().await; + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn test_config_set_rejects_invalid_target_free_percent() { + let logctx = dev::test_setup_log( + "test_config_set_rejects_invalid_target_free_percent", + ); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + // 101% should be rejected + let result = datastore.support_bundle_config_set(&opctx, 101, 10).await; + + assert!( + result.is_err(), + "Setting target_free_percent > 100 should fail" + ); + let err = result.unwrap_err(); + assert!( + err.to_string().contains("target_free_percent"), + "Error message should mention target_free_percent: {}", + err + ); + + // Verify valid values still work + datastore + .support_bundle_config_set(&opctx, 100, 10) + .await + .expect("100% should be valid"); + + datastore + .support_bundle_config_set(&opctx, 0, 10) + .await + .expect("0% should be valid"); + + db.terminate().await; + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn test_config_set_rejects_invalid_min_keep_percent() { + let logctx = dev::test_setup_log( + "test_config_set_rejects_invalid_min_keep_percent", + ); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + // 101% should be rejected + let result = datastore.support_bundle_config_set(&opctx, 10, 101).await; + + assert!(result.is_err(), "Setting min_keep_percent > 100 should fail"); + let err = result.unwrap_err(); + assert!( + err.to_string().contains("min_keep_percent"), + "Error message should mention min_keep_percent: {}", + err + ); + + // Verify valid values still work + datastore + .support_bundle_config_set(&opctx, 10, 100) + .await + .expect("100% should be valid"); + + datastore + .support_bundle_config_set(&opctx, 10, 0) + .await + .expect("0% should be valid"); + + db.terminate().await; + logctx.cleanup_successful(); + } } diff --git a/nexus/db-queries/tests/output/support_bundle_auto_delete.sql b/nexus/db-queries/tests/output/support_bundle_auto_delete.sql new file mode 100644 index 00000000000..fd38aa1b3cd --- /dev/null +++ b/nexus/db-queries/tests/output/support_bundle_auto_delete.sql @@ -0,0 +1,54 @@ +WITH + config + AS ( + SELECT target_free_percent, min_keep_percent FROM support_bundle_config WHERE singleton = true + ), + dataset_count + AS (SELECT count(*) AS total FROM rendezvous_debug_dataset WHERE time_tombstoned IS NULL), + used_count + AS (SELECT count(*) AS used FROM support_bundle WHERE state IN ('collecting', 'active')), + active_count AS (SELECT count(*) AS active FROM support_bundle WHERE state = 'active'), + deletion_calc + AS ( + SELECT + d.total AS total_datasets, + u.used AS used_datasets, + a.active AS active_bundles, + greatest(0, ceil(d.total * c.target_free_percent / 100.0)::INT8 - (d.total - u.used)) + AS autodeletion_count, + greatest(0, a.active - ceil(d.total * c.min_keep_percent / 100.0)::INT8) AS max_deletable + FROM + dataset_count AS d + CROSS JOIN used_count AS u + CROSS JOIN active_count AS a + CROSS JOIN config AS c + ), + candidates + AS ( + SELECT + id + FROM + support_bundle + WHERE + state = 'active' + ORDER BY + time_created ASC, id ASC + LIMIT + (SELECT least(autodeletion_count, max_deletable) FROM deletion_calc) + ), + deleted + AS ( + UPDATE + support_bundle + SET + state = 'destroying' + WHERE + id IN (SELECT id FROM candidates) AND state = 'active' + RETURNING + id + ) +SELECT + (SELECT total_datasets FROM deletion_calc), + (SELECT used_datasets FROM deletion_calc), + (SELECT active_bundles FROM deletion_calc), + ARRAY (SELECT id FROM deleted) AS deleted_ids diff --git a/nexus/db-schema/src/schema.rs b/nexus/db-schema/src/schema.rs index 3290ecad1f1..c9a2d281800 100644 --- a/nexus/db-schema/src/schema.rs +++ b/nexus/db-schema/src/schema.rs @@ -1589,6 +1589,15 @@ table! { } } +table! { + support_bundle_config (singleton) { + singleton -> Bool, + target_free_percent -> Int8, + min_keep_percent -> Int8, + time_modified -> Timestamptz, + } +} + /* hardware inventory */ table! { diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs index 887be497a17..a648760123c 100644 --- a/nexus/src/app/background/tasks/support_bundle_collector.rs +++ b/nexus/src/app/background/tasks/support_bundle_collector.rs @@ -13,6 +13,7 @@ use nexus_db_model::SupportBundleState; use nexus_db_queries::authz; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; +use nexus_types::internal_api::background::SupportBundleAutoDeletionReport; use nexus_types::internal_api::background::SupportBundleCleanupReport; use nexus_types::internal_api::background::SupportBundleCollectionReport; use omicron_common::api::external::DataPageParams; @@ -308,6 +309,60 @@ impl SupportBundleCollector { Ok(report) } + /// Atomically finds and marks bundles for automatic deletion. + /// + /// This maintains a buffer of free debug datasets for new bundle + /// allocations. Configuration (target_free_percent, min_keep_percent) + /// is read from the database, ensuring all Nexus replicas use consistent + /// values. + /// + /// The operation is atomic: finding candidates and transitioning them + /// to Destroying state happens in a single database query. This prevents + /// over-deletion when multiple Nexuses run concurrently. + async fn auto_delete_bundles( + &self, + opctx: &OpContext, + ) -> SupportBundleAutoDeletionReport { + let mut report = SupportBundleAutoDeletionReport::default(); + + // Atomically find and delete bundles in a single query. + // Config is read from the database within the query. + let result = self.datastore.support_bundle_auto_delete(opctx).await; + + let auto_deleted = match result { + Ok(r) => r, + Err(err) => { + warn!( + &opctx.log, + "SupportBundleCollector: Failed to auto-delete bundles"; + "err" => %err + ); + report + .errors + .push(format!("Failed to auto-delete bundles: {}", err)); + return report; + } + }; + + // Update report with state (as of before any deletions) + report.total_datasets = auto_deleted.total_datasets; + report.active_bundles = auto_deleted.active_bundles; + report.free_datasets = auto_deleted.free_datasets; + report.bundles_marked_for_deletion = auto_deleted.deleted_ids.len(); + + // Log each bundle that was marked for deletion + for id in &auto_deleted.deleted_ids { + info!( + &opctx.log, + "SupportBundleCollector: Auto-deleted bundle to free dataset capacity"; + "id" => %id, + "free_datasets" => auto_deleted.free_datasets, + ); + } + + report + } + async fn collect_bundle( &self, opctx: &OpContext, @@ -403,11 +458,16 @@ impl BackgroundTask for SupportBundleCollector { return json!({ "error": "task disabled" }); } + let auto_deletion_report; let mut cleanup_report = None; let mut cleanup_err = None; let mut collection_report = None; let mut collection_err = None; + // Phase 1: Auto-delete eligible bundles to maintain free dataset buffer + auto_deletion_report = self.auto_delete_bundles(&opctx).await; + + // Phase 2: Cleanup destroyed/failing bundles match self.cleanup_destroyed_bundles(&opctx).await { Ok(report) => cleanup_report = Some(report), Err(err) => { @@ -416,6 +476,7 @@ impl BackgroundTask for SupportBundleCollector { } }; + // Phase 3: Collect pending bundles let request = BundleRequest::default(); match self.collect_bundle(&opctx, &request).await { Ok(report) => collection_report = Some(report), @@ -426,6 +487,7 @@ impl BackgroundTask for SupportBundleCollector { }; json!({ + "auto_deletion_report": Some(auto_deletion_report), "cleanup_report": cleanup_report, "cleanup_err": cleanup_err, "collection_report": collection_report, diff --git a/nexus/tests/integration_tests/support_bundles.rs b/nexus/tests/integration_tests/support_bundles.rs index 11ce5119ede..a4449a9b613 100644 --- a/nexus/tests/integration_tests/support_bundles.rs +++ b/nexus/tests/integration_tests/support_bundles.rs @@ -20,6 +20,7 @@ use nexus_test_utils::http_testing::RequestBuilder; use nexus_test_utils_macros::nexus_test; use nexus_types::external_api::shared::SupportBundleInfo; use nexus_types::external_api::shared::SupportBundleState; +use nexus_types::internal_api::background::SupportBundleAutoDeletionReport; use nexus_types::internal_api::background::SupportBundleCleanupReport; use nexus_types::internal_api::background::SupportBundleCollectionReport; use nexus_types::internal_api::background::SupportBundleCollectionStep; @@ -331,6 +332,8 @@ async fn bundle_update_comment( #[derive(Deserialize)] struct TaskOutput { + auto_deletion_err: Option, + auto_deletion_report: Option, cleanup_err: Option, collection_err: Option, cleanup_report: Option, @@ -930,3 +933,116 @@ async fn test_support_bundle_delete_failed_bundle( "Deleted bundle should not appear in bundle list" ); } + +// Test automatic deletion of support bundles to maintain free dataset capacity. +// +// This test verifies that: +// 1. Auto-deletion kicks in when free_datasets < target threshold +// 2. The oldest bundles are marked for deletion first +// 3. min_keep percentage is respected +// +// Configuration: 20% target_free (CEIL(5*20/100)=1), 20% min_keep (CEIL(5*20/100)=1), 5 datasets +// Create 5 bundles (filling all datasets), and verify auto-deletion happens +// when we run out of free datasets. +#[tokio::test] +async fn test_support_bundle_auto_deletion() { + let cptestctx = nexus_test_utils::ControlPlaneBuilder::new( + "test_support_bundle_auto_deletion", + ) + .start::() + .await; + + let client = &cptestctx.external_client; + let nexus = &cptestctx.server.server_context().nexus; + let datastore = nexus.datastore(); + let opctx = + OpContext::for_tests(cptestctx.logctx.log.clone(), datastore.clone()); + + // Set auto-deletion config in the database: + // - target_free_percent=20: CEIL(5*20/100)=1 free dataset target + // - min_keep_percent=20: CEIL(5*20/100)=1 bundle minimum + datastore + .support_bundle_config_set(&opctx, 20, 20) + .await + .expect("Should be able to set config"); + + // Create 5 zpools, giving us 5 debug datasets + let _disk_test = + DiskTestBuilder::new(&cptestctx).with_zpool_count(5).build().await; + + // Create and activate bundles one by one. + // With 5 datasets and 20% target_free (CEIL(5*20/100)=1): + // - After bundles 1-4: free >= 1, so no auto-delete needed + // - When creating bundle 5: + // - Before collection: 4 Active + 1 Collecting = 5 bundles, free = 0 + // - Auto-delete triggers: want 1 free, have 0, delete 1 oldest + // - 20% min_keep (CEIL(5*20/100)=1), active=4, max_deletable=3, so we CAN delete + // - Result: oldest bundle deleted, then bundle 5 gets collected + let mut bundle_ids = Vec::new(); + for i in 0..5 { + let bundle = bundle_create(&client).await.unwrap(); + bundle_ids.push(bundle.id); + let output = + activate_bundle_collection_background_task(&cptestctx).await; + assert_eq!( + output.collection_err, None, + "Bundle {} collection failed", + i + ); + + // Check auto-deletion report + assert_eq!(output.auto_deletion_err, None); + let report = output + .auto_deletion_report + .expect("Should have auto_deletion_report"); + assert_eq!(report.total_datasets, 5); + + // Small delay to ensure different creation times + tokio::time::sleep(tokio::time::Duration::from_millis(50)).await; + } + + // After creating 5 bundles: + // - Bundle 5 creation triggered auto-delete of bundle 1 (oldest) + // - Then cleanup removed bundle 1, and collection activated bundle 5 + // So we should have 4 Active bundles remaining + let bundles = bundles_list(&client).await.unwrap(); + assert_eq!(bundles.len(), 4, "Should have 4 bundles after auto-deletion"); + + // All remaining bundles should be Active + for bundle in &bundles { + assert_eq!(bundle.state, SupportBundleState::Active); + } + + // The oldest bundle (first created) should have been deleted + assert!( + !bundles.iter().any(|b| b.id == bundle_ids[0]), + "Oldest bundle (bundle 1) should have been auto-deleted" + ); + + // Bundles 2-5 should remain + for i in 1..5 { + assert!( + bundles.iter().any(|b| b.id == bundle_ids[i]), + "Bundle {} should remain", + i + 1 + ); + } + + // Now verify the auto-deletion report from the last run + // Re-run bg task to get a fresh report (no deletion should happen now) + let output = activate_bundle_collection_background_task(&cptestctx).await; + assert_eq!(output.auto_deletion_err, None); + let report = output.auto_deletion_report.expect("Should have report"); + + // Now we have: 4 bundles, 5 datasets, 1 free + // free (1) >= target (1), so no deletion needed + assert_eq!(report.total_datasets, 5); + assert_eq!(report.active_bundles, 4); + assert_eq!(report.free_datasets, 1); + assert_eq!( + report.bundles_marked_for_deletion, 0, + "No deletion needed when free >= target" + ); + + cptestctx.teardown().await; +} diff --git a/nexus/types/src/internal_api/background.rs b/nexus/types/src/internal_api/background.rs index 8e9570e8f11..1f25dbb29be 100644 --- a/nexus/types/src/internal_api/background.rs +++ b/nexus/types/src/internal_api/background.rs @@ -268,6 +268,21 @@ pub struct SupportBundleCleanupReport { pub db_failing_bundles_updated: usize, } +/// Describes what happened during automatic support bundle deletion. +#[derive(Debug, Default, Deserialize, Serialize, Eq, PartialEq)] +pub struct SupportBundleAutoDeletionReport { + /// Number of bundles marked for deletion to free up dataset capacity. + pub bundles_marked_for_deletion: usize, + /// Current count of free debug datasets (before deletions). + pub free_datasets: usize, + /// Total debug datasets available. + pub total_datasets: usize, + /// Active bundles count (before deletions). + pub active_bundles: usize, + /// Errors encountered during auto-deletion. + pub errors: Vec, +} + /// Identifies what we could or could not store within a support bundle. /// /// This struct will get emitted as part of the background task infrastructure. diff --git a/schema/crdb/bundle-state-index/up1.sql b/schema/crdb/bundle-state-index/up1.sql new file mode 100644 index 00000000000..862dc1685f9 --- /dev/null +++ b/schema/crdb/bundle-state-index/up1.sql @@ -0,0 +1,4 @@ +CREATE INDEX IF NOT EXISTS lookup_bundle_by_state_and_creation ON omicron.public.support_bundle ( + state, + time_created +); diff --git a/schema/crdb/bundle-state-index/up2.sql b/schema/crdb/bundle-state-index/up2.sql new file mode 100644 index 00000000000..fa26870b6f2 --- /dev/null +++ b/schema/crdb/bundle-state-index/up2.sql @@ -0,0 +1,18 @@ +CREATE TABLE IF NOT EXISTS omicron.public.support_bundle_config ( + -- Singleton pattern: only one row allowed + singleton BOOL PRIMARY KEY DEFAULT TRUE CHECK (singleton = TRUE), + + -- Percentage (0-100) of total datasets to keep free for new allocations. + -- Calculated as CEIL(total_datasets * target_free_percent / 100). + -- Example: 10% of 100 datasets = 10 free, 10% of 5 datasets = 1 free. + target_free_percent INT8 NOT NULL + CHECK (target_free_percent >= 0 AND target_free_percent <= 100), + + -- Percentage (0-100) of total datasets to retain as bundles (minimum). + -- Calculated as CEIL(total_datasets * min_keep_percent / 100). + -- Prevents aggressive cleanup on small systems. + min_keep_percent INT8 NOT NULL + CHECK (min_keep_percent >= 0 AND min_keep_percent <= 100), + + time_modified TIMESTAMPTZ NOT NULL DEFAULT NOW() +); diff --git a/schema/crdb/bundle-state-index/up3.sql b/schema/crdb/bundle-state-index/up3.sql new file mode 100644 index 00000000000..458ac3d3354 --- /dev/null +++ b/schema/crdb/bundle-state-index/up3.sql @@ -0,0 +1,4 @@ +-- Default: 10% free datasets, keep at least 10% worth of bundles +INSERT INTO omicron.public.support_bundle_config (singleton, target_free_percent, min_keep_percent, time_modified) +VALUES (TRUE, 10, 10, NOW()) +ON CONFLICT (singleton) DO NOTHING; diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 99db83904f7..384acb7d6f6 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -3171,6 +3171,41 @@ CREATE INDEX IF NOT EXISTS lookup_bundle_by_creation ON omicron.public.support_b time_created ); +CREATE INDEX IF NOT EXISTS lookup_bundle_by_state_and_creation ON omicron.public.support_bundle ( + state, + time_created +); + +/* + * Support Bundle Config + * + * Configuration for automatic support bundle deletion. This table uses a + * singleton pattern (exactly one row) to store cluster-wide configuration. + */ +CREATE TABLE IF NOT EXISTS omicron.public.support_bundle_config ( + -- Singleton pattern: only one row allowed + singleton BOOL PRIMARY KEY DEFAULT TRUE CHECK (singleton = TRUE), + + -- Percentage (0-100) of total datasets to keep free for new allocations. + -- Calculated as CEIL(total_datasets * target_free_percent / 100). + -- Example: 10% of 100 datasets = 10 free, 10% of 5 datasets = 1 free. + target_free_percent INT8 NOT NULL + CHECK (target_free_percent >= 0 AND target_free_percent <= 100), + + -- Percentage (0-100) of total datasets to retain as bundles (minimum). + -- Calculated as CEIL(total_datasets * min_keep_percent / 100). + -- Prevents aggressive cleanup on small systems. + min_keep_percent INT8 NOT NULL + CHECK (min_keep_percent >= 0 AND min_keep_percent <= 100), + + time_modified TIMESTAMPTZ NOT NULL DEFAULT NOW() +); + +-- Default: 10% free datasets, keep at least 10% worth of bundles +INSERT INTO omicron.public.support_bundle_config (singleton, target_free_percent, min_keep_percent, time_modified) +VALUES (TRUE, 10, 10, NOW()) +ON CONFLICT (singleton) DO NOTHING; + /*******************************************************************/ /* @@ -8062,7 +8097,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - (TRUE, NOW(), NOW(), '225.0.0', NULL) + (TRUE, NOW(), NOW(), '226.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT;