diff --git a/clients/sled-agent-client/src/lib.rs b/clients/sled-agent-client/src/lib.rs index 6a9c7215876..f0dfbe289ea 100644 --- a/clients/sled-agent-client/src/lib.rs +++ b/clients/sled-agent-client/src/lib.rs @@ -83,6 +83,7 @@ progenitor::generate_api!( TypedUuidForOmicronZoneKind = omicron_uuid_kinds::OmicronZoneUuid, TypedUuidForPropolisKind = omicron_uuid_kinds::PropolisUuid, TypedUuidForSledKind = omicron_uuid_kinds::SledUuid, + TypedUuidForSupportBundleKind = omicron_uuid_kinds::SupportBundleUuid, TypedUuidForZpoolKind = omicron_uuid_kinds::ZpoolUuid, Vni = omicron_common::api::external::Vni, ZpoolKind = omicron_common::zpool_name::ZpoolKind, diff --git a/nexus/db-model/src/lib.rs b/nexus/db-model/src/lib.rs index 66723e0b185..b6aea15c892 100644 --- a/nexus/db-model/src/lib.rs +++ b/nexus/db-model/src/lib.rs @@ -96,6 +96,7 @@ mod sled_state; mod sled_underlay_subnet_allocation; mod snapshot; mod ssh_key; +mod support_bundle; mod switch; mod tuf_repo; mod typed_uuid; @@ -204,6 +205,7 @@ pub use sled_state::*; pub use sled_underlay_subnet_allocation::*; pub use snapshot::*; pub use ssh_key::*; +pub use support_bundle::*; pub use switch::*; pub use switch_interface::*; pub use switch_port::*; diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs index a54c2e30299..a8e1141db6e 100644 --- a/nexus/db-model/src/schema.rs +++ b/nexus/db-model/src/schema.rs @@ -1364,6 +1364,20 @@ joinable!(tuf_repo_artifact -> tuf_repo (tuf_repo_id)); // Can't specify joinable for a composite primary key (tuf_repo_artifact -> // tuf_artifact). +table! { + support_bundle { + id -> Uuid, + time_created -> Timestamptz, + reason_for_creation -> Text, + reason_for_failure -> Nullable, + state -> crate::SupportBundleStateEnum, + zpool_id -> Uuid, + dataset_id -> Uuid, + + assigned_nexus -> Nullable, + } +} + /* hardware inventory */ table! { @@ -2034,6 +2048,7 @@ allow_tables_to_appear_in_same_query!( console_session, sled, sled_resource, + support_bundle, router_route, vmm, volume, diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index 8b663d25499..4542283aac1 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -17,7 +17,7 @@ use std::collections::BTreeMap; /// /// This must be updated when you change the database schema. Refer to /// schema/crdb/README.adoc in the root of this repository for details. -pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(117, 0, 0); +pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(118, 0, 0); /// List of all past database schema versions, in *reverse* order /// @@ -29,6 +29,7 @@ static KNOWN_VERSIONS: Lazy> = Lazy::new(|| { // | leaving the first copy as an example for the next person. // v // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"), + KnownVersion::new(118, "support-bundles"), KnownVersion::new(117, "add-completing-and-new-region-volume"), KnownVersion::new(116, "bp-physical-disk-disposition"), KnownVersion::new(115, "inv-omicron-physical-disks-generation"), diff --git a/nexus/db-model/src/support_bundle.rs b/nexus/db-model/src/support_bundle.rs new file mode 100644 index 00000000000..a4b14d363b2 --- /dev/null +++ b/nexus/db-model/src/support_bundle.rs @@ -0,0 +1,133 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use super::impl_enum_type; +use crate::schema::support_bundle; +use crate::typed_uuid::DbTypedUuid; + +use chrono::{DateTime, Utc}; +use nexus_types::external_api::shared::SupportBundleInfo as SupportBundleView; +use nexus_types::external_api::shared::SupportBundleState as SupportBundleStateView; +use omicron_uuid_kinds::DatasetKind; +use omicron_uuid_kinds::DatasetUuid; +use omicron_uuid_kinds::OmicronZoneKind; +use omicron_uuid_kinds::OmicronZoneUuid; +use omicron_uuid_kinds::SupportBundleKind; +use omicron_uuid_kinds::SupportBundleUuid; +use omicron_uuid_kinds::ZpoolKind; +use omicron_uuid_kinds::ZpoolUuid; +use serde::{Deserialize, Serialize}; + +impl_enum_type!( + #[derive(SqlType, Debug, QueryId)] + #[diesel(postgres_type(name = "support_bundle_state", schema = "public"))] + pub struct SupportBundleStateEnum; + + #[derive(Copy, Clone, Debug, AsExpression, FromSqlRow, Serialize, Deserialize, PartialEq)] + #[diesel(sql_type = SupportBundleStateEnum)] + pub enum SupportBundleState; + + // Enum values + Collecting => b"collecting" + Active => b"active" + Destroying => b"destroying" + Failing => b"failing" + Failed => b"failed" +); + +impl SupportBundleState { + /// Returns the list of valid prior states. + /// + /// This is used to confirm that state updates are performed legally, + /// and defines the possible state transitions. + pub fn valid_old_states(&self) -> Vec { + use SupportBundleState::*; + + match self { + Collecting => vec![], + Active => vec![Collecting], + // The "Destroying" state is terminal. + Destroying => vec![Active, Collecting, Failing], + Failing => vec![Collecting, Active], + // The "Failed" state is terminal. + Failed => vec![Active, Collecting, Failing], + } + } +} + +impl From for SupportBundleStateView { + fn from(state: SupportBundleState) -> Self { + use SupportBundleState::*; + + match state { + Collecting => SupportBundleStateView::Collecting, + Active => SupportBundleStateView::Active, + Destroying => SupportBundleStateView::Destroying, + // The distinction between "failing" and "failed" should not be + // visible to end-users. This is internal book-keeping to decide + // whether or not the bundle record can be safely deleted. + // + // Either way, it should be possible to delete the bundle. + // If a user requests that we delete a bundle in these states: + // - "Failing" bundles will become "Destroying" + // - "Failed" bundles can be deleted immediately + Failing => SupportBundleStateView::Failed, + Failed => SupportBundleStateView::Failed, + } + } +} + +#[derive( + Queryable, + Insertable, + Debug, + Clone, + Selectable, + Deserialize, + Serialize, + PartialEq, +)] +#[diesel(table_name = support_bundle)] +pub struct SupportBundle { + pub id: DbTypedUuid, + pub time_created: DateTime, + pub reason_for_creation: String, + pub reason_for_failure: Option, + pub state: SupportBundleState, + pub zpool_id: DbTypedUuid, + pub dataset_id: DbTypedUuid, + pub assigned_nexus: Option>, +} + +impl SupportBundle { + pub fn new( + reason_for_creation: &'static str, + zpool_id: ZpoolUuid, + dataset_id: DatasetUuid, + nexus_id: OmicronZoneUuid, + ) -> Self { + Self { + id: SupportBundleUuid::new_v4().into(), + time_created: Utc::now(), + reason_for_creation: reason_for_creation.to_string(), + reason_for_failure: None, + state: SupportBundleState::Collecting, + zpool_id: zpool_id.into(), + dataset_id: dataset_id.into(), + assigned_nexus: Some(nexus_id.into()), + } + } +} + +impl From for SupportBundleView { + fn from(bundle: SupportBundle) -> Self { + Self { + id: bundle.id.into(), + time_created: bundle.time_created, + reason_for_creation: bundle.reason_for_creation, + reason_for_failure: bundle.reason_for_failure, + state: bundle.state.into(), + } + } +} diff --git a/nexus/db-queries/src/db/datastore/dataset.rs b/nexus/db-queries/src/db/datastore/dataset.rs index 992db3705c9..938927ff5e4 100644 --- a/nexus/db-queries/src/db/datastore/dataset.rs +++ b/nexus/db-queries/src/db/datastore/dataset.rs @@ -352,14 +352,13 @@ impl DataStore { #[cfg(test)] mod test { use super::*; + use crate::db::datastore::test::bp_insert_and_make_target; use crate::db::pub_test_utils::TestDatabase; use nexus_db_model::Generation; use nexus_db_model::SledBaseboard; use nexus_db_model::SledSystemHardware; use nexus_db_model::SledUpdate; use nexus_reconfigurator_planning::blueprint_builder::BlueprintBuilder; - use nexus_types::deployment::Blueprint; - use nexus_types::deployment::BlueprintTarget; use omicron_common::api::internal::shared::DatasetKind as ApiDatasetKind; use omicron_test_utils::dev; use omicron_uuid_kinds::DatasetUuid; @@ -523,28 +522,6 @@ mod test { logctx.cleanup_successful(); } - async fn bp_insert_and_make_target( - opctx: &OpContext, - datastore: &DataStore, - bp: &Blueprint, - ) { - datastore - .blueprint_insert(opctx, bp) - .await - .expect("inserted blueprint"); - datastore - .blueprint_target_set_current( - opctx, - BlueprintTarget { - target_id: bp.id, - enabled: true, - time_made_target: Utc::now(), - }, - ) - .await - .expect("made blueprint the target"); - } - fn new_dataset_on(zpool_id: ZpoolUuid) -> Dataset { Dataset::new( DatasetUuid::new_v4(), diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs index d20f24e773d..3b19677fe89 100644 --- a/nexus/db-queries/src/db/datastore/mod.rs +++ b/nexus/db-queries/src/db/datastore/mod.rs @@ -93,6 +93,7 @@ mod sled; mod sled_instance; mod snapshot; mod ssh_key; +mod support_bundle; mod switch; mod switch_interface; mod switch_port; @@ -464,6 +465,8 @@ mod test { use nexus_db_fixed_data::silo::DEFAULT_SILO; use nexus_db_model::IpAttachState; use nexus_db_model::{to_db_typed_uuid, Generation}; + use nexus_types::deployment::Blueprint; + use nexus_types::deployment::BlueprintTarget; use nexus_types::external_api::params; use nexus_types::silo::DEFAULT_SILO_ID; use omicron_common::api::external::{ @@ -504,6 +507,33 @@ mod test { } } + /// Inserts a blueprint in the DB and forcibly makes it the target + /// + /// WARNING: This makes no attempts to validate the blueprint relative to + /// parents -- this is just a test-only helper to make testing + /// blueprint-specific checks easier. + pub async fn bp_insert_and_make_target( + opctx: &OpContext, + datastore: &DataStore, + bp: &Blueprint, + ) { + datastore + .blueprint_insert(opctx, bp) + .await + .expect("inserted blueprint"); + datastore + .blueprint_target_set_current( + opctx, + BlueprintTarget { + target_id: bp.id, + enabled: true, + time_made_target: Utc::now(), + }, + ) + .await + .expect("made blueprint the target"); + } + #[tokio::test] async fn test_project_creation() { let logctx = dev::test_setup_log("test_project_creation"); diff --git a/nexus/db-queries/src/db/datastore/support_bundle.rs b/nexus/db-queries/src/db/datastore/support_bundle.rs new file mode 100644 index 00000000000..d616b4356b7 --- /dev/null +++ b/nexus/db-queries/src/db/datastore/support_bundle.rs @@ -0,0 +1,1412 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! [`DataStore`] methods on [`SupportBundle`]s. + +use super::DataStore; +use crate::authz; +use crate::context::OpContext; +use crate::db; +use crate::db::error::public_error_from_diesel; +use crate::db::error::ErrorHandler; +use crate::db::model::Dataset; +use crate::db::model::DatasetKind; +use crate::db::model::SupportBundle; +use crate::db::model::SupportBundleState; +use crate::db::pagination::paginated; +use crate::db::update_and_check::{UpdateAndCheck, UpdateStatus}; +use crate::transaction_retry::OptionalError; +use async_bb8_diesel::AsyncRunQueryDsl; +use diesel::prelude::*; +use futures::FutureExt; +use nexus_types::identity::Asset; +use omicron_common::api::external; +use omicron_common::api::external::CreateResult; +use omicron_common::api::external::DataPageParams; +use omicron_common::api::external::Error; +use omicron_common::api::external::ListResultVec; +use omicron_common::api::external::LookupResult; +use omicron_uuid_kinds::BlueprintUuid; +use omicron_uuid_kinds::GenericUuid; +use omicron_uuid_kinds::OmicronZoneUuid; +use omicron_uuid_kinds::SupportBundleUuid; +use omicron_uuid_kinds::ZpoolUuid; +use uuid::Uuid; + +const CANNOT_ALLOCATE_ERR_MSG: &'static str = +"Current policy limits support bundle creation to 'one per external disk', and \ + no disks are available. You must delete old support bundles before new ones \ + can be created"; + +const FAILURE_REASON_NO_DATASET: &'static str = + "Allocated dataset no longer exists"; +const FAILURE_REASON_NO_NEXUS: &'static str = + "Nexus managing this bundle no longer exists"; + +/// Provides a report on how many bundle were expunged, and why. +#[derive(Default, Debug, Clone, PartialEq)] +pub struct SupportBundleExpungementReport { + /// Bundles marked "failed" because the datasets storing them have been + /// expunged. + pub bundles_failed_missing_datasets: usize, + /// Bundles already in the "destroying" state that have been deleted because + /// the datasets storing them have been expunged. + pub bundles_deleted_missing_datasets: usize, + + /// Bundles marked "destroying" because the nexuses managing them have been + /// expunged. + /// + /// These bundles should be re-assigned to a different nexus for cleanup. + pub bundles_failing_missing_nexus: usize, + + /// Bundles which had a new Nexus assigned to them. + pub bundles_reassigned: usize, +} + +impl DataStore { + /// Creates a new support bundle. + /// + /// Requires that the UUID of the calling Nexus be supplied as input - + /// this particular Zone is responsible for the collection process. + /// + /// Note that really any functioning Nexus would work as the "assignee", + /// but it's clear that our instance will work, because we're currently + /// running. + pub async fn support_bundle_create( + &self, + opctx: &OpContext, + reason_for_creation: &'static str, + this_nexus_id: OmicronZoneUuid, + ) -> CreateResult { + opctx.authorize(authz::Action::Modify, &authz::FLEET).await?; + let conn = self.pool_connection_authorized(opctx).await?; + + #[derive(Debug)] + enum SupportBundleError { + TooManyBundles, + } + + let err = OptionalError::new(); + self.transaction_retry_wrapper("support_bundle_create") + .transaction(&conn, |conn| { + let err = err.clone(); + + async move { + use db::schema::dataset::dsl as dataset_dsl; + use db::schema::support_bundle::dsl as support_bundle_dsl; + + // Observe all "non-deleted, debug datasets". + // + // Return the first one we find that doesn't already + // have a support bundle allocated to it. + let free_dataset = dataset_dsl::dataset + .filter(dataset_dsl::time_deleted.is_null()) + .filter(dataset_dsl::kind.eq(DatasetKind::Debug)) + .left_join(support_bundle_dsl::support_bundle.on( + dataset_dsl::id.eq(support_bundle_dsl::dataset_id), + )) + .filter(support_bundle_dsl::dataset_id.is_null()) + .select(Dataset::as_select()) + .first_async(&conn) + .await + .optional()?; + + let Some(dataset) = free_dataset else { + return Err( + err.bail(SupportBundleError::TooManyBundles) + ); + }; + + // We could check that "this_nexus_id" is not expunged, but + // we have some evidence that it is valid: this Nexus is + // currently running! + // + // Besides, we COULD be expunged immediately after inserting + // the SupportBundle. In this case, we'd fall back to the + // case of "clean up a bundle which is managed by an + // expunged Nexus" anyway. + + let bundle = SupportBundle::new( + reason_for_creation, + ZpoolUuid::from_untyped_uuid(dataset.pool_id), + dataset.id(), + this_nexus_id, + ); + + diesel::insert_into(support_bundle_dsl::support_bundle) + .values(bundle.clone()) + .execute_async(&conn) + .await?; + + Ok(bundle) + } + }) + .await + .map_err(|e| { + if let Some(err) = err.take() { + match err { + SupportBundleError::TooManyBundles => { + return external::Error::insufficient_capacity( + CANNOT_ALLOCATE_ERR_MSG, + "Support Bundle storage exhausted", + ); + } + } + } + public_error_from_diesel(e, ErrorHandler::Server) + }) + } + + /// Looks up a single support bundle + pub async fn support_bundle_get( + &self, + opctx: &OpContext, + id: SupportBundleUuid, + ) -> LookupResult { + opctx.authorize(authz::Action::Read, &authz::FLEET).await?; + use db::schema::support_bundle::dsl; + + let conn = self.pool_connection_authorized(opctx).await?; + dsl::support_bundle + .filter(dsl::id.eq(id.into_untyped_uuid())) + .select(SupportBundle::as_select()) + .first_async::(&*conn) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } + + /// Lists one page of support bundles + pub async fn support_bundle_list( + &self, + opctx: &OpContext, + pagparams: &DataPageParams<'_, Uuid>, + ) -> ListResultVec { + opctx.authorize(authz::Action::Read, &authz::FLEET).await?; + use db::schema::support_bundle::dsl; + + let conn = self.pool_connection_authorized(opctx).await?; + paginated(dsl::support_bundle, dsl::id, pagparams) + .select(SupportBundle::as_select()) + .load_async(&*conn) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } + + /// Lists one page of support bundles in a particular state, assigned to + /// a particular Nexus. + pub async fn support_bundle_list_assigned_to_nexus( + &self, + opctx: &OpContext, + pagparams: &DataPageParams<'_, Uuid>, + nexus_id: OmicronZoneUuid, + states: Vec, + ) -> ListResultVec { + opctx.authorize(authz::Action::Read, &authz::FLEET).await?; + use db::schema::support_bundle::dsl; + + let conn = self.pool_connection_authorized(opctx).await?; + paginated(dsl::support_bundle, dsl::id, pagparams) + .filter(dsl::assigned_nexus.eq(nexus_id.into_untyped_uuid())) + .filter(dsl::state.eq_any(states)) + .order(dsl::time_created.asc()) + .select(SupportBundle::as_select()) + .load_async(&*conn) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } + + /// Marks support bundles as failed if their assigned Nexus or backing + /// dataset has been destroyed. + pub async fn support_bundle_fail_expunged( + &self, + opctx: &OpContext, + blueprint: &nexus_types::deployment::Blueprint, + ) -> Result { + opctx.authorize(authz::Action::Modify, &authz::FLEET).await?; + + // For this blueprint: The set of all expunged Nexus zones + let invalid_nexus_zones = blueprint + .all_omicron_zones( + nexus_types::deployment::BlueprintZoneFilter::Expunged, + ) + .filter_map(|(_sled, zone)| { + if matches!( + zone.zone_type, + nexus_types::deployment::BlueprintZoneType::Nexus(_) + ) { + Some(zone.id.into_untyped_uuid()) + } else { + None + } + }) + .collect::>(); + let valid_nexus_zones = blueprint + .all_omicron_zones( + nexus_types::deployment::BlueprintZoneFilter::ShouldBeRunning, + ) + .filter_map(|(_sled, zone)| { + if matches!( + zone.zone_type, + nexus_types::deployment::BlueprintZoneType::Nexus(_) + ) { + Some(zone.id.into_untyped_uuid()) + } else { + None + } + }) + .collect::>(); + + // For this blueprint: The set of expunged debug datasets + let invalid_datasets = blueprint + .all_omicron_datasets( + nexus_types::deployment::BlueprintDatasetFilter::Expunged, + ) + .filter_map(|(_sled_id, dataset_config)| { + if matches!( + dataset_config.kind, + omicron_common::api::internal::shared::DatasetKind::Debug + ) { + Some(dataset_config.id.into_untyped_uuid()) + } else { + None + } + }) + .collect::>(); + + let conn = self.pool_connection_authorized(opctx).await?; + + self.transaction_if_current_blueprint_is( + &conn, + "support_bundle_fail_expunged", + opctx, + BlueprintUuid::from_untyped_uuid(blueprint.id), + |conn| { + let invalid_nexus_zones = invalid_nexus_zones.clone(); + let valid_nexus_zones = valid_nexus_zones.clone(); + let invalid_datasets = invalid_datasets.clone(); + async move { + use db::schema::support_bundle::dsl; + + // Find all bundles without backing storage. + let bundles_with_bad_datasets = dsl::support_bundle + .filter(dsl::dataset_id.eq_any(invalid_datasets)) + .select(SupportBundle::as_select()) + .load_async(conn) + .await?; + + // Split these bundles into two categories: + // - Ones that are being destroyed anyway, and that can be + // fully deleted. + // - Ones that are NOT being destroyed, and should be marked + // failed so the end-user has visibility into their + // destruction. + let (bundles_to_delete, bundles_to_fail): (Vec<_>, Vec<_>) = + bundles_with_bad_datasets.into_iter().partition( + |bundle| bundle.state == SupportBundleState::Destroying + ); + let bundles_to_delete = bundles_to_delete.into_iter().map(|b| b.id).collect::>(); + let bundles_to_fail = bundles_to_fail.into_iter().map(|b| b.id).collect::>(); + + // Find all non-destroying bundles on datasets that no + // longer exist, and mark them "failed". They skip the + // "failing" state because there is no remaining storage to + // be cleaned up. + let state = SupportBundleState::Failed; + let bundles_failed_missing_datasets = + diesel::update(dsl::support_bundle) + .filter(dsl::state.eq_any(state.valid_old_states())) + .filter(dsl::id.eq_any(bundles_to_fail)) + .set(( + dsl::state.eq(state), + dsl::reason_for_failure.eq(FAILURE_REASON_NO_DATASET), + )) + .execute_async(conn) + .await?; + // For bundles that are in the process of being destroyed, + // the dataset expungement speeds up the process. + let bundles_deleted_missing_datasets = + diesel::delete(dsl::support_bundle) + .filter(dsl::id.eq_any(bundles_to_delete)) + // This check should be redundant (we already + // partitioned above based on this state) but out of + // an abundance of caution we don't auto-delete a + // bundle in any other state. + .filter(dsl::state.eq(SupportBundleState::Destroying)) + .execute_async(conn) + .await?; + + let Some(arbitrary_valid_nexus) = + valid_nexus_zones.get(0).cloned() + else { + return Err(external::Error::internal_error( + "No valid Nexuses, we cannot re-assign this support bundle", + ) + .into()); + }; + + // Find all bundles on nexuses that no longer exist. + let bundles_with_bad_nexuses = dsl::support_bundle + .filter(dsl::assigned_nexus.eq_any(invalid_nexus_zones)) + .select(SupportBundle::as_select()) + .load_async(conn) + .await?; + + let bundles_to_mark_failing = bundles_with_bad_nexuses.iter() + .map(|b| b.id).collect::>(); + let bundles_to_reassign = bundles_with_bad_nexuses.iter() + .filter_map(|bundle| { + if bundle.state != SupportBundleState::Failed { + Some(bundle.id) + } else { + None + } + }).collect::>(); + + // Mark these support bundles as failing, and assign then + // to a nexus that should still exist. + // + // This should lead to their storage being freed, if it + // exists. + let state = SupportBundleState::Failing; + let bundles_failing_missing_nexus = diesel::update(dsl::support_bundle) + .filter(dsl::state.eq_any(state.valid_old_states())) + .filter(dsl::id.eq_any(bundles_to_mark_failing)) + .set(( + dsl::state.eq(state), + dsl::reason_for_failure.eq(FAILURE_REASON_NO_NEXUS), + )) + .execute_async(conn) + .await?; + let bundles_reassigned = diesel::update(dsl::support_bundle) + .filter(dsl::id.eq_any(bundles_to_reassign)) + .set(dsl::assigned_nexus.eq(arbitrary_valid_nexus)) + .execute_async(conn) + .await?; + + Ok(SupportBundleExpungementReport { + bundles_failed_missing_datasets, + bundles_deleted_missing_datasets, + bundles_failing_missing_nexus, + bundles_reassigned, + }) + } + .boxed() + }, + ) + .await + } + + /// Updates the state of a support bundle. + /// + /// Returns: + /// - "Ok" if the bundle was updated successfully. + /// - "Err::InvalidRequest" if the bundle exists, but could not be updated + /// because the state transition is invalid. + pub async fn support_bundle_update( + &self, + opctx: &OpContext, + id: SupportBundleUuid, + state: SupportBundleState, + ) -> Result<(), Error> { + opctx.authorize(authz::Action::Modify, &authz::FLEET).await?; + + use db::schema::support_bundle::dsl; + let conn = self.pool_connection_authorized(opctx).await?; + let result = diesel::update(dsl::support_bundle) + .filter(dsl::id.eq(id.into_untyped_uuid())) + .filter(dsl::state.eq_any(state.valid_old_states())) + .set(dsl::state.eq(state)) + .check_if_exists::(id.into_untyped_uuid()) + .execute_and_check(&conn) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + + match result.status { + UpdateStatus::Updated => Ok(()), + UpdateStatus::NotUpdatedButExists => { + Err(Error::invalid_request(format!( + "Cannot update support bundle state from {:?} to {:?}", + result.found.state, state + ))) + } + } + } + + /// Deletes a support bundle. + /// + /// This should only be invoked after all storage for the support bundle has + /// been cleared. + pub async fn support_bundle_delete( + &self, + opctx: &OpContext, + id: SupportBundleUuid, + ) -> Result<(), Error> { + opctx.authorize(authz::Action::Modify, &authz::FLEET).await?; + + use db::schema::support_bundle::dsl; + + let conn = self.pool_connection_authorized(opctx).await?; + diesel::delete(dsl::support_bundle) + .filter( + dsl::state + .eq(SupportBundleState::Destroying) + .or(dsl::state.eq(SupportBundleState::Failed)), + ) + .filter(dsl::id.eq(id.into_untyped_uuid())) + .execute_async(&*conn) + .await + .map(|_rows_modified| ()) + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + + Ok(()) + } +} + +#[cfg(test)] +mod test { + use super::*; + use crate::db::datastore::test::bp_insert_and_make_target; + use crate::db::pub_test_utils::TestDatabase; + use nexus_db_model::Generation; + use nexus_db_model::SledBaseboard; + use nexus_db_model::SledSystemHardware; + use nexus_db_model::SledUpdate; + use nexus_db_model::Zpool; + use nexus_reconfigurator_planning::example::ExampleSystemBuilder; + use nexus_reconfigurator_planning::example::SimRngState; + use nexus_types::deployment::Blueprint; + use nexus_types::deployment::BlueprintDatasetDisposition; + use nexus_types::deployment::BlueprintDatasetFilter; + use nexus_types::deployment::BlueprintZoneDisposition; + use nexus_types::deployment::BlueprintZoneFilter; + use nexus_types::deployment::BlueprintZoneType; + use omicron_common::api::internal::shared::DatasetKind::Debug as DebugDatasetKind; + use omicron_test_utils::dev; + use omicron_uuid_kinds::DatasetUuid; + use omicron_uuid_kinds::PhysicalDiskUuid; + use omicron_uuid_kinds::SledUuid; + use rand::Rng; + + // Pool/Dataset pairs, for debug datasets only. + struct TestPool { + pool: ZpoolUuid, + dataset: DatasetUuid, + } + + // Sleds and their pools, with a focus on debug datasets only. + struct TestSled { + sled: SledUuid, + pools: Vec, + } + + impl TestSled { + fn new_with_pool_count(pool_count: usize) -> Self { + Self { + sled: SledUuid::new_v4(), + pools: (0..pool_count) + .map(|_| TestPool { + pool: ZpoolUuid::new_v4(), + dataset: DatasetUuid::new_v4(), + }) + .collect(), + } + } + + fn new_from_blueprint(blueprint: &Blueprint) -> Vec { + let mut sleds = vec![]; + for (sled, datasets) in &blueprint.blueprint_datasets { + let pools = datasets + .datasets + .values() + .filter_map(|dataset| { + if !matches!(dataset.kind, DebugDatasetKind) + || !dataset + .disposition + .matches(BlueprintDatasetFilter::InService) + { + return None; + }; + + Some(TestPool { + pool: dataset.pool.id(), + dataset: dataset.id, + }) + }) + .collect(); + + sleds.push(TestSled { sled: *sled, pools }); + } + sleds + } + + async fn create_database_records( + &self, + datastore: &DataStore, + opctx: &OpContext, + ) { + let rack_id = Uuid::new_v4(); + let sled = SledUpdate::new( + *self.sled.as_untyped_uuid(), + "[::1]:0".parse().unwrap(), + SledBaseboard { + serial_number: format!( + "test-{}", + rand::thread_rng().gen::() + ), + part_number: "test-pn".to_string(), + revision: 0, + }, + SledSystemHardware { + is_scrimlet: false, + usable_hardware_threads: 128, + usable_physical_ram: (64 << 30).try_into().unwrap(), + reservoir_size: (16 << 30).try_into().unwrap(), + }, + rack_id, + Generation::new(), + ); + datastore.sled_upsert(sled).await.expect("failed to upsert sled"); + + // Create fake zpools that back our fake datasets. + for pool in &self.pools { + let zpool = Zpool::new( + *pool.pool.as_untyped_uuid(), + *self.sled.as_untyped_uuid(), + PhysicalDiskUuid::new_v4(), + ); + datastore + .zpool_insert(opctx, zpool) + .await + .expect("failed to upsert zpool"); + + let dataset = Dataset::new( + pool.dataset, + pool.pool.into_untyped_uuid(), + None, + DebugDatasetKind, + ); + datastore + .dataset_upsert(dataset) + .await + .expect("failed to upsert dataset"); + } + } + } + + // Creates a fake sled with `pool_count` zpools, and a debug dataset on each + // zpool. + async fn create_sled_and_zpools( + datastore: &DataStore, + opctx: &OpContext, + pool_count: usize, + ) -> TestSled { + let sled = TestSled::new_with_pool_count(pool_count); + sled.create_database_records(&datastore, &opctx).await; + sled + } + + async fn support_bundle_create_expect_no_capacity( + datastore: &DataStore, + opctx: &OpContext, + this_nexus_id: OmicronZoneUuid, + ) { + let err = datastore + .support_bundle_create(&opctx, "for tests", this_nexus_id) + .await + .expect_err("Shouldn't provision bundle without datasets"); + let Error::InsufficientCapacity { message } = err else { + panic!("Unexpected error: {err:?} - we expected 'InsufficientCapacity'"); + }; + assert_eq!( + CANNOT_ALLOCATE_ERR_MSG, + message.external_message(), + "Unexpected error: {message:?}" + ); + } + + #[tokio::test] + async fn test_bundle_list_filtering() { + let logctx = dev::test_setup_log("test_bundle_create_capacity_limits"); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let nexus_a = OmicronZoneUuid::new_v4(); + let nexus_b = OmicronZoneUuid::new_v4(); + + let _test_sled = create_sled_and_zpools(&datastore, &opctx, 5).await; + + let pagparams = DataPageParams::max_page(); + + // No bundles exist yet, so the list should be empty + + assert_eq!( + datastore + .support_bundle_list_assigned_to_nexus( + &opctx, + &pagparams, + nexus_a, + vec![SupportBundleState::Collecting] + ) + .await + .expect("Should always be able to list bundles"), + vec![] + ); + + // Create two bundles on "nexus A", one bundle on "nexus B" + + let bundle_a1 = datastore + .support_bundle_create(&opctx, "for the test", nexus_a) + .await + .expect("Should be able to create bundle"); + let bundle_a2 = datastore + .support_bundle_create(&opctx, "for the test", nexus_a) + .await + .expect("Should be able to create bundle"); + let bundle_b1 = datastore + .support_bundle_create(&opctx, "for the test", nexus_b) + .await + .expect("Should be able to create bundle"); + + assert_eq!( + datastore + .support_bundle_list_assigned_to_nexus( + &opctx, + &pagparams, + nexus_a, + vec![SupportBundleState::Collecting] + ) + .await + .expect("Should always be able to list bundles") + .iter() + .map(|b| b.id) + .collect::>(), + vec![bundle_a1.id, bundle_a2.id,] + ); + assert_eq!( + datastore + .support_bundle_list_assigned_to_nexus( + &opctx, + &pagparams, + nexus_b, + vec![SupportBundleState::Collecting] + ) + .await + .expect("Should always be able to list bundles") + .iter() + .map(|b| b.id) + .collect::>(), + vec![bundle_b1.id,] + ); + + // When we update the state of the bundles, the list results + // should also be filtered. + datastore + .support_bundle_update( + &opctx, + bundle_a1.id.into(), + SupportBundleState::Active, + ) + .await + .expect("Should have been able to update state"); + + // "bundle_a1" is no longer collecting, so it won't appear here. + assert_eq!( + datastore + .support_bundle_list_assigned_to_nexus( + &opctx, + &pagparams, + nexus_a, + vec![SupportBundleState::Collecting] + ) + .await + .expect("Should always be able to list bundles") + .iter() + .map(|b| b.id) + .collect::>(), + vec![bundle_a2.id,] + ); + + // ... but if we ask for enough states, it'll show up + assert_eq!( + datastore + .support_bundle_list_assigned_to_nexus( + &opctx, + &pagparams, + nexus_a, + vec![ + SupportBundleState::Active, + SupportBundleState::Collecting + ] + ) + .await + .expect("Should always be able to list bundles") + .iter() + .map(|b| b.id) + .collect::>(), + vec![bundle_a1.id, bundle_a2.id,] + ); + + db.terminate().await; + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn test_bundle_create_capacity_limits() { + let logctx = dev::test_setup_log("test_bundle_create_capacity_limits"); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let this_nexus_id = OmicronZoneUuid::new_v4(); + + // No sleds, no datasets. Allocation should fail. + + support_bundle_create_expect_no_capacity( + &datastore, + &opctx, + this_nexus_id, + ) + .await; + + // Create a sled with a couple pools. Allocation should succeed. + + const POOL_COUNT: usize = 2; + let _test_sled = + create_sled_and_zpools(&datastore, &opctx, POOL_COUNT).await; + let mut bundles = vec![]; + for _ in 0..POOL_COUNT { + bundles.push( + datastore + .support_bundle_create( + &opctx, + "for the test", + this_nexus_id, + ) + .await + .expect("Should be able to create bundle"), + ); + } + + // If we try to allocate any more bundles, we'll run out of capacity. + + support_bundle_create_expect_no_capacity( + &datastore, + &opctx, + this_nexus_id, + ) + .await; + + // If we destroy a bundle, it isn't deleted (yet). + // This operation should signify that we can start to free up + // storage on the dataset, but that needs to happen outside the + // database. + // + // We should still expect to hit capacity limits. + + datastore + .support_bundle_update( + &opctx, + bundles[0].id.into(), + SupportBundleState::Destroying, + ) + .await + .expect("Should be able to destroy this bundle"); + support_bundle_create_expect_no_capacity( + &datastore, + &opctx, + this_nexus_id, + ) + .await; + + // If we delete a bundle, it should be gone. This means we can + // re-allocate from that dataset which was just freed up. + + datastore + .support_bundle_delete(&opctx, bundles[0].id.into()) + .await + .expect("Should be able to destroy this bundle"); + datastore + .support_bundle_create(&opctx, "for the test", this_nexus_id) + .await + .expect("Should be able to create bundle"); + + db.terminate().await; + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn test_crud_operations() { + let logctx = dev::test_setup_log("test_crud_operations"); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let test_sled = create_sled_and_zpools(&datastore, &opctx, 1).await; + let reason = "Bundle for test"; + let this_nexus_id = OmicronZoneUuid::new_v4(); + + // Create the bundle, then observe it through the "getter" APIs + + let mut bundle = datastore + .support_bundle_create(&opctx, reason, this_nexus_id) + .await + .expect("Should be able to create bundle"); + assert_eq!(bundle.reason_for_creation, reason); + assert_eq!(bundle.reason_for_failure, None); + assert_eq!(bundle.assigned_nexus, Some(this_nexus_id.into())); + assert_eq!(bundle.state, SupportBundleState::Collecting); + assert_eq!(bundle.zpool_id, test_sled.pools[0].pool.into()); + assert_eq!(bundle.dataset_id, test_sled.pools[0].dataset.into()); + + let observed_bundle = datastore + .support_bundle_get(&opctx, bundle.id.into()) + .await + .expect("Should be able to get bundle we just created"); + // Overwrite this column; it is modified slightly upon database insertion. + bundle.time_created = observed_bundle.time_created; + assert_eq!(bundle, observed_bundle); + + let pagparams = DataPageParams::max_page(); + let observed_bundles = datastore + .support_bundle_list(&opctx, &pagparams) + .await + .expect("Should be able to get bundle we just created"); + assert_eq!(1, observed_bundles.len()); + assert_eq!(bundle, observed_bundles[0]); + + // Destroy the bundle, observe the new state + + datastore + .support_bundle_update( + &opctx, + bundle.id.into(), + SupportBundleState::Destroying, + ) + .await + .expect("Should be able to destroy our bundle"); + let observed_bundle = datastore + .support_bundle_get(&opctx, bundle.id.into()) + .await + .expect("Should be able to get bundle we just created"); + assert_eq!(SupportBundleState::Destroying, observed_bundle.state); + + // Delete the bundle, observe that it's gone + + datastore + .support_bundle_delete(&opctx, bundle.id.into()) + .await + .expect("Should be able to destroy our bundle"); + let observed_bundles = datastore + .support_bundle_list(&opctx, &pagparams) + .await + .expect("Should be able to query when no bundles exist"); + assert!(observed_bundles.is_empty()); + + db.terminate().await; + logctx.cleanup_successful(); + } + + fn get_nexuses_from_blueprint( + bp: &Blueprint, + filter: BlueprintZoneFilter, + ) -> Vec { + bp.blueprint_zones + .values() + .flat_map(|zones_config| { + let mut nexus_zones = vec![]; + for zone in &zones_config.zones { + if matches!(zone.zone_type, BlueprintZoneType::Nexus(_)) + && zone.disposition.matches(filter) + { + nexus_zones.push(zone.id); + } + } + nexus_zones + }) + .collect() + } + + fn get_debug_datasets_from_blueprint( + bp: &Blueprint, + filter: BlueprintDatasetFilter, + ) -> Vec { + bp.blueprint_datasets + .values() + .flat_map(|datasets_config| { + let mut debug_datasets = vec![]; + for dataset in datasets_config.datasets.values() { + if matches!(dataset.kind, DebugDatasetKind) + && dataset.disposition.matches(filter) + { + debug_datasets.push(dataset.id); + } + } + debug_datasets + }) + .collect() + } + + fn expunge_dataset_for_bundle(bp: &mut Blueprint, bundle: &SupportBundle) { + for datasets in bp.blueprint_datasets.values_mut() { + for dataset in datasets.datasets.values_mut() { + if dataset.id == bundle.dataset_id.into() { + dataset.disposition = BlueprintDatasetDisposition::Expunged; + } + } + } + } + + fn expunge_nexus_for_bundle(bp: &mut Blueprint, bundle: &SupportBundle) { + for zones in bp.blueprint_zones.values_mut() { + for zone in &mut zones.zones { + if zone.id == bundle.assigned_nexus.unwrap().into() { + zone.disposition = BlueprintZoneDisposition::Expunged; + } + } + } + } + + #[tokio::test] + async fn test_bundle_failed_from_expunged_dataset() { + static TEST_NAME: &str = "test_bundle_failed_from_expunged_dataset"; + let logctx = dev::test_setup_log(TEST_NAME); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let mut rng = SimRngState::from_seed(TEST_NAME); + let (_example, mut bp1) = ExampleSystemBuilder::new_with_rng( + &logctx.log, + rng.next_system_rng(), + ) + .build(); + + // Weirdly, the "ExampleSystemBuilder" blueprint has a parent blueprint, + // but which isn't exposed through the API. Since we're only able to see + // the blueprint it emits, that means we can't actually make it the + // target because "the parent blueprint is not the current target". + // + // Instead of dealing with that, we lie: claim this is the primordial + // blueprint, with no parent. + // + // Regardless, make this starter blueprint our target. + bp1.parent_blueprint_id = None; + bp_insert_and_make_target(&opctx, &datastore, &bp1).await; + + // Manually perform the equivalent of blueprint execution to populate + // database records. + let sleds = TestSled::new_from_blueprint(&bp1); + for sled in &sleds { + sled.create_database_records(&datastore, &opctx).await; + } + + // Extract Nexus and Dataset information from the generated blueprint. + let this_nexus_id = get_nexuses_from_blueprint( + &bp1, + BlueprintZoneFilter::ShouldBeRunning, + ) + .get(0) + .map(|id| *id) + .expect("There should be a Nexus in the example blueprint"); + let debug_datasets = get_debug_datasets_from_blueprint( + &bp1, + BlueprintDatasetFilter::InService, + ); + assert!(!debug_datasets.is_empty()); + + // When we create a bundle, it should exist on a dataset provisioned by + // the blueprint. + let bundle = datastore + .support_bundle_create(&opctx, "for the test", this_nexus_id) + .await + .expect("Should be able to create bundle"); + assert_eq!(bundle.assigned_nexus, Some(this_nexus_id.into())); + assert!( + debug_datasets.contains(&DatasetUuid::from(bundle.dataset_id)), + "Bundle should have been allocated from a blueprint dataset" + ); + + // If we try to "fail support bundles" from expunged datasets/nexuses, + // we should see a no-op. Nothing has been expunged yet! + let report = + datastore.support_bundle_fail_expunged(&opctx, &bp1).await.expect( + "Should have been able to perform no-op support bundle failure", + ); + assert_eq!(SupportBundleExpungementReport::default(), report); + + // Expunge the bundle's dataset (manually) + let bp2 = { + let mut bp2 = bp1.clone(); + bp2.id = Uuid::new_v4(); + bp2.parent_blueprint_id = Some(bp1.id); + expunge_dataset_for_bundle(&mut bp2, &bundle); + bp2 + }; + bp_insert_and_make_target(&opctx, &datastore, &bp2).await; + + datastore + .support_bundle_fail_expunged(&opctx, &bp1) + .await + .expect_err("bp1 is no longer the target; this should fail"); + let report = datastore + .support_bundle_fail_expunged(&opctx, &bp2) + .await + .expect("Should have been able to mark bundle state as failed"); + assert_eq!( + SupportBundleExpungementReport { + bundles_failed_missing_datasets: 1, + ..Default::default() + }, + report + ); + + let observed_bundle = datastore + .support_bundle_get(&opctx, bundle.id.into()) + .await + .expect("Should be able to get bundle we just failed"); + assert_eq!(SupportBundleState::Failed, observed_bundle.state); + assert!(observed_bundle + .reason_for_failure + .unwrap() + .contains(FAILURE_REASON_NO_DATASET)); + + db.terminate().await; + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn test_bundle_deleted_from_expunged_dataset() { + static TEST_NAME: &str = "test_bundle_deleted_from_expunged_dataset"; + let logctx = dev::test_setup_log(TEST_NAME); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let mut rng = SimRngState::from_seed(TEST_NAME); + let (_example, mut bp1) = ExampleSystemBuilder::new_with_rng( + &logctx.log, + rng.next_system_rng(), + ) + .build(); + + // Weirdly, the "ExampleSystemBuilder" blueprint has a parent blueprint, + // but which isn't exposed through the API. Since we're only able to see + // the blueprint it emits, that means we can't actually make it the + // target because "the parent blueprint is not the current target". + // + // Instead of dealing with that, we lie: claim this is the primordial + // blueprint, with no parent. + // + // Regardless, make this starter blueprint our target. + bp1.parent_blueprint_id = None; + bp_insert_and_make_target(&opctx, &datastore, &bp1).await; + + // Manually perform the equivalent of blueprint execution to populate + // database records. + let sleds = TestSled::new_from_blueprint(&bp1); + for sled in &sleds { + sled.create_database_records(&datastore, &opctx).await; + } + + // Extract Nexus and Dataset information from the generated blueprint. + let this_nexus_id = get_nexuses_from_blueprint( + &bp1, + BlueprintZoneFilter::ShouldBeRunning, + ) + .get(0) + .map(|id| *id) + .expect("There should be a Nexus in the example blueprint"); + let debug_datasets = get_debug_datasets_from_blueprint( + &bp1, + BlueprintDatasetFilter::InService, + ); + assert!(!debug_datasets.is_empty()); + + // When we create a bundle, it should exist on a dataset provisioned by + // the blueprint. + let bundle = datastore + .support_bundle_create(&opctx, "for the test", this_nexus_id) + .await + .expect("Should be able to create bundle"); + assert_eq!(bundle.assigned_nexus, Some(this_nexus_id.into())); + assert!( + debug_datasets.contains(&DatasetUuid::from(bundle.dataset_id)), + "Bundle should have been allocated from a blueprint dataset" + ); + + // Start the deletion of this bundle + datastore + .support_bundle_update( + &opctx, + bundle.id.into(), + SupportBundleState::Destroying, + ) + .await + .expect("Should have been able to update state"); + + // If we try to "fail support bundles" from expunged datasets/nexuses, + // we should see a no-op. Nothing has been expunged yet! + let report = + datastore.support_bundle_fail_expunged(&opctx, &bp1).await.expect( + "Should have been able to perform no-op support bundle failure", + ); + assert_eq!(SupportBundleExpungementReport::default(), report); + + // Expunge the bundle's dataset (manually) + let bp2 = { + let mut bp2 = bp1.clone(); + bp2.id = Uuid::new_v4(); + bp2.parent_blueprint_id = Some(bp1.id); + expunge_dataset_for_bundle(&mut bp2, &bundle); + bp2 + }; + bp_insert_and_make_target(&opctx, &datastore, &bp2).await; + + datastore + .support_bundle_fail_expunged(&opctx, &bp1) + .await + .expect_err("bp1 is no longer the target; this should fail"); + let report = datastore + .support_bundle_fail_expunged(&opctx, &bp2) + .await + .expect("Should have been able to mark bundle state as failed"); + assert_eq!( + SupportBundleExpungementReport { + bundles_deleted_missing_datasets: 1, + ..Default::default() + }, + report + ); + + // Should observe no bundles (it should have been deleted) + let pagparams = DataPageParams::max_page(); + let observed_bundles = datastore + .support_bundle_list(&opctx, &pagparams) + .await + .expect("Should be able to query when no bundles exist"); + assert!(observed_bundles.is_empty()); + + db.terminate().await; + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn test_bundle_failed_from_expunged_nexus_no_reassign() { + static TEST_NAME: &str = + "test_bundle_failed_from_expunged_nexus_no_reassign"; + let logctx = dev::test_setup_log(TEST_NAME); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let mut rng = SimRngState::from_seed(TEST_NAME); + let (_example, mut bp1) = ExampleSystemBuilder::new_with_rng( + &logctx.log, + rng.next_system_rng(), + ) + .build(); + + bp1.parent_blueprint_id = None; + bp_insert_and_make_target(&opctx, &datastore, &bp1).await; + + // Manually perform the equivalent of blueprint execution to populate + // database records. + let sleds = TestSled::new_from_blueprint(&bp1); + for sled in &sleds { + sled.create_database_records(&datastore, &opctx).await; + } + + // Extract Nexus and Dataset information from the generated blueprint. + let nexus_ids = get_nexuses_from_blueprint( + &bp1, + BlueprintZoneFilter::ShouldBeRunning, + ); + let debug_datasets = get_debug_datasets_from_blueprint( + &bp1, + BlueprintDatasetFilter::InService, + ); + assert!(!debug_datasets.is_empty()); + + // When we create a bundle, it should exist on a dataset provisioned by + // the blueprint. + let bundle = datastore + .support_bundle_create(&opctx, "for the test", nexus_ids[0]) + .await + .expect("Should be able to create bundle"); + + assert_eq!(bundle.state, SupportBundleState::Collecting); + assert_eq!(bundle.assigned_nexus, Some(nexus_ids[0].into())); + assert!( + debug_datasets.contains(&DatasetUuid::from(bundle.dataset_id)), + "Bundle should have been allocated from a blueprint dataset" + ); + + // Expunge the bundle's dataset. This marks it as "failed", and + // is a prerequisite for the bundle not later being re-assigned. + let bp2 = { + let mut bp2 = bp1.clone(); + bp2.id = Uuid::new_v4(); + bp2.parent_blueprint_id = Some(bp1.id); + expunge_dataset_for_bundle(&mut bp2, &bundle); + bp2 + }; + bp_insert_and_make_target(&opctx, &datastore, &bp2).await; + + let report = datastore + .support_bundle_fail_expunged(&opctx, &bp2) + .await + .expect("Should have been able to mark bundle state as failed"); + assert_eq!( + SupportBundleExpungementReport { + bundles_failed_missing_datasets: 1, + ..Default::default() + }, + report + ); + + let observed_bundle = datastore + .support_bundle_get(&opctx, bundle.id.into()) + .await + .expect("Should be able to get bundle we just failed"); + assert_eq!(SupportBundleState::Failed, observed_bundle.state); + assert!(observed_bundle + .reason_for_failure + .unwrap() + .contains(FAILURE_REASON_NO_DATASET)); + + // Expunge the bundle's Nexus + let bp3 = { + let mut bp3 = bp2.clone(); + bp3.id = Uuid::new_v4(); + bp3.parent_blueprint_id = Some(bp2.id); + expunge_nexus_for_bundle(&mut bp3, &bundle); + bp3 + }; + bp_insert_and_make_target(&opctx, &datastore, &bp3).await; + + let report = datastore + .support_bundle_fail_expunged(&opctx, &bp3) + .await + .expect("Should have been able to mark bundle state as failed"); + + // Although the record for this bundle already exists, it is not + // re-assigned, and the original reason for it failing (dataset loss) is + // preserved. + assert_eq!(SupportBundleExpungementReport::default(), report); + + let observed_bundle = datastore + .support_bundle_get(&opctx, bundle.id.into()) + .await + .expect("Should be able to get bundle we just failed"); + assert_eq!(SupportBundleState::Failed, observed_bundle.state); + assert!(observed_bundle + .reason_for_failure + .unwrap() + .contains(FAILURE_REASON_NO_DATASET)); + + datastore + .support_bundle_delete(&opctx, bundle.id.into()) + .await + .expect("Should have been able to delete support bundle"); + + db.terminate().await; + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn test_bundle_failed_from_expunged_nexus_with_reassign() { + static TEST_NAME: &str = + "test_bundle_failed_from_expunged_nexus_with_reassign"; + let logctx = dev::test_setup_log(TEST_NAME); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let mut rng = SimRngState::from_seed(TEST_NAME); + let (_example, mut bp1) = ExampleSystemBuilder::new_with_rng( + &logctx.log, + rng.next_system_rng(), + ) + .build(); + + bp1.parent_blueprint_id = None; + bp_insert_and_make_target(&opctx, &datastore, &bp1).await; + + // Manually perform the equivalent of blueprint execution to populate + // database records. + let sleds = TestSled::new_from_blueprint(&bp1); + for sled in &sleds { + sled.create_database_records(&datastore, &opctx).await; + } + + // Extract Nexus and Dataset information from the generated blueprint. + let nexus_ids = get_nexuses_from_blueprint( + &bp1, + BlueprintZoneFilter::ShouldBeRunning, + ); + let debug_datasets = get_debug_datasets_from_blueprint( + &bp1, + BlueprintDatasetFilter::InService, + ); + assert!(!debug_datasets.is_empty()); + + // When we create a bundle, it should exist on a dataset provisioned by + // the blueprint. + let bundle = datastore + .support_bundle_create(&opctx, "for the test", nexus_ids[0]) + .await + .expect("Should be able to create bundle"); + + assert_eq!(bundle.state, SupportBundleState::Collecting); + assert_eq!(bundle.assigned_nexus, Some(nexus_ids[0].into())); + assert!( + debug_datasets.contains(&DatasetUuid::from(bundle.dataset_id)), + "Bundle should have been allocated from a blueprint dataset" + ); + + // Update the bundle's state. + // + // This is what we would do when we finish collecting, and + // provisioned storage on a sled. + datastore + .support_bundle_update( + &opctx, + bundle.id.into(), + SupportBundleState::Active, + ) + .await + .expect("Should have been able to update state"); + + // Expunge the bundle's Nexus (manually) + let bp2 = { + let mut bp2 = bp1.clone(); + bp2.id = Uuid::new_v4(); + bp2.parent_blueprint_id = Some(bp1.id); + expunge_nexus_for_bundle(&mut bp2, &bundle); + bp2 + }; + bp_insert_and_make_target(&opctx, &datastore, &bp2).await; + + let report = datastore + .support_bundle_fail_expunged(&opctx, &bp2) + .await + .expect("Should have been able to mark bundle state as destroying"); + + assert_eq!( + SupportBundleExpungementReport { + bundles_failing_missing_nexus: 1, + bundles_reassigned: 1, + ..Default::default() + }, + report + ); + + let observed_bundle = datastore + .support_bundle_get(&opctx, bundle.id.into()) + .await + .expect("Should be able to get bundle we just failed"); + assert_eq!(SupportBundleState::Failing, observed_bundle.state); + assert!(observed_bundle + .reason_for_failure + .unwrap() + .contains(FAILURE_REASON_NO_NEXUS)); + + db.terminate().await; + logctx.cleanup_successful(); + } +} diff --git a/nexus/external-api/output/nexus_tags.txt b/nexus/external-api/output/nexus_tags.txt index a979a9804ba..4fc92b18d8a 100644 --- a/nexus/external-api/output/nexus_tags.txt +++ b/nexus/external-api/output/nexus_tags.txt @@ -30,6 +30,15 @@ probe_create POST /experimental/v1/probes probe_delete DELETE /experimental/v1/probes/{probe} probe_list GET /experimental/v1/probes probe_view GET /experimental/v1/probes/{probe} +support_bundle_create POST /experimental/v1/system/support-bundles +support_bundle_delete DELETE /experimental/v1/system/support-bundles/{support_bundle} +support_bundle_download GET /experimental/v1/system/support-bundles/{support_bundle}/download +support_bundle_download_file GET /experimental/v1/system/support-bundles/{support_bundle}/download/{file} +support_bundle_head HEAD /experimental/v1/system/support-bundles/{support_bundle}/download +support_bundle_head_file HEAD /experimental/v1/system/support-bundles/{support_bundle}/download/{file} +support_bundle_index GET /experimental/v1/system/support-bundles/{support_bundle}/index +support_bundle_list GET /experimental/v1/system/support-bundles +support_bundle_view GET /experimental/v1/system/support-bundles/{support_bundle} timeseries_query POST /v1/timeseries/query API operations found with tag "images" diff --git a/nexus/external-api/src/lib.rs b/nexus/external-api/src/lib.rs index e2b53a7e6fd..54ba3ab34bc 100644 --- a/nexus/external-api/src/lib.rs +++ b/nexus/external-api/src/lib.rs @@ -2780,6 +2780,109 @@ pub trait NexusExternalApi { path_params: Path, ) -> Result; + // Support bundles (experimental) + + /// List all support bundles + #[endpoint { + method = GET, + path = "/experimental/v1/system/support-bundles", + tags = ["hidden"], // system/support-bundles: only one tag is allowed + }] + async fn support_bundle_list( + rqctx: RequestContext, + query_params: Query, + ) -> Result>, HttpError>; + + /// View a support bundle + #[endpoint { + method = GET, + path = "/experimental/v1/system/support-bundles/{support_bundle}", + tags = ["hidden"], // system/support-bundles: only one tag is allowed + }] + async fn support_bundle_view( + rqctx: RequestContext, + path_params: Path, + ) -> Result, HttpError>; + + /// Download the index of a support bundle + #[endpoint { + method = GET, + path = "/experimental/v1/system/support-bundles/{support_bundle}/index", + tags = ["hidden"], // system/support-bundles: only one tag is allowed + }] + async fn support_bundle_index( + rqctx: RequestContext, + path_params: Path, + ) -> Result, HttpError>; + + /// Download the contents of a support bundle + #[endpoint { + method = GET, + path = "/experimental/v1/system/support-bundles/{support_bundle}/download", + tags = ["hidden"], // system/support-bundles: only one tag is allowed + }] + async fn support_bundle_download( + rqctx: RequestContext, + path_params: Path, + ) -> Result, HttpError>; + + /// Download a file within a support bundle + #[endpoint { + method = GET, + path = "/experimental/v1/system/support-bundles/{support_bundle}/download/{file}", + tags = ["hidden"], // system/support-bundles: only one tag is allowed + }] + async fn support_bundle_download_file( + rqctx: RequestContext, + path_params: Path, + ) -> Result, HttpError>; + + /// Download the metadata of a support bundle + #[endpoint { + method = HEAD, + path = "/experimental/v1/system/support-bundles/{support_bundle}/download", + tags = ["hidden"], // system/support-bundles: only one tag is allowed + }] + async fn support_bundle_head( + rqctx: RequestContext, + path_params: Path, + ) -> Result, HttpError>; + + /// Download the metadata of a file within the support bundle + #[endpoint { + method = HEAD, + path = "/experimental/v1/system/support-bundles/{support_bundle}/download/{file}", + tags = ["hidden"], // system/support-bundles: only one tag is allowed + }] + async fn support_bundle_head_file( + rqctx: RequestContext, + path_params: Path, + ) -> Result, HttpError>; + + /// Create a new support bundle + #[endpoint { + method = POST, + path = "/experimental/v1/system/support-bundles", + tags = ["hidden"], // system/support-bundles: only one tag is allowed + }] + async fn support_bundle_create( + rqctx: RequestContext, + ) -> Result, HttpError>; + + /// Delete an existing support bundle + /// + /// May also be used to cancel a support bundle which is currently being + /// collected, or to remove metadata for a support bundle that has failed. + #[endpoint { + method = DELETE, + path = "/experimental/v1/system/support-bundles/{support_bundle}", + tags = ["hidden"], // system/support-bundles: only one tag is allowed + }] + async fn support_bundle_delete( + rqctx: RequestContext, + path_params: Path, + ) -> Result; + // Probes (experimental) /// List instrumentation probes diff --git a/nexus/src/external_api/http_entrypoints.rs b/nexus/src/external_api/http_entrypoints.rs index 740895b7e44..cfc9f99851a 100644 --- a/nexus/src/external_api/http_entrypoints.rs +++ b/nexus/src/external_api/http_entrypoints.rs @@ -6026,6 +6026,213 @@ impl NexusExternalApi for NexusExternalApiImpl { .await } + async fn support_bundle_list( + rqctx: RequestContext, + _query_params: Query, + ) -> Result>, HttpError> + { + let apictx = rqctx.context(); + let handler = async { + let nexus = &apictx.context.nexus; + + let opctx = + crate::context::op_context_for_external_api(&rqctx).await?; + + Err(nexus + .unimplemented_todo(&opctx, crate::app::Unimpl::Public) + .await + .into()) + }; + apictx + .context + .external_latencies + .instrument_dropshot_handler(&rqctx, handler) + .await + } + + async fn support_bundle_view( + rqctx: RequestContext, + _path_params: Path, + ) -> Result, HttpError> { + let apictx = rqctx.context(); + let handler = async { + let nexus = &apictx.context.nexus; + + let opctx = + crate::context::op_context_for_external_api(&rqctx).await?; + + Err(nexus + .unimplemented_todo(&opctx, crate::app::Unimpl::Public) + .await + .into()) + }; + apictx + .context + .external_latencies + .instrument_dropshot_handler(&rqctx, handler) + .await + } + + async fn support_bundle_index( + rqctx: RequestContext, + _path_params: Path, + ) -> Result, HttpError> { + let apictx = rqctx.context(); + let handler = async { + let nexus = &apictx.context.nexus; + + let opctx = + crate::context::op_context_for_external_api(&rqctx).await?; + + Err(nexus + .unimplemented_todo(&opctx, crate::app::Unimpl::Public) + .await + .into()) + }; + apictx + .context + .external_latencies + .instrument_dropshot_handler(&rqctx, handler) + .await + } + + async fn support_bundle_download( + rqctx: RequestContext, + _path_params: Path, + ) -> Result, HttpError> { + let apictx = rqctx.context(); + let handler = async { + let nexus = &apictx.context.nexus; + + let opctx = + crate::context::op_context_for_external_api(&rqctx).await?; + + Err(nexus + .unimplemented_todo(&opctx, crate::app::Unimpl::Public) + .await + .into()) + }; + apictx + .context + .external_latencies + .instrument_dropshot_handler(&rqctx, handler) + .await + } + + async fn support_bundle_download_file( + rqctx: RequestContext, + _path_params: Path, + ) -> Result, HttpError> { + let apictx = rqctx.context(); + let handler = async { + let nexus = &apictx.context.nexus; + + let opctx = + crate::context::op_context_for_external_api(&rqctx).await?; + + Err(nexus + .unimplemented_todo(&opctx, crate::app::Unimpl::Public) + .await + .into()) + }; + apictx + .context + .external_latencies + .instrument_dropshot_handler(&rqctx, handler) + .await + } + + async fn support_bundle_head( + rqctx: RequestContext, + _path_params: Path, + ) -> Result, HttpError> { + let apictx = rqctx.context(); + let handler = async { + let nexus = &apictx.context.nexus; + + let opctx = + crate::context::op_context_for_external_api(&rqctx).await?; + + Err(nexus + .unimplemented_todo(&opctx, crate::app::Unimpl::Public) + .await + .into()) + }; + apictx + .context + .external_latencies + .instrument_dropshot_handler(&rqctx, handler) + .await + } + + async fn support_bundle_head_file( + rqctx: RequestContext, + _path_params: Path, + ) -> Result, HttpError> { + let apictx = rqctx.context(); + let handler = async { + let nexus = &apictx.context.nexus; + + let opctx = + crate::context::op_context_for_external_api(&rqctx).await?; + + Err(nexus + .unimplemented_todo(&opctx, crate::app::Unimpl::Public) + .await + .into()) + }; + apictx + .context + .external_latencies + .instrument_dropshot_handler(&rqctx, handler) + .await + } + + async fn support_bundle_create( + rqctx: RequestContext, + ) -> Result, HttpError> { + let apictx = rqctx.context(); + let handler = async { + let nexus = &apictx.context.nexus; + + let opctx = + crate::context::op_context_for_external_api(&rqctx).await?; + + Err(nexus + .unimplemented_todo(&opctx, crate::app::Unimpl::Public) + .await + .into()) + }; + apictx + .context + .external_latencies + .instrument_dropshot_handler(&rqctx, handler) + .await + } + + async fn support_bundle_delete( + rqctx: RequestContext, + _path_params: Path, + ) -> Result { + let apictx = rqctx.context(); + let handler = async { + let nexus = &apictx.context.nexus; + + let opctx = + crate::context::op_context_for_external_api(&rqctx).await?; + + Err(nexus + .unimplemented_todo(&opctx, crate::app::Unimpl::Public) + .await + .into()) + }; + apictx + .context + .external_latencies + .instrument_dropshot_handler(&rqctx, handler) + .await + } + async fn probe_list( rqctx: RequestContext, query_params: Query>, diff --git a/nexus/tests/output/uncovered-authz-endpoints.txt b/nexus/tests/output/uncovered-authz-endpoints.txt index c5091c5a3bc..0a9e62707f0 100644 --- a/nexus/tests/output/uncovered-authz-endpoints.txt +++ b/nexus/tests/output/uncovered-authz-endpoints.txt @@ -1,13 +1,22 @@ API endpoints with no coverage in authz tests: probe_delete (delete "/experimental/v1/probes/{probe}") +support_bundle_delete (delete "/experimental/v1/system/support-bundles/{support_bundle}") probe_list (get "/experimental/v1/probes") probe_view (get "/experimental/v1/probes/{probe}") +support_bundle_list (get "/experimental/v1/system/support-bundles") +support_bundle_view (get "/experimental/v1/system/support-bundles/{support_bundle}") +support_bundle_download (get "/experimental/v1/system/support-bundles/{support_bundle}/download") +support_bundle_download_file (get "/experimental/v1/system/support-bundles/{support_bundle}/download/{file}") +support_bundle_index (get "/experimental/v1/system/support-bundles/{support_bundle}/index") ping (get "/v1/ping") networking_switch_port_status (get "/v1/system/hardware/switch-port/{port}/status") +support_bundle_head (head "/experimental/v1/system/support-bundles/{support_bundle}/download") +support_bundle_head_file (head "/experimental/v1/system/support-bundles/{support_bundle}/download/{file}") device_auth_request (post "/device/auth") device_auth_confirm (post "/device/confirm") device_access_token (post "/device/token") probe_create (post "/experimental/v1/probes") +support_bundle_create (post "/experimental/v1/system/support-bundles") login_saml (post "/login/{silo_name}/saml/{provider_name}") login_local (post "/v1/login/{silo_name}/local") logout (post "/v1/logout") diff --git a/nexus/types/src/external_api/params.rs b/nexus/types/src/external_api/params.rs index 9b4c2474ad9..4e616e698f9 100644 --- a/nexus/types/src/external_api/params.rs +++ b/nexus/types/src/external_api/params.rs @@ -90,6 +90,7 @@ path_param!(AddressLotPath, address_lot, "address lot"); path_param!(ProbePath, probe, "probe"); path_param!(CertificatePath, certificate, "certificate"); +id_path_param!(SupportBundlePath, support_bundle, "support bundle"); id_path_param!(GroupPath, group_id, "group"); // TODO: The hardware resources should be represented by its UUID or a hardware @@ -142,6 +143,15 @@ impl From for SiloSelector { } } +#[derive(Serialize, Deserialize, JsonSchema)] +pub struct SupportBundleFilePath { + #[serde(flatten)] + pub bundle: SupportBundlePath, + + /// The file within the bundle to download + pub file: String, +} + #[derive(Debug, Clone, Serialize, Deserialize, JsonSchema, PartialEq)] pub struct OptionalSiloSelector { /// Name or ID of the silo diff --git a/nexus/types/src/external_api/shared.rs b/nexus/types/src/external_api/shared.rs index 9bfa9c8358d..9e9bc26ffe8 100644 --- a/nexus/types/src/external_api/shared.rs +++ b/nexus/types/src/external_api/shared.rs @@ -6,8 +6,11 @@ use std::net::IpAddr; +use chrono::DateTime; +use chrono::Utc; use omicron_common::api::external::Name; use omicron_common::api::internal::shared::NetworkInterface; +use omicron_uuid_kinds::SupportBundleUuid; use parse_display::FromStr; use schemars::JsonSchema; use serde::de::Error as _; @@ -414,6 +417,43 @@ mod test { } } +#[derive(Debug, Clone, Copy, JsonSchema, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum SupportBundleState { + /// Support Bundle still actively being collected. + /// + /// This is the initial state for a Support Bundle, and it will + /// automatically transition to either "Failing" or "Active". + /// + /// If a user no longer wants to access a Support Bundle, they can + /// request cancellation, which will transition to the "Destroying" state. + Collecting, + + /// Support Bundle is being destroyed. + /// + /// Once backing storage has been freed, this bundle is destroyed. + Destroying, + + /// Support Bundle was not created successfully, or was created and has lost + /// backing storage. + /// + /// The record of the bundle still exists for readability, but the only + /// valid operation on these bundles is to destroy them. + Failed, + + /// Support Bundle has been processed, and is ready for usage. + Active, +} + +#[derive(Debug, Clone, JsonSchema, Serialize, Deserialize)] +pub struct SupportBundleInfo { + pub id: SupportBundleUuid, + pub time_created: DateTime, + pub reason_for_creation: String, + pub reason_for_failure: Option, + pub state: SupportBundleState, +} + #[derive(Debug, Clone, JsonSchema, Serialize, Deserialize)] pub struct ProbeInfo { pub id: Uuid, diff --git a/openapi/nexus.json b/openapi/nexus.json index c0b6a96fcf3..bc043059dd2 100644 --- a/openapi/nexus.json +++ b/openapi/nexus.json @@ -301,6 +301,329 @@ } } }, + "/experimental/v1/system/support-bundles": { + "get": { + "tags": [ + "hidden" + ], + "summary": "List all support bundles", + "operationId": "support_bundle_list", + "parameters": [ + { + "in": "query", + "name": "limit", + "description": "Maximum number of items returned by a single call", + "schema": { + "nullable": true, + "type": "integer", + "format": "uint32", + "minimum": 1 + } + }, + { + "in": "query", + "name": "page_token", + "description": "Token returned by previous call to retrieve the subsequent page", + "schema": { + "nullable": true, + "type": "string" + } + }, + { + "in": "query", + "name": "sort_by", + "schema": { + "$ref": "#/components/schemas/IdSortMode" + } + } + ], + "responses": { + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/SupportBundleInfoResultsPage" + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + }, + "x-dropshot-pagination": { + "required": [] + } + }, + "post": { + "tags": [ + "hidden" + ], + "summary": "Create a new support bundle", + "operationId": "support_bundle_create", + "responses": { + "201": { + "description": "successful creation", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/SupportBundleInfo" + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, + "/experimental/v1/system/support-bundles/{support_bundle}": { + "get": { + "tags": [ + "hidden" + ], + "summary": "View a support bundle", + "operationId": "support_bundle_view", + "parameters": [ + { + "in": "path", + "name": "support_bundle", + "description": "ID of the support bundle", + "required": true, + "schema": { + "type": "string", + "format": "uuid" + } + } + ], + "responses": { + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/SupportBundleInfo" + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + }, + "delete": { + "tags": [ + "hidden" + ], + "summary": "Delete an existing support bundle", + "description": "May also be used to cancel a support bundle which is currently being collected, or to remove metadata for a support bundle that has failed.", + "operationId": "support_bundle_delete", + "parameters": [ + { + "in": "path", + "name": "support_bundle", + "description": "ID of the support bundle", + "required": true, + "schema": { + "type": "string", + "format": "uuid" + } + } + ], + "responses": { + "204": { + "description": "successful deletion" + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, + "/experimental/v1/system/support-bundles/{support_bundle}/download": { + "get": { + "tags": [ + "hidden" + ], + "summary": "Download the contents of a support bundle", + "operationId": "support_bundle_download", + "parameters": [ + { + "in": "path", + "name": "support_bundle", + "description": "ID of the support bundle", + "required": true, + "schema": { + "type": "string", + "format": "uuid" + } + } + ], + "responses": { + "default": { + "description": "", + "content": { + "*/*": { + "schema": {} + } + } + } + } + }, + "head": { + "tags": [ + "hidden" + ], + "summary": "Download the metadata of a support bundle", + "operationId": "support_bundle_head", + "parameters": [ + { + "in": "path", + "name": "support_bundle", + "description": "ID of the support bundle", + "required": true, + "schema": { + "type": "string", + "format": "uuid" + } + } + ], + "responses": { + "default": { + "description": "", + "content": { + "*/*": { + "schema": {} + } + } + } + } + } + }, + "/experimental/v1/system/support-bundles/{support_bundle}/download/{file}": { + "get": { + "tags": [ + "hidden" + ], + "summary": "Download a file within a support bundle", + "operationId": "support_bundle_download_file", + "parameters": [ + { + "in": "path", + "name": "file", + "description": "The file within the bundle to download", + "required": true, + "schema": { + "type": "string" + } + }, + { + "in": "path", + "name": "support_bundle", + "description": "ID of the support bundle", + "required": true, + "schema": { + "type": "string", + "format": "uuid" + } + } + ], + "responses": { + "default": { + "description": "", + "content": { + "*/*": { + "schema": {} + } + } + } + } + }, + "head": { + "tags": [ + "hidden" + ], + "summary": "Download the metadata of a file within the support bundle", + "operationId": "support_bundle_head_file", + "parameters": [ + { + "in": "path", + "name": "file", + "description": "The file within the bundle to download", + "required": true, + "schema": { + "type": "string" + } + }, + { + "in": "path", + "name": "support_bundle", + "description": "ID of the support bundle", + "required": true, + "schema": { + "type": "string", + "format": "uuid" + } + } + ], + "responses": { + "default": { + "description": "", + "content": { + "*/*": { + "schema": {} + } + } + } + } + } + }, + "/experimental/v1/system/support-bundles/{support_bundle}/index": { + "get": { + "tags": [ + "hidden" + ], + "summary": "Download the index of a support bundle", + "operationId": "support_bundle_index", + "parameters": [ + { + "in": "path", + "name": "support_bundle", + "description": "ID of the support bundle", + "required": true, + "schema": { + "type": "string", + "format": "uuid" + } + } + ], + "responses": { + "default": { + "description": "", + "content": { + "*/*": { + "schema": {} + } + } + } + } + } + }, "/login/{silo_name}/saml/{provider_name}": { "post": { "tags": [ @@ -20059,6 +20382,87 @@ "items" ] }, + "SupportBundleInfo": { + "type": "object", + "properties": { + "id": { + "$ref": "#/components/schemas/TypedUuidForSupportBundleKind" + }, + "reason_for_creation": { + "type": "string" + }, + "reason_for_failure": { + "nullable": true, + "type": "string" + }, + "state": { + "$ref": "#/components/schemas/SupportBundleState" + }, + "time_created": { + "type": "string", + "format": "date-time" + } + }, + "required": [ + "id", + "reason_for_creation", + "state", + "time_created" + ] + }, + "SupportBundleInfoResultsPage": { + "description": "A single page of results", + "type": "object", + "properties": { + "items": { + "description": "list of items on this page of results", + "type": "array", + "items": { + "$ref": "#/components/schemas/SupportBundleInfo" + } + }, + "next_page": { + "nullable": true, + "description": "token used to fetch the next page of results (if any)", + "type": "string" + } + }, + "required": [ + "items" + ] + }, + "SupportBundleState": { + "oneOf": [ + { + "description": "Support Bundle still actively being collected.\n\nThis is the initial state for a Support Bundle, and it will automatically transition to either \"Failing\" or \"Active\".\n\nIf a user no longer wants to access a Support Bundle, they can request cancellation, which will transition to the \"Destroying\" state.", + "type": "string", + "enum": [ + "collecting" + ] + }, + { + "description": "Support Bundle is being destroyed.\n\nOnce backing storage has been freed, this bundle is destroyed.", + "type": "string", + "enum": [ + "destroying" + ] + }, + { + "description": "Support Bundle was not created successfully, or was created and has lost backing storage.\n\nThe record of the bundle still exists for readability, but the only valid operation on these bundles is to destroy them.", + "type": "string", + "enum": [ + "failed" + ] + }, + { + "description": "Support Bundle has been processed, and is ready for usage.", + "type": "string", + "enum": [ + "active" + ] + } + ] + }, "Switch": { "description": "An operator's view of a Switch.", "type": "object", @@ -21073,6 +21477,10 @@ } } }, + "TypedUuidForSupportBundleKind": { + "type": "string", + "format": "uuid" + }, "UninitializedSled": { "description": "A sled that has not been added to an initialized rack yet", "type": "object", @@ -22509,6 +22917,18 @@ } ] }, + "IdSortMode": { + "description": "Supported set of sort modes for scanning by id only.\n\nCurrently, we only support scanning in ascending order.", + "oneOf": [ + { + "description": "sort in increasing order of \"id\"", + "type": "string", + "enum": [ + "id_ascending" + ] + } + ] + }, "DiskMetricName": { "type": "string", "enum": [ @@ -22528,18 +22948,6 @@ "descending" ] }, - "IdSortMode": { - "description": "Supported set of sort modes for scanning by id only.\n\nCurrently, we only support scanning in ascending order.", - "oneOf": [ - { - "description": "sort in increasing order of \"id\"", - "type": "string", - "enum": [ - "id_ascending" - ] - } - ] - }, "SystemMetricName": { "type": "string", "enum": [ diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 67b8ab00415..ce6764bd17c 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -2395,6 +2395,66 @@ CREATE TABLE IF NOT EXISTS omicron.public.tuf_repo_artifact ( /*******************************************************************/ +/* + * Support Bundles + */ + + +CREATE TYPE IF NOT EXISTS omicron.public.support_bundle_state AS ENUM ( + -- The bundle is currently being created. + -- + -- It might have storage that is partially allocated on a sled. + 'collecting', + + -- The bundle has been collected successfully, and has storage on + -- a particular sled. + 'active', + + -- The user has explicitly requested that a bundle be destroyed. + -- We must ensure that storage backing that bundle is gone before + -- it is automatically deleted. + 'destroying', + + -- The support bundle is failing. + -- This happens when Nexus is expunged partway through collection. + -- + -- A different Nexus must ensure that storage is gone before the + -- bundle can be marked "failed". + 'failing', + + -- The bundle has finished failing. + -- + -- The only action that can be taken on this bundle is to delete it. + 'failed' +); + +CREATE TABLE IF NOT EXISTS omicron.public.support_bundle ( + id UUID PRIMARY KEY, + time_created TIMESTAMPTZ NOT NULL, + reason_for_creation TEXT NOT NULL, + reason_for_failure TEXT, + state omicron.public.support_bundle_state NOT NULL, + zpool_id UUID NOT NULL, + dataset_id UUID NOT NULL, + + -- The Nexus which is in charge of collecting the support bundle, + -- and later managing its storage. + assigned_nexus UUID +); + +-- The "UNIQUE" part of this index helps enforce that we allow one support bundle +-- per debug dataset. This constraint can be removed, if the query responsible +-- for allocation changes to allocate more intelligently. +CREATE UNIQUE INDEX IF NOT EXISTS one_bundle_per_dataset ON omicron.public.support_bundle ( + dataset_id +); + +CREATE INDEX IF NOT EXISTS lookup_bundle_by_nexus ON omicron.public.support_bundle ( + assigned_nexus +); + +/*******************************************************************/ + /* * DNS Propagation * @@ -4697,7 +4757,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - (TRUE, NOW(), NOW(), '117.0.0', NULL) + (TRUE, NOW(), NOW(), '118.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; diff --git a/schema/crdb/support-bundles/up01.sql b/schema/crdb/support-bundles/up01.sql new file mode 100644 index 00000000000..1260e79e18c --- /dev/null +++ b/schema/crdb/support-bundles/up01.sql @@ -0,0 +1,28 @@ +CREATE TYPE IF NOT EXISTS omicron.public.support_bundle_state AS ENUM ( + -- The bundle is currently being created. + -- + -- It might have storage that is partially allocated on a sled. + 'collecting', + + -- The bundle has been collected successfully, and has storage on + -- a particular sled. + 'active', + + -- The user has explicitly requested that a bundle be destroyed. + -- We must ensure that storage backing that bundle is gone before + -- it is automatically deleted. + 'destroying', + + -- The support bundle is failing. + -- This happens when Nexus is expunged partway through collection. + -- + -- A different Nexus must ensure that storage is gone before the + -- bundle can be marked "failed". + 'failing', + + -- The bundle has finished failing. + -- + -- The only action that can be taken on this bundle is to delete it. + 'failed' +); + diff --git a/schema/crdb/support-bundles/up02.sql b/schema/crdb/support-bundles/up02.sql new file mode 100644 index 00000000000..bc61e704800 --- /dev/null +++ b/schema/crdb/support-bundles/up02.sql @@ -0,0 +1,14 @@ +CREATE TABLE IF NOT EXISTS omicron.public.support_bundle ( + id UUID PRIMARY KEY, + time_created TIMESTAMPTZ NOT NULL, + reason_for_creation TEXT NOT NULL, + reason_for_failure TEXT, + state omicron.public.support_bundle_state NOT NULL, + zpool_id UUID NOT NULL, + dataset_id UUID NOT NULL, + + -- The Nexus which is in charge of collecting the support bundle, + -- and later managing its storage. + assigned_nexus UUID +); + diff --git a/schema/crdb/support-bundles/up03.sql b/schema/crdb/support-bundles/up03.sql new file mode 100644 index 00000000000..8d4bdb47caa --- /dev/null +++ b/schema/crdb/support-bundles/up03.sql @@ -0,0 +1,4 @@ +CREATE UNIQUE INDEX IF NOT EXISTS one_bundle_per_dataset ON omicron.public.support_bundle ( + dataset_id +); + diff --git a/schema/crdb/support-bundles/up04.sql b/schema/crdb/support-bundles/up04.sql new file mode 100644 index 00000000000..58b903e5009 --- /dev/null +++ b/schema/crdb/support-bundles/up04.sql @@ -0,0 +1,4 @@ +CREATE INDEX IF NOT EXISTS lookup_bundle_by_nexus ON omicron.public.support_bundle ( + assigned_nexus +); +