From cd3aabd75bdb937cecba1f99b306913adfd4bd2a Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Thu, 7 Nov 2024 12:54:04 -0800 Subject: [PATCH 01/22] Add support bundle API skeleton to Nexus --- nexus/external-api/output/nexus_tags.txt | 5 + nexus/external-api/src/lib.rs | 59 +++++ nexus/src/external_api/http_entrypoints.rs | 99 +++++++ nexus/types/src/external_api/params.rs | 1 + nexus/types/src/external_api/shared.rs | 37 +++ openapi/nexus.json | 291 ++++++++++++++++++++- 6 files changed, 480 insertions(+), 12 deletions(-) diff --git a/nexus/external-api/output/nexus_tags.txt b/nexus/external-api/output/nexus_tags.txt index 2d64679ca02..6ac531cf61b 100644 --- a/nexus/external-api/output/nexus_tags.txt +++ b/nexus/external-api/output/nexus_tags.txt @@ -30,6 +30,11 @@ probe_create POST /experimental/v1/probes probe_delete DELETE /experimental/v1/probes/{probe} probe_list GET /experimental/v1/probes probe_view GET /experimental/v1/probes/{probe} +support_bundle_create POST /experimental/v1/system/support-bundles +support_bundle_delete DELETE /experimental/v1/system/support-bundles/{support_bundle} +support_bundle_download GET /experimental/v1/system/support-bundles/{support_bundle}/download +support_bundle_list GET /experimental/v1/system/support-bundles +support_bundle_view GET /experimental/v1/system/support-bundles/{support_bundle} API operations found with tag "images" OPERATION ID METHOD URL PATH diff --git a/nexus/external-api/src/lib.rs b/nexus/external-api/src/lib.rs index b05366e0c56..d7f523a9006 100644 --- a/nexus/external-api/src/lib.rs +++ b/nexus/external-api/src/lib.rs @@ -2760,6 +2760,65 @@ pub trait NexusExternalApi { path_params: Path, ) -> Result; + // Support bundles (experimental) + + /// List all support bundles + #[endpoint { + method = GET, + path = "/experimental/v1/system/support-bundles", + tags = ["hidden"], // system/support-bundles: only one tag is allowed + }] + async fn support_bundle_list( + rqctx: RequestContext, + query_params: Query, + ) -> Result>, HttpError>; + + /// View a single support bundle + #[endpoint { + method = GET, + path = "/experimental/v1/system/support-bundles/{support_bundle}", + tags = ["hidden"], // system/support-bundles: only one tag is allowed + }] + async fn support_bundle_view( + rqctx: RequestContext, + path_params: Path, + ) -> Result, HttpError>; + + /// Download the contents of a single support bundle + #[endpoint { + method = GET, + path = "/experimental/v1/system/support-bundles/{support_bundle}/download", + tags = ["hidden"], // system/support-bundles: only one tag is allowed + }] + async fn support_bundle_download( + rqctx: RequestContext, + path_params: Path, + ) -> Result, HttpError>; + + /// Create a new support bundle + #[endpoint { + method = POST, + path = "/experimental/v1/system/support-bundles", + tags = ["hidden"], // system/support-bundles: only one tag is allowed + }] + async fn support_bundle_create( + rqctx: RequestContext, + ) -> Result, HttpError>; + + /// Delete an existing support bundle + /// + /// May also be used to cancel a support bundle which is currently being + /// collected, or to remove metadata for a support bundle that has failed. + #[endpoint { + method = DELETE, + path = "/experimental/v1/system/support-bundles/{support_bundle}", + tags = ["hidden"], // system/support-bundles: only one tag is allowed + }] + async fn support_bundle_delete( + rqctx: RequestContext, + path_params: Path, + ) -> Result, HttpError>; + // Probes (experimental) /// List instrumentation probes diff --git a/nexus/src/external_api/http_entrypoints.rs b/nexus/src/external_api/http_entrypoints.rs index df8f1aa24e5..655c56320e5 100644 --- a/nexus/src/external_api/http_entrypoints.rs +++ b/nexus/src/external_api/http_entrypoints.rs @@ -5999,6 +5999,105 @@ impl NexusExternalApi for NexusExternalApiImpl { .await } + async fn support_bundle_list( + rqctx: RequestContext, + _query_params: Query, + ) -> Result>, HttpError> { + let apictx = rqctx.context(); + let handler = async { + let nexus = &apictx.context.nexus; + + let opctx = + crate::context::op_context_for_external_api(&rqctx).await?; + + Err(nexus.unimplemented_todo(&opctx, crate::app::Unimpl::Public).await.into()) + }; + apictx + .context + .external_latencies + .instrument_dropshot_handler(&rqctx, handler) + .await + } + + async fn support_bundle_view( + rqctx: RequestContext, + _path_params: Path, + ) -> Result, HttpError> { + let apictx = rqctx.context(); + let handler = async { + let nexus = &apictx.context.nexus; + + let opctx = + crate::context::op_context_for_external_api(&rqctx).await?; + + Err(nexus.unimplemented_todo(&opctx, crate::app::Unimpl::Public).await.into()) + }; + apictx + .context + .external_latencies + .instrument_dropshot_handler(&rqctx, handler) + .await + } + + async fn support_bundle_download( + rqctx: RequestContext, + _path_params: Path, + ) -> Result, HttpError> { + let apictx = rqctx.context(); + let handler = async { + let nexus = &apictx.context.nexus; + + let opctx = + crate::context::op_context_for_external_api(&rqctx).await?; + + Err(nexus.unimplemented_todo(&opctx, crate::app::Unimpl::Public).await.into()) + }; + apictx + .context + .external_latencies + .instrument_dropshot_handler(&rqctx, handler) + .await + } + + async fn support_bundle_create( + rqctx: RequestContext, + ) -> Result, HttpError> { + let apictx = rqctx.context(); + let handler = async { + let nexus = &apictx.context.nexus; + + let opctx = + crate::context::op_context_for_external_api(&rqctx).await?; + + Err(nexus.unimplemented_todo(&opctx, crate::app::Unimpl::Public).await.into()) + }; + apictx + .context + .external_latencies + .instrument_dropshot_handler(&rqctx, handler) + .await + } + + async fn support_bundle_delete( + rqctx: RequestContext, + _path_params: Path, + ) -> Result, HttpError> { + let apictx = rqctx.context(); + let handler = async { + let nexus = &apictx.context.nexus; + + let opctx = + crate::context::op_context_for_external_api(&rqctx).await?; + + Err(nexus.unimplemented_todo(&opctx, crate::app::Unimpl::Public).await.into()) + }; + apictx + .context + .external_latencies + .instrument_dropshot_handler(&rqctx, handler) + .await + } + async fn probe_list( rqctx: RequestContext, query_params: Query>, diff --git a/nexus/types/src/external_api/params.rs b/nexus/types/src/external_api/params.rs index 9af9d3be1e6..a0bd36ed3cf 100644 --- a/nexus/types/src/external_api/params.rs +++ b/nexus/types/src/external_api/params.rs @@ -89,6 +89,7 @@ path_param!(SshKeyPath, ssh_key, "SSH key"); path_param!(AddressLotPath, address_lot, "address lot"); path_param!(ProbePath, probe, "probe"); path_param!(CertificatePath, certificate, "certificate"); +path_param!(SupportBundlePath, support_bundle, "support bundle"); id_path_param!(GroupPath, group_id, "group"); diff --git a/nexus/types/src/external_api/shared.rs b/nexus/types/src/external_api/shared.rs index 9bfa9c8358d..35367400e0b 100644 --- a/nexus/types/src/external_api/shared.rs +++ b/nexus/types/src/external_api/shared.rs @@ -6,6 +6,8 @@ use std::net::IpAddr; +use chrono::DateTime; +use chrono::Utc; use omicron_common::api::external::Name; use omicron_common::api::internal::shared::NetworkInterface; use parse_display::FromStr; @@ -414,6 +416,41 @@ mod test { } } +#[derive(Debug, Clone, Copy, JsonSchema, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum SupportBundleState { + /// Support Bundle still actively being collected. + /// + /// This is the initial state for a Support Bundle, and it will + /// automatically transition to either "Failed" or "Active". + /// + /// If a user no longer wants to access a Support Bundle, they can + /// request cancellation, which will transition to the "Cancelling" state. + Collecting, + + /// Support Bundle has been cancelled, and will be deleted shortly. + Cancelling, + + /// Support Bundle was not created successfully, or was created and has lost + /// backing storage. + /// + /// The record of the bundle still exists for readability, but the only + /// valid operation on these bundles is to delete them. + Failed, + + /// Support Bundle has been processed, and is ready for usage. + Active, +} + +#[derive(Debug, Clone, JsonSchema, Serialize, Deserialize)] +pub struct SupportBundleInfo { + // XXX: Strongly-typed Support Bundle UUID? + pub id: Uuid, + pub time_created: DateTime, + pub reason_for_creation: String, + pub state: SupportBundleState, +} + #[derive(Debug, Clone, JsonSchema, Serialize, Deserialize)] pub struct ProbeInfo { pub id: Uuid, diff --git a/openapi/nexus.json b/openapi/nexus.json index f12ff4730cf..6cf143ac49c 100644 --- a/openapi/nexus.json +++ b/openapi/nexus.json @@ -301,6 +301,195 @@ } } }, + "/experimental/v1/system/support-bundles": { + "get": { + "tags": [ + "hidden" + ], + "summary": "List all support bundles", + "operationId": "support_bundle_list", + "parameters": [ + { + "in": "query", + "name": "limit", + "description": "Maximum number of items returned by a single call", + "schema": { + "nullable": true, + "type": "integer", + "format": "uint32", + "minimum": 1 + } + }, + { + "in": "query", + "name": "page_token", + "description": "Token returned by previous call to retrieve the subsequent page", + "schema": { + "nullable": true, + "type": "string" + } + }, + { + "in": "query", + "name": "sort_by", + "schema": { + "$ref": "#/components/schemas/IdSortMode" + } + } + ], + "responses": { + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/SupportBundleInfoResultsPage" + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + }, + "x-dropshot-pagination": { + "required": [] + } + }, + "post": { + "tags": [ + "hidden" + ], + "summary": "Create a new support bundle", + "operationId": "support_bundle_create", + "responses": { + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/SupportBundleInfo" + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, + "/experimental/v1/system/support-bundles/{support_bundle}": { + "get": { + "tags": [ + "hidden" + ], + "summary": "View a single support bundle", + "operationId": "support_bundle_view", + "parameters": [ + { + "in": "path", + "name": "support_bundle", + "description": "Name or ID of the support bundle", + "required": true, + "schema": { + "$ref": "#/components/schemas/NameOrId" + } + } + ], + "responses": { + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/SupportBundleInfo" + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + }, + "delete": { + "tags": [ + "hidden" + ], + "summary": "Delete an existing support bundle", + "description": "May also be used to cancel a support bundle which is currently being collected, or to remove metadata for a support bundle that has failed.", + "operationId": "support_bundle_delete", + "parameters": [ + { + "in": "path", + "name": "support_bundle", + "description": "Name or ID of the support bundle", + "required": true, + "schema": { + "$ref": "#/components/schemas/NameOrId" + } + } + ], + "responses": { + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/SupportBundleInfo" + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, + "/experimental/v1/system/support-bundles/{support_bundle}/download": { + "get": { + "tags": [ + "hidden" + ], + "summary": "Download the contents of a single support bundle", + "operationId": "support_bundle_download", + "parameters": [ + { + "in": "path", + "name": "support_bundle", + "description": "Name or ID of the support bundle", + "required": true, + "schema": { + "$ref": "#/components/schemas/NameOrId" + } + } + ], + "responses": { + "default": { + "description": "", + "content": { + "*/*": { + "schema": {} + } + } + } + } + } + }, "/login/{silo_name}/saml/{provider_name}": { "post": { "tags": [ @@ -19990,6 +20179,84 @@ "items" ] }, + "SupportBundleInfo": { + "type": "object", + "properties": { + "id": { + "type": "string", + "format": "uuid" + }, + "reason_for_creation": { + "type": "string" + }, + "state": { + "$ref": "#/components/schemas/SupportBundleState" + }, + "time_created": { + "type": "string", + "format": "date-time" + } + }, + "required": [ + "id", + "reason_for_creation", + "state", + "time_created" + ] + }, + "SupportBundleInfoResultsPage": { + "description": "A single page of results", + "type": "object", + "properties": { + "items": { + "description": "list of items on this page of results", + "type": "array", + "items": { + "$ref": "#/components/schemas/SupportBundleInfo" + } + }, + "next_page": { + "nullable": true, + "description": "token used to fetch the next page of results (if any)", + "type": "string" + } + }, + "required": [ + "items" + ] + }, + "SupportBundleState": { + "oneOf": [ + { + "description": "Support Bundle still actively being collected.\n\nThis is the initial state for a Support Bundle, and it will automatically transition to either \"Failed\" or \"Active\".\n\nIf a user no longer wants to access a Support Bundle, they can request cancellation, which will transition to the \"Cancelling\" state.", + "type": "string", + "enum": [ + "collecting" + ] + }, + { + "description": "Support Bundle has been cancelled, and will be deleted shortly.", + "type": "string", + "enum": [ + "cancelling" + ] + }, + { + "description": "Support Bundle was not created successfully, or was created and has lost backing storage.\n\nThe record of the bundle still exists for readability, but the only valid operation on these bundles is to delete them.", + "type": "string", + "enum": [ + "failed" + ] + }, + { + "description": "Support Bundle has been processed, and is ready for usage.", + "type": "string", + "enum": [ + "active" + ] + } + ] + }, "Switch": { "description": "An operator's view of a Switch.", "type": "object", @@ -22440,6 +22707,18 @@ } ] }, + "IdSortMode": { + "description": "Supported set of sort modes for scanning by id only.\n\nCurrently, we only support scanning in ascending order.", + "oneOf": [ + { + "description": "sort in increasing order of \"id\"", + "type": "string", + "enum": [ + "id_ascending" + ] + } + ] + }, "DiskMetricName": { "type": "string", "enum": [ @@ -22459,18 +22738,6 @@ "descending" ] }, - "IdSortMode": { - "description": "Supported set of sort modes for scanning by id only.\n\nCurrently, we only support scanning in ascending order.", - "oneOf": [ - { - "description": "sort in increasing order of \"id\"", - "type": "string", - "enum": [ - "id_ascending" - ] - } - ] - }, "SystemMetricName": { "type": "string", "enum": [ From 3774ecbe97839af6fba78c7a55d8ec5c21eb4d62 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Thu, 7 Nov 2024 12:57:50 -0800 Subject: [PATCH 02/22] fmt --- nexus/src/external_api/http_entrypoints.rs | 28 +++++++++++++++++----- 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/nexus/src/external_api/http_entrypoints.rs b/nexus/src/external_api/http_entrypoints.rs index 655c56320e5..3f22b958118 100644 --- a/nexus/src/external_api/http_entrypoints.rs +++ b/nexus/src/external_api/http_entrypoints.rs @@ -6002,7 +6002,8 @@ impl NexusExternalApi for NexusExternalApiImpl { async fn support_bundle_list( rqctx: RequestContext, _query_params: Query, - ) -> Result>, HttpError> { + ) -> Result>, HttpError> + { let apictx = rqctx.context(); let handler = async { let nexus = &apictx.context.nexus; @@ -6010,7 +6011,10 @@ impl NexusExternalApi for NexusExternalApiImpl { let opctx = crate::context::op_context_for_external_api(&rqctx).await?; - Err(nexus.unimplemented_todo(&opctx, crate::app::Unimpl::Public).await.into()) + Err(nexus + .unimplemented_todo(&opctx, crate::app::Unimpl::Public) + .await + .into()) }; apictx .context @@ -6030,7 +6034,10 @@ impl NexusExternalApi for NexusExternalApiImpl { let opctx = crate::context::op_context_for_external_api(&rqctx).await?; - Err(nexus.unimplemented_todo(&opctx, crate::app::Unimpl::Public).await.into()) + Err(nexus + .unimplemented_todo(&opctx, crate::app::Unimpl::Public) + .await + .into()) }; apictx .context @@ -6050,7 +6057,10 @@ impl NexusExternalApi for NexusExternalApiImpl { let opctx = crate::context::op_context_for_external_api(&rqctx).await?; - Err(nexus.unimplemented_todo(&opctx, crate::app::Unimpl::Public).await.into()) + Err(nexus + .unimplemented_todo(&opctx, crate::app::Unimpl::Public) + .await + .into()) }; apictx .context @@ -6069,7 +6079,10 @@ impl NexusExternalApi for NexusExternalApiImpl { let opctx = crate::context::op_context_for_external_api(&rqctx).await?; - Err(nexus.unimplemented_todo(&opctx, crate::app::Unimpl::Public).await.into()) + Err(nexus + .unimplemented_todo(&opctx, crate::app::Unimpl::Public) + .await + .into()) }; apictx .context @@ -6089,7 +6102,10 @@ impl NexusExternalApi for NexusExternalApiImpl { let opctx = crate::context::op_context_for_external_api(&rqctx).await?; - Err(nexus.unimplemented_todo(&opctx, crate::app::Unimpl::Public).await.into()) + Err(nexus + .unimplemented_todo(&opctx, crate::app::Unimpl::Public) + .await + .into()) }; apictx .context From 0b3fd29b97db396a3e937c39ad2d83c3ade22797 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Thu, 7 Nov 2024 14:54:57 -0800 Subject: [PATCH 03/22] EXPECTORATE --- nexus/reconfigurator/planning/src/system.rs | 6 +++--- nexus/tests/output/uncovered-authz-endpoints.txt | 5 +++++ 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/nexus/reconfigurator/planning/src/system.rs b/nexus/reconfigurator/planning/src/system.rs index 66a53932757..28b8bcf2ae2 100644 --- a/nexus/reconfigurator/planning/src/system.rs +++ b/nexus/reconfigurator/planning/src/system.rs @@ -845,9 +845,9 @@ impl Sled { name: config.name.full_name(), available: ByteCount::from_gibibytes_u32(1), used: ByteCount::from_gibibytes_u32(0), - quota: config.quota, - reservation: config.reservation, - compression: config.compression.to_string(), + quota: config.inner.quota, + reservation: config.inner.reservation, + compression: config.inner.compression.to_string(), }); } diff --git a/nexus/tests/output/uncovered-authz-endpoints.txt b/nexus/tests/output/uncovered-authz-endpoints.txt index c5091c5a3bc..ec4790c071d 100644 --- a/nexus/tests/output/uncovered-authz-endpoints.txt +++ b/nexus/tests/output/uncovered-authz-endpoints.txt @@ -1,13 +1,18 @@ API endpoints with no coverage in authz tests: probe_delete (delete "/experimental/v1/probes/{probe}") +support_bundle_delete (delete "/experimental/v1/system/support-bundles/{support_bundle}") probe_list (get "/experimental/v1/probes") probe_view (get "/experimental/v1/probes/{probe}") +support_bundle_list (get "/experimental/v1/system/support-bundles") +support_bundle_view (get "/experimental/v1/system/support-bundles/{support_bundle}") +support_bundle_download (get "/experimental/v1/system/support-bundles/{support_bundle}/download") ping (get "/v1/ping") networking_switch_port_status (get "/v1/system/hardware/switch-port/{port}/status") device_auth_request (post "/device/auth") device_auth_confirm (post "/device/confirm") device_access_token (post "/device/token") probe_create (post "/experimental/v1/probes") +support_bundle_create (post "/experimental/v1/system/support-bundles") login_saml (post "/login/{silo_name}/saml/{provider_name}") login_local (post "/v1/login/{silo_name}/local") logout (post "/v1/logout") From ec897f77162dfe4884bf5acc480c019d7715e54e Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Fri, 8 Nov 2024 15:30:20 -0800 Subject: [PATCH 04/22] Add database schema for support bundles --- nexus/db-model/src/lib.rs | 2 + nexus/db-model/src/schema.rs | 14 +++++ nexus/db-model/src/support_bundle.rs | 77 ++++++++++++++++++++++++++++ schema/crdb/dbinit.sql | 28 ++++++++++ 4 files changed, 121 insertions(+) create mode 100644 nexus/db-model/src/support_bundle.rs diff --git a/nexus/db-model/src/lib.rs b/nexus/db-model/src/lib.rs index 9137b0b3c3a..546f53b1ac0 100644 --- a/nexus/db-model/src/lib.rs +++ b/nexus/db-model/src/lib.rs @@ -96,6 +96,7 @@ mod sled_state; mod sled_underlay_subnet_allocation; mod snapshot; mod ssh_key; +mod support_bundle; mod switch; mod tuf_repo; mod typed_uuid; @@ -203,6 +204,7 @@ pub use sled_state::*; pub use sled_underlay_subnet_allocation::*; pub use snapshot::*; pub use ssh_key::*; +pub use support_bundle::*; pub use switch::*; pub use switch_interface::*; pub use switch_port::*; diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs index 8a97e436398..56f1e7e8acd 100644 --- a/nexus/db-model/src/schema.rs +++ b/nexus/db-model/src/schema.rs @@ -1362,6 +1362,20 @@ joinable!(tuf_repo_artifact -> tuf_repo (tuf_repo_id)); // Can't specify joinable for a composite primary key (tuf_repo_artifact -> // tuf_artifact). +table! { + support_bundle { + id -> Uuid, + time_created -> Timestamptz, + reason_for_creation -> Text, + reason_for_failure -> Nullable, + state -> crate::SupportBundleStateEnum, + zpool_id -> Uuid, + dataset_id -> Uuid, + + assigned_nexus -> Nullable, + } +} + /* hardware inventory */ table! { diff --git a/nexus/db-model/src/support_bundle.rs b/nexus/db-model/src/support_bundle.rs new file mode 100644 index 00000000000..56ebbf2e2f2 --- /dev/null +++ b/nexus/db-model/src/support_bundle.rs @@ -0,0 +1,77 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use super::impl_enum_type; +use crate::schema::support_bundle; +use crate::typed_uuid::DbTypedUuid; + +use chrono::{DateTime, Utc}; +use nexus_types::external_api::shared::SupportBundleInfo as SupportBundleView; +use nexus_types::external_api::shared::SupportBundleState as SupportBundleStateView; +use omicron_uuid_kinds::DatasetKind; +use omicron_uuid_kinds::ZpoolKind; +use serde::{Deserialize, Serialize}; +use uuid::Uuid; + +impl_enum_type!( + #[derive(SqlType, Debug, QueryId)] + #[diesel(postgres_type(name = "support_bundle_state", schema = "public"))] + pub struct SupportBundleStateEnum; + + #[derive(Copy, Clone, Debug, AsExpression, FromSqlRow, Serialize, Deserialize, PartialEq)] + #[diesel(sql_type = SupportBundleStateEnum)] + pub enum SupportBundleState; + + // Enum values + Collecting => b"collecting" + Cancelling => b"cancelling" + Failed => b"failed" + Active => b"active" +); + +impl From for SupportBundleStateView { + fn from(state: SupportBundleState) -> Self { + use SupportBundleState::*; + + match state { + Collecting => SupportBundleStateView::Collecting, + Cancelling => SupportBundleStateView::Cancelling, + Failed => SupportBundleStateView::Failed, + Active => SupportBundleStateView::Active, + } + } +} + +#[derive( + Queryable, + Insertable, + Debug, + Clone, + Selectable, + Deserialize, + Serialize, + PartialEq, +)] +#[diesel(table_name = support_bundle)] +pub struct SupportBundle { + id: Uuid, + time_created: DateTime, + reason_for_creation: String, + reason_for_failure: Option, + state: SupportBundleState, + zpool_id: DbTypedUuid, + dataset_id: DbTypedUuid, + assigned_nexus: Option, +} + +impl From for SupportBundleView { + fn from(bundle: SupportBundle) -> Self { + Self { + id: bundle.id, + time_created: bundle.time_created, + reason_for_creation: bundle.reason_for_creation, + state: bundle.state.into(), + } + } +} diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index b095c4c89a9..12883554630 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -2384,6 +2384,34 @@ CREATE TABLE IF NOT EXISTS omicron.public.tuf_repo_artifact ( /*******************************************************************/ +/* + * Support Bundles + */ + + +CREATE TYPE IF NOT EXISTS omicron.public.support_bundle_state AS ENUM ( + 'collecting', + 'cancelling', + 'failed', + 'active' +); + +CREATE TABLE IF NOT EXISTS omicron.public.support_bundle ( + id UUID NOT NULL, + time_created TIMESTAMPTZ NOT NULL, + reason_for_creation TEXT NOT NULL, + reason_for_failure TEXT, + state omicron.public.support_bundle_state NOT NULL, + zpool_id NOT NULL UUID, + dataset_id NOT NULL UUID, + + -- The Nexus which is in charge of collecting the support bundle, + -- and later managing its storage. + assigned_nexus UUID, +); + +/*******************************************************************/ + /* * DNS Propagation * From 38b6ea86de4452c0a831641a2c6bf8b7b5b59cff Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Fri, 8 Nov 2024 15:47:55 -0800 Subject: [PATCH 05/22] Add failure reason --- nexus/types/src/external_api/shared.rs | 1 + openapi/nexus.json | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/nexus/types/src/external_api/shared.rs b/nexus/types/src/external_api/shared.rs index 35367400e0b..2928fab46bc 100644 --- a/nexus/types/src/external_api/shared.rs +++ b/nexus/types/src/external_api/shared.rs @@ -448,6 +448,7 @@ pub struct SupportBundleInfo { pub id: Uuid, pub time_created: DateTime, pub reason_for_creation: String, + pub reason_for_failure: Option, pub state: SupportBundleState, } diff --git a/openapi/nexus.json b/openapi/nexus.json index 6cf143ac49c..1aaba49e9eb 100644 --- a/openapi/nexus.json +++ b/openapi/nexus.json @@ -20189,6 +20189,10 @@ "reason_for_creation": { "type": "string" }, + "reason_for_failure": { + "nullable": true, + "type": "string" + }, "state": { "$ref": "#/components/schemas/SupportBundleState" }, From 908f661a755cb7a1bf7ea4c159c0c1b830c1d809 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Fri, 8 Nov 2024 15:48:44 -0800 Subject: [PATCH 06/22] Merge reason for failure --- nexus/db-model/src/support_bundle.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/nexus/db-model/src/support_bundle.rs b/nexus/db-model/src/support_bundle.rs index 56ebbf2e2f2..aedcdbf760e 100644 --- a/nexus/db-model/src/support_bundle.rs +++ b/nexus/db-model/src/support_bundle.rs @@ -71,6 +71,7 @@ impl From for SupportBundleView { id: bundle.id, time_created: bundle.time_created, reason_for_creation: bundle.reason_for_creation, + reason_for_failure: bundle.reason_for_failure, state: bundle.state.into(), } } From aa3978f4cedac5ce787ce851f0c0c456e7f74021 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Tue, 12 Nov 2024 12:03:56 -0800 Subject: [PATCH 07/22] Fix SQL, moving forward on tests --- nexus/db-model/src/schema.rs | 1 + nexus/db-model/src/support_bundle.rs | 63 +- nexus/db-queries/src/db/datastore/dataset.rs | 25 +- nexus/db-queries/src/db/datastore/mod.rs | 30 + .../src/db/datastore/support_bundle.rs | 855 ++++++++++++++++++ nexus/types/src/external_api/shared.rs | 4 +- schema/crdb/dbinit.sql | 39 +- uuid-kinds/src/lib.rs | 1 + 8 files changed, 978 insertions(+), 40 deletions(-) create mode 100644 nexus/db-queries/src/db/datastore/support_bundle.rs diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs index 56f1e7e8acd..34a29e7297e 100644 --- a/nexus/db-model/src/schema.rs +++ b/nexus/db-model/src/schema.rs @@ -2042,6 +2042,7 @@ allow_tables_to_appear_in_same_query!( console_session, sled, sled_resource, + support_bundle, router_route, vmm, volume, diff --git a/nexus/db-model/src/support_bundle.rs b/nexus/db-model/src/support_bundle.rs index aedcdbf760e..f2450473f54 100644 --- a/nexus/db-model/src/support_bundle.rs +++ b/nexus/db-model/src/support_bundle.rs @@ -10,9 +10,14 @@ use chrono::{DateTime, Utc}; use nexus_types::external_api::shared::SupportBundleInfo as SupportBundleView; use nexus_types::external_api::shared::SupportBundleState as SupportBundleStateView; use omicron_uuid_kinds::DatasetKind; +use omicron_uuid_kinds::DatasetUuid; +use omicron_uuid_kinds::OmicronZoneKind; +use omicron_uuid_kinds::OmicronZoneUuid; +use omicron_uuid_kinds::SupportBundleKind; +use omicron_uuid_kinds::SupportBundleUuid; use omicron_uuid_kinds::ZpoolKind; +use omicron_uuid_kinds::ZpoolUuid; use serde::{Deserialize, Serialize}; -use uuid::Uuid; impl_enum_type!( #[derive(SqlType, Debug, QueryId)] @@ -25,17 +30,35 @@ impl_enum_type!( // Enum values Collecting => b"collecting" + Collected => b"collected" Cancelling => b"cancelling" Failed => b"failed" Active => b"active" ); +impl SupportBundleState { + pub fn might_have_dataset_storage(&self) -> bool { + use SupportBundleState::*; + + match self { + Collecting => false, + Collected => true, + Cancelling => true, + Failed => false, + Active => true, + } + } +} + impl From for SupportBundleStateView { fn from(state: SupportBundleState) -> Self { use SupportBundleState::*; match state { Collecting => SupportBundleStateView::Collecting, + // The distinction between "collected" and "collecting" is an + // internal detail that doesn't need to be exposed through the API. + Collected => SupportBundleStateView::Collecting, Cancelling => SupportBundleStateView::Cancelling, Failed => SupportBundleStateView::Failed, Active => SupportBundleStateView::Active, @@ -55,20 +78,40 @@ impl From for SupportBundleStateView { )] #[diesel(table_name = support_bundle)] pub struct SupportBundle { - id: Uuid, - time_created: DateTime, - reason_for_creation: String, - reason_for_failure: Option, - state: SupportBundleState, - zpool_id: DbTypedUuid, - dataset_id: DbTypedUuid, - assigned_nexus: Option, + pub id: DbTypedUuid, + pub time_created: DateTime, + pub reason_for_creation: String, + pub reason_for_failure: Option, + pub state: SupportBundleState, + pub zpool_id: DbTypedUuid, + pub dataset_id: DbTypedUuid, + pub assigned_nexus: Option>, +} + +impl SupportBundle { + pub fn new( + reason_for_creation: &'static str, + zpool_id: ZpoolUuid, + dataset_id: DatasetUuid, + nexus_id: OmicronZoneUuid, + ) -> Self { + Self { + id: SupportBundleUuid::new_v4().into(), + time_created: Utc::now(), + reason_for_creation: reason_for_creation.to_string(), + reason_for_failure: None, + state: SupportBundleState::Collecting, + zpool_id: zpool_id.into(), + dataset_id: dataset_id.into(), + assigned_nexus: Some(nexus_id.into()), + } + } } impl From for SupportBundleView { fn from(bundle: SupportBundle) -> Self { Self { - id: bundle.id, + id: bundle.id.into(), time_created: bundle.time_created, reason_for_creation: bundle.reason_for_creation, reason_for_failure: bundle.reason_for_failure, diff --git a/nexus/db-queries/src/db/datastore/dataset.rs b/nexus/db-queries/src/db/datastore/dataset.rs index 996f105254c..e5a72c5fb33 100644 --- a/nexus/db-queries/src/db/datastore/dataset.rs +++ b/nexus/db-queries/src/db/datastore/dataset.rs @@ -347,14 +347,13 @@ impl DataStore { #[cfg(test)] mod test { use super::*; + use crate::db::datastore::test::bp_insert_and_make_target; use crate::db::pub_test_utils::TestDatabase; use nexus_db_model::Generation; use nexus_db_model::SledBaseboard; use nexus_db_model::SledSystemHardware; use nexus_db_model::SledUpdate; use nexus_reconfigurator_planning::blueprint_builder::BlueprintBuilder; - use nexus_types::deployment::Blueprint; - use nexus_types::deployment::BlueprintTarget; use omicron_common::api::internal::shared::DatasetKind as ApiDatasetKind; use omicron_test_utils::dev; use omicron_uuid_kinds::SledUuid; @@ -516,28 +515,6 @@ mod test { logctx.cleanup_successful(); } - async fn bp_insert_and_make_target( - opctx: &OpContext, - datastore: &DataStore, - bp: &Blueprint, - ) { - datastore - .blueprint_insert(opctx, bp) - .await - .expect("inserted blueprint"); - datastore - .blueprint_target_set_current( - opctx, - BlueprintTarget { - target_id: bp.id, - enabled: true, - time_made_target: Utc::now(), - }, - ) - .await - .expect("made blueprint the target"); - } - fn new_dataset_on(zpool_id: ZpoolUuid) -> Dataset { Dataset::new( Uuid::new_v4(), diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs index 5344ffc1d9a..c0b31ed2eff 100644 --- a/nexus/db-queries/src/db/datastore/mod.rs +++ b/nexus/db-queries/src/db/datastore/mod.rs @@ -93,6 +93,7 @@ mod sled; mod sled_instance; mod snapshot; mod ssh_key; +mod support_bundle; mod switch; mod switch_interface; mod switch_port; @@ -465,6 +466,8 @@ mod test { use nexus_db_fixed_data::silo::DEFAULT_SILO; use nexus_db_model::IpAttachState; use nexus_db_model::{to_db_typed_uuid, Generation}; + use nexus_types::deployment::Blueprint; + use nexus_types::deployment::BlueprintTarget; use nexus_types::external_api::params; use nexus_types::silo::DEFAULT_SILO_ID; use omicron_common::api::external::{ @@ -503,6 +506,33 @@ mod test { } } + /// Inserts a blueprint in the DB and forcibly makes it the target + /// + /// WARNING: This makes no attempts to validate the blueprint relative to + /// parents -- this is just a test-only helper to make testing + /// blueprint-specific checks easier. + pub async fn bp_insert_and_make_target( + opctx: &OpContext, + datastore: &DataStore, + bp: &Blueprint, + ) { + datastore + .blueprint_insert(opctx, bp) + .await + .expect("inserted blueprint"); + datastore + .blueprint_target_set_current( + opctx, + BlueprintTarget { + target_id: bp.id, + enabled: true, + time_made_target: Utc::now(), + }, + ) + .await + .expect("made blueprint the target"); + } + #[tokio::test] async fn test_project_creation() { let logctx = dev::test_setup_log("test_project_creation"); diff --git a/nexus/db-queries/src/db/datastore/support_bundle.rs b/nexus/db-queries/src/db/datastore/support_bundle.rs new file mode 100644 index 00000000000..d289127f4b7 --- /dev/null +++ b/nexus/db-queries/src/db/datastore/support_bundle.rs @@ -0,0 +1,855 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! [`DataStore`] methods on [`SupportBundle`]s. + +use super::DataStore; +use crate::authz; +use crate::context::OpContext; +use crate::db; +use crate::db::error::public_error_from_diesel; +use crate::db::error::ErrorHandler; +use crate::db::model::Dataset; +use crate::db::model::DatasetKind; +use crate::db::model::SupportBundle; +use crate::db::model::SupportBundleState; +use crate::db::pagination::paginated; +use crate::transaction_retry::OptionalError; +use async_bb8_diesel::AsyncRunQueryDsl; +use diesel::prelude::*; +use futures::FutureExt; +use nexus_types::identity::Asset; +use omicron_common::api::external; +use omicron_common::api::external::CreateResult; +use omicron_common::api::external::DataPageParams; +use omicron_common::api::external::Error; +use omicron_common::api::external::ListResultVec; +use omicron_common::api::external::LookupResult; +use omicron_uuid_kinds::BlueprintUuid; +use omicron_uuid_kinds::DatasetUuid; +use omicron_uuid_kinds::GenericUuid; +use omicron_uuid_kinds::OmicronZoneUuid; +use omicron_uuid_kinds::SupportBundleUuid; +use omicron_uuid_kinds::ZpoolUuid; +use uuid::Uuid; + +const CANNOT_ALLOCATE_ERR_MSG: &'static str = +"Current policy limits support bundle creation to 'one per external disk', and \ + no disks are available. Either delete old support bundles or add additional \ + disks"; + +/// Provides a report on how many bundle were expunged, and why. +#[derive(Debug, Clone)] +pub struct SupportBundleExpungementReport { + pub bundles_failed_missing_datasets: usize, + pub bundles_cancelled_missing_nexus: usize, + pub bundles_failed_missing_nexus: usize, +} + +impl DataStore { + /// Creates a new support bundle. + /// + /// Requires that the UUID of the calling Nexus be supplied as input - + /// this particular Zone is responsible for the collection process. + /// + /// Note that really any functioning Nexus would work as the "assignee", + /// but it's clear that our instance will work, because we're currently + /// running. + pub async fn support_bundle_create( + &self, + opctx: &OpContext, + reason_for_creation: &'static str, + this_nexus_id: OmicronZoneUuid, + ) -> CreateResult { + opctx.authorize(authz::Action::ListChildren, &authz::FLEET).await?; + let conn = self.pool_connection_authorized(opctx).await?; + + #[derive(Debug)] + enum SupportBundleError { + TooManyBundles, + } + + let err = OptionalError::new(); + self.transaction_retry_wrapper("support_bundle_create") + .transaction(&conn, |conn| { + let err = err.clone(); + + async move { + use db::schema::dataset::dsl as dataset_dsl; + use db::schema::support_bundle::dsl as support_bundle_dsl; + + // Observe all "non-deleted, debug datasets". + // + // Return the first one we find that doesn't already + // have a support bundle allocated to it. + let free_dataset = dataset_dsl::dataset + .filter(dataset_dsl::time_deleted.is_null()) + .filter(dataset_dsl::kind.eq(DatasetKind::Debug)) + .left_join(support_bundle_dsl::support_bundle.on( + dataset_dsl::id.eq(support_bundle_dsl::dataset_id), + )) + .filter(support_bundle_dsl::dataset_id.is_null()) + .select(Dataset::as_select()) + .first_async(&conn) + .await + .optional()?; + + let Some(dataset) = free_dataset else { + return Err( + err.bail(SupportBundleError::TooManyBundles) + ); + }; + + // We could check that "this_nexus_id" is not expunged, but + // we have some evidence that it is valid: this Nexus is + // currently running! + // + // Besides, we COULD be expunged immediately after inserting + // the SupportBundle. In this case, we'd fall back to the + // case of "clean up a bundle which is managed by an + // expunged Nexus" anyway. + + let bundle = SupportBundle::new( + reason_for_creation, + ZpoolUuid::from_untyped_uuid(dataset.pool_id), + DatasetUuid::from_untyped_uuid(dataset.id()), + this_nexus_id, + ); + + diesel::insert_into(support_bundle_dsl::support_bundle) + .values(bundle.clone()) + .execute_async(&conn) + .await?; + + Ok(bundle) + } + }) + .await + .map_err(|e| { + if let Some(err) = err.take() { + match err { + SupportBundleError::TooManyBundles => { + return external::Error::insufficient_capacity( + CANNOT_ALLOCATE_ERR_MSG, + "Support Bundle storage exhausted", + ); + } + } + } + public_error_from_diesel(e, ErrorHandler::Server) + }) + } + + /// Looks up a single support bundle + pub async fn support_bundle_get( + &self, + opctx: &OpContext, + id: SupportBundleUuid, + ) -> LookupResult { + opctx.authorize(authz::Action::ListChildren, &authz::FLEET).await?; + use db::schema::support_bundle::dsl; + + let conn = self.pool_connection_authorized(opctx).await?; + dsl::support_bundle + .filter(dsl::id.eq(id.into_untyped_uuid())) + .select(SupportBundle::as_select()) + .first_async::(&*conn) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } + + /// Lists one page of support bundles + pub async fn support_bundle_list( + &self, + opctx: &OpContext, + pagparams: &DataPageParams<'_, Uuid>, + ) -> ListResultVec { + opctx.authorize(authz::Action::ListChildren, &authz::FLEET).await?; + use db::schema::support_bundle::dsl; + + let conn = self.pool_connection_authorized(opctx).await?; + paginated(dsl::support_bundle, dsl::id, pagparams) + .select(SupportBundle::as_select()) + .load_async(&*conn) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } + + /// Marks support bundles as failed if their assigned Nexus or backing + /// dataset has been destroyed. + pub async fn support_bundle_fail_expunged( + &self, + opctx: &OpContext, + blueprint: &nexus_types::deployment::Blueprint, + ) -> Result { + opctx.authorize(authz::Action::ListChildren, &authz::FLEET).await?; + + // For this blueprint: The set of all expunged Nexus zones + let invalid_nexus_zones = blueprint + .all_omicron_zones( + nexus_types::deployment::BlueprintZoneFilter::Expunged, + ) + .filter_map(|(_sled, zone)| { + if matches!( + zone.zone_type, + nexus_types::deployment::BlueprintZoneType::Nexus(_) + ) { + Some(zone.id.into_untyped_uuid()) + } else { + None + } + }) + .collect::>(); + let valid_nexus_zones = blueprint + .all_omicron_zones( + nexus_types::deployment::BlueprintZoneFilter::ShouldBeRunning, + ) + .filter_map(|(_sled, zone)| { + if matches!( + zone.zone_type, + nexus_types::deployment::BlueprintZoneType::Nexus(_) + ) { + Some(zone.id.into_untyped_uuid()) + } else { + None + } + }) + .collect::>(); + + // For this blueprint: The set of expunged debug datasets + let invalid_datasets = blueprint + .all_omicron_datasets( + nexus_types::deployment::BlueprintDatasetFilter::Expunged, + ) + .filter_map(|dataset_config| { + if matches!( + dataset_config.kind, + omicron_common::api::internal::shared::DatasetKind::Debug + ) { + Some(dataset_config.id.into_untyped_uuid()) + } else { + None + } + }) + .collect::>(); + + let conn = self.pool_connection_authorized(opctx).await?; + + self.transaction_if_current_blueprint_is( + &conn, + "support_bundle_fail_expunged", + opctx, + BlueprintUuid::from_untyped_uuid(blueprint.id), + |conn| { + let invalid_nexus_zones = invalid_nexus_zones.clone(); + let valid_nexus_zones = valid_nexus_zones.clone(); + let invalid_datasets = invalid_datasets.clone(); + async move { + use db::schema::support_bundle::dsl; + + // Find all bundles on datasets that no longer exist, and + // mark them "failed". + let bundles_with_bad_datasets = dsl::support_bundle + .filter(dsl::dataset_id.eq_any(invalid_datasets)) + .select(SupportBundle::as_select()) + .load_async(&*conn) + .await?; + + let bundles_failed_missing_datasets = + diesel::update(dsl::support_bundle) + .filter(dsl::state.ne(SupportBundleState::Failed)) + .filter( + dsl::id.eq_any( + bundles_with_bad_datasets + .iter() + .map(|b| b.id) + .collect::>(), + ), + ) + .set(( + dsl::state.eq(SupportBundleState::Failed), + dsl::reason_for_failure + .eq("Allocated dataset no longer exists"), + )) + .execute_async(&*conn) + .await?; + + let Some(arbitrary_valid_nexus) = + valid_nexus_zones.get(0).cloned() + else { + return Err(external::Error::internal_error( + "No valid Nexuses", + ) + .into()); + }; + + // Find all bundles on nexuses that no longer exist. + // + // If the bundle might have storage: + // - Mark it cancelled and re-assign the managing Nexus + // - Otherwise: mark it failed + let bundles_with_bad_nexuses = dsl::support_bundle + .filter(dsl::assigned_nexus.eq_any(invalid_nexus_zones)) + .select(SupportBundle::as_select()) + .load_async(&*conn) + .await?; + let (needs_cancellation, needs_failure): (Vec<_>, Vec<_>) = + bundles_with_bad_nexuses.into_iter().partition( + |bundle| bundle.state.might_have_dataset_storage(), + ); + + // Mark these support bundles as cancelled, and assign then + // to a nexus that should still exist. + // + // This should lead to their storage being freed, if it + // exists. + let bundles_cancelled_missing_nexus = diesel::update( + dsl::support_bundle, + ) + .filter(dsl::state.ne(SupportBundleState::Cancelling)) + .filter(dsl::state.ne(SupportBundleState::Failed)) + .filter( + dsl::id.eq_any( + needs_cancellation + .iter() + .map(|b| b.id) + .collect::>(), + ), + ) + .set(( + dsl::assigned_nexus.eq(arbitrary_valid_nexus), + dsl::state.eq(SupportBundleState::Cancelling), + dsl::reason_for_failure + .eq("Nexus managing this bundle no longer exists"), + )) + .execute_async(&*conn) + .await?; + + // Mark these support bundles as failed. + // + // If they don't have storage (e.g., they never got that + // far, or their underlying dataset was expunged) there + // isn't anything left to free. This means we can skip the + // "Cancelling" state and jump to "Failed". + let bundles_failed_missing_nexus = diesel::update( + dsl::support_bundle, + ) + .filter(dsl::state.ne(SupportBundleState::Cancelling)) + .filter(dsl::state.ne(SupportBundleState::Failed)) + .filter(dsl::id.eq_any( + needs_failure.iter().map(|b| b.id).collect::>(), + )) + .set(( + dsl::state.eq(SupportBundleState::Failed), + dsl::reason_for_failure + .eq("Nexus managing this bundle no longer exists"), + )) + .execute_async(&*conn) + .await?; + + Ok(SupportBundleExpungementReport { + bundles_failed_missing_datasets, + bundles_cancelled_missing_nexus, + bundles_failed_missing_nexus, + }) + } + .boxed() + }, + ) + .await + } + + /// Cancels a single support bundle. + /// + /// Note that this does not delete the support bundle record, but starts the + /// process by which it may later be deleted. + pub async fn support_bundle_cancel( + &self, + opctx: &OpContext, + id: SupportBundleUuid, + ) -> Result<(), Error> { + opctx.authorize(authz::Action::ListChildren, &authz::FLEET).await?; + + use db::schema::support_bundle::dsl as support_bundle_dsl; + + let conn = self.pool_connection_authorized(opctx).await?; + diesel::update(support_bundle_dsl::support_bundle) + .filter(support_bundle_dsl::id.eq(id.into_untyped_uuid())) + .set(support_bundle_dsl::state.eq(SupportBundleState::Cancelling)) + .execute_async(&*conn) + .await + .map(|_rows_modified| ()) + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + + Ok(()) + } + + /// Deletes a support bundle. + /// + /// This should only be invoked after all storage for the support bundle has + /// been cleared. + pub async fn support_bundle_delete( + &self, + opctx: &OpContext, + id: SupportBundleUuid, + ) -> Result<(), Error> { + opctx.authorize(authz::Action::ListChildren, &authz::FLEET).await?; + + use db::schema::support_bundle::dsl as support_bundle_dsl; + + let conn = self.pool_connection_authorized(opctx).await?; + diesel::delete(support_bundle_dsl::support_bundle) + .filter(support_bundle_dsl::id.eq(id.into_untyped_uuid())) + .execute_async(&*conn) + .await + .map(|_rows_modified| ()) + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + + Ok(()) + } +} + +#[cfg(test)] +mod test { + use super::*; + use crate::db::datastore::test::bp_insert_and_make_target; + use crate::db::pub_test_utils::TestDatabase; + use nexus_db_model::Generation; + use nexus_db_model::SledBaseboard; + use nexus_db_model::SledSystemHardware; + use nexus_db_model::SledUpdate; + use nexus_db_model::Zpool; + use nexus_reconfigurator_planning::example::ExampleSystemBuilder; + use nexus_reconfigurator_planning::example::SimRngState; + use nexus_types::deployment::Blueprint; + use nexus_types::deployment::BlueprintDatasetDisposition; + use nexus_types::deployment::BlueprintDatasetFilter; + use nexus_types::deployment::BlueprintZoneFilter; + use nexus_types::deployment::BlueprintZoneType; + use omicron_common::api::internal::shared::DatasetKind::Debug as DebugDatasetKind; + use omicron_test_utils::dev; + use omicron_uuid_kinds::SledUuid; + use rand::Rng; + + // Pool/Dataset pairs, for debug datasets only. + struct TestPool { + pool: ZpoolUuid, + dataset: DatasetUuid, + } + + // Sleds and their pools, with a focus on debug datasets only. + struct TestSled { + sled: SledUuid, + pools: Vec, + } + + impl TestSled { + fn new_with_pool_count(pool_count: usize) -> Self { + Self { + sled: SledUuid::new_v4(), + pools: (0..pool_count) + .map(|_| TestPool { + pool: ZpoolUuid::new_v4(), + dataset: DatasetUuid::new_v4(), + }) + .collect(), + } + } + + fn new_from_blueprint(blueprint: &Blueprint) -> Vec { + let mut sleds = vec![]; + for (sled, datasets) in &blueprint.blueprint_datasets { + let pools = datasets + .datasets + .values() + .filter_map(|dataset| { + if !matches!(dataset.kind, DebugDatasetKind) + || !dataset + .disposition + .matches(BlueprintDatasetFilter::InService) + { + return None; + }; + + Some(TestPool { + pool: dataset.pool.id(), + dataset: dataset.id, + }) + }) + .collect(); + + sleds.push(TestSled { sled: *sled, pools }); + } + sleds + } + + async fn create_database_records( + &self, + datastore: &DataStore, + opctx: &OpContext, + ) { + let rack_id = Uuid::new_v4(); + let sled = SledUpdate::new( + *self.sled.as_untyped_uuid(), + "[::1]:0".parse().unwrap(), + SledBaseboard { + serial_number: format!( + "test-{}", + rand::thread_rng().gen::() + ), + part_number: "test-pn".to_string(), + revision: 0, + }, + SledSystemHardware { + is_scrimlet: false, + usable_hardware_threads: 128, + usable_physical_ram: (64 << 30).try_into().unwrap(), + reservoir_size: (16 << 30).try_into().unwrap(), + }, + rack_id, + Generation::new(), + ); + datastore.sled_upsert(sled).await.expect("failed to upsert sled"); + + // Create fake zpools that back our fake datasets. + for pool in &self.pools { + let zpool = Zpool::new( + *pool.pool.as_untyped_uuid(), + *self.sled.as_untyped_uuid(), + Uuid::new_v4(), + ); + datastore + .zpool_insert(opctx, zpool) + .await + .expect("failed to upsert zpool"); + + let dataset = Dataset::new( + pool.dataset.into_untyped_uuid(), + pool.pool.into_untyped_uuid(), + None, + DebugDatasetKind, + ); + datastore + .dataset_upsert(dataset) + .await + .expect("failed to upsert dataset"); + } + } + } + + // Creates a fake sled with `pool_count` zpools, and a debug dataset on each + // zpool. + async fn create_sled_and_zpools( + datastore: &DataStore, + opctx: &OpContext, + pool_count: usize, + ) -> TestSled { + let sled = TestSled::new_with_pool_count(pool_count); + sled.create_database_records(&datastore, &opctx).await; + sled + } + + async fn support_bundle_create_expect_no_capacity( + datastore: &DataStore, + opctx: &OpContext, + this_nexus_id: OmicronZoneUuid, + ) { + let err = datastore + .support_bundle_create(&opctx, "for tests", this_nexus_id) + .await + .expect_err("Shouldn't provision bundle without datasets"); + let Error::InsufficientCapacity { message } = err else { + panic!("Unexpected error: {err:?} - we expected 'InsufficientCapacity'"); + }; + assert_eq!( + CANNOT_ALLOCATE_ERR_MSG, + message.external_message(), + "Unexpected error: {message:?}" + ); + } + + #[tokio::test] + async fn test_bundle_create_capacity_limits() { + let logctx = dev::test_setup_log("test_bundle_create_capacity_limits"); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let this_nexus_id = OmicronZoneUuid::new_v4(); + + // No sleds, no datasets. Allocation should fail. + + support_bundle_create_expect_no_capacity( + &datastore, + &opctx, + this_nexus_id, + ) + .await; + + // Create a sled with a couple pools. Allocation should succeed. + + const POOL_COUNT: usize = 2; + let _test_sled = + create_sled_and_zpools(&datastore, &opctx, POOL_COUNT).await; + let mut bundles = vec![]; + for _ in 0..POOL_COUNT { + bundles.push( + datastore + .support_bundle_create( + &opctx, + "for the test", + this_nexus_id, + ) + .await + .expect("Should be able to create bundle"), + ); + } + + // If we try to allocate any more bundles, we'll run out of capacity. + + support_bundle_create_expect_no_capacity( + &datastore, + &opctx, + this_nexus_id, + ) + .await; + + // If we cancel a bundle, it isn't deleted (yet). + // This operation should signify that we can start to free up + // storage on the dataset, but that needs to happen outside the + // database. + // + // We should still expect to hit capacity limits. + + datastore + .support_bundle_cancel(&opctx, bundles[0].id.into()) + .await + .expect("Should be able to cancel this bundle"); + support_bundle_create_expect_no_capacity( + &datastore, + &opctx, + this_nexus_id, + ) + .await; + + // If we delete a bundle, it should be gone. This means we can + // re-allocate from that dataset which was just freed up. + + datastore + .support_bundle_delete(&opctx, bundles[0].id.into()) + .await + .expect("Should be able to cancel this bundle"); + datastore + .support_bundle_create(&opctx, "for the test", this_nexus_id) + .await + .expect("Should be able to create bundle"); + + db.terminate().await; + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn test_crud_operations() { + let logctx = dev::test_setup_log("test_crud_operations"); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let test_sled = create_sled_and_zpools(&datastore, &opctx, 1).await; + let reason = "Bundle for test"; + let this_nexus_id = OmicronZoneUuid::new_v4(); + + // Create the bundle, then observe it through the "getter" APIs + + let mut bundle = datastore + .support_bundle_create(&opctx, reason, this_nexus_id) + .await + .expect("Should be able to create bundle"); + assert_eq!(bundle.reason_for_creation, reason); + assert_eq!(bundle.reason_for_failure, None); + assert_eq!(bundle.assigned_nexus, Some(this_nexus_id.into())); + assert_eq!(bundle.state, SupportBundleState::Collecting); + assert_eq!(bundle.zpool_id, test_sled.pools[0].pool.into()); + assert_eq!(bundle.dataset_id, test_sled.pools[0].dataset.into()); + + let observed_bundle = datastore + .support_bundle_get(&opctx, bundle.id.into()) + .await + .expect("Should be able to get bundle we just created"); + // Overwrite this column; it is modified slightly upon database insertion. + bundle.time_created = observed_bundle.time_created; + assert_eq!(bundle, observed_bundle); + + let pagparams = DataPageParams::max_page(); + let observed_bundles = datastore + .support_bundle_list(&opctx, &pagparams) + .await + .expect("Should be able to get bundle we just created"); + assert_eq!(1, observed_bundles.len()); + assert_eq!(bundle, observed_bundles[0]); + + // Cancel the bundle, observe the new state + + datastore + .support_bundle_cancel(&opctx, bundle.id.into()) + .await + .expect("Should be able to cancel our bundle"); + let observed_bundle = datastore + .support_bundle_get(&opctx, bundle.id.into()) + .await + .expect("Should be able to get bundle we just created"); + assert_eq!(SupportBundleState::Cancelling, observed_bundle.state); + + // Delete the bundle, observe that it's gone + + datastore + .support_bundle_delete(&opctx, bundle.id.into()) + .await + .expect("Should be able to cancel our bundle"); + let observed_bundles = datastore + .support_bundle_list(&opctx, &pagparams) + .await + .expect("Should be able to get bundle we just created"); + assert!(observed_bundles.is_empty()); + + db.terminate().await; + logctx.cleanup_successful(); + } + + fn get_nexuses_from_blueprint( + bp: &Blueprint, + filter: BlueprintZoneFilter, + ) -> Vec { + bp.blueprint_zones + .values() + .map(|zones_config| { + let mut nexus_zones = vec![]; + for zone in &zones_config.zones { + if matches!(zone.zone_type, BlueprintZoneType::Nexus(_)) + && zone.disposition.matches(filter) + { + nexus_zones.push(zone.id); + } + } + nexus_zones + }) + .flatten() + .collect() + } + + fn get_debug_datasets_from_blueprint( + bp: &Blueprint, + filter: BlueprintDatasetFilter, + ) -> Vec { + bp.blueprint_datasets + .values() + .map(|datasets_config| { + let mut debug_datasets = vec![]; + for dataset in datasets_config.datasets.values() { + if matches!(dataset.kind, DebugDatasetKind) + && dataset.disposition.matches(filter) + { + debug_datasets.push(dataset.id); + } + } + debug_datasets + }) + .flatten() + .collect() + } + + #[tokio::test] + async fn test_bundle_expungement() { + static TEST_NAME: &str = "test_bundle_expungement"; + let logctx = dev::test_setup_log(TEST_NAME); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let mut rng = SimRngState::from_seed(TEST_NAME); + let (_example, mut bp1) = ExampleSystemBuilder::new_with_rng( + &logctx.log, + rng.next_system_rng(), + ) + .build(); + + // Weirdly, the "ExampleSystemBuilder" blueprint has a parent blueprint, + // but which isn't exposed through the API. Since we're only able to see + // the blueprint it emits, that means we can't actually make it the + // target because "the parent blueprint is not the current target". + // + // Instead of dealing with that, we lie: claim this is the primordial + // blueprint, with no parent. + // + // Regardless, make this starter blueprint our target. + bp1.parent_blueprint_id = None; + bp_insert_and_make_target(&opctx, &datastore, &bp1).await; + + // Manually perform the equivalent of blueprint execution to populate + // database records. + let sleds = TestSled::new_from_blueprint(&bp1); + for sled in &sleds { + sled.create_database_records(&datastore, &opctx).await; + } + + // Extract Nexus and Dataset information from the generated blueprint. + let this_nexus_id = get_nexuses_from_blueprint( + &bp1, + BlueprintZoneFilter::ShouldBeRunning, + ) + .get(0) + .map(|id| *id) + .expect("There should be a Nexus in the example blueprint"); + let debug_datasets = get_debug_datasets_from_blueprint( + &bp1, + BlueprintDatasetFilter::InService, + ); + assert!(!debug_datasets.is_empty()); + + // When we create a bundle, it should exist on a dataset provisioned by + // the blueprint. + let bundle = datastore + .support_bundle_create(&opctx, "for the test", this_nexus_id) + .await + .expect("Should be able to create bundle"); + assert_eq!(bundle.assigned_nexus, Some(this_nexus_id.into())); + + assert!( + debug_datasets.contains(&DatasetUuid::from(bundle.dataset_id)), + "Bundle should have been allocated from a blueprint dataset" + ); + + // Expunge the bundle's dataset. + let bp2 = { + let mut bp2 = bp1.clone(); + bp2.id = Uuid::new_v4(); + bp2.parent_blueprint_id = Some(bp1.id); + + for datasets in bp2.blueprint_datasets.values_mut() { + for dataset in datasets.datasets.values_mut() { + if dataset.id == bundle.dataset_id.into() { + dataset.disposition = + BlueprintDatasetDisposition::Expunged; + } + } + } + bp2 + }; + bp_insert_and_make_target(&opctx, &datastore, &bp2).await; + + // TODO: Call this on bp1, observe no changes? + let report = datastore + .support_bundle_fail_expunged(&opctx, &bp2) + .await + .expect("Should have been able to mark bundle state as failed"); + + assert_eq!(1, report.bundles_failed_missing_datasets); + assert_eq!(0, report.bundles_cancelled_missing_nexus); + assert_eq!(0, report.bundles_failed_missing_nexus); + + // TODO: Another test, maybe? + // TODO: Find the nexus where we allocated the bundle + // TODO: expunge it + + db.terminate().await; + logctx.cleanup_successful(); + } +} diff --git a/nexus/types/src/external_api/shared.rs b/nexus/types/src/external_api/shared.rs index 2928fab46bc..b53dd78cdba 100644 --- a/nexus/types/src/external_api/shared.rs +++ b/nexus/types/src/external_api/shared.rs @@ -10,6 +10,7 @@ use chrono::DateTime; use chrono::Utc; use omicron_common::api::external::Name; use omicron_common::api::internal::shared::NetworkInterface; +use omicron_uuid_kinds::SupportBundleUuid; use parse_display::FromStr; use schemars::JsonSchema; use serde::de::Error as _; @@ -444,8 +445,7 @@ pub enum SupportBundleState { #[derive(Debug, Clone, JsonSchema, Serialize, Deserialize)] pub struct SupportBundleInfo { - // XXX: Strongly-typed Support Bundle UUID? - pub id: Uuid, + pub id: SupportBundleUuid, pub time_created: DateTime, pub reason_for_creation: String, pub reason_for_failure: Option, diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 12883554630..fe29b2e65f0 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -2390,24 +2390,55 @@ CREATE TABLE IF NOT EXISTS omicron.public.tuf_repo_artifact ( CREATE TYPE IF NOT EXISTS omicron.public.support_bundle_state AS ENUM ( + -- The bundle is currently being created. + -- + -- All storage for this bundle exists in Nexus's transient filesystem, as it + -- is being assembled. 'collecting', + + -- The bundle has been created, and is being transferred to a Sled. + -- + -- This bundle may been partially allocated to a Sled. + 'collected', + + -- The bundle has been explicitly marked cancelled. + -- + -- It may or may not have durable storage which needs to be freed. 'cancelling', + + -- The bundle, for whatever reason, has failed. + -- + -- It should no longer have any managed durable storage. 'failed', + + -- The bundle has been collected successfully, and has storage on + -- a particular sled. 'active' ); CREATE TABLE IF NOT EXISTS omicron.public.support_bundle ( - id UUID NOT NULL, + id UUID PRIMARY KEY, time_created TIMESTAMPTZ NOT NULL, reason_for_creation TEXT NOT NULL, reason_for_failure TEXT, state omicron.public.support_bundle_state NOT NULL, - zpool_id NOT NULL UUID, - dataset_id NOT NULL UUID, + zpool_id UUID NOT NULL, + dataset_id UUID NOT NULL, -- The Nexus which is in charge of collecting the support bundle, -- and later managing its storage. - assigned_nexus UUID, + assigned_nexus UUID +); + +-- The "UNIQUE" part of this index helps enforce that we allow one support bundle +-- per debug dataset. This constraint can be removed, if the query responsible +-- for allocation changes to allocate more intelligently. +CREATE UNIQUE INDEX IF NOT EXISTS one_bundle_per_dataset ON omicron.public.support_bundle ( + dataset_id +); + +CREATE INDEX IF NOT EXISTS lookup_bundle_by_neuxs ON omicron.public.support_bundle ( + assigned_nexus ); /*******************************************************************/ diff --git a/uuid-kinds/src/lib.rs b/uuid-kinds/src/lib.rs index 7947062a82b..50b263fa0b6 100644 --- a/uuid-kinds/src/lib.rs +++ b/uuid-kinds/src/lib.rs @@ -65,6 +65,7 @@ impl_typed_uuid_kind! { RackReset => "rack_reset", Region => "region", Sled => "sled", + SupportBundle => "support_bundle", TufRepo => "tuf_repo", Upstairs => "upstairs", UpstairsRepair => "upstairs_repair", From d6d53c974e66e64a48d99f83e3319d6fcd4a6973 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Tue, 12 Nov 2024 17:46:47 -0800 Subject: [PATCH 08/22] Refactor state machine --- nexus/db-model/src/support_bundle.rs | 39 +- .../src/db/datastore/support_bundle.rs | 522 ++++++++++++++---- nexus/types/src/external_api/shared.rs | 12 +- schema/crdb/dbinit.sql | 33 +- 4 files changed, 455 insertions(+), 151 deletions(-) diff --git a/nexus/db-model/src/support_bundle.rs b/nexus/db-model/src/support_bundle.rs index f2450473f54..0e530708770 100644 --- a/nexus/db-model/src/support_bundle.rs +++ b/nexus/db-model/src/support_bundle.rs @@ -30,22 +30,28 @@ impl_enum_type!( // Enum values Collecting => b"collecting" - Collected => b"collected" - Cancelling => b"cancelling" - Failed => b"failed" Active => b"active" + Destroying => b"destroying" + Failing => b"failing" + Failed => b"failed" ); impl SupportBundleState { - pub fn might_have_dataset_storage(&self) -> bool { + /// Returns the list of valid prior states. + /// + /// This is used to confirm that state updates are performed legally, + /// and defines the possible state transitions. + pub fn valid_old_states(&self) -> Vec { use SupportBundleState::*; match self { - Collecting => false, - Collected => true, - Cancelling => true, - Failed => false, - Active => true, + Collecting => vec![], + Active => vec![Collecting], + // The "Destroying" state is terminal. + Destroying => vec![Active, Collecting, Failing], + Failing => vec![Collecting, Active], + // The "Failed" state is terminal. + Failed => vec![Collecting, Active, Failing], } } } @@ -56,12 +62,17 @@ impl From for SupportBundleStateView { match state { Collecting => SupportBundleStateView::Collecting, - // The distinction between "collected" and "collecting" is an - // internal detail that doesn't need to be exposed through the API. - Collected => SupportBundleStateView::Collecting, - Cancelling => SupportBundleStateView::Cancelling, - Failed => SupportBundleStateView::Failed, Active => SupportBundleStateView::Active, + Destroying => SupportBundleStateView::Destroying, + // The distinction between "failing" and "failed" should not be + // visible to end-users. This is internal book-keeping to decide + // whether or not the bundle record can be safely deleted. + // + // Either way, it should be possible to delete the bundle. + // - "Failing" bundles will become "Destroying" + // - "Failed" bundles will be destroyed immediately + Failing => SupportBundleStateView::Failed, + Failed => SupportBundleStateView::Failed, } } } diff --git a/nexus/db-queries/src/db/datastore/support_bundle.rs b/nexus/db-queries/src/db/datastore/support_bundle.rs index d289127f4b7..4d4a6645c2a 100644 --- a/nexus/db-queries/src/db/datastore/support_bundle.rs +++ b/nexus/db-queries/src/db/datastore/support_bundle.rs @@ -15,6 +15,7 @@ use crate::db::model::DatasetKind; use crate::db::model::SupportBundle; use crate::db::model::SupportBundleState; use crate::db::pagination::paginated; +use crate::db::update_and_check::{UpdateAndCheck, UpdateStatus}; use crate::transaction_retry::OptionalError; use async_bb8_diesel::AsyncRunQueryDsl; use diesel::prelude::*; @@ -39,12 +40,29 @@ const CANNOT_ALLOCATE_ERR_MSG: &'static str = no disks are available. Either delete old support bundles or add additional \ disks"; +const FAILURE_REASON_NO_DATASET: &'static str = + "Allocated dataset no longer exists"; +const FAILURE_REASON_NO_NEXUS: &'static str = + "Nexus managing this bundle no longer exists"; + /// Provides a report on how many bundle were expunged, and why. -#[derive(Debug, Clone)] +#[derive(Default, Debug, Clone, PartialEq)] pub struct SupportBundleExpungementReport { + /// Bundles marked "failed" because the datasets storing them have been + /// expunged. pub bundles_failed_missing_datasets: usize, - pub bundles_cancelled_missing_nexus: usize, - pub bundles_failed_missing_nexus: usize, + /// Bundles deleted because the datasets storing them have been + /// expunged. + pub bundles_deleted_missing_datasets: usize, + + /// Bundles marked "destroying" because the nexuses managing them have been + /// expunged. + /// + /// These bundles should be re-assigned to a different nexus for cleanup. + pub bundles_failing_missing_nexus: usize, + + /// Bundles which had a new Nexus assigned to them. + pub bundles_reassigned: usize, } impl DataStore { @@ -248,110 +266,108 @@ impl DataStore { async move { use db::schema::support_bundle::dsl; - // Find all bundles on datasets that no longer exist, and - // mark them "failed". + // Find all bundles without backing storage. let bundles_with_bad_datasets = dsl::support_bundle .filter(dsl::dataset_id.eq_any(invalid_datasets)) .select(SupportBundle::as_select()) .load_async(&*conn) .await?; + // Split these bundles into two categories: + // - Ones that are being destroyed anyway, and that can be + // fully deleted. + // - Ones that are NOT being destroyed, and should be marked + // failed so the end-user has visibility into their + // destruction. + let (bundles_to_delete, bundles_to_fail): (Vec<_>, Vec<_>) = + bundles_with_bad_datasets.into_iter().partition( + |bundle| bundle.state == SupportBundleState::Destroying + ); + let bundles_to_delete = bundles_to_delete.into_iter().map(|b| b.id).collect::>(); + let bundles_to_fail = bundles_to_fail.into_iter().map(|b| b.id).collect::>(); + + // Find all non-destroying bundles on datasets that no + // longer exist, and mark them "failed". They skip the + // "failing" state because there is no remaining storage to + // be cleaned up. + let state = SupportBundleState::Failed; let bundles_failed_missing_datasets = diesel::update(dsl::support_bundle) - .filter(dsl::state.ne(SupportBundleState::Failed)) - .filter( - dsl::id.eq_any( - bundles_with_bad_datasets - .iter() - .map(|b| b.id) - .collect::>(), - ), - ) + .filter(dsl::state.eq_any(state.valid_old_states())) + .filter(dsl::id.eq_any(bundles_to_fail)) .set(( - dsl::state.eq(SupportBundleState::Failed), - dsl::reason_for_failure - .eq("Allocated dataset no longer exists"), + dsl::state.eq(state), + dsl::reason_for_failure.eq(FAILURE_REASON_NO_DATASET), )) .execute_async(&*conn) .await?; + // For bundles that are in the process of being destroyed, + // the dataset expungement speeds up the process. + let bundles_deleted_missing_datasets = + diesel::delete(dsl::support_bundle) + .filter(dsl::state.eq_any(state.valid_old_states())) + .filter(dsl::id.eq_any(bundles_to_delete)) + // This check should be redundant (we already + // partitioned above based on this state) but out of + // an abundance of catuion we don't auto-delete a + // bundle in any other state. + .filter(dsl::state.eq(SupportBundleState::Destroying)) + .execute_async(&*conn) + .await?; let Some(arbitrary_valid_nexus) = valid_nexus_zones.get(0).cloned() else { return Err(external::Error::internal_error( - "No valid Nexuses", + "No valid Nexuses, we cannot re-assign this support bundle", ) .into()); }; // Find all bundles on nexuses that no longer exist. - // - // If the bundle might have storage: - // - Mark it cancelled and re-assign the managing Nexus - // - Otherwise: mark it failed let bundles_with_bad_nexuses = dsl::support_bundle .filter(dsl::assigned_nexus.eq_any(invalid_nexus_zones)) .select(SupportBundle::as_select()) .load_async(&*conn) .await?; - let (needs_cancellation, needs_failure): (Vec<_>, Vec<_>) = - bundles_with_bad_nexuses.into_iter().partition( - |bundle| bundle.state.might_have_dataset_storage(), - ); - // Mark these support bundles as cancelled, and assign then + let bundles_to_mark_failing = bundles_with_bad_nexuses.iter() + .map(|b| b.id).collect::>(); + let bundles_to_reassign = bundles_with_bad_nexuses.iter() + .filter_map(|bundle| { + if bundle.state != SupportBundleState::Failed { + Some(bundle.id) + } else { + None + } + }).collect::>(); + + // Mark these support bundles as failing, and assign then // to a nexus that should still exist. // // This should lead to their storage being freed, if it // exists. - let bundles_cancelled_missing_nexus = diesel::update( - dsl::support_bundle, - ) - .filter(dsl::state.ne(SupportBundleState::Cancelling)) - .filter(dsl::state.ne(SupportBundleState::Failed)) - .filter( - dsl::id.eq_any( - needs_cancellation - .iter() - .map(|b| b.id) - .collect::>(), - ), - ) - .set(( - dsl::assigned_nexus.eq(arbitrary_valid_nexus), - dsl::state.eq(SupportBundleState::Cancelling), - dsl::reason_for_failure - .eq("Nexus managing this bundle no longer exists"), - )) - .execute_async(&*conn) - .await?; - - // Mark these support bundles as failed. - // - // If they don't have storage (e.g., they never got that - // far, or their underlying dataset was expunged) there - // isn't anything left to free. This means we can skip the - // "Cancelling" state and jump to "Failed". - let bundles_failed_missing_nexus = diesel::update( - dsl::support_bundle, - ) - .filter(dsl::state.ne(SupportBundleState::Cancelling)) - .filter(dsl::state.ne(SupportBundleState::Failed)) - .filter(dsl::id.eq_any( - needs_failure.iter().map(|b| b.id).collect::>(), - )) - .set(( - dsl::state.eq(SupportBundleState::Failed), - dsl::reason_for_failure - .eq("Nexus managing this bundle no longer exists"), - )) - .execute_async(&*conn) - .await?; + let state = SupportBundleState::Failing; + let bundles_failing_missing_nexus = diesel::update(dsl::support_bundle) + .filter(dsl::state.eq_any(state.valid_old_states())) + .filter(dsl::id.eq_any(bundles_to_mark_failing)) + .set(( + dsl::state.eq(state), + dsl::reason_for_failure.eq(FAILURE_REASON_NO_NEXUS), + )) + .execute_async(&*conn) + .await?; + let bundles_reassigned = diesel::update(dsl::support_bundle) + .filter(dsl::id.eq_any(bundles_to_reassign)) + .set(dsl::assigned_nexus.eq(arbitrary_valid_nexus)) + .execute_async(&*conn) + .await?; Ok(SupportBundleExpungementReport { bundles_failed_missing_datasets, - bundles_cancelled_missing_nexus, - bundles_failed_missing_nexus, + bundles_deleted_missing_datasets, + bundles_failing_missing_nexus, + bundles_reassigned, }) } .boxed() @@ -360,29 +376,35 @@ impl DataStore { .await } - /// Cancels a single support bundle. - /// - /// Note that this does not delete the support bundle record, but starts the - /// process by which it may later be deleted. - pub async fn support_bundle_cancel( + /// Updates the state of a support bundle + pub async fn support_bundle_update( &self, opctx: &OpContext, id: SupportBundleUuid, + state: SupportBundleState, ) -> Result<(), Error> { opctx.authorize(authz::Action::ListChildren, &authz::FLEET).await?; - use db::schema::support_bundle::dsl as support_bundle_dsl; - + use db::schema::support_bundle::dsl; let conn = self.pool_connection_authorized(opctx).await?; - diesel::update(support_bundle_dsl::support_bundle) - .filter(support_bundle_dsl::id.eq(id.into_untyped_uuid())) - .set(support_bundle_dsl::state.eq(SupportBundleState::Cancelling)) - .execute_async(&*conn) + let result = diesel::update(dsl::support_bundle) + .filter(dsl::id.eq(id.into_untyped_uuid())) + .filter(dsl::state.eq_any(state.valid_old_states())) + .set(dsl::state.eq(state)) + .check_if_exists::(id.into_untyped_uuid()) + .execute_and_check(&*conn) .await - .map(|_rows_modified| ()) .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; - Ok(()) + match result.status { + UpdateStatus::Updated => Ok(()), + UpdateStatus::NotUpdatedButExists => { + Err(Error::invalid_request(format!( + "Cannot update support bundle state from {:?} to {:?}", + result.found.state, state + ))) + } + } } /// Deletes a support bundle. @@ -396,11 +418,16 @@ impl DataStore { ) -> Result<(), Error> { opctx.authorize(authz::Action::ListChildren, &authz::FLEET).await?; - use db::schema::support_bundle::dsl as support_bundle_dsl; + use db::schema::support_bundle::dsl; let conn = self.pool_connection_authorized(opctx).await?; - diesel::delete(support_bundle_dsl::support_bundle) - .filter(support_bundle_dsl::id.eq(id.into_untyped_uuid())) + diesel::delete(dsl::support_bundle) + .filter( + dsl::state + .eq(SupportBundleState::Destroying) + .or(dsl::state.eq(SupportBundleState::Failed)), + ) + .filter(dsl::id.eq(id.into_untyped_uuid())) .execute_async(&*conn) .await .map(|_rows_modified| ()) @@ -425,6 +452,7 @@ mod test { use nexus_types::deployment::Blueprint; use nexus_types::deployment::BlueprintDatasetDisposition; use nexus_types::deployment::BlueprintDatasetFilter; + use nexus_types::deployment::BlueprintZoneDisposition; use nexus_types::deployment::BlueprintZoneFilter; use nexus_types::deployment::BlueprintZoneType; use omicron_common::api::internal::shared::DatasetKind::Debug as DebugDatasetKind; @@ -614,7 +642,7 @@ mod test { ) .await; - // If we cancel a bundle, it isn't deleted (yet). + // If we destroy a bundle, it isn't deleted (yet). // This operation should signify that we can start to free up // storage on the dataset, but that needs to happen outside the // database. @@ -622,9 +650,13 @@ mod test { // We should still expect to hit capacity limits. datastore - .support_bundle_cancel(&opctx, bundles[0].id.into()) + .support_bundle_update( + &opctx, + bundles[0].id.into(), + SupportBundleState::Destroying, + ) .await - .expect("Should be able to cancel this bundle"); + .expect("Should be able to destroy this bundle"); support_bundle_create_expect_no_capacity( &datastore, &opctx, @@ -638,7 +670,7 @@ mod test { datastore .support_bundle_delete(&opctx, bundles[0].id.into()) .await - .expect("Should be able to cancel this bundle"); + .expect("Should be able to destroy this bundle"); datastore .support_bundle_create(&opctx, "for the test", this_nexus_id) .await @@ -687,24 +719,28 @@ mod test { assert_eq!(1, observed_bundles.len()); assert_eq!(bundle, observed_bundles[0]); - // Cancel the bundle, observe the new state + // Destroy the bundle, observe the new state datastore - .support_bundle_cancel(&opctx, bundle.id.into()) + .support_bundle_update( + &opctx, + bundle.id.into(), + SupportBundleState::Destroying, + ) .await - .expect("Should be able to cancel our bundle"); + .expect("Should be able to destroy our bundle"); let observed_bundle = datastore .support_bundle_get(&opctx, bundle.id.into()) .await .expect("Should be able to get bundle we just created"); - assert_eq!(SupportBundleState::Cancelling, observed_bundle.state); + assert_eq!(SupportBundleState::Destroying, observed_bundle.state); // Delete the bundle, observe that it's gone datastore .support_bundle_delete(&opctx, bundle.id.into()) .await - .expect("Should be able to cancel our bundle"); + .expect("Should be able to destroy our bundle"); let observed_bundles = datastore .support_bundle_list(&opctx, &pagparams) .await @@ -757,9 +793,29 @@ mod test { .collect() } + fn expunge_dataset_for_bundle(bp: &mut Blueprint, bundle: &SupportBundle) { + for datasets in bp.blueprint_datasets.values_mut() { + for dataset in datasets.datasets.values_mut() { + if dataset.id == bundle.dataset_id.into() { + dataset.disposition = BlueprintDatasetDisposition::Expunged; + } + } + } + } + + fn expunge_nexus_for_bundle(bp: &mut Blueprint, bundle: &SupportBundle) { + for zones in bp.blueprint_zones.values_mut() { + for zone in &mut zones.zones { + if zone.id == bundle.assigned_nexus.unwrap().into() { + zone.disposition = BlueprintZoneDisposition::Expunged; + } + } + } + } + #[tokio::test] - async fn test_bundle_expungement() { - static TEST_NAME: &str = "test_bundle_expungement"; + async fn test_bundle_failed_from_expunged_dataset() { + static TEST_NAME: &str = "test_bundle_failed_from_expunged_dataset"; let logctx = dev::test_setup_log(TEST_NAME); let db = TestDatabase::new_with_datastore(&logctx.log).await; let (opctx, datastore) = (db.opctx(), db.datastore()); @@ -811,43 +867,277 @@ mod test { .await .expect("Should be able to create bundle"); assert_eq!(bundle.assigned_nexus, Some(this_nexus_id.into())); - assert!( debug_datasets.contains(&DatasetUuid::from(bundle.dataset_id)), "Bundle should have been allocated from a blueprint dataset" ); - // Expunge the bundle's dataset. + // If we try to "fail support bundles" from expunged datasets/nexuses, + // we should see a no-op. Nothing has been expunged yet! + let report = + datastore.support_bundle_fail_expunged(&opctx, &bp1).await.expect( + "Should have been able to perform no-op support bundle failure", + ); + assert_eq!(SupportBundleExpungementReport::default(), report); + + // Expunge the bundle's dataset (manually) let bp2 = { let mut bp2 = bp1.clone(); bp2.id = Uuid::new_v4(); bp2.parent_blueprint_id = Some(bp1.id); + expunge_dataset_for_bundle(&mut bp2, &bundle); + bp2 + }; + bp_insert_and_make_target(&opctx, &datastore, &bp2).await; - for datasets in bp2.blueprint_datasets.values_mut() { - for dataset in datasets.datasets.values_mut() { - if dataset.id == bundle.dataset_id.into() { - dataset.disposition = - BlueprintDatasetDisposition::Expunged; - } - } - } + datastore + .support_bundle_fail_expunged(&opctx, &bp1) + .await + .expect_err("bp1 is no longer the target; this should fail"); + let report = datastore + .support_bundle_fail_expunged(&opctx, &bp2) + .await + .expect("Should have been able to mark bundle state as failed"); + assert_eq!( + SupportBundleExpungementReport { + bundles_failed_missing_datasets: 1, + ..Default::default() + }, + report + ); + + let observed_bundle = datastore + .support_bundle_get(&opctx, bundle.id.into()) + .await + .expect("Should be able to get bundle we just failed"); + assert_eq!(SupportBundleState::Failed, observed_bundle.state); + assert!(observed_bundle + .reason_for_failure + .unwrap() + .contains(FAILURE_REASON_NO_DATASET)); + + db.terminate().await; + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn test_bundle_failed_from_expunged_nexus_no_reassign() { + static TEST_NAME: &str = + "test_bundle_failed_from_expunged_nexus_no_reassign"; + let logctx = dev::test_setup_log(TEST_NAME); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let mut rng = SimRngState::from_seed(TEST_NAME); + let (_example, mut bp1) = ExampleSystemBuilder::new_with_rng( + &logctx.log, + rng.next_system_rng(), + ) + .build(); + + bp1.parent_blueprint_id = None; + bp_insert_and_make_target(&opctx, &datastore, &bp1).await; + + // Manually perform the equivalent of blueprint execution to populate + // database records. + let sleds = TestSled::new_from_blueprint(&bp1); + for sled in &sleds { + sled.create_database_records(&datastore, &opctx).await; + } + + // Extract Nexus and Dataset information from the generated blueprint. + let nexus_ids = get_nexuses_from_blueprint( + &bp1, + BlueprintZoneFilter::ShouldBeRunning, + ); + let debug_datasets = get_debug_datasets_from_blueprint( + &bp1, + BlueprintDatasetFilter::InService, + ); + assert!(!debug_datasets.is_empty()); + + // When we create a bundle, it should exist on a dataset provisioned by + // the blueprint. + let bundle = datastore + .support_bundle_create(&opctx, "for the test", nexus_ids[0]) + .await + .expect("Should be able to create bundle"); + + assert_eq!(bundle.state, SupportBundleState::Collecting); + assert_eq!(bundle.assigned_nexus, Some(nexus_ids[0].into())); + assert!( + debug_datasets.contains(&DatasetUuid::from(bundle.dataset_id)), + "Bundle should have been allocated from a blueprint dataset" + ); + + // Expunge the bundle's dataset. This marks it as "failed", and + // is a prerequisite for the bundle not later being re-assigned. + let bp2 = { + let mut bp2 = bp1.clone(); + bp2.id = Uuid::new_v4(); + bp2.parent_blueprint_id = Some(bp1.id); + expunge_dataset_for_bundle(&mut bp2, &bundle); bp2 }; bp_insert_and_make_target(&opctx, &datastore, &bp2).await; - // TODO: Call this on bp1, observe no changes? let report = datastore .support_bundle_fail_expunged(&opctx, &bp2) .await .expect("Should have been able to mark bundle state as failed"); + assert_eq!( + SupportBundleExpungementReport { + bundles_failed_missing_datasets: 1, + ..Default::default() + }, + report + ); + + let observed_bundle = datastore + .support_bundle_get(&opctx, bundle.id.into()) + .await + .expect("Should be able to get bundle we just failed"); + assert_eq!(SupportBundleState::Failed, observed_bundle.state); + assert!(observed_bundle + .reason_for_failure + .unwrap() + .contains(FAILURE_REASON_NO_DATASET)); + + // Expunge the bundle's Nexus + let bp3 = { + let mut bp3 = bp2.clone(); + bp3.id = Uuid::new_v4(); + bp3.parent_blueprint_id = Some(bp2.id); + expunge_nexus_for_bundle(&mut bp3, &bundle); + bp3 + }; + bp_insert_and_make_target(&opctx, &datastore, &bp3).await; + + let report = datastore + .support_bundle_fail_expunged(&opctx, &bp3) + .await + .expect("Should have been able to mark bundle state as failed"); + + // Although the record for this bundle already exists, it is not + // re-assigned, and the original reason for it failing (dataset loss) is + // preserved. + assert_eq!(SupportBundleExpungementReport::default(), report); + + let observed_bundle = datastore + .support_bundle_get(&opctx, bundle.id.into()) + .await + .expect("Should be able to get bundle we just failed"); + assert_eq!(SupportBundleState::Failed, observed_bundle.state); + assert!(observed_bundle + .reason_for_failure + .unwrap() + .contains(FAILURE_REASON_NO_DATASET)); + + datastore + .support_bundle_delete(&opctx, bundle.id.into()) + .await + .expect("Should have been able to delete support bundle"); + + db.terminate().await; + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn test_bundle_failed_from_expunged_nexus_with_reassign() { + static TEST_NAME: &str = + "test_bundle_failed_from_expunged_nexus_with_reassign"; + let logctx = dev::test_setup_log(TEST_NAME); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); - assert_eq!(1, report.bundles_failed_missing_datasets); - assert_eq!(0, report.bundles_cancelled_missing_nexus); - assert_eq!(0, report.bundles_failed_missing_nexus); + let mut rng = SimRngState::from_seed(TEST_NAME); + let (_example, mut bp1) = ExampleSystemBuilder::new_with_rng( + &logctx.log, + rng.next_system_rng(), + ) + .build(); + + bp1.parent_blueprint_id = None; + bp_insert_and_make_target(&opctx, &datastore, &bp1).await; - // TODO: Another test, maybe? - // TODO: Find the nexus where we allocated the bundle - // TODO: expunge it + // Manually perform the equivalent of blueprint execution to populate + // database records. + let sleds = TestSled::new_from_blueprint(&bp1); + for sled in &sleds { + sled.create_database_records(&datastore, &opctx).await; + } + + // Extract Nexus and Dataset information from the generated blueprint. + let nexus_ids = get_nexuses_from_blueprint( + &bp1, + BlueprintZoneFilter::ShouldBeRunning, + ); + let debug_datasets = get_debug_datasets_from_blueprint( + &bp1, + BlueprintDatasetFilter::InService, + ); + assert!(!debug_datasets.is_empty()); + + // When we create a bundle, it should exist on a dataset provisioned by + // the blueprint. + let bundle = datastore + .support_bundle_create(&opctx, "for the test", nexus_ids[0]) + .await + .expect("Should be able to create bundle"); + + assert_eq!(bundle.state, SupportBundleState::Collecting); + assert_eq!(bundle.assigned_nexus, Some(nexus_ids[0].into())); + assert!( + debug_datasets.contains(&DatasetUuid::from(bundle.dataset_id)), + "Bundle should have been allocated from a blueprint dataset" + ); + + // Update the bundle's state. + // + // This is what we would do when we finish collecting, and + // provisioned storage on a sled. + datastore + .support_bundle_update( + &opctx, + bundle.id.into(), + SupportBundleState::Active, + ) + .await + .expect("Should have been able to update state"); + + // Expunge the bundle's Nexus (manually) + let bp2 = { + let mut bp2 = bp1.clone(); + bp2.id = Uuid::new_v4(); + bp2.parent_blueprint_id = Some(bp1.id); + expunge_nexus_for_bundle(&mut bp2, &bundle); + bp2 + }; + bp_insert_and_make_target(&opctx, &datastore, &bp2).await; + + let report = datastore + .support_bundle_fail_expunged(&opctx, &bp2) + .await + .expect("Should have been able to mark bundle state as destroying"); + + assert_eq!( + SupportBundleExpungementReport { + bundles_failing_missing_nexus: 1, + bundles_reassigned: 1, + ..Default::default() + }, + report + ); + + let observed_bundle = datastore + .support_bundle_get(&opctx, bundle.id.into()) + .await + .expect("Should be able to get bundle we just failed"); + assert_eq!(SupportBundleState::Failing, observed_bundle.state); + assert!(observed_bundle + .reason_for_failure + .unwrap() + .contains(FAILURE_REASON_NO_NEXUS)); db.terminate().await; logctx.cleanup_successful(); diff --git a/nexus/types/src/external_api/shared.rs b/nexus/types/src/external_api/shared.rs index b53dd78cdba..9e9bc26ffe8 100644 --- a/nexus/types/src/external_api/shared.rs +++ b/nexus/types/src/external_api/shared.rs @@ -423,20 +423,22 @@ pub enum SupportBundleState { /// Support Bundle still actively being collected. /// /// This is the initial state for a Support Bundle, and it will - /// automatically transition to either "Failed" or "Active". + /// automatically transition to either "Failing" or "Active". /// /// If a user no longer wants to access a Support Bundle, they can - /// request cancellation, which will transition to the "Cancelling" state. + /// request cancellation, which will transition to the "Destroying" state. Collecting, - /// Support Bundle has been cancelled, and will be deleted shortly. - Cancelling, + /// Support Bundle is being destroyed. + /// + /// Once backing storage has been freed, this bundle is destroyed. + Destroying, /// Support Bundle was not created successfully, or was created and has lost /// backing storage. /// /// The record of the bundle still exists for readability, but the only - /// valid operation on these bundles is to delete them. + /// valid operation on these bundles is to destroy them. Failed, /// Support Bundle has been processed, and is ready for usage. diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index fe29b2e65f0..52af30612c7 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -2392,28 +2392,29 @@ CREATE TABLE IF NOT EXISTS omicron.public.tuf_repo_artifact ( CREATE TYPE IF NOT EXISTS omicron.public.support_bundle_state AS ENUM ( -- The bundle is currently being created. -- - -- All storage for this bundle exists in Nexus's transient filesystem, as it - -- is being assembled. + -- It might have storage that is partially allocated on a sled. 'collecting', - -- The bundle has been created, and is being transferred to a Sled. - -- - -- This bundle may been partially allocated to a Sled. - 'collected', + -- The bundle has been collected successfully, and has storage on + -- a particular sled. + 'active', - -- The bundle has been explicitly marked cancelled. - -- - -- It may or may not have durable storage which needs to be freed. - 'cancelling', + -- The user has explicitly requested that a bundle be destroyed. + -- We must ensure that storage backing that bundle is gone before + -- it is automatically deleted. + 'destroying', - -- The bundle, for whatever reason, has failed. + -- The support bundle is failing. + -- This happens when Nexus is expunged partway through collection. -- - -- It should no longer have any managed durable storage. - 'failed', + -- A different Nexus must ensure that storage is gone before the + -- bundle can be marked "failed". + 'failing', - -- The bundle has been collected successfully, and has storage on - -- a particular sled. - 'active' + -- The bundle has finished failing. + -- + -- The only action that can be taken on this bundle is to delete it. + 'failed' ); CREATE TABLE IF NOT EXISTS omicron.public.support_bundle ( From 483801a9b0456ebc7d9352a9aaa24364491cb761 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 13 Nov 2024 11:33:30 -0800 Subject: [PATCH 09/22] Clippy, schema changes --- nexus/db-model/src/schema_versions.rs | 3 +- .../src/db/datastore/support_bundle.rs | 14 +++++----- openapi/nexus.json | 15 ++++++---- schema/crdb/dbinit.sql | 2 +- schema/crdb/support-bundles/up01.sql | 28 +++++++++++++++++++ schema/crdb/support-bundles/up02.sql | 14 ++++++++++ schema/crdb/support-bundles/up03.sql | 4 +++ schema/crdb/support-bundles/up04.sql | 4 +++ 8 files changed, 69 insertions(+), 15 deletions(-) create mode 100644 schema/crdb/support-bundles/up01.sql create mode 100644 schema/crdb/support-bundles/up02.sql create mode 100644 schema/crdb/support-bundles/up03.sql create mode 100644 schema/crdb/support-bundles/up04.sql diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index 14eba1cb1cf..580b654b75c 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -17,7 +17,7 @@ use std::collections::BTreeMap; /// /// This must be updated when you change the database schema. Refer to /// schema/crdb/README.adoc in the root of this repository for details. -pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(113, 0, 0); +pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(114, 0, 0); /// List of all past database schema versions, in *reverse* order /// @@ -29,6 +29,7 @@ static KNOWN_VERSIONS: Lazy> = Lazy::new(|| { // | leaving the first copy as an example for the next person. // v // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"), + KnownVersion::new(114, "support-bundles"), KnownVersion::new(113, "add-tx-eq"), KnownVersion::new(112, "blueprint-dataset"), KnownVersion::new(111, "drop-omicron-zone-underlay-address"), diff --git a/nexus/db-queries/src/db/datastore/support_bundle.rs b/nexus/db-queries/src/db/datastore/support_bundle.rs index 4d4a6645c2a..71d10700719 100644 --- a/nexus/db-queries/src/db/datastore/support_bundle.rs +++ b/nexus/db-queries/src/db/datastore/support_bundle.rs @@ -270,7 +270,7 @@ impl DataStore { let bundles_with_bad_datasets = dsl::support_bundle .filter(dsl::dataset_id.eq_any(invalid_datasets)) .select(SupportBundle::as_select()) - .load_async(&*conn) + .load_async(conn) .await?; // Split these bundles into two categories: @@ -299,7 +299,7 @@ impl DataStore { dsl::state.eq(state), dsl::reason_for_failure.eq(FAILURE_REASON_NO_DATASET), )) - .execute_async(&*conn) + .execute_async(conn) .await?; // For bundles that are in the process of being destroyed, // the dataset expungement speeds up the process. @@ -312,7 +312,7 @@ impl DataStore { // an abundance of catuion we don't auto-delete a // bundle in any other state. .filter(dsl::state.eq(SupportBundleState::Destroying)) - .execute_async(&*conn) + .execute_async(conn) .await?; let Some(arbitrary_valid_nexus) = @@ -328,7 +328,7 @@ impl DataStore { let bundles_with_bad_nexuses = dsl::support_bundle .filter(dsl::assigned_nexus.eq_any(invalid_nexus_zones)) .select(SupportBundle::as_select()) - .load_async(&*conn) + .load_async(conn) .await?; let bundles_to_mark_failing = bundles_with_bad_nexuses.iter() @@ -355,12 +355,12 @@ impl DataStore { dsl::state.eq(state), dsl::reason_for_failure.eq(FAILURE_REASON_NO_NEXUS), )) - .execute_async(&*conn) + .execute_async(conn) .await?; let bundles_reassigned = diesel::update(dsl::support_bundle) .filter(dsl::id.eq_any(bundles_to_reassign)) .set(dsl::assigned_nexus.eq(arbitrary_valid_nexus)) - .execute_async(&*conn) + .execute_async(conn) .await?; Ok(SupportBundleExpungementReport { @@ -392,7 +392,7 @@ impl DataStore { .filter(dsl::state.eq_any(state.valid_old_states())) .set(dsl::state.eq(state)) .check_if_exists::(id.into_untyped_uuid()) - .execute_and_check(&*conn) + .execute_and_check(&conn) .await .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; diff --git a/openapi/nexus.json b/openapi/nexus.json index 1aaba49e9eb..18cb260fd08 100644 --- a/openapi/nexus.json +++ b/openapi/nexus.json @@ -20183,8 +20183,7 @@ "type": "object", "properties": { "id": { - "type": "string", - "format": "uuid" + "$ref": "#/components/schemas/TypedUuidForSupportBundleKind" }, "reason_for_creation": { "type": "string" @@ -20232,21 +20231,21 @@ "SupportBundleState": { "oneOf": [ { - "description": "Support Bundle still actively being collected.\n\nThis is the initial state for a Support Bundle, and it will automatically transition to either \"Failed\" or \"Active\".\n\nIf a user no longer wants to access a Support Bundle, they can request cancellation, which will transition to the \"Cancelling\" state.", + "description": "Support Bundle still actively being collected.\n\nThis is the initial state for a Support Bundle, and it will automatically transition to either \"Failing\" or \"Active\".\n\nIf a user no longer wants to access a Support Bundle, they can request cancellation, which will transition to the \"Destroying\" state.", "type": "string", "enum": [ "collecting" ] }, { - "description": "Support Bundle has been cancelled, and will be deleted shortly.", + "description": "Support Bundle is being destroyed.\n\nOnce backing storage has been freed, this bundle is destroyed.", "type": "string", "enum": [ - "cancelling" + "destroying" ] }, { - "description": "Support Bundle was not created successfully, or was created and has lost backing storage.\n\nThe record of the bundle still exists for readability, but the only valid operation on these bundles is to delete them.", + "description": "Support Bundle was not created successfully, or was created and has lost backing storage.\n\nThe record of the bundle still exists for readability, but the only valid operation on these bundles is to destroy them.", "type": "string", "enum": [ "failed" @@ -21275,6 +21274,10 @@ } } }, + "TypedUuidForSupportBundleKind": { + "type": "string", + "format": "uuid" + }, "UninitializedSled": { "description": "A sled that has not been added to an initialized rack yet", "type": "object", diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 52af30612c7..0beb32c92a6 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -4661,7 +4661,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - (TRUE, NOW(), NOW(), '113.0.0', NULL) + (TRUE, NOW(), NOW(), '114.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; diff --git a/schema/crdb/support-bundles/up01.sql b/schema/crdb/support-bundles/up01.sql new file mode 100644 index 00000000000..1260e79e18c --- /dev/null +++ b/schema/crdb/support-bundles/up01.sql @@ -0,0 +1,28 @@ +CREATE TYPE IF NOT EXISTS omicron.public.support_bundle_state AS ENUM ( + -- The bundle is currently being created. + -- + -- It might have storage that is partially allocated on a sled. + 'collecting', + + -- The bundle has been collected successfully, and has storage on + -- a particular sled. + 'active', + + -- The user has explicitly requested that a bundle be destroyed. + -- We must ensure that storage backing that bundle is gone before + -- it is automatically deleted. + 'destroying', + + -- The support bundle is failing. + -- This happens when Nexus is expunged partway through collection. + -- + -- A different Nexus must ensure that storage is gone before the + -- bundle can be marked "failed". + 'failing', + + -- The bundle has finished failing. + -- + -- The only action that can be taken on this bundle is to delete it. + 'failed' +); + diff --git a/schema/crdb/support-bundles/up02.sql b/schema/crdb/support-bundles/up02.sql new file mode 100644 index 00000000000..bc61e704800 --- /dev/null +++ b/schema/crdb/support-bundles/up02.sql @@ -0,0 +1,14 @@ +CREATE TABLE IF NOT EXISTS omicron.public.support_bundle ( + id UUID PRIMARY KEY, + time_created TIMESTAMPTZ NOT NULL, + reason_for_creation TEXT NOT NULL, + reason_for_failure TEXT, + state omicron.public.support_bundle_state NOT NULL, + zpool_id UUID NOT NULL, + dataset_id UUID NOT NULL, + + -- The Nexus which is in charge of collecting the support bundle, + -- and later managing its storage. + assigned_nexus UUID +); + diff --git a/schema/crdb/support-bundles/up03.sql b/schema/crdb/support-bundles/up03.sql new file mode 100644 index 00000000000..8d4bdb47caa --- /dev/null +++ b/schema/crdb/support-bundles/up03.sql @@ -0,0 +1,4 @@ +CREATE UNIQUE INDEX IF NOT EXISTS one_bundle_per_dataset ON omicron.public.support_bundle ( + dataset_id +); + diff --git a/schema/crdb/support-bundles/up04.sql b/schema/crdb/support-bundles/up04.sql new file mode 100644 index 00000000000..6df95640fd9 --- /dev/null +++ b/schema/crdb/support-bundles/up04.sql @@ -0,0 +1,4 @@ +CREATE INDEX IF NOT EXISTS lookup_bundle_by_neuxs ON omicron.public.support_bundle ( + assigned_nexus +); + From 5f78af9015371bff473b50651172cc98212611fc Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 13 Nov 2024 11:34:56 -0800 Subject: [PATCH 10/22] more clippy --- nexus/db-queries/src/db/datastore/support_bundle.rs | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/support_bundle.rs b/nexus/db-queries/src/db/datastore/support_bundle.rs index 71d10700719..eb099a1a480 100644 --- a/nexus/db-queries/src/db/datastore/support_bundle.rs +++ b/nexus/db-queries/src/db/datastore/support_bundle.rs @@ -757,7 +757,7 @@ mod test { ) -> Vec { bp.blueprint_zones .values() - .map(|zones_config| { + .flat_map(|zones_config| { let mut nexus_zones = vec![]; for zone in &zones_config.zones { if matches!(zone.zone_type, BlueprintZoneType::Nexus(_)) @@ -768,7 +768,6 @@ mod test { } nexus_zones }) - .flatten() .collect() } @@ -778,7 +777,7 @@ mod test { ) -> Vec { bp.blueprint_datasets .values() - .map(|datasets_config| { + .flat_map(|datasets_config| { let mut debug_datasets = vec![]; for dataset in datasets_config.datasets.values() { if matches!(dataset.kind, DebugDatasetKind) @@ -789,7 +788,6 @@ mod test { } debug_datasets }) - .flatten() .collect() } From 92b7d62aa28bb4e9ae9e2791ba889e6880968a7b Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 13 Nov 2024 12:10:32 -0800 Subject: [PATCH 11/22] Strongly-typed UUID --- nexus/types/src/external_api/shared.rs | 16 +++++++++------- openapi/nexus.json | 15 +++++++++------ uuid-kinds/src/lib.rs | 1 + 3 files changed, 19 insertions(+), 13 deletions(-) diff --git a/nexus/types/src/external_api/shared.rs b/nexus/types/src/external_api/shared.rs index 2928fab46bc..9e9bc26ffe8 100644 --- a/nexus/types/src/external_api/shared.rs +++ b/nexus/types/src/external_api/shared.rs @@ -10,6 +10,7 @@ use chrono::DateTime; use chrono::Utc; use omicron_common::api::external::Name; use omicron_common::api::internal::shared::NetworkInterface; +use omicron_uuid_kinds::SupportBundleUuid; use parse_display::FromStr; use schemars::JsonSchema; use serde::de::Error as _; @@ -422,20 +423,22 @@ pub enum SupportBundleState { /// Support Bundle still actively being collected. /// /// This is the initial state for a Support Bundle, and it will - /// automatically transition to either "Failed" or "Active". + /// automatically transition to either "Failing" or "Active". /// /// If a user no longer wants to access a Support Bundle, they can - /// request cancellation, which will transition to the "Cancelling" state. + /// request cancellation, which will transition to the "Destroying" state. Collecting, - /// Support Bundle has been cancelled, and will be deleted shortly. - Cancelling, + /// Support Bundle is being destroyed. + /// + /// Once backing storage has been freed, this bundle is destroyed. + Destroying, /// Support Bundle was not created successfully, or was created and has lost /// backing storage. /// /// The record of the bundle still exists for readability, but the only - /// valid operation on these bundles is to delete them. + /// valid operation on these bundles is to destroy them. Failed, /// Support Bundle has been processed, and is ready for usage. @@ -444,8 +447,7 @@ pub enum SupportBundleState { #[derive(Debug, Clone, JsonSchema, Serialize, Deserialize)] pub struct SupportBundleInfo { - // XXX: Strongly-typed Support Bundle UUID? - pub id: Uuid, + pub id: SupportBundleUuid, pub time_created: DateTime, pub reason_for_creation: String, pub reason_for_failure: Option, diff --git a/openapi/nexus.json b/openapi/nexus.json index 1aaba49e9eb..18cb260fd08 100644 --- a/openapi/nexus.json +++ b/openapi/nexus.json @@ -20183,8 +20183,7 @@ "type": "object", "properties": { "id": { - "type": "string", - "format": "uuid" + "$ref": "#/components/schemas/TypedUuidForSupportBundleKind" }, "reason_for_creation": { "type": "string" @@ -20232,21 +20231,21 @@ "SupportBundleState": { "oneOf": [ { - "description": "Support Bundle still actively being collected.\n\nThis is the initial state for a Support Bundle, and it will automatically transition to either \"Failed\" or \"Active\".\n\nIf a user no longer wants to access a Support Bundle, they can request cancellation, which will transition to the \"Cancelling\" state.", + "description": "Support Bundle still actively being collected.\n\nThis is the initial state for a Support Bundle, and it will automatically transition to either \"Failing\" or \"Active\".\n\nIf a user no longer wants to access a Support Bundle, they can request cancellation, which will transition to the \"Destroying\" state.", "type": "string", "enum": [ "collecting" ] }, { - "description": "Support Bundle has been cancelled, and will be deleted shortly.", + "description": "Support Bundle is being destroyed.\n\nOnce backing storage has been freed, this bundle is destroyed.", "type": "string", "enum": [ - "cancelling" + "destroying" ] }, { - "description": "Support Bundle was not created successfully, or was created and has lost backing storage.\n\nThe record of the bundle still exists for readability, but the only valid operation on these bundles is to delete them.", + "description": "Support Bundle was not created successfully, or was created and has lost backing storage.\n\nThe record of the bundle still exists for readability, but the only valid operation on these bundles is to destroy them.", "type": "string", "enum": [ "failed" @@ -21275,6 +21274,10 @@ } } }, + "TypedUuidForSupportBundleKind": { + "type": "string", + "format": "uuid" + }, "UninitializedSled": { "description": "A sled that has not been added to an initialized rack yet", "type": "object", diff --git a/uuid-kinds/src/lib.rs b/uuid-kinds/src/lib.rs index 7947062a82b..50b263fa0b6 100644 --- a/uuid-kinds/src/lib.rs +++ b/uuid-kinds/src/lib.rs @@ -65,6 +65,7 @@ impl_typed_uuid_kind! { RackReset => "rack_reset", Region => "region", Sled => "sled", + SupportBundle => "support_bundle", TufRepo => "tuf_repo", Upstairs => "upstairs", UpstairsRepair => "upstairs_repair", From 6c529d1b8cdf0f5b7b6b536eb07aca636f9e27a7 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 13 Nov 2024 16:30:15 -0800 Subject: [PATCH 12/22] Add list-by-nexus method, test it --- .../src/db/datastore/support_bundle.rs | 156 +++++++++++++++++- 1 file changed, 155 insertions(+), 1 deletion(-) diff --git a/nexus/db-queries/src/db/datastore/support_bundle.rs b/nexus/db-queries/src/db/datastore/support_bundle.rs index eb099a1a480..562b1ec2b12 100644 --- a/nexus/db-queries/src/db/datastore/support_bundle.rs +++ b/nexus/db-queries/src/db/datastore/support_bundle.rs @@ -194,6 +194,29 @@ impl DataStore { .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } + /// Lists one page of support bundles in a particular state, assigned to + /// a particular Nexus. + pub async fn support_bundle_list_assigned_to_nexus( + &self, + opctx: &OpContext, + pagparams: &DataPageParams<'_, Uuid>, + nexus_id: OmicronZoneUuid, + states: Vec, + ) -> ListResultVec { + opctx.authorize(authz::Action::ListChildren, &authz::FLEET).await?; + use db::schema::support_bundle::dsl; + + let conn = self.pool_connection_authorized(opctx).await?; + paginated(dsl::support_bundle, dsl::id, pagparams) + .filter(dsl::assigned_nexus.eq(nexus_id.into_untyped_uuid())) + .filter(dsl::state.eq_any(states)) + .order(dsl::time_created.asc()) + .select(SupportBundle::as_select()) + .load_async(&*conn) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } + /// Marks support bundles as failed if their assigned Nexus or backing /// dataset has been destroyed. pub async fn support_bundle_fail_expunged( @@ -376,7 +399,12 @@ impl DataStore { .await } - /// Updates the state of a support bundle + /// Updates the state of a support bundle. + /// + /// Returns: + /// - "Ok" if the bundle was updated successfully. + /// - "Err::InvalidRequest" if the bundle exists, but could not be updated + /// because the state transition is invalid. pub async fn support_bundle_update( &self, opctx: &OpContext, @@ -597,6 +625,132 @@ mod test { ); } + #[tokio::test] + async fn test_bundle_list_filtering() { + let logctx = dev::test_setup_log("test_bundle_create_capacity_limits"); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let nexus_a = OmicronZoneUuid::new_v4(); + let nexus_b = OmicronZoneUuid::new_v4(); + + let _test_sled = create_sled_and_zpools(&datastore, &opctx, 5).await; + + let pagparams = DataPageParams::max_page(); + + // No bundles exist yet, so the list should be empty + + assert_eq!( + datastore + .support_bundle_list_assigned_to_nexus( + &opctx, + &pagparams, + nexus_a, + vec![SupportBundleState::Collecting] + ) + .await + .expect("Should always be able to list bundles"), + vec![] + ); + + // Create two bundles on "nexus A", one bundle on "nexus B" + + let bundle_a1 = datastore + .support_bundle_create(&opctx, "for the test", nexus_a) + .await + .expect("Should be able to create bundle"); + let bundle_a2 = datastore + .support_bundle_create(&opctx, "for the test", nexus_a) + .await + .expect("Should be able to create bundle"); + let bundle_b1 = datastore + .support_bundle_create(&opctx, "for the test", nexus_b) + .await + .expect("Should be able to create bundle"); + + assert_eq!( + datastore + .support_bundle_list_assigned_to_nexus( + &opctx, + &pagparams, + nexus_a, + vec![SupportBundleState::Collecting] + ) + .await + .expect("Should always be able to list bundles") + .iter() + .map(|b| b.id) + .collect::>(), + vec![bundle_a1.id, bundle_a2.id,] + ); + assert_eq!( + datastore + .support_bundle_list_assigned_to_nexus( + &opctx, + &pagparams, + nexus_b, + vec![SupportBundleState::Collecting] + ) + .await + .expect("Should always be able to list bundles") + .iter() + .map(|b| b.id) + .collect::>(), + vec![bundle_b1.id,] + ); + + // When we update the state of the bundles, the list results + // should also be filtered. + datastore + .support_bundle_update( + &opctx, + bundle_a1.id.into(), + SupportBundleState::Active, + ) + .await + .expect("Should have been able to update state"); + + // "bundle_a1" is no longer collecting, so it won't appear here. + assert_eq!( + datastore + .support_bundle_list_assigned_to_nexus( + &opctx, + &pagparams, + nexus_a, + vec![SupportBundleState::Collecting] + ) + .await + .expect("Should always be able to list bundles") + .iter() + .map(|b| b.id) + .collect::>(), + vec![bundle_a2.id,] + ); + + // ... but if we ask for enough states, it'll show up + assert_eq!( + datastore + .support_bundle_list_assigned_to_nexus( + &opctx, + &pagparams, + nexus_a, + vec![ + SupportBundleState::Active, + SupportBundleState::Collecting + ] + ) + .await + .expect("Should always be able to list bundles") + .iter() + .map(|b| b.id) + .collect::>(), + vec![bundle_a1.id, bundle_a2.id,] + ); + + db.terminate().await; + logctx.cleanup_successful(); + } + #[tokio::test] async fn test_bundle_create_capacity_limits() { let logctx = dev::test_setup_log("test_bundle_create_capacity_limits"); From e61365a1c000f22b06500685c23c1cc7e07babb3 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Mon, 25 Nov 2024 13:21:23 -0800 Subject: [PATCH 13/22] merge --- nexus/test-utils/src/resource_helpers.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nexus/test-utils/src/resource_helpers.rs b/nexus/test-utils/src/resource_helpers.rs index ce1fb38f016..6f704fac00e 100644 --- a/nexus/test-utils/src/resource_helpers.rs +++ b/nexus/test-utils/src/resource_helpers.rs @@ -1175,7 +1175,7 @@ impl<'a, N: NexusServer> DiskTest<'a, N> { for disk in &disks_config.disks { self.add_zpool_with_dataset_ext( *sled_id, - disk.id, + PhysicalDiskUuid::from_untyped_uuid(disk.id), disk.pool_id, DatasetUuid::new_v4(), Self::DEFAULT_ZPOOL_SIZE_GIB, From 81c7121fa3c13e42167386dd579aaf86e70e1c08 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Mon, 25 Nov 2024 13:23:41 -0800 Subject: [PATCH 14/22] Merge UUID changes --- nexus/db-queries/src/db/datastore/support_bundle.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/support_bundle.rs b/nexus/db-queries/src/db/datastore/support_bundle.rs index 562b1ec2b12..b89ce5cec09 100644 --- a/nexus/db-queries/src/db/datastore/support_bundle.rs +++ b/nexus/db-queries/src/db/datastore/support_bundle.rs @@ -28,7 +28,6 @@ use omicron_common::api::external::Error; use omicron_common::api::external::ListResultVec; use omicron_common::api::external::LookupResult; use omicron_uuid_kinds::BlueprintUuid; -use omicron_uuid_kinds::DatasetUuid; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::OmicronZoneUuid; use omicron_uuid_kinds::SupportBundleUuid; @@ -131,7 +130,7 @@ impl DataStore { let bundle = SupportBundle::new( reason_for_creation, ZpoolUuid::from_untyped_uuid(dataset.pool_id), - DatasetUuid::from_untyped_uuid(dataset.id()), + dataset.id(), this_nexus_id, ); @@ -485,6 +484,7 @@ mod test { use nexus_types::deployment::BlueprintZoneType; use omicron_common::api::internal::shared::DatasetKind::Debug as DebugDatasetKind; use omicron_test_utils::dev; + use omicron_uuid_kinds::DatasetUuid; use omicron_uuid_kinds::SledUuid; use rand::Rng; @@ -581,7 +581,7 @@ mod test { .expect("failed to upsert zpool"); let dataset = Dataset::new( - pool.dataset.into_untyped_uuid(), + pool.dataset, pool.pool.into_untyped_uuid(), None, DebugDatasetKind, From 7c33f19af9504fba317ae3587602b8a47832b34b Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 27 Nov 2024 11:11:48 -0800 Subject: [PATCH 15/22] fix typed uuid merge --- nexus/test-utils/src/resource_helpers.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nexus/test-utils/src/resource_helpers.rs b/nexus/test-utils/src/resource_helpers.rs index ebf652b8b29..c5cb0231d18 100644 --- a/nexus/test-utils/src/resource_helpers.rs +++ b/nexus/test-utils/src/resource_helpers.rs @@ -1175,7 +1175,7 @@ impl<'a, N: NexusServer> DiskTest<'a, N> { for disk in &disks_config.disks { self.add_zpool_with_dataset_ext( *sled_id, - PhysicalDiskUuid::from_untyped_uuid(disk.id), + disk.id, disk.pool_id, DatasetUuid::new_v4(), Self::DEFAULT_ZPOOL_SIZE_GIB, From 10c3745349f5ef3c2890e208928b29dca7f74ef6 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 27 Nov 2024 11:16:30 -0800 Subject: [PATCH 16/22] more typed UUID merges --- nexus/db-queries/src/db/datastore/support_bundle.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nexus/db-queries/src/db/datastore/support_bundle.rs b/nexus/db-queries/src/db/datastore/support_bundle.rs index b89ce5cec09..3f04cb11352 100644 --- a/nexus/db-queries/src/db/datastore/support_bundle.rs +++ b/nexus/db-queries/src/db/datastore/support_bundle.rs @@ -485,6 +485,7 @@ mod test { use omicron_common::api::internal::shared::DatasetKind::Debug as DebugDatasetKind; use omicron_test_utils::dev; use omicron_uuid_kinds::DatasetUuid; + use omicron_uuid_kinds::PhysicalDiskUuid; use omicron_uuid_kinds::SledUuid; use rand::Rng; @@ -573,7 +574,7 @@ mod test { let zpool = Zpool::new( *pool.pool.as_untyped_uuid(), *self.sled.as_untyped_uuid(), - Uuid::new_v4(), + PhysicalDiskUuid::new_v4(), ); datastore .zpool_insert(opctx, zpool) From 2b42d0e9eba874261cfe04e8dada79942d9b9633 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 27 Nov 2024 16:47:23 -0800 Subject: [PATCH 17/22] API updates pulled from 7187 as I built it --- clients/sled-agent-client/src/lib.rs | 3 + common/src/api/external/mod.rs | 18 +++ nexus/external-api/output/nexus_tags.txt | 1 + nexus/external-api/src/lib.rs | 17 ++- nexus/src/external_api/http_entrypoints.rs | 30 ++++- nexus/types/src/external_api/params.rs | 2 +- openapi/nexus.json | 145 ++++++++++++++++++--- sled-agent/api/src/lib.rs | 19 +-- 8 files changed, 196 insertions(+), 39 deletions(-) diff --git a/clients/sled-agent-client/src/lib.rs b/clients/sled-agent-client/src/lib.rs index f2eb957650c..f818f241fa4 100644 --- a/clients/sled-agent-client/src/lib.rs +++ b/clients/sled-agent-client/src/lib.rs @@ -75,12 +75,15 @@ progenitor::generate_api!( RouterVersion = omicron_common::api::internal::shared::RouterVersion, SledRole = nexus_sled_agent_shared::inventory::SledRole, SourceNatConfig = omicron_common::api::internal::shared::SourceNatConfig, + SupportBundleGetQueryParams = omicron_common::api::external::SupportBundleGetQueryParams, + SupportBundleQueryType = omicron_common::api::external::SupportBundleQueryType, SwitchLocation = omicron_common::api::external::SwitchLocation, TypedUuidForDatasetKind = omicron_uuid_kinds::DatasetUuid, TypedUuidForInstanceKind = omicron_uuid_kinds::InstanceUuid, TypedUuidForOmicronZoneKind = omicron_uuid_kinds::OmicronZoneUuid, TypedUuidForPropolisKind = omicron_uuid_kinds::PropolisUuid, TypedUuidForSledKind = omicron_uuid_kinds::SledUuid, + TypedUuidForSupportBundleKind = omicron_uuid_kinds::SupportBundleUuid, TypedUuidForZpoolKind = omicron_uuid_kinds::ZpoolUuid, Vni = omicron_common::api::external::Vni, ZpoolKind = omicron_common::zpool_name::ZpoolKind, diff --git a/common/src/api/external/mod.rs b/common/src/api/external/mod.rs index 38a9de05648..6db4d5c1890 100644 --- a/common/src/api/external/mod.rs +++ b/common/src/api/external/mod.rs @@ -3090,6 +3090,24 @@ pub enum ImportExportPolicy { Allow(Vec), } +/// Query parameters for reading the support bundle +#[derive(Deserialize, Serialize, JsonSchema)] +pub struct SupportBundleGetQueryParams { + pub query_type: SupportBundleQueryType, +} + +/// Describes the type of access to the support bundle +#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)] +#[serde(tag = "type", rename_all = "snake_case")] +pub enum SupportBundleQueryType { + /// Access the whole support bundle + Whole, + /// Access the names of all files within the support bundle + Index, + /// Access a specific file within the support bundle + Path { file_path: String }, +} + #[cfg(test)] mod test { use serde::Deserialize; diff --git a/nexus/external-api/output/nexus_tags.txt b/nexus/external-api/output/nexus_tags.txt index 0526a10b33b..11e942496ec 100644 --- a/nexus/external-api/output/nexus_tags.txt +++ b/nexus/external-api/output/nexus_tags.txt @@ -33,6 +33,7 @@ probe_view GET /experimental/v1/probes/{probe support_bundle_create POST /experimental/v1/system/support-bundles support_bundle_delete DELETE /experimental/v1/system/support-bundles/{support_bundle} support_bundle_download GET /experimental/v1/system/support-bundles/{support_bundle}/download +support_bundle_head HEAD /experimental/v1/system/support-bundles/{support_bundle}/download support_bundle_list GET /experimental/v1/system/support-bundles support_bundle_view GET /experimental/v1/system/support-bundles/{support_bundle} diff --git a/nexus/external-api/src/lib.rs b/nexus/external-api/src/lib.rs index 2860db740e6..4c65825396a 100644 --- a/nexus/external-api/src/lib.rs +++ b/nexus/external-api/src/lib.rs @@ -2793,6 +2793,19 @@ pub trait NexusExternalApi { async fn support_bundle_download( rqctx: RequestContext, path_params: Path, + body: TypedBody, + ) -> Result, HttpError>; + + /// Download the metadata of a single support bundle + #[endpoint { + method = HEAD, + path = "/experimental/v1/system/support-bundles/{support_bundle}/download", + tags = ["hidden"], // system/support-bundles: only one tag is allowed + }] + async fn support_bundle_head( + rqctx: RequestContext, + path_params: Path, + body: TypedBody, ) -> Result, HttpError>; /// Create a new support bundle @@ -2803,7 +2816,7 @@ pub trait NexusExternalApi { }] async fn support_bundle_create( rqctx: RequestContext, - ) -> Result, HttpError>; + ) -> Result, HttpError>; /// Delete an existing support bundle /// @@ -2817,7 +2830,7 @@ pub trait NexusExternalApi { async fn support_bundle_delete( rqctx: RequestContext, path_params: Path, - ) -> Result, HttpError>; + ) -> Result; // Probes (experimental) diff --git a/nexus/src/external_api/http_entrypoints.rs b/nexus/src/external_api/http_entrypoints.rs index 004a3482c56..bab84e8c38d 100644 --- a/nexus/src/external_api/http_entrypoints.rs +++ b/nexus/src/external_api/http_entrypoints.rs @@ -81,6 +81,7 @@ use omicron_common::api::external::NameOrId; use omicron_common::api::external::Probe; use omicron_common::api::external::RouterRoute; use omicron_common::api::external::RouterRouteKind; +use omicron_common::api::external::SupportBundleGetQueryParams; use omicron_common::api::external::SwitchPort; use omicron_common::api::external::SwitchPortSettings; use omicron_common::api::external::SwitchPortSettingsView; @@ -6049,6 +6050,31 @@ impl NexusExternalApi for NexusExternalApiImpl { async fn support_bundle_download( rqctx: RequestContext, _path_params: Path, + _body: TypedBody, + ) -> Result, HttpError> { + let apictx = rqctx.context(); + let handler = async { + let nexus = &apictx.context.nexus; + + let opctx = + crate::context::op_context_for_external_api(&rqctx).await?; + + Err(nexus + .unimplemented_todo(&opctx, crate::app::Unimpl::Public) + .await + .into()) + }; + apictx + .context + .external_latencies + .instrument_dropshot_handler(&rqctx, handler) + .await + } + + async fn support_bundle_head( + rqctx: RequestContext, + _path_params: Path, + _body: TypedBody, ) -> Result, HttpError> { let apictx = rqctx.context(); let handler = async { @@ -6071,7 +6097,7 @@ impl NexusExternalApi for NexusExternalApiImpl { async fn support_bundle_create( rqctx: RequestContext, - ) -> Result, HttpError> { + ) -> Result, HttpError> { let apictx = rqctx.context(); let handler = async { let nexus = &apictx.context.nexus; @@ -6094,7 +6120,7 @@ impl NexusExternalApi for NexusExternalApiImpl { async fn support_bundle_delete( rqctx: RequestContext, _path_params: Path, - ) -> Result, HttpError> { + ) -> Result { let apictx = rqctx.context(); let handler = async { let nexus = &apictx.context.nexus; diff --git a/nexus/types/src/external_api/params.rs b/nexus/types/src/external_api/params.rs index 0929b2c04b5..4a4af8b052f 100644 --- a/nexus/types/src/external_api/params.rs +++ b/nexus/types/src/external_api/params.rs @@ -89,8 +89,8 @@ path_param!(SshKeyPath, ssh_key, "SSH key"); path_param!(AddressLotPath, address_lot, "address lot"); path_param!(ProbePath, probe, "probe"); path_param!(CertificatePath, certificate, "certificate"); -path_param!(SupportBundlePath, support_bundle, "support bundle"); +id_path_param!(SupportBundlePath, support_bundle, "support bundle"); id_path_param!(GroupPath, group_id, "group"); // TODO: The hardware resources should be represented by its UUID or a hardware diff --git a/openapi/nexus.json b/openapi/nexus.json index c9da9dac5a2..fe91ff98e31 100644 --- a/openapi/nexus.json +++ b/openapi/nexus.json @@ -366,8 +366,8 @@ "summary": "Create a new support bundle", "operationId": "support_bundle_create", "responses": { - "200": { - "description": "successful operation", + "201": { + "description": "successful creation", "content": { "application/json": { "schema": { @@ -396,10 +396,11 @@ { "in": "path", "name": "support_bundle", - "description": "Name or ID of the support bundle", + "description": "ID of the support bundle", "required": true, "schema": { - "$ref": "#/components/schemas/NameOrId" + "type": "string", + "format": "uuid" } } ], @@ -433,23 +434,17 @@ { "in": "path", "name": "support_bundle", - "description": "Name or ID of the support bundle", + "description": "ID of the support bundle", "required": true, "schema": { - "$ref": "#/components/schemas/NameOrId" + "type": "string", + "format": "uuid" } } ], "responses": { - "200": { - "description": "successful operation", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/SupportBundleInfo" - } - } - } + "204": { + "description": "successful deletion" }, "4XX": { "$ref": "#/components/responses/Error" @@ -471,13 +466,63 @@ { "in": "path", "name": "support_bundle", - "description": "Name or ID of the support bundle", + "description": "ID of the support bundle", "required": true, "schema": { - "$ref": "#/components/schemas/NameOrId" + "type": "string", + "format": "uuid" } } ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/SupportBundleGetQueryParams" + } + } + }, + "required": true + }, + "responses": { + "default": { + "description": "", + "content": { + "*/*": { + "schema": {} + } + } + } + } + }, + "head": { + "tags": [ + "hidden" + ], + "summary": "Download the metadata of a single support bundle", + "operationId": "support_bundle_head", + "parameters": [ + { + "in": "path", + "name": "support_bundle", + "description": "ID of the support bundle", + "required": true, + "schema": { + "type": "string", + "format": "uuid" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/SupportBundleGetQueryParams" + } + } + }, + "required": true + }, "responses": { "default": { "description": "", @@ -20199,6 +20244,18 @@ "items" ] }, + "SupportBundleGetQueryParams": { + "description": "Query parameters for reading the support bundle", + "type": "object", + "properties": { + "query_type": { + "$ref": "#/components/schemas/SupportBundleQueryType" + } + }, + "required": [ + "query_type" + ] + }, "SupportBundleInfo": { "type": "object", "properties": { @@ -20248,6 +20305,60 @@ "items" ] }, + "SupportBundleQueryType": { + "description": "Describes the type of access to the support bundle", + "oneOf": [ + { + "description": "Access the whole support bundle", + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "whole" + ] + } + }, + "required": [ + "type" + ] + }, + { + "description": "Access the names of all files within the support bundle", + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "index" + ] + } + }, + "required": [ + "type" + ] + }, + { + "description": "Access a specific file within the support bundle", + "type": "object", + "properties": { + "file_path": { + "type": "string" + }, + "type": { + "type": "string", + "enum": [ + "path" + ] + } + }, + "required": [ + "file_path", + "type" + ] + } + ] + }, "SupportBundleState": { "oneOf": [ { diff --git a/sled-agent/api/src/lib.rs b/sled-agent/api/src/lib.rs index 634640079a2..be0dbdf1d8f 100644 --- a/sled-agent/api/src/lib.rs +++ b/sled-agent/api/src/lib.rs @@ -702,23 +702,8 @@ pub struct SupportBundleGetHeaders { range: String, } -/// Query parameters for reading the support bundle -#[derive(Deserialize, Serialize, JsonSchema)] -pub struct SupportBundleGetQueryParams { - pub query_type: SupportBundleQueryType, -} - -/// Describes the type of access to the support bundle -#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)] -#[serde(tag = "type", rename_all = "snake_case")] -pub enum SupportBundleQueryType { - /// Access the whole support bundle - Whole, - /// Access the names of all files within the support bundle - Index, - /// Access a specific file within the support bundle - Path { file_path: String }, -} +pub use omicron_common::api::external::SupportBundleGetQueryParams; +pub use omicron_common::api::external::SupportBundleQueryType; #[derive(Deserialize, Debug, Serialize, JsonSchema, PartialEq)] #[serde(rename_all = "snake_case")] From 1fccea8089bf39ff8806a6c0f767da59a3b11aa3 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Mon, 2 Dec 2024 10:33:20 -0800 Subject: [PATCH 18/22] expectorate --- nexus/tests/output/uncovered-authz-endpoints.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/nexus/tests/output/uncovered-authz-endpoints.txt b/nexus/tests/output/uncovered-authz-endpoints.txt index ec4790c071d..13fb389d6ec 100644 --- a/nexus/tests/output/uncovered-authz-endpoints.txt +++ b/nexus/tests/output/uncovered-authz-endpoints.txt @@ -8,6 +8,7 @@ support_bundle_view (get "/experimental/v1/system/suppor support_bundle_download (get "/experimental/v1/system/support-bundles/{support_bundle}/download") ping (get "/v1/ping") networking_switch_port_status (get "/v1/system/hardware/switch-port/{port}/status") +support_bundle_head (head "/experimental/v1/system/support-bundles/{support_bundle}/download") device_auth_request (post "/device/auth") device_auth_confirm (post "/device/confirm") device_access_token (post "/device/token") From 55e9c24a7d44ea9768041e6ce623ea86683e852c Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Fri, 13 Dec 2024 13:14:38 -0800 Subject: [PATCH 19/22] use path params more --- nexus/external-api/output/nexus_tags.txt | 3 + nexus/external-api/src/lib.rs | 41 ++++- nexus/src/external_api/http_entrypoints.rs | 72 +++++++- nexus/types/src/external_api/params.rs | 9 + openapi/nexus.json | 193 ++++++++++++--------- 5 files changed, 225 insertions(+), 93 deletions(-) diff --git a/nexus/external-api/output/nexus_tags.txt b/nexus/external-api/output/nexus_tags.txt index 4c266f87b5f..4fc92b18d8a 100644 --- a/nexus/external-api/output/nexus_tags.txt +++ b/nexus/external-api/output/nexus_tags.txt @@ -33,7 +33,10 @@ probe_view GET /experimental/v1/probes/{probe support_bundle_create POST /experimental/v1/system/support-bundles support_bundle_delete DELETE /experimental/v1/system/support-bundles/{support_bundle} support_bundle_download GET /experimental/v1/system/support-bundles/{support_bundle}/download +support_bundle_download_file GET /experimental/v1/system/support-bundles/{support_bundle}/download/{file} support_bundle_head HEAD /experimental/v1/system/support-bundles/{support_bundle}/download +support_bundle_head_file HEAD /experimental/v1/system/support-bundles/{support_bundle}/download/{file} +support_bundle_index GET /experimental/v1/system/support-bundles/{support_bundle}/index support_bundle_list GET /experimental/v1/system/support-bundles support_bundle_view GET /experimental/v1/system/support-bundles/{support_bundle} timeseries_query POST /v1/timeseries/query diff --git a/nexus/external-api/src/lib.rs b/nexus/external-api/src/lib.rs index 6429cfd06b5..54ba3ab34bc 100644 --- a/nexus/external-api/src/lib.rs +++ b/nexus/external-api/src/lib.rs @@ -2793,7 +2793,7 @@ pub trait NexusExternalApi { query_params: Query, ) -> Result>, HttpError>; - /// View a single support bundle + /// View a support bundle #[endpoint { method = GET, path = "/experimental/v1/system/support-bundles/{support_bundle}", @@ -2804,7 +2804,18 @@ pub trait NexusExternalApi { path_params: Path, ) -> Result, HttpError>; - /// Download the contents of a single support bundle + /// Download the index of a support bundle + #[endpoint { + method = GET, + path = "/experimental/v1/system/support-bundles/{support_bundle}/index", + tags = ["hidden"], // system/support-bundles: only one tag is allowed + }] + async fn support_bundle_index( + rqctx: RequestContext, + path_params: Path, + ) -> Result, HttpError>; + + /// Download the contents of a support bundle #[endpoint { method = GET, path = "/experimental/v1/system/support-bundles/{support_bundle}/download", @@ -2813,10 +2824,20 @@ pub trait NexusExternalApi { async fn support_bundle_download( rqctx: RequestContext, path_params: Path, - body: TypedBody, ) -> Result, HttpError>; - /// Download the metadata of a single support bundle + /// Download a file within a support bundle + #[endpoint { + method = GET, + path = "/experimental/v1/system/support-bundles/{support_bundle}/download/{file}", + tags = ["hidden"], // system/support-bundles: only one tag is allowed + }] + async fn support_bundle_download_file( + rqctx: RequestContext, + path_params: Path, + ) -> Result, HttpError>; + + /// Download the metadata of a support bundle #[endpoint { method = HEAD, path = "/experimental/v1/system/support-bundles/{support_bundle}/download", @@ -2825,7 +2846,17 @@ pub trait NexusExternalApi { async fn support_bundle_head( rqctx: RequestContext, path_params: Path, - body: TypedBody, + ) -> Result, HttpError>; + + /// Download the metadata of a file within the support bundle + #[endpoint { + method = HEAD, + path = "/experimental/v1/system/support-bundles/{support_bundle}/download/{file}", + tags = ["hidden"], // system/support-bundles: only one tag is allowed + }] + async fn support_bundle_head_file( + rqctx: RequestContext, + path_params: Path, ) -> Result, HttpError>; /// Create a new support bundle diff --git a/nexus/src/external_api/http_entrypoints.rs b/nexus/src/external_api/http_entrypoints.rs index bc451e2a763..cfc9f99851a 100644 --- a/nexus/src/external_api/http_entrypoints.rs +++ b/nexus/src/external_api/http_entrypoints.rs @@ -81,7 +81,6 @@ use omicron_common::api::external::NameOrId; use omicron_common::api::external::Probe; use omicron_common::api::external::RouterRoute; use omicron_common::api::external::RouterRouteKind; -use omicron_common::api::external::SupportBundleGetQueryParams; use omicron_common::api::external::SwitchPort; use omicron_common::api::external::SwitchPortSettings; use omicron_common::api::external::SwitchPortSettingsView; @@ -6074,10 +6073,55 @@ impl NexusExternalApi for NexusExternalApiImpl { .await } + async fn support_bundle_index( + rqctx: RequestContext, + _path_params: Path, + ) -> Result, HttpError> { + let apictx = rqctx.context(); + let handler = async { + let nexus = &apictx.context.nexus; + + let opctx = + crate::context::op_context_for_external_api(&rqctx).await?; + + Err(nexus + .unimplemented_todo(&opctx, crate::app::Unimpl::Public) + .await + .into()) + }; + apictx + .context + .external_latencies + .instrument_dropshot_handler(&rqctx, handler) + .await + } + async fn support_bundle_download( rqctx: RequestContext, _path_params: Path, - _body: TypedBody, + ) -> Result, HttpError> { + let apictx = rqctx.context(); + let handler = async { + let nexus = &apictx.context.nexus; + + let opctx = + crate::context::op_context_for_external_api(&rqctx).await?; + + Err(nexus + .unimplemented_todo(&opctx, crate::app::Unimpl::Public) + .await + .into()) + }; + apictx + .context + .external_latencies + .instrument_dropshot_handler(&rqctx, handler) + .await + } + + async fn support_bundle_download_file( + rqctx: RequestContext, + _path_params: Path, ) -> Result, HttpError> { let apictx = rqctx.context(); let handler = async { @@ -6101,7 +6145,29 @@ impl NexusExternalApi for NexusExternalApiImpl { async fn support_bundle_head( rqctx: RequestContext, _path_params: Path, - _body: TypedBody, + ) -> Result, HttpError> { + let apictx = rqctx.context(); + let handler = async { + let nexus = &apictx.context.nexus; + + let opctx = + crate::context::op_context_for_external_api(&rqctx).await?; + + Err(nexus + .unimplemented_todo(&opctx, crate::app::Unimpl::Public) + .await + .into()) + }; + apictx + .context + .external_latencies + .instrument_dropshot_handler(&rqctx, handler) + .await + } + + async fn support_bundle_head_file( + rqctx: RequestContext, + _path_params: Path, ) -> Result, HttpError> { let apictx = rqctx.context(); let handler = async { diff --git a/nexus/types/src/external_api/params.rs b/nexus/types/src/external_api/params.rs index 4a4af8b052f..4e616e698f9 100644 --- a/nexus/types/src/external_api/params.rs +++ b/nexus/types/src/external_api/params.rs @@ -143,6 +143,15 @@ impl From for SiloSelector { } } +#[derive(Serialize, Deserialize, JsonSchema)] +pub struct SupportBundleFilePath { + #[serde(flatten)] + pub bundle: SupportBundlePath, + + /// The file within the bundle to download + pub file: String, +} + #[derive(Debug, Clone, Serialize, Deserialize, JsonSchema, PartialEq)] pub struct OptionalSiloSelector { /// Name or ID of the silo diff --git a/openapi/nexus.json b/openapi/nexus.json index fff479b9df0..bc043059dd2 100644 --- a/openapi/nexus.json +++ b/openapi/nexus.json @@ -390,7 +390,7 @@ "tags": [ "hidden" ], - "summary": "View a single support bundle", + "summary": "View a support bundle", "operationId": "support_bundle_view", "parameters": [ { @@ -460,7 +460,7 @@ "tags": [ "hidden" ], - "summary": "Download the contents of a single support bundle", + "summary": "Download the contents of a support bundle", "operationId": "support_bundle_download", "parameters": [ { @@ -474,16 +474,75 @@ } } ], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/SupportBundleGetQueryParams" + "responses": { + "default": { + "description": "", + "content": { + "*/*": { + "schema": {} + } + } + } + } + }, + "head": { + "tags": [ + "hidden" + ], + "summary": "Download the metadata of a support bundle", + "operationId": "support_bundle_head", + "parameters": [ + { + "in": "path", + "name": "support_bundle", + "description": "ID of the support bundle", + "required": true, + "schema": { + "type": "string", + "format": "uuid" + } + } + ], + "responses": { + "default": { + "description": "", + "content": { + "*/*": { + "schema": {} } } + } + } + } + }, + "/experimental/v1/system/support-bundles/{support_bundle}/download/{file}": { + "get": { + "tags": [ + "hidden" + ], + "summary": "Download a file within a support bundle", + "operationId": "support_bundle_download_file", + "parameters": [ + { + "in": "path", + "name": "file", + "description": "The file within the bundle to download", + "required": true, + "schema": { + "type": "string" + } }, - "required": true - }, + { + "in": "path", + "name": "support_bundle", + "description": "ID of the support bundle", + "required": true, + "schema": { + "type": "string", + "format": "uuid" + } + } + ], "responses": { "default": { "description": "", @@ -499,9 +558,18 @@ "tags": [ "hidden" ], - "summary": "Download the metadata of a single support bundle", - "operationId": "support_bundle_head", + "summary": "Download the metadata of a file within the support bundle", + "operationId": "support_bundle_head_file", "parameters": [ + { + "in": "path", + "name": "file", + "description": "The file within the bundle to download", + "required": true, + "schema": { + "type": "string" + } + }, { "in": "path", "name": "support_bundle", @@ -513,16 +581,37 @@ } } ], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/SupportBundleGetQueryParams" + "responses": { + "default": { + "description": "", + "content": { + "*/*": { + "schema": {} } } - }, - "required": true - }, + } + } + } + }, + "/experimental/v1/system/support-bundles/{support_bundle}/index": { + "get": { + "tags": [ + "hidden" + ], + "summary": "Download the index of a support bundle", + "operationId": "support_bundle_index", + "parameters": [ + { + "in": "path", + "name": "support_bundle", + "description": "ID of the support bundle", + "required": true, + "schema": { + "type": "string", + "format": "uuid" + } + } + ], "responses": { "default": { "description": "", @@ -20293,18 +20382,6 @@ "items" ] }, - "SupportBundleGetQueryParams": { - "description": "Query parameters for reading the support bundle", - "type": "object", - "properties": { - "query_type": { - "$ref": "#/components/schemas/SupportBundleQueryType" - } - }, - "required": [ - "query_type" - ] - }, "SupportBundleInfo": { "type": "object", "properties": { @@ -20354,60 +20431,6 @@ "items" ] }, - "SupportBundleQueryType": { - "description": "Describes the type of access to the support bundle", - "oneOf": [ - { - "description": "Access the whole support bundle", - "type": "object", - "properties": { - "type": { - "type": "string", - "enum": [ - "whole" - ] - } - }, - "required": [ - "type" - ] - }, - { - "description": "Access the names of all files within the support bundle", - "type": "object", - "properties": { - "type": { - "type": "string", - "enum": [ - "index" - ] - } - }, - "required": [ - "type" - ] - }, - { - "description": "Access a specific file within the support bundle", - "type": "object", - "properties": { - "file_path": { - "type": "string" - }, - "type": { - "type": "string", - "enum": [ - "path" - ] - } - }, - "required": [ - "file_path", - "type" - ] - } - ] - }, "SupportBundleState": { "oneOf": [ { From efca59637727022106bf19f7f9db0f4c28a3afdb Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Fri, 13 Dec 2024 14:02:55 -0800 Subject: [PATCH 20/22] Review feedback --- nexus/db-model/src/support_bundle.rs | 5 +- .../src/db/datastore/support_bundle.rs | 142 ++++++++++++++++-- schema/crdb/dbinit.sql | 2 +- schema/crdb/support-bundles/up04.sql | 2 +- 4 files changed, 133 insertions(+), 18 deletions(-) diff --git a/nexus/db-model/src/support_bundle.rs b/nexus/db-model/src/support_bundle.rs index 0e530708770..a4b14d363b2 100644 --- a/nexus/db-model/src/support_bundle.rs +++ b/nexus/db-model/src/support_bundle.rs @@ -51,7 +51,7 @@ impl SupportBundleState { Destroying => vec![Active, Collecting, Failing], Failing => vec![Collecting, Active], // The "Failed" state is terminal. - Failed => vec![Collecting, Active, Failing], + Failed => vec![Active, Collecting, Failing], } } } @@ -69,8 +69,9 @@ impl From for SupportBundleStateView { // whether or not the bundle record can be safely deleted. // // Either way, it should be possible to delete the bundle. + // If a user requests that we delete a bundle in these states: // - "Failing" bundles will become "Destroying" - // - "Failed" bundles will be destroyed immediately + // - "Failed" bundles can be deleted immediately Failing => SupportBundleStateView::Failed, Failed => SupportBundleStateView::Failed, } diff --git a/nexus/db-queries/src/db/datastore/support_bundle.rs b/nexus/db-queries/src/db/datastore/support_bundle.rs index 3f04cb11352..8f96c4fd7c2 100644 --- a/nexus/db-queries/src/db/datastore/support_bundle.rs +++ b/nexus/db-queries/src/db/datastore/support_bundle.rs @@ -36,8 +36,8 @@ use uuid::Uuid; const CANNOT_ALLOCATE_ERR_MSG: &'static str = "Current policy limits support bundle creation to 'one per external disk', and \ - no disks are available. Either delete old support bundles or add additional \ - disks"; + no disks are available. You must delete old support bundles before new ones \ + can be created"; const FAILURE_REASON_NO_DATASET: &'static str = "Allocated dataset no longer exists"; @@ -50,8 +50,8 @@ pub struct SupportBundleExpungementReport { /// Bundles marked "failed" because the datasets storing them have been /// expunged. pub bundles_failed_missing_datasets: usize, - /// Bundles deleted because the datasets storing them have been - /// expunged. + /// Bundles already in the "destroying" state that have been deleted because + /// the datasets storing them have been expunged. pub bundles_deleted_missing_datasets: usize, /// Bundles marked "destroying" because the nexuses managing them have been @@ -79,7 +79,7 @@ impl DataStore { reason_for_creation: &'static str, this_nexus_id: OmicronZoneUuid, ) -> CreateResult { - opctx.authorize(authz::Action::ListChildren, &authz::FLEET).await?; + opctx.authorize(authz::Action::Modify, &authz::FLEET).await?; let conn = self.pool_connection_authorized(opctx).await?; #[derive(Debug)] @@ -164,7 +164,7 @@ impl DataStore { opctx: &OpContext, id: SupportBundleUuid, ) -> LookupResult { - opctx.authorize(authz::Action::ListChildren, &authz::FLEET).await?; + opctx.authorize(authz::Action::Read, &authz::FLEET).await?; use db::schema::support_bundle::dsl; let conn = self.pool_connection_authorized(opctx).await?; @@ -182,7 +182,7 @@ impl DataStore { opctx: &OpContext, pagparams: &DataPageParams<'_, Uuid>, ) -> ListResultVec { - opctx.authorize(authz::Action::ListChildren, &authz::FLEET).await?; + opctx.authorize(authz::Action::Read, &authz::FLEET).await?; use db::schema::support_bundle::dsl; let conn = self.pool_connection_authorized(opctx).await?; @@ -202,7 +202,7 @@ impl DataStore { nexus_id: OmicronZoneUuid, states: Vec, ) -> ListResultVec { - opctx.authorize(authz::Action::ListChildren, &authz::FLEET).await?; + opctx.authorize(authz::Action::Read, &authz::FLEET).await?; use db::schema::support_bundle::dsl; let conn = self.pool_connection_authorized(opctx).await?; @@ -223,7 +223,7 @@ impl DataStore { opctx: &OpContext, blueprint: &nexus_types::deployment::Blueprint, ) -> Result { - opctx.authorize(authz::Action::ListChildren, &authz::FLEET).await?; + opctx.authorize(authz::Action::Modify, &authz::FLEET).await?; // For this blueprint: The set of all expunged Nexus zones let invalid_nexus_zones = blueprint @@ -327,11 +327,10 @@ impl DataStore { // the dataset expungement speeds up the process. let bundles_deleted_missing_datasets = diesel::delete(dsl::support_bundle) - .filter(dsl::state.eq_any(state.valid_old_states())) .filter(dsl::id.eq_any(bundles_to_delete)) // This check should be redundant (we already // partitioned above based on this state) but out of - // an abundance of catuion we don't auto-delete a + // an abundance of caution we don't auto-delete a // bundle in any other state. .filter(dsl::state.eq(SupportBundleState::Destroying)) .execute_async(conn) @@ -410,7 +409,7 @@ impl DataStore { id: SupportBundleUuid, state: SupportBundleState, ) -> Result<(), Error> { - opctx.authorize(authz::Action::ListChildren, &authz::FLEET).await?; + opctx.authorize(authz::Action::Modify, &authz::FLEET).await?; use db::schema::support_bundle::dsl; let conn = self.pool_connection_authorized(opctx).await?; @@ -443,7 +442,7 @@ impl DataStore { opctx: &OpContext, id: SupportBundleUuid, ) -> Result<(), Error> { - opctx.authorize(authz::Action::ListChildren, &authz::FLEET).await?; + opctx.authorize(authz::Action::Modify, &authz::FLEET).await?; use db::schema::support_bundle::dsl; @@ -899,7 +898,7 @@ mod test { let observed_bundles = datastore .support_bundle_list(&opctx, &pagparams) .await - .expect("Should be able to get bundle we just created"); + .expect("Should be able to query when no bundles exist"); assert!(observed_bundles.is_empty()); db.terminate().await; @@ -1073,6 +1072,121 @@ mod test { logctx.cleanup_successful(); } + #[tokio::test] + async fn test_bundle_deleted_from_expunged_dataset() { + static TEST_NAME: &str = "test_bundle_deleted_from_expunged_dataset"; + let logctx = dev::test_setup_log(TEST_NAME); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let mut rng = SimRngState::from_seed(TEST_NAME); + let (_example, mut bp1) = ExampleSystemBuilder::new_with_rng( + &logctx.log, + rng.next_system_rng(), + ) + .build(); + + // Weirdly, the "ExampleSystemBuilder" blueprint has a parent blueprint, + // but which isn't exposed through the API. Since we're only able to see + // the blueprint it emits, that means we can't actually make it the + // target because "the parent blueprint is not the current target". + // + // Instead of dealing with that, we lie: claim this is the primordial + // blueprint, with no parent. + // + // Regardless, make this starter blueprint our target. + bp1.parent_blueprint_id = None; + bp_insert_and_make_target(&opctx, &datastore, &bp1).await; + + // Manually perform the equivalent of blueprint execution to populate + // database records. + let sleds = TestSled::new_from_blueprint(&bp1); + for sled in &sleds { + sled.create_database_records(&datastore, &opctx).await; + } + + // Extract Nexus and Dataset information from the generated blueprint. + let this_nexus_id = get_nexuses_from_blueprint( + &bp1, + BlueprintZoneFilter::ShouldBeRunning, + ) + .get(0) + .map(|id| *id) + .expect("There should be a Nexus in the example blueprint"); + let debug_datasets = get_debug_datasets_from_blueprint( + &bp1, + BlueprintDatasetFilter::InService, + ); + assert!(!debug_datasets.is_empty()); + + // When we create a bundle, it should exist on a dataset provisioned by + // the blueprint. + let bundle = datastore + .support_bundle_create(&opctx, "for the test", this_nexus_id) + .await + .expect("Should be able to create bundle"); + assert_eq!(bundle.assigned_nexus, Some(this_nexus_id.into())); + assert!( + debug_datasets.contains(&DatasetUuid::from(bundle.dataset_id)), + "Bundle should have been allocated from a blueprint dataset" + ); + + // Start the deletion of this bundle + datastore + .support_bundle_update( + &opctx, + bundle.id.into(), + SupportBundleState::Destroying, + ) + .await + .expect("Should have been able to update state"); + + // If we try to "fail support bundles" from expunged datasets/nexuses, + // we should see a no-op. Nothing has been expunged yet! + let report = + datastore.support_bundle_fail_expunged(&opctx, &bp1).await.expect( + "Should have been able to perform no-op support bundle failure", + ); + assert_eq!(SupportBundleExpungementReport::default(), report); + + // Expunge the bundle's dataset (manually) + let bp2 = { + let mut bp2 = bp1.clone(); + bp2.id = Uuid::new_v4(); + bp2.parent_blueprint_id = Some(bp1.id); + expunge_dataset_for_bundle(&mut bp2, &bundle); + bp2 + }; + bp_insert_and_make_target(&opctx, &datastore, &bp2).await; + + datastore + .support_bundle_fail_expunged(&opctx, &bp1) + .await + .expect_err("bp1 is no longer the target; this should fail"); + let report = datastore + .support_bundle_fail_expunged(&opctx, &bp2) + .await + .expect("Should have been able to mark bundle state as failed"); + assert_eq!( + SupportBundleExpungementReport { + bundles_deleted_missing_datasets: 1, + ..Default::default() + }, + report + ); + + // Should observe no bundles (it should have been deleted) + let pagparams = DataPageParams::max_page(); + let observed_bundles = datastore + .support_bundle_list(&opctx, &pagparams) + .await + .expect("Should be able to query when no bundles exist"); + assert!(observed_bundles.is_empty()); + + db.terminate().await; + logctx.cleanup_successful(); + } + #[tokio::test] async fn test_bundle_failed_from_expunged_nexus_no_reassign() { static TEST_NAME: &str = diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 399b06570a8..37380f6e432 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -2449,7 +2449,7 @@ CREATE UNIQUE INDEX IF NOT EXISTS one_bundle_per_dataset ON omicron.public.suppo dataset_id ); -CREATE INDEX IF NOT EXISTS lookup_bundle_by_neuxs ON omicron.public.support_bundle ( +CREATE INDEX IF NOT EXISTS lookup_bundle_by_nexus ON omicron.public.support_bundle ( assigned_nexus ); diff --git a/schema/crdb/support-bundles/up04.sql b/schema/crdb/support-bundles/up04.sql index 6df95640fd9..58b903e5009 100644 --- a/schema/crdb/support-bundles/up04.sql +++ b/schema/crdb/support-bundles/up04.sql @@ -1,4 +1,4 @@ -CREATE INDEX IF NOT EXISTS lookup_bundle_by_neuxs ON omicron.public.support_bundle ( +CREATE INDEX IF NOT EXISTS lookup_bundle_by_nexus ON omicron.public.support_bundle ( assigned_nexus ); From e29733de837068a67fa8c1e367cb6291dc964db6 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Fri, 13 Dec 2024 14:25:51 -0800 Subject: [PATCH 21/22] Add to uncovered endpoints, since these aren't impl'd yet --- nexus/tests/output/uncovered-authz-endpoints.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/nexus/tests/output/uncovered-authz-endpoints.txt b/nexus/tests/output/uncovered-authz-endpoints.txt index 13fb389d6ec..0a9e62707f0 100644 --- a/nexus/tests/output/uncovered-authz-endpoints.txt +++ b/nexus/tests/output/uncovered-authz-endpoints.txt @@ -6,9 +6,12 @@ probe_view (get "/experimental/v1/probes/{probe support_bundle_list (get "/experimental/v1/system/support-bundles") support_bundle_view (get "/experimental/v1/system/support-bundles/{support_bundle}") support_bundle_download (get "/experimental/v1/system/support-bundles/{support_bundle}/download") +support_bundle_download_file (get "/experimental/v1/system/support-bundles/{support_bundle}/download/{file}") +support_bundle_index (get "/experimental/v1/system/support-bundles/{support_bundle}/index") ping (get "/v1/ping") networking_switch_port_status (get "/v1/system/hardware/switch-port/{port}/status") support_bundle_head (head "/experimental/v1/system/support-bundles/{support_bundle}/download") +support_bundle_head_file (head "/experimental/v1/system/support-bundles/{support_bundle}/download/{file}") device_auth_request (post "/device/auth") device_auth_confirm (post "/device/confirm") device_access_token (post "/device/token") From 1506c87e7036d7233b2840b6216c1d2db9bbcdd5 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Mon, 6 Jan 2025 10:19:53 -0800 Subject: [PATCH 22/22] merging --- nexus/db-queries/src/db/datastore/support_bundle.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nexus/db-queries/src/db/datastore/support_bundle.rs b/nexus/db-queries/src/db/datastore/support_bundle.rs index 8f96c4fd7c2..d616b4356b7 100644 --- a/nexus/db-queries/src/db/datastore/support_bundle.rs +++ b/nexus/db-queries/src/db/datastore/support_bundle.rs @@ -262,7 +262,7 @@ impl DataStore { .all_omicron_datasets( nexus_types::deployment::BlueprintDatasetFilter::Expunged, ) - .filter_map(|dataset_config| { + .filter_map(|(_sled_id, dataset_config)| { if matches!( dataset_config.kind, omicron_common::api::internal::shared::DatasetKind::Debug