diff --git a/.agents/skills/debug-openshell-cluster/SKILL.md b/.agents/skills/debug-openshell-cluster/SKILL.md index 177c52f31..a72751bfb 100644 --- a/.agents/skills/debug-openshell-cluster/SKILL.md +++ b/.agents/skills/debug-openshell-cluster/SKILL.md @@ -116,9 +116,32 @@ Check required Helm deployment secrets: kubectl -n openshell get secret \ openshell-server-tls \ openshell-server-client-ca \ - openshell-client-tls + openshell-client-tls \ + openshell-jwt-keys ``` +If the gateway exits with `failed to read sandbox JWT signing key from +/etc/openshell-jwt/signing.pem`, verify that `openshell-jwt-keys` contains +`signing.pem`, `public.pem`, and `kid`, and that the StatefulSet mounts the +`sandbox-jwt` secret at `/etc/openshell-jwt`. The sandbox JWT mount is required +even when local Helm values disable TLS. + +If `server.spiffe.enabled=true`, the sandbox JWT ConfigMap block and +`sandbox-jwt` StatefulSet mount are intentionally omitted. Instead verify that +SPIRE is installed, the CSI driver is available, and the gateway pod mounts the +SPIFFE Workload API socket: + +```bash +helm -n openshell get values openshell | grep -E 'spiffe|trustDomain|workloadApiSocketPath' +kubectl get pods -A | grep -E 'spire|spiffe' +kubectl -n openshell get statefulset openshell -o yaml | grep -E 'spiffe-workload-api|csi.spiffe.io' +``` + +Sandbox pods in SPIFFE mode should have `openshell.io/sandbox-id` and +`openshell.io/spiffe-id` annotations, an `openshell.ai/managed-by=openshell` +label, and supervisor env vars `OPENSHELL_SPIFFE_WORKLOAD_API_SOCKET`, +`OPENSHELL_SPIFFE_AUDIENCE`, and `OPENSHELL_SPIFFE_ID`. + Check the image references currently used by the gateway deployment: ```bash diff --git a/.agents/skills/helm-dev-environment/SKILL.md b/.agents/skills/helm-dev-environment/SKILL.md index 623efb2e6..410981707 100644 --- a/.agents/skills/helm-dev-environment/SKILL.md +++ b/.agents/skills/helm-dev-environment/SKILL.md @@ -169,6 +169,21 @@ To remove Keycloak: mise run keycloak:k8s:teardown ``` +### SPIRE / SPIFFE Sandbox Identity + +Skaffold can install SPIRE with the SPIFFE hardened Helm charts. To activate +SPIFFE JWT-SVID supervisor authentication: + +1. Uncomment the `spire-crds` and `spire` releases in `deploy/helm/openshell/skaffold.yaml` +2. Uncomment `#- ci/values-spire.yaml` in the OpenShell release values files +3. Redeploy: `mise run helm:skaffold:run` + +`ci/values-spire-stack.yaml` configures the local SPIRE trust domain as +`openshell.local` and adds a `ClusterSPIFFEID` that maps sandbox pod +annotations to `spiffe://openshell.local/openshell/sandbox/`. +OpenShell mounts the SPIFFE CSI Workload API socket at +`/spiffe-workload-api/spire-agent.sock`. + --- ## Cluster Lifecycle (suspend/resume) @@ -196,6 +211,8 @@ mise run helm:k3s:status | `deploy/helm/openshell/ci/values-cert-manager.yaml` | cert-manager PKI overlay (opt-in; disables pkiInitJob) | | `deploy/helm/openshell/ci/values-gateway.yaml` | Envoy Gateway GRPCRoute + Gateway overlay | | `deploy/helm/openshell/ci/values-keycloak.yaml` | Keycloak OIDC overlay | +| `deploy/helm/openshell/ci/values-spire.yaml` | SPIFFE/SPIRE sandbox supervisor auth overlay | +| `deploy/helm/openshell/ci/values-spire-stack.yaml` | SPIRE hardened chart values for local dev | | `deploy/helm/openshell/ci/values-tls-disabled.yaml` | Lint-only: TLS + auth disabled (reverse-proxy edge termination) | | `deploy/kube/manifests/envoy-gateway-openshell.yaml` | GatewayClass for Envoy Gateway (`mise run helm:gateway:apply`) | | `tasks/scripts/helm-k3s-local.sh` | k3d cluster create/delete/start/stop/status | diff --git a/.markdownlint-cli2.jsonc b/.markdownlint-cli2.jsonc index 30cf48849..125df0f81 100644 --- a/.markdownlint-cli2.jsonc +++ b/.markdownlint-cli2.jsonc @@ -16,6 +16,7 @@ ".claude/**", ".opencode/**", ".github/**", + "architecture/plans/**", "**/node_modules/**", "target/**", ".pytest_cache/**", diff --git a/Cargo.lock b/Cargo.lock index cba681774..46142f201 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -156,6 +156,15 @@ dependencies = [ "thiserror 2.0.18", ] +[[package]] +name = "arc-swap" +version = "1.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a3a1fd6f75306b68087b831f025c712524bcb19aad54e557b1129cfa0a2b207" +dependencies = [ + "rustversion", +] + [[package]] name = "argon2" version = "0.5.3" @@ -217,28 +226,6 @@ dependencies = [ "serde_json", ] -[[package]] -name = "async-stream" -version = "0.3.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b5a71a6f37880a80d1d7f19efd781e4b5de42c88f0722cc13bcb6cc2cfe8476" -dependencies = [ - "async-stream-impl", - "futures-core", - "pin-project-lite", -] - -[[package]] -name = "async-stream-impl" -version = "0.3.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7c24de15d275a1ecfd47a380fb4d5ec9bfe0933f309ed5e705b775596a3574d" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.117", -] - [[package]] name = "async-trait" version = "0.1.89" @@ -303,40 +290,13 @@ dependencies = [ "fs_extra", ] -[[package]] -name = "axum" -version = "0.7.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "edca88bc138befd0323b20752846e6587272d3b03b0343c8ea28a6f819e6e71f" -dependencies = [ - "async-trait", - "axum-core 0.4.5", - "bytes", - "futures-util", - "http", - "http-body", - "http-body-util", - "itoa", - "matchit 0.7.3", - "memchr", - "mime", - "percent-encoding", - "pin-project-lite", - "rustversion", - "serde", - "sync_wrapper", - "tower 0.5.3", - "tower-layer", - "tower-service", -] - [[package]] name = "axum" version = "0.8.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "31b698c5f9a010f6573133b09e0de5408834d0c82f8d7475a89fc1867a71cd90" dependencies = [ - "axum-core 0.5.6", + "axum-core", "base64 0.22.1", "bytes", "form_urlencoded", @@ -347,7 +307,7 @@ dependencies = [ "hyper", "hyper-util", "itoa", - "matchit 0.8.4", + "matchit", "memchr", "mime", "percent-encoding", @@ -366,26 +326,6 @@ dependencies = [ "tracing", ] -[[package]] -name = "axum-core" -version = "0.4.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09f2bd6146b97ae3359fa0cc6d6b376d9539582c7b4220f041a33ec24c226199" -dependencies = [ - "async-trait", - "bytes", - "futures-util", - "http", - "http-body", - "http-body-util", - "mime", - "pin-project-lite", - "rustversion", - "sync_wrapper", - "tower-layer", - "tower-service", -] - [[package]] name = "axum-core" version = "0.5.6" @@ -1855,19 +1795,13 @@ dependencies = [ "futures-core", "futures-sink", "http", - "indexmap 2.14.0", + "indexmap", "slab", "tokio", "tokio-util", "tracing", ] -[[package]] -name = "hashbrown" -version = "0.12.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" - [[package]] name = "hashbrown" version = "0.14.5" @@ -2149,7 +2083,7 @@ dependencies = [ "libc", "percent-encoding", "pin-project-lite", - "socket2 0.6.3", + "socket2", "tokio", "tower-service", "tracing", @@ -2328,16 +2262,6 @@ dependencies = [ "quote", ] -[[package]] -name = "indexmap" -version = "1.9.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99" -dependencies = [ - "autocfg", - "hashbrown 0.12.3", -] - [[package]] name = "indexmap" version = "2.14.0" @@ -2957,12 +2881,6 @@ dependencies = [ "regex-automata", ] -[[package]] -name = "matchit" -version = "0.7.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0e7465ac9959cc2b1404e8e2367b43684a6d13790fe23056cc8c6c5a6b7bcb94" - [[package]] name = "matchit" version = "0.8.4" @@ -3011,7 +2929,7 @@ dependencies = [ "http-body-util", "hyper", "hyper-util", - "indexmap 2.14.0", + "indexmap", "ipnet", "metrics", "metrics-util", @@ -3395,6 +3313,7 @@ dependencies = [ "rcgen", "serde", "serde_json", + "sha2 0.10.9", "tar", "tempfile", "tokio", @@ -3464,7 +3383,8 @@ dependencies = [ "tempfile", "thiserror 2.0.18", "tonic", - "tonic-build", + "tonic-prost", + "tonic-prost-build", "url", ] @@ -3661,6 +3581,7 @@ dependencies = [ "serde_yml", "sha1 0.10.6", "sha2 0.10.9", + "spiffe", "temp-env", "tempfile", "thiserror 2.0.18", @@ -3681,7 +3602,8 @@ name = "openshell-server" version = "0.0.0" dependencies = [ "anyhow", - "axum 0.8.9", + "async-trait", + "axum", "bytes", "clap", "futures", @@ -3724,6 +3646,7 @@ dependencies = [ "serde", "serde_json", "sha2 0.10.9", + "spiffe", "sqlx", "tempfile", "thiserror 2.0.18", @@ -3737,6 +3660,7 @@ dependencies = [ "tower-http 0.6.8", "tracing", "tracing-subscriber", + "url", "uuid", "wiremock", "x509-parser", @@ -4007,12 +3931,13 @@ dependencies = [ [[package]] name = "petgraph" -version = "0.7.1" +version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3672b37090dbd86368a4145bc067582552b29c27377cad4e0a306c97f9bd7772" +checksum = "8701b58ea97060d5e5b155d383a69952a60943f0e6dfe30b04c287beb0b27455" dependencies = [ "fixedbitset", - "indexmap 2.14.0", + "hashbrown 0.15.5", + "indexmap", ] [[package]] @@ -4250,9 +4175,9 @@ dependencies = [ [[package]] name = "prost" -version = "0.13.5" +version = "0.14.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2796faa41db3ec313a31f7624d9286acf277b52de526150b7e69f3debf891ee5" +checksum = "d2ea70524a2f82d518bce41317d0fae74151505651af45faf1ffbd6fd33f0568" dependencies = [ "bytes", "prost-derive", @@ -4260,19 +4185,20 @@ dependencies = [ [[package]] name = "prost-build" -version = "0.13.5" +version = "0.14.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be769465445e8c1474e9c5dac2018218498557af32d9ed057325ec9a41ae81bf" +checksum = "343d3bd7056eda839b03204e68deff7d1b13aba7af2b2fd16890697274262ee7" dependencies = [ "heck", "itertools 0.14.0", "log", "multimap", - "once_cell", "petgraph", "prettyplease", "prost", "prost-types", + "pulldown-cmark", + "pulldown-cmark-to-cmark", "regex", "syn 2.0.117", "tempfile", @@ -4280,9 +4206,9 @@ dependencies = [ [[package]] name = "prost-derive" -version = "0.13.5" +version = "0.14.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a56d757972c98b346a9b766e3f02746cde6dd1cd1d1d563472929fdd74bec4d" +checksum = "27c6023962132f4b30eb4c172c91ce92d933da334c59c23cddee82358ddafb0b" dependencies = [ "anyhow", "itertools 0.14.0", @@ -4293,9 +4219,9 @@ dependencies = [ [[package]] name = "prost-types" -version = "0.13.5" +version = "0.14.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "52c2c1bf36ddb1a1c396b3601a3cec27c2462e45f07c386894ec3ccf5332bd16" +checksum = "8991c4cbdb8bc5b11f0b074ffe286c30e523de90fee5ba8132f1399f23cb3dd7" dependencies = [ "prost", ] @@ -4309,6 +4235,26 @@ dependencies = [ "autotools", ] +[[package]] +name = "pulldown-cmark" +version = "0.13.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c3a14896dfa883796f1cb410461aef38810ea05f2b2c33c5aded3649095fdad" +dependencies = [ + "bitflags", + "memchr", + "unicase", +] + +[[package]] +name = "pulldown-cmark-to-cmark" +version = "22.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50793def1b900256624a709439404384204a5dc3a6ec580281bfaac35e882e90" +dependencies = [ + "pulldown-cmark", +] + [[package]] name = "quanta" version = "0.12.6" @@ -4337,7 +4283,7 @@ dependencies = [ "quinn-udp", "rustc-hash 2.1.2", "rustls", - "socket2 0.6.3", + "socket2", "thiserror 2.0.18", "tokio", "tracing", @@ -4375,7 +4321,7 @@ dependencies = [ "cfg_aliases", "libc", "once_cell", - "socket2 0.6.3", + "socket2", "tracing", "windows-sys 0.60.2", ] @@ -5231,7 +5177,7 @@ version = "0.9.34+deprecated" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6a8b1a1a2ebf674015cc02edccce75287f1a0130d394307b36743c2f5d504b47" dependencies = [ - "indexmap 2.14.0", + "indexmap", "itoa", "ryu", "serde", @@ -5244,7 +5190,7 @@ version = "0.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "59e2dd588bf1597a252c3b920e0143eb99b0f76e4e082f4c92ce34fbc9e71ddd" dependencies = [ - "indexmap 2.14.0", + "indexmap", "itoa", "libyml", "memchr", @@ -5427,22 +5373,40 @@ dependencies = [ [[package]] name = "socket2" -version = "0.5.10" +version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e22376abed350d73dd1cd119b57ffccad95b4e585a7cda43e286245ce23c0678" +checksum = "3a766e1110788c36f4fa1c2b71b387a7815aa65f88ce0229841826633d93723e" dependencies = [ "libc", - "windows-sys 0.52.0", + "windows-sys 0.61.2", ] [[package]] -name = "socket2" -version = "0.6.3" +name = "spiffe" +version = "0.15.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3a766e1110788c36f4fa1c2b71b387a7815aa65f88ce0229841826633d93723e" +checksum = "6d3f9e45e9e53f03cb452fe0f050101a9280ff4f4214e326037bc8275284d906" dependencies = [ - "libc", - "windows-sys 0.61.2", + "arc-swap", + "base64ct", + "fastrand", + "futures", + "hyper-util", + "log", + "prost", + "prost-types", + "serde", + "serde_json", + "thiserror 2.0.18", + "time", + "tokio", + "tokio-util", + "tonic", + "tonic-prost", + "tower 0.5.3", + "tracing", + "url", + "zeroize", ] [[package]] @@ -5505,7 +5469,7 @@ dependencies = [ "futures-util", "hashbrown 0.15.5", "hashlink", - "indexmap 2.14.0", + "indexmap", "log", "memchr", "once_cell", @@ -6071,7 +6035,7 @@ dependencies = [ "parking_lot", "pin-project-lite", "signal-hook-registry", - "socket2 0.6.3", + "socket2", "tokio-macros", "windows-sys 0.61.2", ] @@ -6177,7 +6141,7 @@ version = "0.22.27" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "41fe8c660ae4257887cf66394862d21dbca4a6ddd26f04a3560410406a2f819a" dependencies = [ - "indexmap 2.14.0", + "indexmap", "serde", "serde_spanned", "toml_datetime", @@ -6193,13 +6157,12 @@ checksum = "5d99f8c9a7727884afe522e9bd5edbfc91a3312b36a77b5fb8926e4c31a41801" [[package]] name = "tonic" -version = "0.12.3" +version = "0.14.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "877c5b330756d856ffcc4553ab34a5684481ade925ecc54bcd1bf02b1d0d4d52" +checksum = "ac2a5518c70fa84342385732db33fb3f44bc4cc748936eb5833d2df34d6445ef" dependencies = [ - "async-stream", "async-trait", - "axum 0.7.9", + "axum", "base64 0.22.1", "bytes", "h2", @@ -6211,14 +6174,13 @@ dependencies = [ "hyper-util", "percent-encoding", "pin-project", - "prost", "rustls-native-certs", - "rustls-pemfile", - "socket2 0.5.10", + "socket2", + "sync_wrapper", "tokio", "tokio-rustls", "tokio-stream", - "tower 0.4.13", + "tower 0.5.3", "tower-layer", "tower-service", "tracing", @@ -6226,9 +6188,32 @@ dependencies = [ [[package]] name = "tonic-build" -version = "0.12.3" +version = "0.14.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9557ce109ea773b399c9b9e5dca39294110b74f1f342cb347a80d1fce8c26a11" +checksum = "c68f61875ac5293cf72e6c8cf0158086428c82c37229e98c840878f1706b0322" +dependencies = [ + "prettyplease", + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "tonic-prost" +version = "0.14.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50849f68853be452acf590cde0b146665b8d507b3b8af17261df47e02c209ea0" +dependencies = [ + "bytes", + "prost", + "tonic", +] + +[[package]] +name = "tonic-prost-build" +version = "0.14.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "654e5643eff75d7f8c99197ce1440ed19a3474eada74c12bbac488b2cafdae27" dependencies = [ "prettyplease", "proc-macro2", @@ -6236,6 +6221,8 @@ dependencies = [ "prost-types", "quote", "syn 2.0.117", + "tempfile", + "tonic-build", ] [[package]] @@ -6246,11 +6233,8 @@ checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c" dependencies = [ "futures-core", "futures-util", - "indexmap 1.9.3", "pin-project", "pin-project-lite", - "rand 0.8.6", - "slab", "tokio", "tokio-util", "tower-layer", @@ -6266,9 +6250,12 @@ checksum = "ebe5ef63511595f1344e2d5cfa636d973292adc0eec1f0ad45fae9f0851ab1d4" dependencies = [ "futures-core", "futures-util", + "indexmap", "pin-project-lite", + "slab", "sync_wrapper", "tokio", + "tokio-util", "tower-layer", "tower-service", "tracing", @@ -6755,7 +6742,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909" dependencies = [ "anyhow", - "indexmap 2.14.0", + "indexmap", "wasm-encoder", "wasmparser", ] @@ -6781,7 +6768,7 @@ checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe" dependencies = [ "bitflags", "hashbrown 0.15.5", - "indexmap 2.14.0", + "indexmap", "semver", ] @@ -7346,7 +7333,7 @@ checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21" dependencies = [ "anyhow", "heck", - "indexmap 2.14.0", + "indexmap", "prettyplease", "syn 2.0.117", "wasm-metadata", @@ -7377,7 +7364,7 @@ checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2" dependencies = [ "anyhow", "bitflags", - "indexmap 2.14.0", + "indexmap", "log", "serde", "serde_derive", @@ -7396,7 +7383,7 @@ checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736" dependencies = [ "anyhow", "id-arena", - "indexmap 2.14.0", + "indexmap", "log", "semver", "serde", @@ -7610,7 +7597,7 @@ dependencies = [ "flate2", "getrandom 0.4.2", "hmac", - "indexmap 2.14.0", + "indexmap", "lzma-rust2", "memchr", "pbkdf2", diff --git a/Cargo.toml b/Cargo.toml index 3fea379a2..ef7cb9f30 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,10 +17,11 @@ repository = "https://github.com/NVIDIA/OpenShell" tokio = { version = "1.43", features = ["full"] } # gRPC/Protobuf -tonic = "0.12" -tonic-build = "0.12" -prost = "0.13" -prost-types = "0.13" +tonic = "0.14" +tonic-prost = "0.14" +tonic-prost-build = "0.14" +prost = "0.14" +prost-types = "0.14" # HTTP server axum = { version = "0.8", features = ["ws"] } @@ -86,6 +87,7 @@ sha2 = "0.10" rand = "0.9" jsonwebtoken = "9" getrandom = "0.3" +spiffe = { version = "0.15", default-features = false, features = ["workload-api-jwt", "tracing"] } # Filesystem embedding include_dir = "0.7" diff --git a/architecture/gateway.md b/architecture/gateway.md index e9cbe187d..8a3ab2370 100644 --- a/architecture/gateway.md +++ b/architecture/gateway.md @@ -41,10 +41,15 @@ Supported auth modes: | Plaintext | Local development or a trusted reverse proxy boundary. | | Cloudflare JWT | Edge-authenticated deployments where Cloudflare Access supplies identity. | | OIDC | Bearer-token auth for users, with browser PKCE or client credentials login. | - -Sandbox supervisor RPCs authenticate with either mTLS material or a sandbox -secret depending on the runtime and deployment mode. User-facing mutations are -authorized by role policy when OIDC or edge identity is enabled. +| SPIFFE JWT-SVID | Sandbox supervisor authentication through a local SPIFFE Workload API implementation such as SPIRE. | + +Sandbox supervisor RPCs authenticate with mTLS material plus a sandbox workload +identity. Kubernetes deployments can use either the gateway-minted JWT +bootstrap path or SPIFFE JWT-SVIDs. In SPIFFE mode, the supervisor fetches a +JWT-SVID from the SPIFFE Workload API and the gateway validates it through its +own local Workload API socket, then maps `spiffe:///openshell/sandbox/` +to `Principal::Sandbox`. User-facing mutations are authorized by role policy +when OIDC or edge identity is enabled. ## API Surface diff --git a/crates/openshell-bootstrap/Cargo.toml b/crates/openshell-bootstrap/Cargo.toml index c0fb7e9f4..578d59e65 100644 --- a/crates/openshell-bootstrap/Cargo.toml +++ b/crates/openshell-bootstrap/Cargo.toml @@ -16,6 +16,7 @@ bytes = { workspace = true } futures = { workspace = true } miette = { workspace = true } rcgen = { workspace = true } +sha2 = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } tar = "0.4" diff --git a/crates/openshell-bootstrap/src/jwt.rs b/crates/openshell-bootstrap/src/jwt.rs new file mode 100644 index 000000000..cf8ab0dc1 --- /dev/null +++ b/crates/openshell-bootstrap/src/jwt.rs @@ -0,0 +1,112 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Gateway-minted JWT signing-key generation. +//! +//! The gateway mints per-sandbox identity tokens (see PR 2 of the +//! per-sandbox identity series, issue #1354) signed with an Ed25519 +//! keypair generated once at gateway init and persisted alongside the +//! existing PKI bundle. The signing key never leaves the gateway; the +//! public key plus a stable `kid` are consumed by the gateway's own +//! validator and any future external verifiers. + +use miette::{IntoDiagnostic, Result, WrapErr}; +use rcgen::{KeyPair, PKCS_ED25519}; +use sha2::{Digest, Sha256}; + +/// All PEM-encoded material needed to mint and validate sandbox JWTs. +/// +/// The signing key stays in the gateway process. The public key is shared +/// across gateway replicas (so any replica can validate a JWT minted by +/// any other replica). The `kid` is published in every minted JWT's +/// header so the validator can pick the right key after a future rotation. +pub struct JwtKeyMaterial { + /// PKCS#8 PEM-encoded Ed25519 private key. + pub signing_key_pem: String, + /// `SubjectPublicKeyInfo` PEM-encoded Ed25519 public key. + pub public_key_pem: String, + /// Stable identifier derived from the public key (SHA-256 hex prefix). + /// Embedded in every minted JWT's `kid` header so future rotation can + /// be performed in-place by adding a second key without breaking + /// in-flight tokens. + pub kid: String, +} + +/// Generate a fresh Ed25519 JWT signing key. +/// +/// Output PEM is in the formats `jsonwebtoken` consumes via +/// `EncodingKey::from_ed_pem` (signing) and `DecodingKey::from_ed_pem` +/// (validation), so the gateway can round-trip its own tokens with no +/// further conversion. +pub fn generate_jwt_key() -> Result { + let keypair = KeyPair::generate_for(&PKCS_ED25519) + .into_diagnostic() + .wrap_err("failed to generate Ed25519 JWT signing key")?; + let signing_key_pem = keypair.serialize_pem(); + let public_key_pem = keypair.public_key_pem(); + let kid = kid_from_public_key_der(&keypair.public_key_der()); + Ok(JwtKeyMaterial { + signing_key_pem, + public_key_pem, + kid, + }) +} + +/// Stable `kid` derived from the SHA-256 of the public-key DER. +/// +/// First 16 bytes hex-encoded — collision-resistant for the small N of +/// signing keys a single deployment ever has, while staying short enough +/// to keep JWT headers compact. +fn kid_from_public_key_der(public_key_der: &[u8]) -> String { + let digest = Sha256::digest(public_key_der); + hex_encode_prefix(&digest, 16) +} + +fn hex_encode_prefix(bytes: &[u8], n: usize) -> String { + use std::fmt::Write as _; + let mut out = String::with_capacity(n * 2); + for byte in bytes.iter().take(n) { + let _ = write!(out, "{byte:02x}"); + } + out +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn generate_jwt_key_produces_parseable_pem() { + let material = generate_jwt_key().expect("generate_jwt_key"); + assert!(material.signing_key_pem.contains("BEGIN PRIVATE KEY")); + assert!(material.public_key_pem.contains("BEGIN PUBLIC KEY")); + assert_eq!(material.kid.len(), 32, "kid is 16 bytes hex-encoded"); + assert!(material.kid.chars().all(|c| c.is_ascii_hexdigit())); + } + + #[test] + fn kid_is_stable_for_identical_public_keys() { + // Same input -> same kid. Hash of a fixed byte string. + let kid_a = kid_from_public_key_der(b"abc"); + let kid_b = kid_from_public_key_der(b"abc"); + assert_eq!(kid_a, kid_b); + } + + #[test] + fn kid_differs_for_different_public_keys() { + let kid_a = kid_from_public_key_der(b"first"); + let kid_b = kid_from_public_key_der(b"second"); + assert_ne!(kid_a, kid_b); + } + + #[test] + fn generated_keys_are_unique() { + let a = generate_jwt_key().expect("generate_jwt_key"); + let b = generate_jwt_key().expect("generate_jwt_key"); + assert_ne!( + a.kid, b.kid, + "fresh keypairs must produce distinct public keys" + ); + assert_ne!(a.signing_key_pem, b.signing_key_pem); + } +} diff --git a/crates/openshell-bootstrap/src/lib.rs b/crates/openshell-bootstrap/src/lib.rs index 0988c4b6b..8845f0392 100644 --- a/crates/openshell-bootstrap/src/lib.rs +++ b/crates/openshell-bootstrap/src/lib.rs @@ -3,6 +3,7 @@ pub mod build; pub mod edge_token; +pub mod jwt; pub mod oidc_token; mod metadata; diff --git a/crates/openshell-bootstrap/src/pki.rs b/crates/openshell-bootstrap/src/pki.rs index ed93850df..bb103bf46 100644 --- a/crates/openshell-bootstrap/src/pki.rs +++ b/crates/openshell-bootstrap/src/pki.rs @@ -1,6 +1,7 @@ // SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 +use crate::jwt::{JwtKeyMaterial, generate_jwt_key}; use miette::{IntoDiagnostic, Result, WrapErr}; use rcgen::{BasicConstraints, CertificateParams, DnType, Ia5String, IsCa, KeyPair, SanType}; use std::net::IpAddr; @@ -15,6 +16,12 @@ pub struct PkiBundle { pub server_key_pem: String, pub client_cert_pem: String, pub client_key_pem: String, + /// PKCS#8 PEM Ed25519 private key for minting per-sandbox JWTs. + pub jwt_signing_key_pem: String, + /// SPKI PEM Ed25519 public key, paired with `jwt_signing_key_pem`. + pub jwt_public_key_pem: String, + /// Stable identifier embedded in the `kid` header of every minted JWT. + pub jwt_key_id: String, } /// Default SANs always included on the server certificate. @@ -95,6 +102,13 @@ pub fn generate_pki(extra_sans: &[String]) -> Result { .into_diagnostic() .wrap_err("failed to sign client certificate")?; + // --- JWT signing key (Ed25519, used to mint per-sandbox identity tokens) --- + let JwtKeyMaterial { + signing_key_pem: jwt_signing_key_pem, + public_key_pem: jwt_public_key_pem, + kid: jwt_key_id, + } = generate_jwt_key().wrap_err("failed to generate JWT signing key")?; + Ok(PkiBundle { ca_cert_pem: ca_cert.pem(), ca_key_pem: ca_key.serialize_pem(), @@ -102,6 +116,9 @@ pub fn generate_pki(extra_sans: &[String]) -> Result { server_key_pem: server_key.serialize_pem(), client_cert_pem: client_cert.pem(), client_key_pem: client_key.serialize_pem(), + jwt_signing_key_pem, + jwt_public_key_pem, + jwt_key_id, }) } @@ -144,6 +161,9 @@ mod tests { assert!(bundle.server_key_pem.contains("BEGIN PRIVATE KEY")); assert!(bundle.client_cert_pem.contains("BEGIN CERTIFICATE")); assert!(bundle.client_key_pem.contains("BEGIN PRIVATE KEY")); + assert!(bundle.jwt_signing_key_pem.contains("BEGIN PRIVATE KEY")); + assert!(bundle.jwt_public_key_pem.contains("BEGIN PUBLIC KEY")); + assert_eq!(bundle.jwt_key_id.len(), 32, "kid is 16 bytes hex-encoded"); } #[test] diff --git a/crates/openshell-cli/Cargo.toml b/crates/openshell-cli/Cargo.toml index 7dc0c0f22..dedd1f460 100644 --- a/crates/openshell-cli/Cargo.toml +++ b/crates/openshell-cli/Cargo.toml @@ -29,7 +29,7 @@ prost-types = { workspace = true } tokio = { workspace = true } # gRPC client -tonic = { workspace = true, features = ["tls", "tls-native-roots"] } +tonic = { workspace = true, features = ["tls-native-roots"] } # CLI clap = { workspace = true } diff --git a/crates/openshell-cli/src/run.rs b/crates/openshell-cli/src/run.rs index 198cb4b0a..61c79af4f 100644 --- a/crates/openshell-cli/src/run.rs +++ b/crates/openshell-cli/src/run.rs @@ -739,6 +739,11 @@ fn import_local_package_mtls_bundle(name: &str) -> Result> { client_key_pem: std::fs::read_to_string(&key) .into_diagnostic() .wrap_err_with(|| format!("failed to read {}", key.display()))?, + // CLI never holds the gateway's JWT signing material — only the + // gateway needs it. Fill the JWT fields with placeholders. + jwt_signing_key_pem: String::new(), + jwt_public_key_pem: String::new(), + jwt_key_id: String::new(), }; openshell_bootstrap::mtls::store_pki_bundle(name, &bundle) .wrap_err_with(|| format!("failed to store mTLS bundle for gateway '{name}'"))?; diff --git a/crates/openshell-cli/tests/ensure_providers_integration.rs b/crates/openshell-cli/tests/ensure_providers_integration.rs index bd4262b31..96f173172 100644 --- a/crates/openshell-cli/tests/ensure_providers_integration.rs +++ b/crates/openshell-cli/tests/ensure_providers_integration.rs @@ -488,6 +488,20 @@ impl OpenShell for TestOpenShell { Err(Status::unimplemented("not implemented in test")) } + async fn issue_sandbox_token( + &self, + _request: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("not implemented in test")) + } + + async fn refresh_sandbox_token( + &self, + _request: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("not implemented in test")) + } + async fn connect_supervisor( &self, _request: tonic::Request>, diff --git a/crates/openshell-cli/tests/mtls_integration.rs b/crates/openshell-cli/tests/mtls_integration.rs index 7102ed9b6..22de566bf 100644 --- a/crates/openshell-cli/tests/mtls_integration.rs +++ b/crates/openshell-cli/tests/mtls_integration.rs @@ -397,6 +397,20 @@ impl OpenShell for TestOpenShell { Err(Status::unimplemented("not implemented in test")) } + async fn issue_sandbox_token( + &self, + _request: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("not implemented in test")) + } + + async fn refresh_sandbox_token( + &self, + _request: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("not implemented in test")) + } + async fn connect_supervisor( &self, _request: tonic::Request>, diff --git a/crates/openshell-cli/tests/provider_commands_integration.rs b/crates/openshell-cli/tests/provider_commands_integration.rs index 49b933e67..96dce3a5b 100644 --- a/crates/openshell-cli/tests/provider_commands_integration.rs +++ b/crates/openshell-cli/tests/provider_commands_integration.rs @@ -620,6 +620,20 @@ impl OpenShell for TestOpenShell { Err(Status::unimplemented("not implemented in test")) } + async fn issue_sandbox_token( + &self, + _request: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("not implemented in test")) + } + + async fn refresh_sandbox_token( + &self, + _request: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("not implemented in test")) + } + async fn connect_supervisor( &self, _request: tonic::Request>, diff --git a/crates/openshell-cli/tests/sandbox_create_lifecycle_integration.rs b/crates/openshell-cli/tests/sandbox_create_lifecycle_integration.rs index 1ad00dd6e..6ae868487 100644 --- a/crates/openshell-cli/tests/sandbox_create_lifecycle_integration.rs +++ b/crates/openshell-cli/tests/sandbox_create_lifecycle_integration.rs @@ -574,6 +574,20 @@ impl OpenShell for TestOpenShell { Err(Status::unimplemented("not implemented in test")) } + async fn issue_sandbox_token( + &self, + _request: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("not implemented in test")) + } + + async fn refresh_sandbox_token( + &self, + _request: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("not implemented in test")) + } + async fn connect_supervisor( &self, _request: tonic::Request>, diff --git a/crates/openshell-cli/tests/sandbox_name_fallback_integration.rs b/crates/openshell-cli/tests/sandbox_name_fallback_integration.rs index 531599dcf..b9c52b685 100644 --- a/crates/openshell-cli/tests/sandbox_name_fallback_integration.rs +++ b/crates/openshell-cli/tests/sandbox_name_fallback_integration.rs @@ -409,6 +409,20 @@ impl OpenShell for TestOpenShell { Err(Status::unimplemented("not implemented in test")) } + async fn issue_sandbox_token( + &self, + _request: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("not implemented in test")) + } + + async fn refresh_sandbox_token( + &self, + _request: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("not implemented in test")) + } + async fn connect_supervisor( &self, _request: tonic::Request>, diff --git a/crates/openshell-core/Cargo.toml b/crates/openshell-core/Cargo.toml index b03fb1494..014bf0dc7 100644 --- a/crates/openshell-core/Cargo.toml +++ b/crates/openshell-core/Cargo.toml @@ -14,6 +14,7 @@ repository.workspace = true prost = { workspace = true } prost-types = { workspace = true } tonic = { workspace = true } +tonic-prost = { workspace = true } thiserror = { workspace = true } miette = { workspace = true } serde = { workspace = true } @@ -28,7 +29,7 @@ ipnet = "2" dev-settings = [] [build-dependencies] -tonic-build = { workspace = true } +tonic-prost-build = { workspace = true } protobuf-src = { workspace = true } [dev-dependencies] diff --git a/crates/openshell-core/build.rs b/crates/openshell-core/build.rs index 7613c8754..98011c3c4 100644 --- a/crates/openshell-core/build.rs +++ b/crates/openshell-core/build.rs @@ -40,11 +40,12 @@ fn main() -> Result<(), Box> { collect_proto_files(&proto_root, &mut proto_files)?; proto_files.sort(); - // Configure tonic-build - tonic_build::configure() + // Configure tonic/prost protobuf code generation. + let include_paths = [proto_root]; + tonic_prost_build::configure() .build_server(true) .build_client(true) - .compile_protos(&proto_files, &[proto_root.as_path()])?; + .compile_protos(&proto_files, &include_paths)?; Ok(()) } diff --git a/crates/openshell-core/src/config.rs b/crates/openshell-core/src/config.rs index e045d0a52..f8b6add49 100644 --- a/crates/openshell-core/src/config.rs +++ b/crates/openshell-core/src/config.rs @@ -205,6 +205,19 @@ pub struct Config { #[serde(default)] pub oidc: Option, + /// Gateway-minted sandbox JWT configuration. When `Some`, the gateway + /// loads the signing key from disk and accepts gateway-issued sandbox + /// JWTs as `Principal::Sandbox`. Required for the per-sandbox identity + /// flow (issue #1354). + #[serde(default)] + pub gateway_jwt: Option, + + /// SPIFFE Workload API configuration. When `Some`, the gateway validates + /// sandbox JWT-SVIDs through the local SPIFFE implementation and maps + /// matching SPIFFE IDs to sandbox principals. + #[serde(default)] + pub spiffe: Option, + /// Database URL for persistence. pub database_url: String, @@ -317,6 +330,65 @@ const fn default_jwks_ttl_secs() -> u64 { 3600 } +/// Gateway-minted sandbox JWT configuration. +/// +/// Points the gateway at the Ed25519 signing key (produced by `certgen`) +/// and identifies the issuer string embedded in every minted token. The +/// signing key never leaves the gateway process; the public key is loaded +/// by the same gateway so it can validate its own tokens. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct GatewayJwtConfig { + /// Path to the Ed25519 signing key (PKCS#8 PEM). + pub signing_key_path: PathBuf, + /// Path to the matching public key (SPKI PEM). + pub public_key_path: PathBuf, + /// Path to the `kid` value (plain text, one line). + pub kid_path: PathBuf, + /// Stable gateway identity embedded in `iss`/`aud`. Defaults to the + /// hostname-or-`openshell` placeholder if unset. + #[serde(default = "default_gateway_id")] + pub gateway_id: String, + /// Token lifetime in seconds. Defaults to 24 hours. + #[serde(default = "default_sandbox_token_ttl_secs")] + pub ttl_secs: u64, +} + +/// SPIFFE-based sandbox identity configuration. +/// +/// The gateway uses the local SPIFFE Workload API to validate JWT-SVIDs +/// presented by sandbox supervisors. Supervisors request those JWT-SVIDs +/// for the configured audience and use SPIFFE IDs shaped as +/// `spiffe:///`. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SpiffeConfig { + /// Path to the SPIFFE Workload API UNIX socket. + pub workload_api_socket_path: PathBuf, + /// Trust domain accepted for sandbox SPIFFE IDs. + pub trust_domain: String, + /// Audience expected in sandbox JWT-SVIDs. + #[serde(default = "default_spiffe_audience")] + pub audience: String, + /// Path prefix, below the trust domain, that precedes the sandbox UUID. + #[serde(default = "default_spiffe_sandbox_id_prefix")] + pub sandbox_id_prefix: String, +} + +fn default_gateway_id() -> String { + "openshell".to_string() +} + +const fn default_sandbox_token_ttl_secs() -> u64 { + 86_400 +} + +fn default_spiffe_audience() -> String { + "openshell-gateway".to_string() +} + +fn default_spiffe_sandbox_id_prefix() -> String { + "/openshell/sandbox/".to_string() +} + fn default_roles_claim() -> String { "realm_access.roles".to_string() } @@ -340,6 +412,8 @@ impl Config { log_level: default_log_level(), tls, oidc: None, + gateway_jwt: None, + spiffe: None, database_url: String::new(), compute_drivers: vec![], ssh_session_ttl_secs: default_ssh_session_ttl_secs(), diff --git a/crates/openshell-core/src/lib.rs b/crates/openshell-core/src/lib.rs index 037174221..4032e4659 100644 --- a/crates/openshell-core/src/lib.rs +++ b/crates/openshell-core/src/lib.rs @@ -26,7 +26,9 @@ pub mod sandbox_env; pub mod settings; pub mod time; -pub use config::{ComputeDriverKind, Config, OidcConfig, TlsConfig}; +pub use config::{ + ComputeDriverKind, Config, GatewayJwtConfig, OidcConfig, SpiffeConfig, TlsConfig, +}; pub use error::{ComputeDriverError, Error, Result}; pub use metadata::{ObjectId, ObjectLabels, ObjectName}; diff --git a/crates/openshell-core/src/sandbox_env.rs b/crates/openshell-core/src/sandbox_env.rs index d345762ca..6d89d3c36 100644 --- a/crates/openshell-core/src/sandbox_env.rs +++ b/crates/openshell-core/src/sandbox_env.rs @@ -34,3 +34,34 @@ pub const TLS_CERT: &str = "OPENSHELL_TLS_CERT"; /// Path to the private key for mTLS communication with the gateway. pub const TLS_KEY: &str = "OPENSHELL_TLS_KEY"; + +/// Raw gateway-minted JWT identifying this sandbox. Mutually exclusive with +/// [`SANDBOX_TOKEN_FILE`] / [`K8S_SA_TOKEN_FILE`]; used only by test harnesses +/// that bypass the file-mount path. +pub const SANDBOX_TOKEN: &str = "OPENSHELL_SANDBOX_TOKEN"; + +/// Path to the file holding a gateway-minted sandbox JWT. +/// +/// Set by the Docker, Podman, and VM drivers, which write the token to a +/// bundle file at sandbox-create time. Read once at supervisor startup; +/// the token is held in process memory thereafter. +pub const SANDBOX_TOKEN_FILE: &str = "OPENSHELL_SANDBOX_TOKEN_FILE"; + +/// Path to the projected `ServiceAccount` JWT (Kubernetes driver). +/// +/// Used to bootstrap a gateway-minted JWT via `IssueSandboxToken`. Kubelet +/// writes and rotates this file; the supervisor exchanges its contents +/// for a gateway JWT at startup and on refresh. +pub const K8S_SA_TOKEN_FILE: &str = "OPENSHELL_K8S_SA_TOKEN_FILE"; + +/// Filesystem path to the SPIFFE Workload API UNIX socket. +/// +/// When set, the supervisor fetches a JWT-SVID from the local Workload API +/// and presents that token directly to the gateway. +pub const SPIFFE_WORKLOAD_API_SOCKET: &str = "OPENSHELL_SPIFFE_WORKLOAD_API_SOCKET"; + +/// Audience requested when fetching a SPIFFE JWT-SVID. +pub const SPIFFE_AUDIENCE: &str = "OPENSHELL_SPIFFE_AUDIENCE"; + +/// Optional exact SPIFFE ID requested from the Workload API. +pub const SPIFFE_ID: &str = "OPENSHELL_SPIFFE_ID"; diff --git a/crates/openshell-driver-docker/src/lib.rs b/crates/openshell-driver-docker/src/lib.rs index 30507422b..b0b06e2d1 100644 --- a/crates/openshell-driver-docker/src/lib.rs +++ b/crates/openshell-driver-docker/src/lib.rs @@ -988,6 +988,19 @@ fn build_environment(sandbox: &DriverSandbox, config: &DockerDriverRuntimeConfig ); } + // Gateway-minted sandbox JWT (PR 3 of the per-sandbox identity series). + // Passed via env var since Docker has no native secret mount that is + // simpler than the existing bind-mount pattern; the trust boundary + // (`docker inspect` access) is already equivalent to the TLS key mount. + if let Some(spec) = sandbox.spec.as_ref() + && !spec.sandbox_token.is_empty() + { + environment.insert( + openshell_core::sandbox_env::SANDBOX_TOKEN.to_string(), + spec.sandbox_token.clone(), + ); + } + let mut pairs = environment.into_iter().collect::>(); pairs.sort_by(|left, right| left.0.cmp(&right.0)); pairs diff --git a/crates/openshell-driver-docker/src/tests.rs b/crates/openshell-driver-docker/src/tests.rs index 62a6b89e4..c0ce10a04 100644 --- a/crates/openshell-driver-docker/src/tests.rs +++ b/crates/openshell-driver-docker/src/tests.rs @@ -33,6 +33,7 @@ fn test_sandbox() -> DriverSandbox { }), gpu: false, gpu_device: String::new(), + sandbox_token: String::new(), }), status: None, } diff --git a/crates/openshell-driver-kubernetes/src/config.rs b/crates/openshell-driver-kubernetes/src/config.rs index 28c04deb3..cb11d3aa5 100644 --- a/crates/openshell-driver-kubernetes/src/config.rs +++ b/crates/openshell-driver-kubernetes/src/config.rs @@ -64,8 +64,35 @@ pub struct KubernetesComputeConfig { pub client_tls_secret_name: String, pub host_gateway_ip: String, pub enable_user_namespaces: bool, + /// Lifetime (seconds) of the projected `ServiceAccount` token kubelet + /// writes into each sandbox pod. Used only for the one-shot + /// `IssueSandboxToken` bootstrap exchange — the gateway-minted JWT + /// that follows has its own TTL set via `gateway_jwt.ttl_secs`. + /// + /// Kubelet enforces a minimum of 600 seconds; the supervisor uses + /// this token within a few seconds of pod start, so any value at + /// the floor is sufficient. Default 3600. + pub sa_token_ttl_secs: i64, + /// SPIFFE Workload API socket path mounted into sandbox pods. Empty + /// disables SPIFFE identity material and keeps the `ServiceAccount` + /// bootstrap path active. + pub spiffe_workload_api_socket_path: String, + /// SPIFFE trust domain used for sandbox identities. + pub spiffe_trust_domain: String, + /// Audience requested by sandbox supervisors for JWT-SVIDs. + pub spiffe_audience: String, + /// Path prefix under the trust domain before the sandbox UUID. + pub spiffe_sandbox_id_prefix: String, } +/// Lower bound enforced by kubelet for projected SA tokens. +pub const MIN_SA_TOKEN_TTL_SECS: i64 = 600; + +/// Cap at 24h — operators who want longer-lived bootstrap tokens are +/// almost certainly misconfigured (the token is consumed seconds after +/// pod start). +pub const MAX_SA_TOKEN_TTL_SECS: i64 = 86_400; + impl Default for KubernetesComputeConfig { fn default() -> Self { Self { @@ -84,8 +111,35 @@ impl Default for KubernetesComputeConfig { client_tls_secret_name: String::new(), host_gateway_ip: String::new(), enable_user_namespaces: false, + sa_token_ttl_secs: 3600, + spiffe_workload_api_socket_path: String::new(), + spiffe_trust_domain: String::new(), + spiffe_audience: "openshell-gateway".to_string(), + spiffe_sandbox_id_prefix: "/openshell/sandbox/".to_string(), + } + } +} + +impl KubernetesComputeConfig { + /// Clamp `sa_token_ttl_secs` into the `[MIN_SA_TOKEN_TTL_SECS, + /// MAX_SA_TOKEN_TTL_SECS]` range used by the projected-volume spec. + /// Invalid (≤0) values fall back to the default 3600. + #[must_use] + pub fn effective_sa_token_ttl_secs(&self) -> i64 { + if self.sa_token_ttl_secs <= 0 { + 3600 + } else { + self.sa_token_ttl_secs + .clamp(MIN_SA_TOKEN_TTL_SECS, MAX_SA_TOKEN_TTL_SECS) } } + + #[must_use] + pub fn spiffe_enabled(&self) -> bool { + !self.spiffe_workload_api_socket_path.trim().is_empty() + && !self.spiffe_trust_domain.trim().is_empty() + && !self.spiffe_audience.trim().is_empty() + } } fn default_sandbox_image() -> String { diff --git a/crates/openshell-driver-kubernetes/src/driver.rs b/crates/openshell-driver-kubernetes/src/driver.rs index 21ec7f5bf..33df91ada 100644 --- a/crates/openshell-driver-kubernetes/src/driver.rs +++ b/crates/openshell-driver-kubernetes/src/driver.rs @@ -74,6 +74,7 @@ const SANDBOX_MANAGED_LABEL: &str = "openshell.ai/managed-by"; const SANDBOX_MANAGED_VALUE: &str = "openshell"; const GPU_RESOURCE_NAME: &str = "nvidia.com/gpu"; const GPU_RESOURCE_QUANTITY: &str = "1"; +const SPIFFE_WORKLOAD_API_VOLUME_NAME: &str = "spiffe-workload-api"; // --------------------------------------------------------------------------- // Default workspace persistence (temporary — will be replaced by snapshotting) @@ -327,6 +328,12 @@ impl KubernetesComputeDriver { client_tls_secret_name: &self.config.client_tls_secret_name, host_gateway_ip: &self.config.host_gateway_ip, enable_user_namespaces: self.config.enable_user_namespaces, + sa_token_ttl_secs: self.config.effective_sa_token_ttl_secs(), + spiffe_enabled: self.config.spiffe_enabled(), + spiffe_workload_api_socket_path: &self.config.spiffe_workload_api_socket_path, + spiffe_trust_domain: &self.config.spiffe_trust_domain, + spiffe_audience: &self.config.spiffe_audience, + spiffe_sandbox_id_prefix: &self.config.spiffe_sandbox_id_prefix, }; obj.data = sandbox_to_k8s_spec(sandbox.spec.as_ref(), ¶ms); let api = self.api(); @@ -1042,7 +1049,6 @@ fn default_workspace_volume_claim_templates() -> serde_json::Value { } /// Parameters shared by `sandbox_to_k8s_spec` and `sandbox_template_to_k8s`. -#[derive(Default)] struct SandboxPodParams<'a> { default_image: &'a str, image_pull_policy: &'a str, @@ -1056,6 +1062,39 @@ struct SandboxPodParams<'a> { client_tls_secret_name: &'a str, host_gateway_ip: &'a str, enable_user_namespaces: bool, + /// Lifetime (seconds) of the projected `ServiceAccount` token used + /// for the bootstrap `IssueSandboxToken` exchange. + sa_token_ttl_secs: i64, + spiffe_enabled: bool, + spiffe_workload_api_socket_path: &'a str, + spiffe_trust_domain: &'a str, + spiffe_audience: &'a str, + spiffe_sandbox_id_prefix: &'a str, +} + +impl Default for SandboxPodParams<'_> { + fn default() -> Self { + Self { + default_image: "", + image_pull_policy: "", + supervisor_image: "", + supervisor_image_pull_policy: "", + supervisor_sideload_method: SupervisorSideloadMethod::default(), + sandbox_id: "", + sandbox_name: "", + grpc_endpoint: "", + ssh_socket_path: "", + client_tls_secret_name: "", + host_gateway_ip: "", + enable_user_namespaces: false, + sa_token_ttl_secs: 3600, + spiffe_enabled: false, + spiffe_workload_api_socket_path: "", + spiffe_trust_domain: "", + spiffe_audience: "openshell-gateway", + spiffe_sandbox_id_prefix: "/openshell/sandbox/", + } + } } fn spec_pod_env(spec: Option<&SandboxSpec>) -> std::collections::HashMap { @@ -1144,11 +1183,54 @@ fn sandbox_template_to_k8s( params: &SandboxPodParams<'_>, ) -> serde_json::Value { let mut metadata = serde_json::Map::new(); - if !template.labels.is_empty() { - metadata.insert("labels".to_string(), serde_json::json!(template.labels)); + let mut pod_labels = template + .labels + .iter() + .map(|(key, value)| (key.clone(), serde_json::Value::String(value.clone()))) + .collect::>(); + if params.spiffe_enabled { + pod_labels.insert( + SANDBOX_MANAGED_LABEL.to_string(), + serde_json::Value::String(SANDBOX_MANAGED_VALUE.to_string()), + ); + if !params.sandbox_id.is_empty() { + pod_labels.insert( + SANDBOX_ID_LABEL.to_string(), + serde_json::Value::String(params.sandbox_id.to_string()), + ); + } + } + if !pod_labels.is_empty() { + metadata.insert("labels".to_string(), serde_json::Value::Object(pod_labels)); + } + // Carry the sandbox UUID as a pod annotation so the gateway can resolve + // a projected SA token claim (pod name + uid) back to a sandbox identity + // when the supervisor calls `IssueSandboxToken` at startup. The gateway's + // K8s Role does NOT grant `patch pods`, so this annotation is + // effectively immutable post-create (see plan §11.8). + let mut pod_annotations = platform_config_struct(template, "annotations") + .and_then(|v| match v { + serde_json::Value::Object(map) => Some(map), + _ => None, + }) + .unwrap_or_default(); + if !params.sandbox_id.is_empty() { + pod_annotations.insert( + "openshell.io/sandbox-id".to_string(), + serde_json::Value::String(params.sandbox_id.to_string()), + ); + } + if params.spiffe_enabled { + pod_annotations.insert( + "openshell.io/spiffe-id".to_string(), + serde_json::Value::String(sandbox_spiffe_id(params)), + ); } - if let Some(annotations) = platform_config_struct(template, "annotations") { - metadata.insert("annotations".to_string(), annotations); + if !pod_annotations.is_empty() { + metadata.insert( + "annotations".to_string(), + serde_json::Value::Object(pod_annotations), + ); } let mut spec = serde_json::Map::new(); @@ -1214,6 +1296,7 @@ fn sandbox_template_to_k8s( params.grpc_endpoint, params.ssh_socket_path, !params.client_tls_secret_name.is_empty(), + spiffe_env(params), ); container.insert("env".to_string(), serde_json::Value::Array(env)); @@ -1235,17 +1318,35 @@ fn sandbox_template_to_k8s( }), ); - // Mount client TLS secret for mTLS to the server. + // Mount client TLS secret for mTLS to the server, plus exactly one + // sandbox identity source: SPIFFE Workload API socket when configured, + // otherwise a projected ServiceAccount token for the gateway-JWT + // bootstrap path. + let mut volume_mounts: Vec = Vec::new(); if !params.client_tls_secret_name.is_empty() { - container.insert( - "volumeMounts".to_string(), - serde_json::json!([{ - "name": "openshell-client-tls", - "mountPath": "/etc/openshell-tls/client", - "readOnly": true - }]), - ); + volume_mounts.push(serde_json::json!({ + "name": "openshell-client-tls", + "mountPath": "/etc/openshell-tls/client", + "readOnly": true + })); + } + if params.spiffe_enabled { + volume_mounts.push(serde_json::json!({ + "name": SPIFFE_WORKLOAD_API_VOLUME_NAME, + "mountPath": spiffe_socket_mount_path(params.spiffe_workload_api_socket_path), + "readOnly": true, + })); + } else { + volume_mounts.push(serde_json::json!({ + "name": "openshell-sa-token", + "mountPath": "/var/run/secrets/openshell", + "readOnly": true, + })); } + container.insert( + "volumeMounts".to_string(), + serde_json::Value::Array(volume_mounts), + ); if let Some(resources) = container_resources(template, gpu) { container.insert("resources".to_string(), resources); @@ -1257,15 +1358,41 @@ fn sandbox_template_to_k8s( // Add TLS secret volume. Mode 0400 (owner-read) prevents the // unprivileged sandbox user from reading the mTLS private key. + let mut volumes: Vec = Vec::new(); if !params.client_tls_secret_name.is_empty() { - spec.insert( - "volumes".to_string(), - serde_json::json!([{ - "name": "openshell-client-tls", - "secret": { "secretName": params.client_tls_secret_name, "defaultMode": 256 } - }]), - ); + volumes.push(serde_json::json!({ + "name": "openshell-client-tls", + "secret": { "secretName": params.client_tls_secret_name, "defaultMode": 256 } + })); + } + if params.spiffe_enabled { + volumes.push(serde_json::json!({ + "name": SPIFFE_WORKLOAD_API_VOLUME_NAME, + "csi": { + "driver": "csi.spiffe.io", + "readOnly": true + } + })); + } else { + // Projected ServiceAccountToken volume — kubelet writes a short-lived + // audience-bound JWT into /var/run/secrets/openshell/token and rotates + // it automatically. The supervisor exchanges this for a gateway-minted + // JWT via `IssueSandboxToken` once at startup. + volumes.push(serde_json::json!({ + "name": "openshell-sa-token", + "projected": { + "sources": [{ + "serviceAccountToken": { + "audience": "openshell-gateway", + "expirationSeconds": params.sa_token_ttl_secs, + "path": "token" + } + }], + "defaultMode": 256 + } + })); } + spec.insert("volumes".to_string(), serde_json::Value::Array(volumes)); // Add hostAliases so sandbox pods can reach the Docker host. if !params.host_gateway_ip.is_empty() { @@ -1376,6 +1503,7 @@ fn build_env_list( grpc_endpoint: &str, ssh_socket_path: &str, tls_enabled: bool, + spiffe_env: Option, ) -> Vec { let mut env = existing_env.cloned().unwrap_or_default(); apply_env_map(&mut env, template_environment); @@ -1387,6 +1515,7 @@ fn build_env_list( grpc_endpoint, ssh_socket_path, tls_enabled, + spiffe_env, ); env } @@ -1409,6 +1538,7 @@ fn apply_required_env( grpc_endpoint: &str, ssh_socket_path: &str, tls_enabled: bool, + spiffe_env: Option, ) { upsert_env(env, openshell_core::sandbox_env::SANDBOX_ID, sandbox_id); upsert_env(env, openshell_core::sandbox_env::SANDBOX, sandbox_name); @@ -1444,6 +1574,79 @@ fn apply_required_env( "/etc/openshell-tls/client/tls.key", ); } + if let Some(spiffe) = spiffe_env { + upsert_env( + env, + openshell_core::sandbox_env::SPIFFE_WORKLOAD_API_SOCKET, + &spiffe.socket_path, + ); + upsert_env( + env, + openshell_core::sandbox_env::SPIFFE_AUDIENCE, + &spiffe.audience, + ); + upsert_env(env, openshell_core::sandbox_env::SPIFFE_ID, &spiffe.id); + } else { + // Projected ServiceAccount token written by kubelet (see the volume + // definition in `sandbox_template_to_k8s`). The supervisor reads this + // and exchanges it for a gateway-minted JWT via `IssueSandboxToken`. + upsert_env( + env, + openshell_core::sandbox_env::K8S_SA_TOKEN_FILE, + "/var/run/secrets/openshell/token", + ); + } +} + +#[derive(Clone)] +struct SpiffeEnv { + socket_path: String, + audience: String, + id: String, +} + +fn spiffe_env(params: &SandboxPodParams<'_>) -> Option { + params.spiffe_enabled.then(|| SpiffeEnv { + socket_path: params.spiffe_workload_api_socket_path.to_string(), + audience: params.spiffe_audience.to_string(), + id: sandbox_spiffe_id(params), + }) +} + +fn sandbox_spiffe_id(params: &SandboxPodParams<'_>) -> String { + format!( + "spiffe://{}{}{}", + params + .spiffe_trust_domain + .trim() + .trim_start_matches("spiffe://") + .trim_end_matches('/'), + normalize_spiffe_path_prefix(params.spiffe_sandbox_id_prefix), + params.sandbox_id, + ) +} + +fn normalize_spiffe_path_prefix(prefix: &str) -> String { + let trimmed = prefix.trim(); + let with_leading = if trimmed.starts_with('/') { + trimmed.to_string() + } else { + format!("/{trimmed}") + }; + if with_leading.ends_with('/') { + with_leading + } else { + format!("{with_leading}/") + } +} + +fn spiffe_socket_mount_path(socket_path: &str) -> String { + std::path::Path::new(socket_path) + .parent() + .and_then(std::path::Path::to_str) + .filter(|path| !path.is_empty()) + .unwrap_or("/spiffe-workload-api") + .to_string() } fn upsert_env(env: &mut Vec, name: &str, value: &str) { @@ -1863,6 +2066,7 @@ mod tests { "https://endpoint:8080", "0.0.0.0:2222", true, // tls_enabled + None, ); // Extract the TLS-related env vars @@ -2415,6 +2619,65 @@ mod tests { ); } + #[test] + fn spiffe_mode_mounts_csi_socket_and_sets_identity_env() { + let params = SandboxPodParams { + sandbox_id: "sandbox-123", + sandbox_name: "sandbox", + spiffe_enabled: true, + spiffe_workload_api_socket_path: "/spiffe-workload-api/spire-agent.sock", + spiffe_trust_domain: "openshell.local", + spiffe_audience: "openshell-gateway", + spiffe_sandbox_id_prefix: "/openshell/sandbox/", + ..SandboxPodParams::default() + }; + let pod_template = sandbox_template_to_k8s( + &SandboxTemplate::default(), + false, + &std::collections::HashMap::new(), + true, + ¶ms, + ); + + let env = pod_template["spec"]["containers"][0]["env"] + .as_array() + .expect("env"); + assert!(env.iter().any(|e| { + e["name"] == openshell_core::sandbox_env::SPIFFE_WORKLOAD_API_SOCKET + && e["value"] == "/spiffe-workload-api/spire-agent.sock" + })); + assert!(env.iter().any(|e| { + e["name"] == openshell_core::sandbox_env::SPIFFE_ID + && e["value"] == "spiffe://openshell.local/openshell/sandbox/sandbox-123" + })); + assert!( + !env.iter() + .any(|e| e["name"] == openshell_core::sandbox_env::K8S_SA_TOKEN_FILE), + "SPIFFE mode must not expose the ServiceAccount bootstrap token" + ); + + let volumes = pod_template["spec"]["volumes"].as_array().expect("volumes"); + assert!(volumes.iter().any(|volume| { + volume["name"] == SPIFFE_WORKLOAD_API_VOLUME_NAME + && volume["csi"]["driver"] == "csi.spiffe.io" + })); + assert!( + !volumes + .iter() + .any(|volume| volume["name"] == "openshell-sa-token"), + "SPIFFE mode must not mount the ServiceAccount token volume" + ); + + assert_eq!( + pod_template["metadata"]["annotations"]["openshell.io/spiffe-id"], + serde_json::json!("spiffe://openshell.local/openshell/sandbox/sandbox-123") + ); + assert_eq!( + pod_template["metadata"]["labels"][SANDBOX_MANAGED_LABEL], + serde_json::json!(SANDBOX_MANAGED_VALUE) + ); + } + #[test] fn platform_config_bool_extracts_value() { let template = SandboxTemplate { diff --git a/crates/openshell-driver-kubernetes/src/main.rs b/crates/openshell-driver-kubernetes/src/main.rs index a170b5785..0f843a0d1 100644 --- a/crates/openshell-driver-kubernetes/src/main.rs +++ b/crates/openshell-driver-kubernetes/src/main.rs @@ -68,6 +68,33 @@ struct Args { #[arg(long, env = "OPENSHELL_ENABLE_USER_NAMESPACES")] enable_user_namespaces: bool, + + /// Lifetime (seconds) of the projected `ServiceAccount` token + /// kubelet writes into each sandbox pod for the `IssueSandboxToken` + /// bootstrap exchange. Kubelet enforces a minimum of 600s; the + /// gateway clamps values outside `[600, 86400]`. Default 3600. + #[arg(long, env = "OPENSHELL_K8S_SA_TOKEN_TTL_SECS", default_value_t = 3600)] + sa_token_ttl_secs: i64, + + #[arg(long, env = "OPENSHELL_SPIFFE_WORKLOAD_API_SOCKET")] + spiffe_workload_api_socket_path: Option, + + #[arg(long, env = "OPENSHELL_SPIFFE_TRUST_DOMAIN")] + spiffe_trust_domain: Option, + + #[arg( + long, + env = "OPENSHELL_SPIFFE_AUDIENCE", + default_value = "openshell-gateway" + )] + spiffe_audience: String, + + #[arg( + long, + env = "OPENSHELL_SPIFFE_SANDBOX_ID_PREFIX", + default_value = "/openshell/sandbox/" + )] + spiffe_sandbox_id_prefix: String, } #[tokio::main] @@ -93,6 +120,11 @@ async fn main() -> Result<()> { client_tls_secret_name: args.client_tls_secret_name.unwrap_or_default(), host_gateway_ip: args.host_gateway_ip.unwrap_or_default(), enable_user_namespaces: args.enable_user_namespaces, + sa_token_ttl_secs: args.sa_token_ttl_secs, + spiffe_workload_api_socket_path: args.spiffe_workload_api_socket_path.unwrap_or_default(), + spiffe_trust_domain: args.spiffe_trust_domain.unwrap_or_default(), + spiffe_audience: args.spiffe_audience, + spiffe_sandbox_id_prefix: args.spiffe_sandbox_id_prefix, }) .await .into_diagnostic()?; diff --git a/crates/openshell-driver-podman/src/container.rs b/crates/openshell-driver-podman/src/container.rs index 1cb58e338..e00439703 100644 --- a/crates/openshell-driver-podman/src/container.rs +++ b/crates/openshell-driver-podman/src/container.rs @@ -299,6 +299,17 @@ fn build_env( ); } + // 4. Gateway-minted sandbox JWT (PR 3 of the per-sandbox identity + // series). Passed via env var; the supervisor reads it directly. + if let Some(s) = spec + && !s.sandbox_token.is_empty() + { + env.insert( + openshell_core::sandbox_env::SANDBOX_TOKEN.into(), + s.sandbox_token.clone(), + ); + } + env } diff --git a/crates/openshell-sandbox/Cargo.toml b/crates/openshell-sandbox/Cargo.toml index 29919ede4..be74827f2 100644 --- a/crates/openshell-sandbox/Cargo.toml +++ b/crates/openshell-sandbox/Cargo.toml @@ -24,8 +24,9 @@ openshell-router = { path = "../openshell-router" } tokio = { workspace = true } # gRPC -tonic = { workspace = true, features = ["channel", "tls"] } +tonic = { workspace = true, features = ["channel", "tls-native-roots"] } tokio-stream = { workspace = true } +spiffe = { workspace = true } # CLI clap = { workspace = true } diff --git a/crates/openshell-sandbox/src/debug_rpc.rs b/crates/openshell-sandbox/src/debug_rpc.rs new file mode 100644 index 000000000..013099198 --- /dev/null +++ b/crates/openshell-sandbox/src/debug_rpc.rs @@ -0,0 +1,236 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! One-shot debug RPCs exposed via `openshell-sandbox debug-rpc`. +//! +//! Designed for end-to-end verification of the per-sandbox identity +//! flow (issue #1354). A `docker exec` (or `kubectl exec`) into a +//! running sandbox can issue raw sandbox-class gRPC calls without +//! standing up a custom binary inside the sandbox image — useful for +//! confirming the cross-sandbox IDOR guard and refresh semantics. +//! +//! Subcommands: +//! - `get-sandbox-config --sandbox-id ` — call `GetSandboxConfig` +//! - `refresh` — call `RefreshSandboxToken` +//! - `show-token` — print the raw gateway JWT bytes +//! - `show-principal` — pretty-print the decoded JWT claims +//! (no signature verification — the supervisor already trusts the +//! token's origin) + +use base64::Engine as _; +use miette::{IntoDiagnostic, Result, WrapErr}; +use openshell_core::proto::{ + GetSandboxConfigRequest, RefreshSandboxTokenRequest, open_shell_client::OpenShellClient, +}; + +use crate::grpc_client::{AuthedChannel, connect_channel_pub}; + +/// Entry point for the `debug-rpc` subcommand. Returns the process exit +/// code; `main` propagates it. +pub async fn run(args: &[String]) -> Result { + let cmd = args + .first() + .map(String::as_str) + .ok_or_else(|| miette::miette!("{}", USAGE))?; + + match cmd { + "get-sandbox-config" => run_get_sandbox_config(&args[1..]).await, + "refresh" => run_refresh().await, + "show-token" => run_show_token(), + "show-principal" => run_show_principal(), + "--help" | "-h" => { + println!("{USAGE}"); + Ok(0) + } + other => Err(miette::miette!( + "unknown debug-rpc command '{other}'\n\n{USAGE}" + )), + } +} + +const USAGE: &str = "\ +usage: openshell-sandbox debug-rpc [options] + +commands: + get-sandbox-config --sandbox-id call GetSandboxConfig + refresh call RefreshSandboxToken + show-token print raw gateway JWT + show-principal print decoded JWT claims + +requires: OPENSHELL_ENDPOINT in env, plus one of OPENSHELL_SANDBOX_TOKEN, +OPENSHELL_SANDBOX_TOKEN_FILE, or OPENSHELL_K8S_SA_TOKEN_FILE so the +supervisor's normal token-acquisition path can resolve a JWT."; + +async fn open_client() -> Result> { + let endpoint = std::env::var(openshell_core::sandbox_env::ENDPOINT) + .into_diagnostic() + .wrap_err("OPENSHELL_ENDPOINT must be set")?; + let channel = connect_channel_pub(&endpoint).await?; + Ok(OpenShellClient::new(channel)) +} + +async fn run_get_sandbox_config(args: &[String]) -> Result { + let sandbox_id = parse_flag(args, "--sandbox-id") + .ok_or_else(|| miette::miette!("get-sandbox-config: --sandbox-id is required"))?; + let mut client = open_client().await?; + let resp = client + .get_sandbox_config(GetSandboxConfigRequest { + sandbox_id: sandbox_id.to_string(), + }) + .await; + match resp { + Ok(r) => { + let inner = r.into_inner(); + println!( + "version={} policy_hash={} config_revision={}", + inner.version, inner.policy_hash, inner.config_revision + ); + Ok(0) + } + Err(status) => { + eprintln!("{}: {}", code_name(status.code()), status.message()); + // Map gRPC status to a non-zero exit so callers can branch + // (e.g. expect-permission-denied in a shell test). + Ok(match status.code() { + tonic::Code::PermissionDenied => 7, + tonic::Code::Unauthenticated => 16, + tonic::Code::NotFound => 5, + _ => 1, + }) + } + } +} + +async fn run_refresh() -> Result { + let mut client = open_client().await?; + let resp = client + .refresh_sandbox_token(RefreshSandboxTokenRequest {}) + .await; + match resp { + Ok(r) => { + let inner = r.into_inner(); + println!( + "token={}\nexpires_at_ms={}", + inner.token, inner.expires_at_ms + ); + Ok(0) + } + Err(status) => { + eprintln!("{}: {}", code_name(status.code()), status.message()); + Ok(1) + } + } +} + +fn run_show_token() -> Result { + let token = read_local_token()?; + println!("{token}"); + Ok(0) +} + +fn run_show_principal() -> Result { + let token = read_local_token()?; + let payload_b64 = token + .split('.') + .nth(1) + .ok_or_else(|| miette::miette!("token has no payload segment"))?; + let payload = base64::engine::general_purpose::URL_SAFE_NO_PAD + .decode(payload_b64) + .into_diagnostic() + .wrap_err("failed to base64-decode token payload")?; + let claims: serde_json::Value = serde_json::from_slice(&payload) + .into_diagnostic() + .wrap_err("failed to parse token payload as JSON")?; + println!( + "{}", + serde_json::to_string_pretty(&claims).into_diagnostic()? + ); + Ok(0) +} + +/// Read the token from the env/file/SA-bootstrap chain, but only the +/// "already a gateway JWT" paths — show-token / show-principal don't +/// want to actually exchange an SA token. +fn read_local_token() -> Result { + if let Ok(t) = std::env::var(openshell_core::sandbox_env::SANDBOX_TOKEN) + && !t.is_empty() + { + return Ok(t); + } + if let Ok(path) = std::env::var(openshell_core::sandbox_env::SANDBOX_TOKEN_FILE) + && !path.is_empty() + { + return Ok(std::fs::read_to_string(&path) + .into_diagnostic() + .wrap_err_with(|| format!("failed to read sandbox token from {path}"))? + .trim() + .to_string()); + } + Err(miette::miette!( + "no in-process gateway JWT available — set OPENSHELL_SANDBOX_TOKEN or \ + OPENSHELL_SANDBOX_TOKEN_FILE. The K8s SA-bootstrap path is intentionally \ + excluded from `show-token` / `show-principal` to avoid issuing a fresh \ + token just for inspection." + )) +} + +fn parse_flag<'a>(args: &'a [String], name: &str) -> Option<&'a str> { + let mut iter = args.iter(); + while let Some(a) = iter.next() { + if a == name { + return iter.next().map(String::as_str); + } + if let Some(rest) = a.strip_prefix(&format!("{name}=")) { + return Some(rest); + } + } + None +} + +fn code_name(c: tonic::Code) -> &'static str { + match c { + tonic::Code::Ok => "OK", + tonic::Code::Cancelled => "Cancelled", + tonic::Code::Unknown => "Unknown", + tonic::Code::InvalidArgument => "InvalidArgument", + tonic::Code::DeadlineExceeded => "DeadlineExceeded", + tonic::Code::NotFound => "NotFound", + tonic::Code::AlreadyExists => "AlreadyExists", + tonic::Code::PermissionDenied => "PermissionDenied", + tonic::Code::ResourceExhausted => "ResourceExhausted", + tonic::Code::FailedPrecondition => "FailedPrecondition", + tonic::Code::Aborted => "Aborted", + tonic::Code::OutOfRange => "OutOfRange", + tonic::Code::Unimplemented => "Unimplemented", + tonic::Code::Internal => "Internal", + tonic::Code::Unavailable => "Unavailable", + tonic::Code::DataLoss => "DataLoss", + tonic::Code::Unauthenticated => "Unauthenticated", + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn parse_flag_handles_space_separated() { + let args: Vec = ["--sandbox-id", "abc-123"] + .iter() + .map(ToString::to_string) + .collect(); + assert_eq!(parse_flag(&args, "--sandbox-id"), Some("abc-123")); + } + + #[test] + fn parse_flag_handles_equals_separated() { + let args: Vec = ["--sandbox-id=abc-123".to_string()].to_vec(); + assert_eq!(parse_flag(&args, "--sandbox-id"), Some("abc-123")); + } + + #[test] + fn parse_flag_returns_none_when_missing() { + let args: Vec = ["--other".to_string(), "x".to_string()].to_vec(); + assert!(parse_flag(&args, "--sandbox-id").is_none()); + } +} diff --git a/crates/openshell-sandbox/src/grpc_client.rs b/crates/openshell-sandbox/src/grpc_client.rs index 28492b543..dafd6901b 100644 --- a/crates/openshell-sandbox/src/grpc_client.rs +++ b/crates/openshell-sandbox/src/grpc_client.rs @@ -3,22 +3,122 @@ //! gRPC client for fetching sandbox policy, provider environment, and inference //! route bundles from `OpenShell` server. +//! +//! Every request carries a sandbox bearer credential in the `Authorization` +//! header. The token is resolved at startup from one of four sources: +//! +//! 1. `OPENSHELL_SANDBOX_TOKEN` — raw JWT in the env (test harness path). +//! 2. `OPENSHELL_SANDBOX_TOKEN_FILE` — file containing the JWT (Docker / +//! Podman / VM drivers write this to a bundle file at sandbox-create +//! time). +//! 3. `OPENSHELL_SPIFFE_WORKLOAD_API_SOCKET` — local SPIFFE Workload API +//! socket; the supervisor fetches a JWT-SVID and presents it directly. +//! 4. `OPENSHELL_K8S_SA_TOKEN_FILE` — projected `ServiceAccount` JWT; the +//! supervisor exchanges it for a gateway JWT via `IssueSandboxToken` +//! once at startup. +//! +//! The resolved bearer credential is held in process memory thereafter and +//! injected on every outbound call by [`AuthInterceptor`]. use std::collections::HashMap; -use std::time::Duration; +use std::path::PathBuf; +use std::sync::{Arc, OnceLock, RwLock}; +use std::time::{Duration, SystemTime, UNIX_EPOCH}; use miette::{IntoDiagnostic, Result, WrapErr}; use openshell_core::proto::{ DenialSummary, GetDraftPolicyRequest, GetInferenceBundleRequest, GetInferenceBundleResponse, - GetSandboxConfigRequest, GetSandboxProviderEnvironmentRequest, PolicyChunk, PolicySource, - PolicyStatus, ReportPolicyStatusRequest, SandboxPolicy as ProtoSandboxPolicy, - SubmitPolicyAnalysisRequest, SubmitPolicyAnalysisResponse, UpdateConfigRequest, - inference_client::InferenceClient, open_shell_client::OpenShellClient, + GetSandboxConfigRequest, GetSandboxProviderEnvironmentRequest, IssueSandboxTokenRequest, + PolicyChunk, PolicySource, PolicyStatus, RefreshSandboxTokenRequest, ReportPolicyStatusRequest, + SandboxPolicy as ProtoSandboxPolicy, SubmitPolicyAnalysisRequest, SubmitPolicyAnalysisResponse, + UpdateConfigRequest, inference_client::InferenceClient, open_shell_client::OpenShellClient, }; +use openshell_core::sandbox_env; +use spiffe::{SpiffeId, WorkloadApiClient}; +use tonic::Status; +use tonic::metadata::AsciiMetadataValue; +use tonic::service::interceptor::InterceptedService; use tonic::transport::{Certificate, Channel, ClientTlsConfig, Endpoint, Identity}; -use tracing::debug; +use tracing::{debug, info, warn}; -/// Create a channel to the `OpenShell` server. +/// Channel type after the [`AuthInterceptor`] is applied. Aliased so the +/// generated client type signatures stay readable. +pub type AuthedChannel = InterceptedService; + +/// Shared, refreshable Bearer header. All [`AuthInterceptor`] clones read +/// the same slot, so the PR-5 refresh task can rotate the token in place +/// without rebuilding the channel. +type TokenSlot = Arc>; + +/// Process-wide token slot. Initialized by the first [`connect_channel`] +/// call and shared with every subsequent client + the refresh loop. +static TOKEN_SLOT: OnceLock = OnceLock::new(); + +/// One-shot guard so the refresh loop spawns at most once per process. +static REFRESH_SPAWNED: OnceLock<()> = OnceLock::new(); + +#[derive(Clone, Debug)] +enum RefreshMode { + GatewayJwt, + Spiffe(SpiffeTokenSource), +} + +#[derive(Clone, Debug)] +struct SpiffeTokenSource { + socket_path: PathBuf, + audience: String, + spiffe_id: Option, +} + +#[derive(Debug)] +struct AcquiredToken { + token: String, + refresh_mode: RefreshMode, +} + +fn install_token_slot(token: &str) -> Result { + let bearer = AsciiMetadataValue::try_from(format!("Bearer {token}")) + .into_diagnostic() + .wrap_err("sandbox JWT contained characters not valid for a header value")?; + if let Some(existing) = TOKEN_SLOT.get() { + *existing.write().expect("token slot poisoned") = bearer; + return Ok(existing.clone()); + } + let slot: TokenSlot = Arc::new(RwLock::new(bearer)); + let _ = TOKEN_SLOT.set(slot.clone()); + Ok(TOKEN_SLOT.get().cloned().unwrap_or(slot)) +} + +/// gRPC interceptor that injects `authorization: Bearer ` on every +/// outbound request. The token lives in a shared [`TokenSlot`] so the +/// PR-5 refresh task can replace it without rebuilding clients. +#[derive(Clone)] +pub struct AuthInterceptor { + bearer: TokenSlot, +} + +impl AuthInterceptor { + fn new(bearer: TokenSlot) -> Self { + Self { bearer } + } +} + +impl tonic::service::Interceptor for AuthInterceptor { + fn call( + &mut self, + mut req: tonic::Request<()>, + ) -> std::result::Result, Status> { + let bearer = self + .bearer + .read() + .expect("auth interceptor token slot poisoned") + .clone(); + req.metadata_mut().insert("authorization", bearer); + Ok(req) + } +} + +/// Build the plain (un-intercepted) gRPC channel. /// /// When the endpoint uses `https://`, mTLS is configured using these env vars: /// - `OPENSHELL_TLS_CA` -- path to the CA certificate @@ -27,7 +127,7 @@ use tracing::debug; /// /// When the endpoint uses `http://`, a plaintext connection is used (for /// deployments where TLS is disabled, e.g. behind a Cloudflare Tunnel). -async fn connect_channel(endpoint: &str) -> Result { +async fn build_plain_channel(endpoint: &str) -> Result { let mut ep = Endpoint::from_shared(endpoint.to_string()) .into_diagnostic() .wrap_err("invalid gRPC endpoint")? @@ -43,13 +143,13 @@ async fn connect_channel(endpoint: &str) -> Result { let tls_enabled = endpoint.starts_with("https://"); if tls_enabled { - let ca_path = std::env::var(openshell_core::sandbox_env::TLS_CA) + let ca_path = std::env::var(sandbox_env::TLS_CA) .into_diagnostic() .wrap_err("OPENSHELL_TLS_CA is required")?; - let cert_path = std::env::var(openshell_core::sandbox_env::TLS_CERT) + let cert_path = std::env::var(sandbox_env::TLS_CERT) .into_diagnostic() .wrap_err("OPENSHELL_TLS_CERT is required")?; - let key_path = std::env::var(openshell_core::sandbox_env::TLS_KEY) + let key_path = std::env::var(sandbox_env::TLS_KEY) .into_diagnostic() .wrap_err("OPENSHELL_TLS_KEY is required")?; @@ -79,24 +179,367 @@ async fn connect_channel(endpoint: &str) -> Result { .wrap_err("failed to connect to OpenShell server") } -/// Create a channel to the `OpenShell` server (public for use by `supervisor_session`). -pub async fn connect_channel_pub(endpoint: &str) -> Result { +/// Build a Bearer-authenticated channel to the gateway. +/// +/// First call per process resolves the sandbox JWT via the three-step +/// lookup (env → file → K8s SA bootstrap exchange) and installs it into +/// the process-wide [`TOKEN_SLOT`]. Subsequent calls reuse the cached +/// slot — the refresh loop keeps the value fresh, so re-running the +/// bootstrap is both unnecessary and (on the K8s SA path) expensive +/// (one apiserver round-trip per call). The refresh loop itself is +/// spawned once per process via [`REFRESH_SPAWNED`]. +async fn connect_channel(endpoint: &str) -> Result { + let channel = build_plain_channel(endpoint).await?; + let slot = if let Some(existing) = TOKEN_SLOT.get() { + existing.clone() + } else { + let acquired = acquire_sandbox_token(endpoint, &channel).await?; + let slot = install_token_slot(&acquired.token)?; + if REFRESH_SPAWNED.set(()).is_ok() { + match acquired.refresh_mode { + RefreshMode::GatewayJwt => { + let refresh_channel = InterceptedService::new( + channel.clone(), + AuthInterceptor::new(slot.clone()), + ); + let refresh_slot = slot.clone(); + tokio::spawn(async move { + refresh_token_loop(refresh_channel, refresh_slot).await; + }); + } + RefreshMode::Spiffe(source) => { + let refresh_slot = slot.clone(); + tokio::spawn(async move { + refresh_spiffe_token_loop(source, refresh_slot).await; + }); + } + } + } + slot + }; + let intercepted = InterceptedService::new(channel, AuthInterceptor::new(slot)); + Ok(intercepted) +} + +/// Resolve the sandbox JWT used to authenticate every outbound RPC. +/// +/// `endpoint` is logged on errors but never used for transport here; the +/// actual network call lives inside this function only on the K8s +/// bootstrap path, which uses `plain_channel` to call `IssueSandboxToken` +/// once before the steady-state Bearer-authenticated channel is built. +async fn acquire_sandbox_token(endpoint: &str, plain_channel: &Channel) -> Result { + if let Ok(t) = std::env::var(sandbox_env::SANDBOX_TOKEN) + && !t.is_empty() + { + debug!(source = "env", "loaded sandbox token"); + return Ok(AcquiredToken { + token: t, + refresh_mode: RefreshMode::GatewayJwt, + }); + } + + if let Ok(path) = std::env::var(sandbox_env::SANDBOX_TOKEN_FILE) + && !path.is_empty() + { + let contents = std::fs::read_to_string(&path) + .into_diagnostic() + .wrap_err_with(|| format!("failed to read sandbox token from {path}"))?; + debug!(source = "file", path = %path, "loaded sandbox token"); + return Ok(AcquiredToken { + token: contents.trim().to_string(), + refresh_mode: RefreshMode::GatewayJwt, + }); + } + + if let Some(source) = spiffe_token_source_from_env()? { + info!( + socket = %source.socket_path.display(), + audience = %source.audience, + spiffe_id = source.spiffe_id.as_deref().unwrap_or(""), + "fetching SPIFFE JWT-SVID for sandbox gateway authentication" + ); + let token = fetch_spiffe_jwt_svid(&source).await?; + return Ok(AcquiredToken { + token, + refresh_mode: RefreshMode::Spiffe(source), + }); + } + + if let Ok(sa_path) = std::env::var(sandbox_env::K8S_SA_TOKEN_FILE) + && !sa_path.is_empty() + { + let sa_token = std::fs::read_to_string(&sa_path) + .into_diagnostic() + .wrap_err_with(|| format!("failed to read K8s SA token from {sa_path}"))? + .trim() + .to_string(); + info!(endpoint = %endpoint, "exchanging K8s ServiceAccount token for sandbox JWT"); + // The bootstrap exchange uses a one-off interceptor pinned to the + // SA token; the resulting gateway JWT becomes the value in the + // shared `TOKEN_SLOT` once `connect_channel` returns. + let bootstrap_slot: TokenSlot = Arc::new(RwLock::new( + AsciiMetadataValue::try_from(format!("Bearer {sa_token}")) + .into_diagnostic() + .wrap_err("SA token contained characters not valid for a header value")?, + )); + let interceptor = AuthInterceptor::new(bootstrap_slot); + let bootstrap = InterceptedService::new(plain_channel.clone(), interceptor); + let mut client = OpenShellClient::new(bootstrap); + let resp = client + .issue_sandbox_token(IssueSandboxTokenRequest {}) + .await + .into_diagnostic() + .wrap_err("IssueSandboxToken bootstrap exchange failed")?; + return Ok(AcquiredToken { + token: resp.into_inner().token, + refresh_mode: RefreshMode::GatewayJwt, + }); + } + + Err(miette::miette!( + "no sandbox token source available — set one of {}, {}, {}, or {}", + sandbox_env::SANDBOX_TOKEN, + sandbox_env::SANDBOX_TOKEN_FILE, + sandbox_env::SPIFFE_WORKLOAD_API_SOCKET, + sandbox_env::K8S_SA_TOKEN_FILE, + )) +} + +fn spiffe_token_source_from_env() -> Result> { + let Ok(socket_path) = std::env::var(sandbox_env::SPIFFE_WORKLOAD_API_SOCKET) else { + return Ok(None); + }; + if socket_path.trim().is_empty() { + return Ok(None); + } + let audience = std::env::var(sandbox_env::SPIFFE_AUDIENCE) + .unwrap_or_else(|_| "openshell-gateway".to_string()); + if audience.trim().is_empty() { + return Err(miette::miette!( + "{} must not be empty when {} is set", + sandbox_env::SPIFFE_AUDIENCE, + sandbox_env::SPIFFE_WORKLOAD_API_SOCKET, + )); + } + let spiffe_id = std::env::var(sandbox_env::SPIFFE_ID) + .ok() + .map(|value| value.trim().to_string()) + .filter(|value| !value.is_empty()); + Ok(Some(SpiffeTokenSource { + socket_path: PathBuf::from(socket_path), + audience, + spiffe_id, + })) +} + +async fn fetch_spiffe_jwt_svid(source: &SpiffeTokenSource) -> Result { + let endpoint = spiffe_workload_api_endpoint(&source.socket_path); + let client = WorkloadApiClient::connect_to(&endpoint) + .await + .into_diagnostic() + .wrap_err_with(|| { + format!("failed to connect to SPIFFE Workload API endpoint {endpoint}") + })?; + let requested_spiffe_id = source + .spiffe_id + .as_deref() + .map(SpiffeId::try_from) + .transpose() + .into_diagnostic() + .wrap_err("invalid SPIFFE ID requested for JWT-SVID")?; + client + .fetch_jwt_token([source.audience.as_str()], requested_spiffe_id.as_ref()) + .await + .into_diagnostic() + .wrap_err("SPIFFE FetchJWTSVID failed") +} + +fn spiffe_workload_api_endpoint(path: &std::path::Path) -> String { + let path = path.to_string_lossy(); + if path.starts_with("unix:") || path.starts_with("tcp:") { + path.into_owned() + } else { + format!("unix:{path}") + } +} + +/// Build an authenticated channel for direct external use (e.g. the +/// long-lived `supervisor_session` control stream). +pub async fn connect_channel_pub(endpoint: &str) -> Result { connect_channel(endpoint).await } +/// Background task that rotates the sandbox JWT at ~80% of its remaining +/// lifetime. The new token replaces the value in [`TOKEN_SLOT`], so all +/// in-flight and future clients pick it up on their next request. The +/// loop never panics: every failure is logged and re-attempted after a +/// bounded backoff. +async fn refresh_token_loop(channel: AuthedChannel, slot: TokenSlot) { + let mut client = OpenShellClient::new(channel); + loop { + let sleep = compute_refresh_delay(&slot); + tokio::time::sleep(sleep).await; + match client + .refresh_sandbox_token(RefreshSandboxTokenRequest {}) + .await + { + Ok(resp) => { + let new_token = resp.into_inner().token; + match AsciiMetadataValue::try_from(format!("Bearer {new_token}")) { + Ok(value) => { + if let Ok(mut guard) = slot.write() { + *guard = value; + info!("rotated gateway sandbox JWT in-place"); + } + } + Err(e) => warn!(error = %e, "refreshed JWT contained invalid header bytes"), + } + } + Err(status) => { + warn!(error = %status, "RefreshSandboxToken failed; will retry"); + // Backoff so we don't spin against a sustained failure. + tokio::time::sleep(Duration::from_secs(60)).await; + } + } + } +} + +async fn refresh_spiffe_token_loop(source: SpiffeTokenSource, slot: TokenSlot) { + loop { + let sleep = compute_refresh_delay(&slot); + tokio::time::sleep(sleep).await; + match fetch_spiffe_jwt_svid(&source).await { + Ok(new_token) => match AsciiMetadataValue::try_from(format!("Bearer {new_token}")) { + Ok(value) => { + if let Ok(mut guard) = slot.write() { + *guard = value; + info!("rotated SPIFFE JWT-SVID in-place"); + } + } + Err(e) => { + warn!(error = %e, "refreshed SPIFFE JWT-SVID contained invalid header bytes"); + } + }, + Err(err) => { + warn!(error = %err, "SPIFFE FetchJWTSVID failed; will retry"); + tokio::time::sleep(Duration::from_secs(60)).await; + } + } + } +} + +/// Compute the next refresh delay: 80 % of the time remaining until the +/// current token's `exp`, plus up to 10 % jitter, floored at 60 s and +/// capped at 12 h. If the token can't be parsed (legacy/non-JWT bearer) +/// default to 6 h. +fn compute_refresh_delay(slot: &TokenSlot) -> Duration { + let token = slot + .read() + .ok() + .and_then(|v| v.to_str().ok().map(str::to_string)) + .unwrap_or_default(); + let bearer = token.strip_prefix("Bearer ").unwrap_or(&token); + let now_ms = i64::try_from( + SystemTime::now() + .duration_since(UNIX_EPOCH) + .map_or(0, |d| d.as_millis()), + ) + .unwrap_or(i64::MAX); + let remaining_ms = parse_jwt_exp_ms(bearer).map_or(21_600_000, |exp| exp - now_ms); // 6 h fallback + let mut delay_ms = (remaining_ms.max(0) * 8 / 10).clamp(60_000, 43_200_000); + // Up to 10 % jitter, derived deterministically from token bytes so + // unit tests are reproducible without injecting an RNG. + let jitter_pct = (token.len() % 10) as u64; + let jitter_ms = (u64::try_from(delay_ms).unwrap_or(0) * jitter_pct) / 100; + delay_ms = delay_ms.saturating_add(i64::try_from(jitter_ms).unwrap_or(0)); + Duration::from_millis(u64::try_from(delay_ms).unwrap_or(0)) +} + +/// Decode the `exp` claim from a JWT without verifying its signature. +/// Returns the expiry in milliseconds since the Unix epoch, or `None` if +/// the token is not a parseable JWT. +fn parse_jwt_exp_ms(jwt: &str) -> Option { + use base64::Engine; + let mut parts = jwt.splitn(3, '.'); + let _header = parts.next()?; + let payload_b64 = parts.next()?; + let decoded = base64::engine::general_purpose::URL_SAFE_NO_PAD + .decode(payload_b64) + .ok()?; + let value: serde_json::Value = serde_json::from_slice(&decoded).ok()?; + let exp_secs = value.get("exp")?.as_i64()?; + exp_secs.checked_mul(1000) +} + +#[cfg(test)] +mod auth_tests { + use super::*; + + #[test] + fn parse_jwt_exp_reads_unsigned_payload() { + use base64::Engine as _; + let payload = base64::engine::general_purpose::URL_SAFE_NO_PAD + .encode(br#"{"exp":1234567890,"sandbox_id":"sb-1"}"#); + let token = format!("h.{payload}.sig"); + assert_eq!(parse_jwt_exp_ms(&token), Some(1_234_567_890_000)); + } + + #[test] + fn parse_jwt_exp_returns_none_for_malformed_token() { + assert!(parse_jwt_exp_ms("not-a-jwt").is_none()); + assert!(parse_jwt_exp_ms("only.two").is_none()); + assert!(parse_jwt_exp_ms("a.!!!.c").is_none()); + } + + #[test] + fn compute_refresh_delay_uses_80_percent_when_token_present() { + // Build a JWT whose exp is 1000 seconds in the future. With 0-jitter + // the delay should be roughly 800 seconds. + use base64::Engine as _; + let now_s = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_secs(); + let exp = now_s + 1000; + let payload_json = format!(r#"{{"exp":{exp}}}"#); + let payload = base64::engine::general_purpose::URL_SAFE_NO_PAD.encode(payload_json); + let token = format!("h.{payload}.s"); + let bearer = AsciiMetadataValue::try_from(format!("Bearer {token}")).unwrap(); + let slot: TokenSlot = Arc::new(RwLock::new(bearer)); + let delay = compute_refresh_delay(&slot); + // 800 s baseline + up to 10 % jitter → 800..=880 s, with some slack + // for the 1-second resolution of the exp claim. + let secs = delay.as_secs(); + assert!( + (700..=900).contains(&secs), + "expected 80%-of-1000s delay, got {secs}s" + ); + } + + #[test] + fn compute_refresh_delay_floors_at_60_seconds() { + // Already-expired token still produces a 60 s floor so the loop + // doesn't busy-spin. + use base64::Engine as _; + let exp = 1; // past + let payload = + base64::engine::general_purpose::URL_SAFE_NO_PAD.encode(format!(r#"{{"exp":{exp}}}"#)); + let token = format!("h.{payload}.s"); + let bearer = AsciiMetadataValue::try_from(format!("Bearer {token}")).unwrap(); + let slot: TokenSlot = Arc::new(RwLock::new(bearer)); + let delay = compute_refresh_delay(&slot); + assert!(delay.as_secs() >= 60); + } +} + /// Connect to the `OpenShell` server. -/// -/// Sandboxes authenticate to the gateway via the mTLS client certificate -/// configured by `connect_channel`. They do not present an OIDC Bearer -/// token; the gateway recognises sandbox-class callers by absence of a -/// Bearer header on the request. -async fn connect(endpoint: &str) -> Result> { +async fn connect(endpoint: &str) -> Result> { let channel = connect_channel(endpoint).await?; Ok(OpenShellClient::new(channel)) } /// Connect to the inference service. -async fn connect_inference(endpoint: &str) -> Result> { +async fn connect_inference(endpoint: &str) -> Result> { let channel = connect_channel(endpoint).await?; Ok(InferenceClient::new(channel)) } @@ -118,7 +561,7 @@ pub async fn fetch_policy(endpoint: &str, sandbox_id: &str) -> Result, + client: &mut OpenShellClient, sandbox_id: &str, ) -> Result> { let response = client @@ -142,7 +585,7 @@ async fn fetch_policy_with_client( /// Sync a locally-discovered policy using an existing client connection. async fn sync_policy_with_client( - client: &mut OpenShellClient, + client: &mut OpenShellClient, sandbox: &str, policy: &ProtoSandboxPolicy, ) -> Result<()> { @@ -236,7 +679,7 @@ pub async fn fetch_provider_environment( /// and status reporting, avoiding per-request TLS handshake overhead. #[derive(Clone)] pub struct CachedOpenShellClient { - client: OpenShellClient, + client: OpenShellClient, } /// Settings poll result returned by [`CachedOpenShellClient::poll_settings`]. @@ -266,7 +709,7 @@ impl CachedOpenShellClient { } /// Get a clone of the underlying tonic client for direct RPC calls. - pub fn raw_client(&self) -> OpenShellClient { + pub fn raw_client(&self) -> OpenShellClient { self.client.clone() } diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index e297b9262..a9ea57fe0 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -7,6 +7,7 @@ pub mod bypass_monitor; mod child_env; +pub mod debug_rpc; pub mod denial_aggregator; mod grpc_client; mod identity; diff --git a/crates/openshell-sandbox/src/main.rs b/crates/openshell-sandbox/src/main.rs index 4a6cb1955..3c9e21578 100644 --- a/crates/openshell-sandbox/src/main.rs +++ b/crates/openshell-sandbox/src/main.rs @@ -24,6 +24,15 @@ use openshell_sandbox::run_sandbox; /// performs the copy in pure Rust. const COPY_SELF_SUBCOMMAND: &str = "copy-self"; +/// Subcommand for one-shot debug RPCs from inside a sandbox container. +/// +/// Reads the same token sources as the supervisor (env, file, K8s SA +/// bootstrap) and issues a single gRPC call against the gateway. Useful +/// for end-to-end verification: e.g. `docker exec` into a sandbox, then +/// run `openshell-sandbox debug-rpc get-sandbox-config --sandbox-id ` +/// to confirm the cross-sandbox IDOR guard fires. +const DEBUG_RPC_SUBCOMMAND: &str = "debug-rpc"; + /// `OpenShell` Sandbox - process isolation and monitoring. #[derive(Parser, Debug)] #[command(name = "openshell-sandbox")] @@ -150,6 +159,20 @@ fn main() -> Result<()> { return copy_self(dest); } + // Handle `debug-rpc [args]` before clap. Uses a small + // dedicated runtime so we don't pay the supervisor's full startup cost. + if raw_args.get(1).map(String::as_str) == Some(DEBUG_RPC_SUBCOMMAND) { + let runtime = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .into_diagnostic()?; + return runtime.block_on(async move { + let _ = rustls::crypto::ring::default_provider().install_default(); + let exit = openshell_sandbox::debug_rpc::run(&raw_args[2..]).await?; + std::process::exit(exit); + }); + } + let args = Args::parse(); // Try to open a rolling log file; fall back to stderr-only logging if it fails diff --git a/crates/openshell-sandbox/src/process.rs b/crates/openshell-sandbox/src/process.rs index 3d2f6d576..8c6eb77f3 100644 --- a/crates/openshell-sandbox/src/process.rs +++ b/crates/openshell-sandbox/src/process.rs @@ -155,6 +155,15 @@ impl ProcessHandle { .kill_on_drop(true) .env(openshell_core::sandbox_env::SANDBOX, "1"); + // Strip supervisor-only credentials from the entrypoint's inherited + // environment. The entrypoint drops to the sandbox user before + // `exec`; without this strip, anything running as the sandbox user + // (e.g. an SSH-spawned shell) could read /proc//environ + // and recover the gateway-minted JWT. Issue #1354. + cmd.env_remove(openshell_core::sandbox_env::SANDBOX_TOKEN) + .env_remove(openshell_core::sandbox_env::SANDBOX_TOKEN_FILE) + .env_remove(openshell_core::sandbox_env::K8S_SA_TOKEN_FILE); + inject_provider_env(&mut cmd, provider_env); if let Some(dir) = workdir { @@ -281,6 +290,15 @@ impl ProcessHandle { .kill_on_drop(true) .env(openshell_core::sandbox_env::SANDBOX, "1"); + // Strip supervisor-only credentials from the entrypoint's inherited + // environment. The entrypoint drops to the sandbox user before + // `exec`; without this strip, anything running as the sandbox user + // (e.g. an SSH-spawned shell) could read /proc//environ + // and recover the gateway-minted JWT. Issue #1354. + cmd.env_remove(openshell_core::sandbox_env::SANDBOX_TOKEN) + .env_remove(openshell_core::sandbox_env::SANDBOX_TOKEN_FILE) + .env_remove(openshell_core::sandbox_env::K8S_SA_TOKEN_FILE); + inject_provider_env(&mut cmd, provider_env); if let Some(dir) = workdir { diff --git a/crates/openshell-sandbox/src/supervisor_session.rs b/crates/openshell-sandbox/src/supervisor_session.rs index 6485dddf0..4d7392ee3 100644 --- a/crates/openshell-sandbox/src/supervisor_session.rs +++ b/crates/openshell-sandbox/src/supervisor_session.rs @@ -28,7 +28,6 @@ use openshell_ocsf::{ use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt}; use tokio::sync::mpsc; use tokio_stream::StreamExt; -use tonic::transport::Channel; use tracing::{debug, warn}; use crate::grpc_client; @@ -371,7 +370,7 @@ fn handle_gateway_message( sandbox_id: &str, ssh_socket_path: &std::path::Path, netns_fd: Option, - channel: &Channel, + channel: &grpc_client::AuthedChannel, tx: &mpsc::Sender, ) { match &msg.payload { @@ -436,7 +435,7 @@ async fn handle_relay_open( relay_open: RelayOpen, ssh_socket_path: &std::path::Path, netns_fd: Option, - channel: Channel, + channel: grpc_client::AuthedChannel, tx: mpsc::Sender, ) -> Result<(), Box> { let channel_id = relay_open.channel_id.clone(); diff --git a/crates/openshell-server/Cargo.toml b/crates/openshell-server/Cargo.toml index 4bbfe24fc..82920e1e1 100644 --- a/crates/openshell-server/Cargo.toml +++ b/crates/openshell-server/Cargo.toml @@ -33,7 +33,7 @@ k8s-openapi = { workspace = true } tokio = { workspace = true } # gRPC -tonic = { workspace = true, features = ["channel", "tls"] } +tonic = { workspace = true, features = ["channel", "tls-native-roots"] } prost = { workspace = true } prost-types = { workspace = true } @@ -82,6 +82,9 @@ uuid = { workspace = true } hmac = "0.12" sha2 = { workspace = true } jsonwebtoken = { workspace = true } +spiffe = { workspace = true } +async-trait = "0.1" +url = { workspace = true } hex = "0.4" russh = "0.57" rand = { workspace = true } diff --git a/crates/openshell-server/src/auth/authenticator.rs b/crates/openshell-server/src/auth/authenticator.rs new file mode 100644 index 000000000..ee11f8f35 --- /dev/null +++ b/crates/openshell-server/src/auth/authenticator.rs @@ -0,0 +1,277 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Pluggable authentication trait + chain dispatch. +//! +//! The gateway runs every authenticated request through an +//! [`AuthenticatorChain`] of [`Authenticator`] implementations. The chain +//! evaluates authenticators in order; the first one that recognizes the +//! caller produces the [`Principal`]. An authenticator that does not apply +//! (e.g. an OIDC authenticator seeing no Bearer header) returns `Ok(None)` +//! so the chain falls through to the next. An authenticator that *does* +//! apply but rejects the caller returns `Err(Status)`, which terminates +//! the chain — fail-closed. +//! +//! Live authenticators slotting into the chain: +//! - [`super::sandbox_jwt::SandboxJwtAuthenticator`] — gateway-minted JWTs +//! - [`super::k8s_sa::K8sServiceAccountAuthenticator`] — K8s projected SA +//! tokens (path-scoped to `IssueSandboxToken`) +//! - [`super::oidc::OidcAuthenticator`] — user OIDC Bearer tokens +//! - [`PermissiveUserAuthenticator`] — final-fallback dev-mode catch-all +//! that produces a synthetic user principal when no OIDC is +//! configured. Preserves the pre-PR-1 "no OIDC = open" posture for +//! singleplayer / helm-dev deployments. + +use super::identity::{Identity, IdentityProvider}; +use super::principal::{Principal, UserPrincipal}; +use async_trait::async_trait; +use std::sync::Arc; +use tonic::Status; + +/// Pluggable authentication step. +/// +/// Implementations are expected to be cheap to clone (they live behind +/// `Arc` inside an [`AuthenticatorChain`]). +#[async_trait] +pub trait Authenticator: Send + Sync + 'static { + /// Inspect an inbound request and return the authenticated principal. + /// + /// - `Ok(Some(principal))` — this authenticator recognized the caller. + /// The chain stops and the principal is inserted into request + /// extensions. + /// - `Ok(None)` — this authenticator does not apply (e.g. no Bearer + /// token for an OIDC authenticator). The chain falls through to + /// the next authenticator. + /// - `Err(status)` — this authenticator applies but rejected the + /// caller. The chain terminates and the status is returned to the + /// client. Fail-closed. + async fn authenticate( + &self, + headers: &http::HeaderMap, + path: &str, + ) -> Result, Status>; +} + +/// First-match-wins authenticator chain. +/// +/// The chain owns its authenticators behind `Arc` so the entire chain is +/// cheap to clone — required because `tower::Service::call` clones the +/// router on every request. +#[derive(Clone)] +pub struct AuthenticatorChain { + authenticators: Arc<[Arc]>, +} + +impl AuthenticatorChain { + /// Build a chain from an ordered list of authenticators. Earlier + /// entries are evaluated first. + pub fn new(authenticators: Vec>) -> Self { + Self { + authenticators: Arc::from(authenticators), + } + } + + /// Run the chain. Returns the first principal produced. If every + /// authenticator returns `Ok(None)`, the result is `Ok(None)` — the + /// router translates that to `unauthenticated`. + pub async fn authenticate( + &self, + headers: &http::HeaderMap, + path: &str, + ) -> Result, Status> { + for authenticator in self.authenticators.iter() { + if let Some(principal) = authenticator.authenticate(headers, path).await? { + return Ok(Some(principal)); + } + } + Ok(None) + } +} + +impl std::fmt::Debug for AuthenticatorChain { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("AuthenticatorChain") + .field("len", &self.authenticators.len()) + .finish() + } +} + +/// Final-fallback authenticator that produces a synthetic user principal +/// for any request the earlier authenticators didn't claim. Used only +/// when no user-side authentication is configured (no OIDC, no fronting +/// proxy contract) — the pre-PR-1 gateway accepted such requests with +/// no auth at all; this preserves that posture in a principal-aware +/// way so handlers always see *some* principal in extensions. +/// +/// Producing a User principal (rather than Anonymous) means dev-mode +/// requests pass the per-handler IDOR guard via the User-bypass +/// branch — equivalent to "RBAC was the user's gate" with the dev +/// default of "every caller is a user." +pub struct PermissiveUserAuthenticator { + subject: String, +} + +impl PermissiveUserAuthenticator { + pub fn new(subject: impl Into) -> Self { + Self { + subject: subject.into(), + } + } +} + +#[async_trait] +impl Authenticator for PermissiveUserAuthenticator { + async fn authenticate( + &self, + _headers: &http::HeaderMap, + _path: &str, + ) -> Result, Status> { + Ok(Some(Principal::User(UserPrincipal { + identity: Identity { + subject: self.subject.clone(), + display_name: None, + roles: vec![], + scopes: vec![], + provider: IdentityProvider::Internal, + }, + }))) + } +} + +#[cfg(test)] +pub mod test_support { + use super::*; + use std::sync::Mutex; + + /// Authenticator that always returns the configured outcome. Used by + /// tests to inject a known principal (or rejection) without running real + /// crypto. Each call records the path it was invoked with so tests can + /// assert chain ordering. + pub struct MockAuthenticator { + pub outcome: Result, Status>, + pub calls: Mutex>, + } + + impl MockAuthenticator { + pub fn returning(outcome: Result, Status>) -> Self { + Self { + outcome, + calls: Mutex::new(Vec::new()), + } + } + + pub fn call_count(&self) -> usize { + self.calls.lock().unwrap().len() + } + } + + #[async_trait] + impl Authenticator for MockAuthenticator { + async fn authenticate( + &self, + _headers: &http::HeaderMap, + path: &str, + ) -> Result, Status> { + self.calls.lock().unwrap().push(path.to_string()); + self.outcome.clone() + } + } +} + +#[cfg(test)] +mod tests { + use super::test_support::MockAuthenticator; + use super::*; + use crate::auth::identity::{Identity, IdentityProvider}; + use crate::auth::principal::UserPrincipal; + + fn user_principal(subject: &str) -> Principal { + Principal::User(UserPrincipal { + identity: Identity { + subject: subject.to_string(), + display_name: None, + roles: vec![], + scopes: vec![], + provider: IdentityProvider::Oidc, + }, + }) + } + + #[tokio::test] + async fn chain_returns_first_match() { + let first = Arc::new(MockAuthenticator::returning(Ok(Some(user_principal( + "alice", + ))))); + let second = Arc::new(MockAuthenticator::returning(Ok(Some(user_principal( + "bob", + ))))); + let chain = AuthenticatorChain::new(vec![first.clone(), second.clone()]); + let result = chain + .authenticate(&http::HeaderMap::new(), "/some/path") + .await + .unwrap() + .expect("expected a principal"); + match result { + Principal::User(u) => assert_eq!(u.identity.subject, "alice"), + _ => panic!("expected user principal"), + } + assert_eq!(first.call_count(), 1); + assert_eq!( + second.call_count(), + 0, + "second authenticator must be skipped after first matches" + ); + } + + #[tokio::test] + async fn chain_falls_through_on_none() { + let first = Arc::new(MockAuthenticator::returning(Ok(None))); + let second = Arc::new(MockAuthenticator::returning(Ok(Some(user_principal( + "bob", + ))))); + let chain = AuthenticatorChain::new(vec![first.clone(), second.clone()]); + let result = chain + .authenticate(&http::HeaderMap::new(), "/some/path") + .await + .unwrap() + .expect("expected a principal"); + match result { + Principal::User(u) => assert_eq!(u.identity.subject, "bob"), + _ => panic!("expected user principal"), + } + assert_eq!(first.call_count(), 1); + assert_eq!(second.call_count(), 1); + } + + #[tokio::test] + async fn chain_fails_closed_on_first_error() { + let first = Arc::new(MockAuthenticator::returning(Err(Status::unauthenticated( + "bad token", + )))); + let second = Arc::new(MockAuthenticator::returning(Ok(Some(user_principal( + "bob", + ))))); + let chain = AuthenticatorChain::new(vec![first.clone(), second.clone()]); + let err = chain + .authenticate(&http::HeaderMap::new(), "/some/path") + .await + .expect_err("must short-circuit on error"); + assert_eq!(err.code(), tonic::Code::Unauthenticated); + assert_eq!(first.call_count(), 1); + assert_eq!( + second.call_count(), + 0, + "must not consult later authenticators after an error" + ); + } + + #[tokio::test] + async fn empty_chain_returns_none() { + let chain = AuthenticatorChain::new(vec![]); + let result = chain + .authenticate(&http::HeaderMap::new(), "/some/path") + .await + .unwrap(); + assert!(result.is_none()); + } +} diff --git a/crates/openshell-server/src/auth/guard.rs b/crates/openshell-server/src/auth/guard.rs new file mode 100644 index 000000000..f5cdb8131 --- /dev/null +++ b/crates/openshell-server/src/auth/guard.rs @@ -0,0 +1,137 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Per-handler sandbox-scope guards. +//! +//! Closes the IDOR half of issue #1354: a sandbox principal may only +//! reference its own sandbox, identified by its [`Principal::Sandbox`]'s +//! `sandbox_id`. User principals retain the broad scope the RBAC layer +//! already evaluated. + +use super::principal::Principal; +use tonic::Status; +use tracing::info; + +/// Reject a sandbox-class request whose body references a sandbox other +/// than the one the calling principal was authenticated against. +/// +/// - [`Principal::User`] passes through (RBAC has already evaluated user +/// scope at the router level). +/// - [`Principal::Sandbox`] must reference the same canonical UUID it +/// was authenticated with. +/// - [`Principal::Anonymous`] is rejected — sandbox-class methods are +/// never anonymously callable. +/// +/// `claimed_sandbox_id` is the canonical UUID the request is operating +/// on. Name-keyed handlers must resolve the name to a UUID via the +/// store before calling this guard. +#[allow(clippy::result_large_err)] +pub fn ensure_sandbox_scope(principal: &Principal, claimed_sandbox_id: &str) -> Result<(), Status> { + match principal { + Principal::User(_) => Ok(()), + Principal::Sandbox(p) => { + if p.sandbox_id == claimed_sandbox_id { + Ok(()) + } else { + info!( + principal_sandbox_id = %p.sandbox_id, + requested_sandbox_id = %claimed_sandbox_id, + "cross-sandbox access denied" + ); + Err(Status::permission_denied( + "cross-sandbox access denied: principal does not own this sandbox", + )) + } + } + Principal::Anonymous => Err(Status::unauthenticated( + "sandbox-scoped methods require an authenticated caller", + )), + } +} + +/// Convenience: read the `Principal` out of a request and apply +/// [`ensure_sandbox_scope`]. Returns the principal so callers can read it +/// further (e.g. for audit logging). +#[allow(clippy::result_large_err)] +pub fn enforce_sandbox_scope( + request: &tonic::Request, + claimed_sandbox_id: &str, +) -> Result { + let principal = request + .extensions() + .get::() + .cloned() + .ok_or_else(|| Status::unauthenticated("missing principal"))?; + ensure_sandbox_scope(&principal, claimed_sandbox_id)?; + Ok(principal) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::auth::identity::{Identity, IdentityProvider}; + use crate::auth::principal::{SandboxIdentitySource, SandboxPrincipal, UserPrincipal}; + + fn user(subject: &str) -> Principal { + Principal::User(UserPrincipal { + identity: Identity { + subject: subject.to_string(), + display_name: None, + roles: vec![], + scopes: vec![], + provider: IdentityProvider::Oidc, + }, + }) + } + + fn sandbox(id: &str) -> Principal { + Principal::Sandbox(SandboxPrincipal { + sandbox_id: id.to_string(), + source: SandboxIdentitySource::BootstrapJwt { + issuer: "openshell-gateway:test".to_string(), + jti: "j-1".to_string(), + }, + trust_domain: Some("openshell".to_string()), + }) + } + + #[test] + fn user_principal_bypasses_equality_check() { + // RBAC was the user's gate at the router layer. + assert!(ensure_sandbox_scope(&user("alice"), "any-sandbox").is_ok()); + } + + #[test] + fn sandbox_principal_matching_id_is_allowed() { + assert!(ensure_sandbox_scope(&sandbox("sbx-1"), "sbx-1").is_ok()); + } + + #[test] + fn sandbox_principal_mismatched_id_is_denied() { + let err = + ensure_sandbox_scope(&sandbox("sbx-1"), "sbx-2").expect_err("must deny cross-sandbox"); + assert_eq!(err.code(), tonic::Code::PermissionDenied); + } + + #[test] + fn anonymous_principal_is_rejected() { + let err = + ensure_sandbox_scope(&Principal::Anonymous, "sbx-1").expect_err("must reject anon"); + assert_eq!(err.code(), tonic::Code::Unauthenticated); + } + + #[test] + fn enforce_reads_from_request_extensions() { + let mut req = tonic::Request::new(()); + req.extensions_mut().insert(sandbox("sbx-1")); + let result = enforce_sandbox_scope(&req, "sbx-1").expect("scope OK"); + assert!(matches!(result, Principal::Sandbox(_))); + } + + #[test] + fn enforce_rejects_request_without_principal() { + let req = tonic::Request::new(()); + let err = enforce_sandbox_scope(&req, "sbx-1").expect_err("must require principal"); + assert_eq!(err.code(), tonic::Code::Unauthenticated); + } +} diff --git a/crates/openshell-server/src/auth/k8s_sa.rs b/crates/openshell-server/src/auth/k8s_sa.rs new file mode 100644 index 000000000..be625c703 --- /dev/null +++ b/crates/openshell-server/src/auth/k8s_sa.rs @@ -0,0 +1,598 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Kubernetes `ServiceAccount` bootstrap authenticator. +//! +//! Path-scoped to `IssueSandboxToken`. Validates a projected SA token +//! presented by a sandbox pod, reads the pod's `openshell.io/sandbox-id` +//! annotation, and returns a [`Principal::Sandbox`] with +//! [`SandboxIdentitySource::K8sServiceAccount`]. The `IssueSandboxToken` +//! handler then mints a gateway-signed JWT for that sandbox id; subsequent +//! gRPC calls from the supervisor use the gateway-minted JWT validated by +//! [`super::sandbox_jwt::SandboxJwtAuthenticator`]. +//! +//! This is the only authenticator that talks to the K8s apiserver. It is +//! optional — the gateway boots without it in singleplayer deployments. + +use super::authenticator::Authenticator; +use super::principal::{Principal, SandboxIdentitySource, SandboxPrincipal}; +use async_trait::async_trait; +use jsonwebtoken::{Algorithm, DecodingKey, Validation, decode, decode_header}; +use k8s_openapi::api::core::v1::Pod; +use kube::api::Api; +use serde::Deserialize; +use std::collections::HashMap; +use std::sync::Arc; +use tokio::sync::{Mutex, RwLock}; +use tonic::Status; +use tracing::{debug, info, warn}; + +/// gRPC method path that this authenticator accepts. All other paths fall +/// through (return `Ok(None)`) so a gateway-minted JWT is required there. +pub const ISSUE_SANDBOX_TOKEN_PATH: &str = "/openshell.v1.OpenShell/IssueSandboxToken"; + +/// Pod annotation that binds a sandbox pod to its UUID. Set by the +/// Kubernetes compute driver at pod-create time. The gateway treats this +/// annotation as authoritative; the K8s `Role` granted to the gateway must +/// not include `patch pods` (see plan §11.8). +pub const SANDBOX_ID_ANNOTATION: &str = "openshell.io/sandbox-id"; + +/// Resolved identity extracted from a validated SA token + pod lookup. +#[derive(Debug, Clone)] +pub struct ResolvedK8sIdentity { + pub sandbox_id: String, + pub pod_name: String, + pub pod_uid: String, +} + +/// Apiserver-facing operations the authenticator depends on. Split out so +/// tests can fake the apiserver without standing up a kube cluster. +#[async_trait] +pub trait K8sIdentityResolver: Send + Sync + 'static { + /// Validate `token` via `TokenReview` (`aud == openshell-gateway`), + /// extract the pod name/uid, then `GET` the pod and read + /// `openshell.io/sandbox-id`. Returns `Ok(None)` when the token is + /// well-formed but does not authenticate (e.g. wrong audience); returns + /// `Err` for transport/server errors. + async fn resolve(&self, token: &str) -> Result, Status>; +} + +/// Authenticator wrapper around a [`K8sIdentityResolver`]. +pub struct K8sServiceAccountAuthenticator { + resolver: Arc, +} + +impl std::fmt::Debug for K8sServiceAccountAuthenticator { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("K8sServiceAccountAuthenticator") + .finish_non_exhaustive() + } +} + +impl K8sServiceAccountAuthenticator { + pub fn new(resolver: Arc) -> Self { + Self { resolver } + } +} + +#[async_trait] +impl Authenticator for K8sServiceAccountAuthenticator { + async fn authenticate( + &self, + headers: &http::HeaderMap, + path: &str, + ) -> Result, Status> { + // Scope: only the bootstrap RPC. Other paths fall through so the + // SandboxJwtAuthenticator (or OIDC) handles them. + if path != ISSUE_SANDBOX_TOKEN_PATH { + return Ok(None); + } + + let Some(token) = headers + .get("authorization") + .and_then(|v| v.to_str().ok()) + .and_then(|v| v.strip_prefix("Bearer ")) + else { + return Ok(None); + }; + + let Some(resolved) = self.resolver.resolve(token).await? else { + debug!("K8s SA token did not authenticate; falling through"); + return Ok(None); + }; + + if resolved.sandbox_id.is_empty() { + warn!( + pod = %resolved.pod_name, + "pod missing openshell.io/sandbox-id annotation; rejecting" + ); + return Err(Status::permission_denied( + "pod is not bound to a sandbox identity", + )); + } + + Ok(Some(Principal::Sandbox(SandboxPrincipal { + sandbox_id: resolved.sandbox_id, + source: SandboxIdentitySource::K8sServiceAccount { + pod_name: resolved.pod_name, + pod_uid: resolved.pod_uid, + }, + trust_domain: Some("openshell".to_string()), + }))) + } +} + +/// K8s apiserver discovery document (subset of fields used). +#[derive(Deserialize)] +struct ApiserverDiscovery { + issuer: String, + jwks_uri: String, +} + +/// JWKS key set returned by the apiserver's `/openid/v1/jwks` endpoint. +#[derive(Deserialize)] +struct JwkSet { + keys: Vec, +} + +#[derive(Deserialize)] +struct JwkKey { + kid: Option, + kty: String, + #[serde(default)] + n: String, + #[serde(default)] + e: String, + alg: Option, +} + +/// Claims subset extracted from a validated projected SA token. `exp`, +/// `aud`, and `serviceaccount` are validated by `jsonwebtoken` but we +/// don't read them post-decode — dead-code-allowed so the structural +/// match against the token shape stays explicit. +#[derive(Debug, Deserialize)] +#[allow(dead_code)] +struct K8sSaClaims { + /// `system:serviceaccount::` + sub: String, + iss: String, + /// The audience claim is always an array for projected SA tokens. + #[serde(default)] + aud: Vec, + exp: i64, + #[serde(rename = "kubernetes.io")] + kubernetes: K8sClaim, +} + +#[derive(Debug, Deserialize)] +#[allow(dead_code)] +struct K8sClaim { + namespace: String, + pod: K8sPodClaim, + #[serde(default)] + serviceaccount: Option, +} + +#[derive(Debug, Deserialize)] +struct K8sPodClaim { + name: String, + uid: String, +} + +#[derive(Debug, Deserialize)] +struct K8sSaClaim { + #[allow(dead_code)] + name: String, + #[allow(dead_code)] + uid: String, +} + +/// JWKS cache for the K8s apiserver's projected `ServiceAccount` token +/// issuer. Discovery + key fetch lazily on first validate; subsequent +/// validations are in-process signature checks. Refreshes on `kid` miss +/// so apiserver key rotation propagates without a restart. +pub struct K8sApiserverJwks { + client: kube::Client, + expected_audience: String, + state: RwLock, + refresh: Mutex<()>, +} + +#[derive(Default)] +struct JwksState { + issuer: Option, + jwks_path: Option, + keys: HashMap, +} + +impl K8sApiserverJwks { + pub fn new(client: kube::Client, expected_audience: String) -> Self { + Self { + client, + expected_audience, + state: RwLock::new(JwksState::default()), + refresh: Mutex::new(()), + } + } + + /// Validate `token`, returning the parsed claims on success. + #[allow(clippy::result_large_err)] + async fn validate(&self, token: &str) -> Result { + // Decode the header to find the kid first; we lazily load on demand. + let header = decode_header(token).map_err(|e| { + debug!(error = %e, "K8s SA JWT header decode failed"); + Status::unauthenticated("invalid token") + })?; + let kid = header + .kid + .ok_or_else(|| Status::unauthenticated("invalid token: missing kid"))?; + + let (issuer, key) = if let Some(pair) = self.cached_key(&kid).await { + pair + } else { + self.refresh_keys().await?; + self.cached_key(&kid).await.ok_or_else(|| { + debug!(kid = %kid, "K8s SA JWT kid not found in apiserver JWKS"); + Status::unauthenticated("invalid token: unknown signing key") + })? + }; + + let mut validation = Validation::new(Algorithm::RS256); + validation.algorithms = vec![Algorithm::RS256]; + validation.set_issuer(&[&issuer]); + validation.set_audience(&[&self.expected_audience]); + validation.set_required_spec_claims(&["iss", "aud", "exp", "sub"]); + + let data = decode::(token, &key, &validation).map_err(|e| { + debug!(error = %e, "K8s SA JWT validation failed"); + Status::unauthenticated(format!("invalid SA token: {e}")) + })?; + Ok(data.claims) + } + + async fn cached_key(&self, kid: &str) -> Option<(String, DecodingKey)> { + let state = self.state.read().await; + let issuer = state.issuer.clone()?; + let key = state.keys.get(kid).cloned()?; + Some((issuer, key)) + } + + /// Fetch the discovery document + JWKS and replace the cached state. + /// Coalesces concurrent refreshes so the apiserver sees one fetch. + #[allow(clippy::result_large_err)] + async fn refresh_keys(&self) -> Result<(), Status> { + let _guard = self.refresh.lock().await; + info!("refreshing K8s apiserver JWKS"); + let discovery: ApiserverDiscovery = self + .request_apiserver("/.well-known/openid-configuration") + .await?; + let jwks_path = jwks_path_from_uri(&discovery.jwks_uri).ok_or_else(|| { + Status::internal(format!( + "apiserver returned unusable jwks_uri '{}'", + discovery.jwks_uri + )) + })?; + let jwks: JwkSet = self.request_apiserver(&jwks_path).await?; + let mut keys = HashMap::new(); + for key in &jwks.keys { + if key.kty != "RSA" { + continue; + } + let Some(ref kid) = key.kid else { + continue; + }; + if let Some(alg) = key.alg.as_deref() + && alg != "RS256" + { + continue; + } + match DecodingKey::from_rsa_components(&key.n, &key.e) { + Ok(dk) => { + keys.insert(kid.clone(), dk); + } + Err(e) => warn!(kid = %kid, error = %e, "skipped malformed apiserver JWK"), + } + } + info!( + count = keys.len(), + issuer = %discovery.issuer, + "loaded apiserver JWKS" + ); + let mut state = self.state.write().await; + state.issuer = Some(discovery.issuer); + state.jwks_path = Some(jwks_path); + state.keys = keys; + Ok(()) + } + + #[allow(clippy::result_large_err)] + async fn request_apiserver( + &self, + path: &str, + ) -> Result { + let req = http::Request::builder() + .uri(path) + .body(Vec::new()) + .map_err(|e| Status::internal(format!("apiserver request build: {e}")))?; + self.client + .request::(req) + .await + .map_err(|e| Status::internal(format!("apiserver request failed: {e}"))) + } +} + +/// Pull a path-only URI out of the `jwks_uri` field. The apiserver's +/// discovery doc returns an absolute URL (e.g. +/// `https://kubernetes.default.svc.cluster.local/openid/v1/jwks`); we +/// strip to the path so `kube::Client::request` can be reused. +fn jwks_path_from_uri(uri: &str) -> Option { + if uri.starts_with('/') { + return Some(uri.to_string()); + } + let parsed = url::Url::parse(uri).ok()?; + let mut out = parsed.path().to_string(); + if let Some(q) = parsed.query() { + out.push('?'); + out.push_str(q); + } + Some(out) +} + +/// Resolver backed by the apiserver's JWKS endpoint (for SA-token +/// signature verification) and `kube::Client` (for the per-pod +/// annotation lookup). +pub struct LiveK8sResolver { + jwks: Arc, + pods_api: Api, +} + +impl LiveK8sResolver { + pub fn new(client: kube::Client, namespace: &str, expected_audience: String) -> Self { + let pods_api: Api = Api::namespaced(client.clone(), namespace); + let jwks = Arc::new(K8sApiserverJwks::new(client, expected_audience)); + Self { jwks, pods_api } + } +} + +#[async_trait] +impl K8sIdentityResolver for LiveK8sResolver { + async fn resolve(&self, token: &str) -> Result, Status> { + let claims = match self.jwks.validate(token).await { + Ok(c) => c, + Err(status) if status.code() == tonic::Code::Unauthenticated => { + // Returning Ok(None) lets the chain fall through; the + // outer router then returns Unauthenticated to the client. + return Ok(None); + } + Err(other) => return Err(other), + }; + + debug!( + sub = %claims.sub, + iss = %claims.iss, + pod_name = %claims.kubernetes.pod.name, + "validated K8s SA token" + ); + + // Look up the pod and read its sandbox-id annotation. + let pod = self + .pods_api + .get_opt(&claims.kubernetes.pod.name) + .await + .map_err(|e| { + warn!( + pod = %claims.kubernetes.pod.name, + error = %e, + "failed to fetch sandbox pod for annotation lookup" + ); + Status::internal(format!("pod GET failed: {e}")) + })?; + let Some(pod) = pod else { + warn!( + pod = %claims.kubernetes.pod.name, + "sandbox pod referenced by SA token not found in this namespace" + ); + return Err(Status::not_found("sandbox pod not found")); + }; + + // Defense-in-depth: confirm the pod UID matches the SA token's + // `kubernetes.io.pod.uid`. Prevents a replayed token from a + // recreated pod with the same name. + let actual_uid = pod.metadata.uid.as_deref().unwrap_or_default(); + if actual_uid != claims.kubernetes.pod.uid { + warn!( + pod = %claims.kubernetes.pod.name, + claimed_uid = %claims.kubernetes.pod.uid, + actual_uid = %actual_uid, + "SA token pod UID does not match live pod; rejecting" + ); + return Err(Status::permission_denied("SA token pod UID mismatch")); + } + + let sandbox_id = pod + .metadata + .annotations + .as_ref() + .and_then(|a| a.get(SANDBOX_ID_ANNOTATION)) + .cloned() + .unwrap_or_default(); + + Ok(Some(ResolvedK8sIdentity { + sandbox_id, + pod_name: claims.kubernetes.pod.name, + pod_uid: claims.kubernetes.pod.uid, + })) + } +} + +#[cfg(test)] +pub mod test_support { + use super::*; + use std::sync::Mutex; + + /// Fake resolver for unit tests. Returns the configured outcome on + /// every call and records the tokens it observed. + pub struct FakeResolver { + pub outcome: Result, Status>, + pub seen_tokens: Mutex>, + } + + impl FakeResolver { + pub fn returning(outcome: Result, Status>) -> Self { + Self { + outcome, + seen_tokens: Mutex::new(Vec::new()), + } + } + } + + #[async_trait] + impl K8sIdentityResolver for FakeResolver { + async fn resolve(&self, token: &str) -> Result, Status> { + self.seen_tokens.lock().unwrap().push(token.to_string()); + match &self.outcome { + Ok(opt) => Ok(opt.clone()), + Err(s) => Err(Status::new(s.code(), s.message())), + } + } + } +} + +#[cfg(test)] +mod tests { + use super::test_support::FakeResolver; + use super::*; + + fn bearer_headers(token: &str) -> http::HeaderMap { + let mut h = http::HeaderMap::new(); + h.insert( + "authorization", + http::HeaderValue::from_str(&format!("Bearer {token}")).unwrap(), + ); + h + } + + #[test] + fn jwks_path_extracts_absolute_url() { + let path = + jwks_path_from_uri("https://kubernetes.default.svc.cluster.local/openid/v1/jwks") + .expect("apiserver-style URL must parse"); + assert_eq!(path, "/openid/v1/jwks"); + } + + #[test] + fn jwks_path_preserves_relative_path() { + let path = jwks_path_from_uri("/openid/v1/jwks").expect("relative path must round-trip"); + assert_eq!(path, "/openid/v1/jwks"); + } + + #[test] + fn jwks_path_preserves_query_string() { + let path = jwks_path_from_uri("https://apiserver/openid/v1/jwks?version=v1") + .expect("query strings must be preserved"); + assert_eq!(path, "/openid/v1/jwks?version=v1"); + } + + #[test] + fn jwks_path_rejects_garbage() { + assert!(jwks_path_from_uri("not a url").is_none()); + } + + #[tokio::test] + async fn authenticates_on_issue_path_only() { + let resolved = ResolvedK8sIdentity { + sandbox_id: "sandbox-a".to_string(), + pod_name: "openshell-sandbox-a".to_string(), + pod_uid: "uid-a".to_string(), + }; + let fake = Arc::new(FakeResolver::returning(Ok(Some(resolved)))); + let auth = K8sServiceAccountAuthenticator::new(fake.clone()); + + let on_issue = auth + .authenticate(&bearer_headers("sa-jwt"), ISSUE_SANDBOX_TOKEN_PATH) + .await + .unwrap() + .expect("expected principal"); + match on_issue { + Principal::Sandbox(p) => { + assert_eq!(p.sandbox_id, "sandbox-a"); + assert!(matches!( + p.source, + SandboxIdentitySource::K8sServiceAccount { .. } + )); + } + _ => panic!("expected sandbox principal"), + } + + let off_issue = auth + .authenticate( + &bearer_headers("sa-jwt"), + "/openshell.v1.OpenShell/GetSandboxConfig", + ) + .await + .unwrap(); + assert!( + off_issue.is_none(), + "K8s SA authenticator must be scoped to IssueSandboxToken" + ); + assert_eq!( + fake.seen_tokens.lock().unwrap().len(), + 1, + "off-path call must not consult the apiserver" + ); + } + + #[tokio::test] + async fn missing_bearer_yields_none() { + let fake = Arc::new(FakeResolver::returning(Ok(None))); + let auth = K8sServiceAccountAuthenticator::new(fake); + let result = auth + .authenticate(&http::HeaderMap::new(), ISSUE_SANDBOX_TOKEN_PATH) + .await + .unwrap(); + assert!(result.is_none()); + } + + #[tokio::test] + async fn resolver_returning_none_falls_through() { + let fake = Arc::new(FakeResolver::returning(Ok(None))); + let auth = K8sServiceAccountAuthenticator::new(fake); + let result = auth + .authenticate( + &bearer_headers("not-a-real-sa-token"), + ISSUE_SANDBOX_TOKEN_PATH, + ) + .await + .unwrap(); + assert!(result.is_none(), "non-authenticating tokens fall through"); + } + + #[tokio::test] + async fn pod_without_annotation_is_rejected() { + let resolved = ResolvedK8sIdentity { + sandbox_id: String::new(), + pod_name: "stray-pod".to_string(), + pod_uid: "uid".to_string(), + }; + let fake = Arc::new(FakeResolver::returning(Ok(Some(resolved)))); + let auth = K8sServiceAccountAuthenticator::new(fake); + let err = auth + .authenticate(&bearer_headers("sa-jwt"), ISSUE_SANDBOX_TOKEN_PATH) + .await + .expect_err("unbound pod must be rejected"); + assert_eq!(err.code(), tonic::Code::PermissionDenied); + } + + #[tokio::test] + async fn resolver_error_propagates() { + let fake = Arc::new(FakeResolver::returning(Err(Status::unavailable( + "apiserver down", + )))); + let auth = K8sServiceAccountAuthenticator::new(fake); + let err = auth + .authenticate(&bearer_headers("sa-jwt"), ISSUE_SANDBOX_TOKEN_PATH) + .await + .expect_err("resolver error must propagate"); + assert_eq!(err.code(), tonic::Code::Unavailable); + } +} diff --git a/crates/openshell-server/src/auth/mod.rs b/crates/openshell-server/src/auth/mod.rs index 8e4f332d8..cff5508ad 100644 --- a/crates/openshell-server/src/auth/mod.rs +++ b/crates/openshell-server/src/auth/mod.rs @@ -8,9 +8,16 @@ //! - `identity`: Provider-agnostic identity representation //! - `http`: HTTP endpoints for auth discovery and token exchange +pub mod authenticator; pub mod authz; +pub mod guard; mod http; pub mod identity; +pub mod k8s_sa; pub mod oidc; +pub mod principal; +pub mod revocation; +pub mod sandbox_jwt; +pub mod spiffe; pub use http::router; diff --git a/crates/openshell-server/src/auth/oidc.rs b/crates/openshell-server/src/auth/oidc.rs index 92298579e..6c1339e4f 100644 --- a/crates/openshell-server/src/auth/oidc.rs +++ b/crates/openshell-server/src/auth/oidc.rs @@ -10,7 +10,10 @@ //! This module owns authentication (verifying who the caller is). //! Authorization (deciding what the caller can do) is in `authz.rs`. +use super::authenticator::Authenticator; use super::identity::{Identity, IdentityProvider}; +use super::principal::{Principal, UserPrincipal}; +use async_trait::async_trait; use jsonwebtoken::{Algorithm, DecodingKey, Validation, decode, decode_header}; use openshell_core::OidcConfig; use reqwest::Client; @@ -22,15 +25,6 @@ use tokio::sync::RwLock; use tonic::Status; use tracing::{debug, info, warn}; -/// Internal metadata header set by the auth middleware to mark a request as -/// originating from a sandbox. This is stripped from all incoming requests -/// first so external callers cannot spoof it. -pub const INTERNAL_AUTH_SOURCE_HEADER: &str = "x-openshell-auth-source"; -/// Internal auth-source marker for requests originating from a sandbox -/// (no OIDC Bearer; trust derives from the mTLS channel or operator's -/// fronting proxy). -pub const AUTH_SOURCE_SANDBOX: &str = "sandbox"; - /// Truly unauthenticated methods — health probes and infrastructure. const UNAUTHENTICATED_METHODS: &[&str] = &[ "/openshell.v1.OpenShell/Health", @@ -40,40 +34,6 @@ const UNAUTHENTICATED_METHODS: &[&str] = &[ /// Path prefixes that bypass OIDC validation (gRPC reflection, health probes). const UNAUTHENTICATED_PREFIXES: &[&str] = &["/grpc.reflection.", "/grpc.health."]; -/// Sandbox-to-server RPCs that are called by sandboxes instead of CLI -/// users. These do not require an OIDC Bearer token; the gRPC channel's -/// mTLS handshake (or the operator's fronting proxy when -/// `--disable-gateway-auth` is set) is the trust boundary. -const SANDBOX_METHODS: &[&str] = &[ - "/openshell.v1.OpenShell/ReportPolicyStatus", - "/openshell.v1.OpenShell/PushSandboxLogs", - "/openshell.v1.OpenShell/GetSandboxProviderEnvironment", - "/openshell.v1.OpenShell/SubmitPolicyAnalysis", - "/openshell.sandbox.v1.SandboxService/GetSandboxConfig", - "/openshell.inference.v1.Inference/GetInferenceBundle", -]; - -/// Methods that accept either an OIDC Bearer token (CLI users, full scope) -/// or no Bearer (sandbox supervisor, sandbox-restricted scope). -/// `UpdateConfig` is called by both CLI (policy/settings mutations) and the -/// sandbox supervisor (policy sync on startup). -/// `OpenShell/GetSandboxConfig` serves CLI settings reads while remaining -/// compatible with sandbox callers. -/// `GetDraftPolicy` serves CLI reviewer surfaces (`openshell rule get`, -/// TUI inbox) AND the sandbox-side `policy.local /wait` long-poll that -/// blocks on the agent's proposal until the developer decides. -const DUAL_AUTH_METHODS: &[&str] = &[ - "/openshell.v1.OpenShell/UpdateConfig", - "/openshell.v1.OpenShell/GetSandboxConfig", - "/openshell.v1.OpenShell/GetDraftPolicy", -]; - -/// Returns `true` if the method accepts either an OIDC Bearer token or a -/// sandbox-class caller (no Bearer). -pub fn is_dual_auth_method(path: &str) -> bool { - DUAL_AUTH_METHODS.contains(&path) -} - /// Returns `true` if the method needs no authentication at all. pub fn is_unauthenticated_method(path: &str) -> bool { UNAUTHENTICATED_METHODS.contains(&path) @@ -82,34 +42,6 @@ pub fn is_unauthenticated_method(path: &str) -> bool { .any(|prefix| path.starts_with(prefix)) } -/// Returns `true` if the method is an exclusively sandbox-class call (does -/// not accept OIDC Bearer). -pub fn is_sandbox_method(path: &str) -> bool { - SANDBOX_METHODS.contains(&path) -} - -/// Remove internal auth-source markers from the request before any auth -/// decision is made so external callers cannot spoof them. -pub fn clear_internal_auth_markers(headers: &mut http::HeaderMap) { - headers.remove(INTERNAL_AUTH_SOURCE_HEADER); -} - -/// Mark the request as originating from a sandbox caller. -pub fn mark_sandbox_caller(headers: &mut http::HeaderMap) { - headers.insert( - INTERNAL_AUTH_SOURCE_HEADER, - http::HeaderValue::from_static(AUTH_SOURCE_SANDBOX), - ); -} - -/// Returns `true` if the request metadata indicates a sandbox caller. -pub fn is_sandbox_caller(metadata: &tonic::metadata::MetadataMap) -> bool { - metadata - .get(INTERNAL_AUTH_SOURCE_HEADER) - .and_then(|v| v.to_str().ok()) - == Some(AUTH_SOURCE_SANDBOX) -} - /// Cached JWKS key set fetched from the OIDC issuer. /// /// A `refresh_mutex` ensures that only one refresh runs at a time, @@ -419,6 +351,42 @@ impl JwksCache { } } +/// Authenticator that validates `Authorization: Bearer ` headers against +/// the configured OIDC issuer. +/// +/// Returns `Ok(None)` when no Bearer header is present, so the chain can fall +/// through to other authenticators (e.g. the gateway-minted sandbox JWT +/// authenticator added in PR 2). +pub struct OidcAuthenticator { + cache: Arc, +} + +impl OidcAuthenticator { + pub fn new(cache: Arc) -> Self { + Self { cache } + } +} + +#[async_trait] +impl Authenticator for OidcAuthenticator { + async fn authenticate( + &self, + headers: &http::HeaderMap, + _path: &str, + ) -> Result, Status> { + let Some(token) = headers + .get("authorization") + .and_then(|v| v.to_str().ok()) + .and_then(|v| v.strip_prefix("Bearer ")) + else { + return Ok(None); + }; + + let identity = self.cache.validate_token(token).await?; + Ok(Some(Principal::User(UserPrincipal { identity }))) + } +} + #[cfg(test)] mod tests { use super::*; @@ -433,7 +401,6 @@ mod tests { assert!(!is_unauthenticated_method( "/openshell.v1.OpenShell/CreateSandbox" )); - assert!(!is_sandbox_method("/openshell.v1.OpenShell/CreateSandbox")); } #[test] @@ -451,74 +418,6 @@ mod tests { assert!(is_unauthenticated_method("/grpc.health.v1.Health/Check")); } - #[test] - fn sandbox_rpcs_are_sandbox_methods() { - assert!(is_sandbox_method( - "/openshell.sandbox.v1.SandboxService/GetSandboxConfig" - )); - assert!(is_sandbox_method( - "/openshell.v1.OpenShell/GetSandboxProviderEnvironment" - )); - assert!(is_sandbox_method( - "/openshell.v1.OpenShell/ReportPolicyStatus" - )); - assert!(is_sandbox_method("/openshell.v1.OpenShell/PushSandboxLogs")); - assert!(is_sandbox_method( - "/openshell.v1.OpenShell/SubmitPolicyAnalysis" - )); - assert!(is_sandbox_method( - "/openshell.inference.v1.Inference/GetInferenceBundle" - )); - } - - #[test] - fn openshell_get_sandbox_config_is_dual_auth() { - assert!(!is_sandbox_method( - "/openshell.v1.OpenShell/GetSandboxConfig" - )); - assert!(is_dual_auth_method( - "/openshell.v1.OpenShell/GetSandboxConfig" - )); - } - - #[test] - fn openshell_get_draft_policy_is_dual_auth() { - // policy.local calls GetDraftPolicy from inside the sandbox - // supervisor (no Bearer, authenticated via mTLS), and the CLI/TUI - // reviewer surfaces call it with an OIDC Bearer. Sandbox-only - // would lock CLI out; Bearer-only would 401 the /wait long-poll - // in OIDC-enabled deployments. - assert!(!is_sandbox_method("/openshell.v1.OpenShell/GetDraftPolicy")); - assert!(is_dual_auth_method( - "/openshell.v1.OpenShell/GetDraftPolicy" - )); - } - - #[test] - fn sandbox_caller_marker_round_trips_through_metadata() { - let mut headers = http::HeaderMap::new(); - mark_sandbox_caller(&mut headers); - let metadata = tonic::metadata::MetadataMap::from_headers(headers); - assert!(is_sandbox_caller(&metadata)); - } - - #[test] - fn unmarked_request_is_not_sandbox_caller() { - let metadata = tonic::metadata::MetadataMap::new(); - assert!(!is_sandbox_caller(&metadata)); - } - - #[test] - fn clear_internal_markers_strips_spoofed_header() { - let mut headers = http::HeaderMap::new(); - headers.insert( - INTERNAL_AUTH_SOURCE_HEADER, - http::HeaderValue::from_static(AUTH_SOURCE_SANDBOX), - ); - clear_internal_auth_markers(&mut headers); - assert!(headers.get(INTERNAL_AUTH_SOURCE_HEADER).is_none()); - } - #[test] fn extract_roles_keycloak_path() { let json = serde_json::json!({ diff --git a/crates/openshell-server/src/auth/principal.rs b/crates/openshell-server/src/auth/principal.rs new file mode 100644 index 000000000..fac3f6099 --- /dev/null +++ b/crates/openshell-server/src/auth/principal.rs @@ -0,0 +1,82 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Authenticated caller principals. +//! +//! A `Principal` is the result of running the [`super::authenticator::Authenticator`] +//! chain on an inbound request. It generalizes over the kinds of callers the +//! gateway recognizes — human users (OIDC), sandbox supervisors (gateway-minted +//! JWT, future SPIFFE), and anonymous callers (truly unauthenticated methods +//! like health probes). +//! +//! Handlers read the principal from the gRPC `Request` extensions and gate +//! access accordingly. Sandbox-class handlers MUST compare +//! `Principal::Sandbox.sandbox_id` against the request body's `sandbox_id` +//! to prevent cross-sandbox access (see issue #1354). + +use super::identity::Identity; + +/// Who is calling. +/// +/// Inserted into `tonic::Request::extensions` by the auth router. Handlers +/// retrieve it via `req.extensions().get::()`. +#[derive(Debug, Clone)] +pub enum Principal { + /// Human caller authenticated via OIDC (Keycloak, Entra ID, Okta, etc.). + User(UserPrincipal), + /// Sandbox supervisor authenticated by an identity bound to a specific + /// sandbox UUID. The wrapped `sandbox_id` MUST match any sandbox referenced + /// in the request body for sandbox-class methods (PR-4 guard). + Sandbox(#[allow(dead_code)] SandboxPrincipal), + /// Truly unauthenticated caller (health probes, reflection). Sandbox-class + /// and user-class methods reject this variant. + #[allow(dead_code)] + Anonymous, +} + +/// User caller — wraps the existing provider-agnostic [`Identity`]. +#[derive(Debug, Clone)] +pub struct UserPrincipal { + /// The verified identity from the authentication provider. + pub identity: Identity, +} + +/// Sandbox caller — bound to one specific sandbox UUID. +/// +/// `sandbox_id` and `source` are consumed by the PR-4 handler guard; until +/// then they only exist in the type so the trait shape is stable across the +/// PR series. +#[derive(Debug, Clone)] +#[allow(dead_code)] +pub struct SandboxPrincipal { + /// Canonical sandbox UUID. Empty string only for the PR-1 legacy marker; + /// PR 2 onwards always populates this from a verified credential. + pub sandbox_id: String, + /// How this principal was verified — used for audit logs and to gate the + /// PR-4 IDOR check against unverified sources. + pub source: SandboxIdentitySource, + /// SPIFFE trust domain. Populated when the credential is SPIFFE-shaped; + /// reserved for future per-sandbox cert / SPIRE authenticators. + pub trust_domain: Option, +} + +/// How a [`SandboxPrincipal`] was authenticated. +/// +/// Variant fields are populated by the producing authenticator and consumed +/// by audit logging + the PR-4 IDOR guard. Until PR 4 lands those readers +/// they look unused to the dead-code lint. +#[derive(Debug, Clone)] +#[allow(dead_code)] +pub enum SandboxIdentitySource { + /// Gateway-minted JWT validated against the gateway's signing key. + /// Produced by [`super::sandbox_jwt::SandboxJwtAuthenticator`]. + BootstrapJwt { issuer: String, jti: String }, + /// Per-sandbox client certificate. Reserved for the v2 channel-bound + /// identity follow-up. + BootstrapCert { fingerprint: String }, + /// SPIRE-issued SVID. Reserved for the SPIFFE/SPIRE follow-up. + SpiffeSvid { spiffe_id: String }, + /// K8s `ServiceAccount` token used to bootstrap a gateway-minted JWT + /// via `IssueSandboxToken`. Populated only on that one RPC path. + K8sServiceAccount { pod_name: String, pod_uid: String }, +} diff --git a/crates/openshell-server/src/auth/revocation.rs b/crates/openshell-server/src/auth/revocation.rs new file mode 100644 index 000000000..3cca82211 --- /dev/null +++ b/crates/openshell-server/src/auth/revocation.rs @@ -0,0 +1,100 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Sandbox-JWT revocation set. +//! +//! Tracks `jti` claims that have been explicitly revoked (sandbox deleted +//! or token refreshed). The validator consults this set on every sandbox +//! JWT validation and rejects matches as `Unauthenticated`. +//! +//! PR-2 implementation is in-memory only; a gateway restart clears the +//! set. The token TTL (24 h default) bounds the exposure window. PR 5 +//! (refresh RPC) introduces persistence to `Store` so revocations survive +//! restarts. + +use std::collections::HashMap; +use std::sync::RwLock; +use std::time::{SystemTime, UNIX_EPOCH}; + +/// In-memory `jti` deny-list with TTL-based pruning. +#[derive(Debug, Default)] +pub struct RevocationSet { + entries: RwLock>, +} + +impl RevocationSet { + pub fn new() -> Self { + Self::default() + } + + /// Mark `jti` as revoked until `expires_at_ms` (after which it would + /// naturally fail signature validation due to `exp`, so we can drop it). + pub fn revoke(&self, jti: &str, expires_at_ms: i64) { + let mut entries = self.entries.write().expect("revocation lock poisoned"); + entries.insert(jti.to_string(), expires_at_ms); + } + + /// Returns true if `jti` is currently revoked. + pub fn is_revoked(&self, jti: &str) -> bool { + let entries = self.entries.read().expect("revocation lock poisoned"); + entries.contains_key(jti) + } + + /// Drop entries whose `exp` is in the past. Called periodically (or on + /// demand from tests) to bound memory growth. + pub fn prune_expired(&self) -> usize { + let now = now_ms(); + let mut entries = self.entries.write().expect("revocation lock poisoned"); + let before = entries.len(); + entries.retain(|_, exp| *exp > now); + before - entries.len() + } + + /// Number of currently tracked revocations. Test/diagnostic only. + #[cfg(test)] + pub fn len(&self) -> usize { + self.entries.read().unwrap().len() + } +} + +fn now_ms() -> i64 { + i64::try_from( + SystemTime::now() + .duration_since(UNIX_EPOCH) + .map_or(0, |d| d.as_millis()), + ) + .unwrap_or(i64::MAX) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn revoked_jti_is_detected() { + let set = RevocationSet::new(); + let future = now_ms() + 60_000; + set.revoke("abc", future); + assert!(set.is_revoked("abc")); + assert!(!set.is_revoked("xyz")); + } + + #[test] + fn prune_drops_expired_entries() { + let set = RevocationSet::new(); + set.revoke("expired", now_ms() - 1_000); + set.revoke("future", now_ms() + 60_000); + let dropped = set.prune_expired(); + assert_eq!(dropped, 1); + assert!(!set.is_revoked("expired")); + assert!(set.is_revoked("future")); + } + + #[test] + fn re_revoking_overwrites_expiry() { + let set = RevocationSet::new(); + set.revoke("dup", now_ms() + 1_000); + set.revoke("dup", now_ms() + 99_000); + assert_eq!(set.len(), 1); + } +} diff --git a/crates/openshell-server/src/auth/sandbox_jwt.rs b/crates/openshell-server/src/auth/sandbox_jwt.rs new file mode 100644 index 000000000..6b1736dbe --- /dev/null +++ b/crates/openshell-server/src/auth/sandbox_jwt.rs @@ -0,0 +1,397 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Gateway-minted per-sandbox JWTs. +//! +//! The gateway signs an Ed25519 JWT for each sandbox at create time and +//! the sandbox supervisor presents it as `Authorization: Bearer ` on +//! every gRPC call (PR 3). This module implements both sides of the +//! gateway-controlled token: +//! - [`SandboxJwtIssuer`] mints fresh tokens (called from +//! `handle_create_sandbox` and the `IssueSandboxToken` RPC). +//! - [`SandboxJwtAuthenticator`] validates tokens on inbound requests and +//! produces a [`Principal::Sandbox`] with [`SandboxIdentitySource::BootstrapJwt`]. +//! +//! Algorithm: `EdDSA` (Ed25519). Pinned via `Validation::algorithms` to +//! prevent algorithm-confusion attacks. + +use super::authenticator::Authenticator; +use super::principal::{Principal, SandboxIdentitySource, SandboxPrincipal}; +use super::revocation::RevocationSet; +use async_trait::async_trait; +use jsonwebtoken::{ + Algorithm, DecodingKey, EncodingKey, Header, Validation, decode, decode_header, encode, +}; +use serde::{Deserialize, Serialize}; +use std::sync::Arc; +use std::time::{Duration, SystemTime, UNIX_EPOCH}; +use tonic::Status; +use tracing::{debug, warn}; +use uuid::Uuid; + +/// SPIFFE-shaped subject prefix. Embedded in the `sub` claim of every +/// minted token so a future migration to per-sandbox certs or SPIRE can +/// reuse the same subject namespace without breaking handler equality +/// checks. +const SPIFFE_SUBJECT_PREFIX: &str = "spiffe://openshell/sandbox/"; + +/// JWT claim set serialized in every gateway-minted sandbox token. +#[derive(Debug, Serialize, Deserialize)] +pub struct SandboxJwtClaims { + /// `spiffe://openshell/sandbox/`. SPIFFE-shaped for forward + /// compatibility with channel-bound identity (per-sandbox cert / SPIRE). + pub sub: String, + /// Gateway identity (`openshell-gateway:`). Both `iss` and + /// `aud` use the same value so any future replicas of the same + /// deployment validate each others' tokens without configuration. + pub iss: String, + pub aud: String, + pub iat: i64, + pub exp: i64, + pub jti: String, + /// Canonical sandbox UUID, denormalized from `sub` for cheap parsing + /// without a SPIFFE library. + pub sandbox_id: String, +} + +/// Mints fresh sandbox JWTs. +pub struct SandboxJwtIssuer { + encoding_key: EncodingKey, + kid: String, + issuer: String, + audience: String, + ttl: Duration, +} + +impl std::fmt::Debug for SandboxJwtIssuer { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("SandboxJwtIssuer") + .field("kid", &self.kid) + .field("issuer", &self.issuer) + .field("audience", &self.audience) + .field("ttl", &self.ttl) + .finish_non_exhaustive() + } +} + +/// Outcome of a successful mint — caller persists the `jti` so the same +/// token can be revoked on `DeleteSandbox` / refresh. +#[derive(Debug, Clone)] +pub struct MintedToken { + pub token: String, + pub jti: String, + pub expires_at_ms: i64, +} + +impl SandboxJwtIssuer { + pub fn from_pem( + signing_key_pem: &[u8], + kid: String, + gateway_id: &str, + ttl: Duration, + ) -> Result { + let encoding_key = EncodingKey::from_ed_pem(signing_key_pem) + .map_err(|e| format!("failed to parse Ed25519 signing key PEM: {e}"))?; + let identity = format!("openshell-gateway:{gateway_id}"); + Ok(Self { + encoding_key, + kid, + issuer: identity.clone(), + audience: identity, + ttl, + }) + } + + /// Mint a fresh token for `sandbox_id`. The caller MUST track the + /// returned `jti` (in the `RevocationSet`'s mint-time index if we ever + /// need to revoke the most-recent token for a given sandbox). + #[allow(clippy::result_large_err)] // `tonic::Status` is the natural error here + pub fn mint(&self, sandbox_id: &str) -> Result { + let now = now_secs(); + let exp = now + i64::try_from(self.ttl.as_secs()).unwrap_or(86_400); + let jti = Uuid::new_v4().to_string(); + let claims = SandboxJwtClaims { + sub: format!("{SPIFFE_SUBJECT_PREFIX}{sandbox_id}"), + iss: self.issuer.clone(), + aud: self.audience.clone(), + iat: now, + exp, + jti: jti.clone(), + sandbox_id: sandbox_id.to_string(), + }; + let mut header = Header::new(Algorithm::EdDSA); + header.kid = Some(self.kid.clone()); + let token = encode(&header, &claims, &self.encoding_key).map_err(|e| { + warn!(error = %e, "failed to mint sandbox JWT"); + Status::internal("failed to mint sandbox token") + })?; + Ok(MintedToken { + token, + jti, + expires_at_ms: exp.saturating_mul(1000), + }) + } + + pub fn ttl(&self) -> Duration { + self.ttl + } +} + +/// Authenticator that validates gateway-minted sandbox JWTs. +pub struct SandboxJwtAuthenticator { + decoding_key: DecodingKey, + kid: String, + issuer: String, + audience: String, + revocation: Arc, +} + +impl std::fmt::Debug for SandboxJwtAuthenticator { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("SandboxJwtAuthenticator") + .field("kid", &self.kid) + .field("issuer", &self.issuer) + .field("audience", &self.audience) + .finish_non_exhaustive() + } +} + +impl SandboxJwtAuthenticator { + pub fn from_pem( + public_key_pem: &[u8], + kid: String, + gateway_id: &str, + revocation: Arc, + ) -> Result { + let decoding_key = DecodingKey::from_ed_pem(public_key_pem) + .map_err(|e| format!("failed to parse Ed25519 public key PEM: {e}"))?; + let identity = format!("openshell-gateway:{gateway_id}"); + Ok(Self { + decoding_key, + kid, + issuer: identity.clone(), + audience: identity, + revocation, + }) + } + + #[allow(clippy::result_large_err)] + fn validate_bearer(&self, token: &str) -> Result, Status> { + let header = decode_header(token).map_err(|e| { + debug!(error = %e, "sandbox JWT header decode failed"); + Status::unauthenticated("invalid token") + })?; + + // Fall through to other authenticators when the kid does not match — + // OIDC issuers may share the Bearer slot. + if header.kid.as_deref() != Some(self.kid.as_str()) { + return Ok(None); + } + if !matches!(header.alg, Algorithm::EdDSA) { + return Ok(None); + } + + let mut validation = Validation::new(Algorithm::EdDSA); + validation.algorithms = vec![Algorithm::EdDSA]; + validation.set_issuer(&[&self.issuer]); + validation.set_audience(&[&self.audience]); + validation.set_required_spec_claims(&["iss", "aud", "exp", "sub"]); + + let data = + decode::(token, &self.decoding_key, &validation).map_err(|e| { + debug!(error = %e, "sandbox JWT validation failed"); + Status::unauthenticated(format!("invalid token: {e}")) + })?; + + let claims = data.claims; + if self.revocation.is_revoked(&claims.jti) { + debug!(jti = %claims.jti, "sandbox JWT rejected: jti revoked"); + return Err(Status::unauthenticated("revoked token")); + } + + Ok(Some(Principal::Sandbox(SandboxPrincipal { + sandbox_id: claims.sandbox_id, + source: SandboxIdentitySource::BootstrapJwt { + issuer: claims.iss, + jti: claims.jti, + }, + trust_domain: Some("openshell".to_string()), + }))) + } +} + +#[async_trait] +impl Authenticator for SandboxJwtAuthenticator { + async fn authenticate( + &self, + headers: &http::HeaderMap, + _path: &str, + ) -> Result, Status> { + let Some(token) = headers + .get("authorization") + .and_then(|v| v.to_str().ok()) + .and_then(|v| v.strip_prefix("Bearer ")) + else { + return Ok(None); + }; + self.validate_bearer(token) + } +} + +fn now_secs() -> i64 { + i64::try_from( + SystemTime::now() + .duration_since(UNIX_EPOCH) + .map_or(0, |d| d.as_secs()), + ) + .unwrap_or(i64::MAX) +} + +#[cfg(test)] +mod tests { + use super::*; + use openshell_bootstrap::jwt::generate_jwt_key; + + fn header_map_with_bearer(token: &str) -> http::HeaderMap { + let mut h = http::HeaderMap::new(); + h.insert( + "authorization", + http::HeaderValue::from_str(&format!("Bearer {token}")).unwrap(), + ); + h + } + + fn pair() -> ( + SandboxJwtIssuer, + SandboxJwtAuthenticator, + Arc, + ) { + let mat = generate_jwt_key().expect("jwt key"); + let revocation = Arc::new(RevocationSet::new()); + let issuer = SandboxJwtIssuer::from_pem( + mat.signing_key_pem.as_bytes(), + mat.kid.clone(), + "test-gateway", + Duration::from_secs(3600), + ) + .unwrap(); + let auth = SandboxJwtAuthenticator::from_pem( + mat.public_key_pem.as_bytes(), + mat.kid, + "test-gateway", + revocation.clone(), + ) + .unwrap(); + (issuer, auth, revocation) + } + + #[tokio::test] + async fn mint_and_validate_round_trip() { + let (issuer, auth, _) = pair(); + let minted = issuer.mint("sandbox-a").unwrap(); + let principal = auth + .authenticate(&header_map_with_bearer(&minted.token), "/anything") + .await + .unwrap() + .expect("expected principal"); + match principal { + Principal::Sandbox(p) => { + assert_eq!(p.sandbox_id, "sandbox-a"); + match p.source { + SandboxIdentitySource::BootstrapJwt { issuer: iss, jti } => { + assert_eq!(iss, "openshell-gateway:test-gateway"); + assert_eq!(jti, minted.jti); + } + other => panic!("unexpected source: {other:?}"), + } + } + _ => panic!("expected Sandbox principal"), + } + } + + #[tokio::test] + async fn revoked_jti_is_rejected() { + let (issuer, auth, revocation) = pair(); + let minted = issuer.mint("sandbox-a").unwrap(); + revocation.revoke(&minted.jti, minted.expires_at_ms); + let err = auth + .authenticate(&header_map_with_bearer(&minted.token), "/anything") + .await + .expect_err("revoked must reject"); + assert_eq!(err.code(), tonic::Code::Unauthenticated); + } + + #[tokio::test] + async fn token_signed_by_other_key_is_rejected() { + let (_, auth_a, _) = pair(); + let (issuer_b, _, _) = pair(); // different keypair + let minted = issuer_b.mint("sandbox-b").unwrap(); + // The token has a different `kid` than auth_a expects, so the + // authenticator yields None (lets the chain fall through). That is + // the documented behavior for cross-issuer Bearer headers. + let result = auth_a + .authenticate(&header_map_with_bearer(&minted.token), "/anything") + .await + .unwrap(); + assert!(result.is_none(), "different kid must fall through"); + } + + #[tokio::test] + async fn missing_bearer_yields_none() { + let (_, auth, _) = pair(); + let result = auth + .authenticate(&http::HeaderMap::new(), "/anything") + .await + .unwrap(); + assert!(result.is_none()); + } + + #[tokio::test] + async fn malformed_token_is_rejected() { + let (_, auth, _) = pair(); + let err = auth + .authenticate(&header_map_with_bearer("not.a.jwt"), "/anything") + .await + .expect_err("malformed must reject"); + assert_eq!(err.code(), tonic::Code::Unauthenticated); + } + + #[tokio::test] + async fn expired_token_is_rejected() { + // Mint a token whose iat is far in the past so its TTL window is + // already closed by `now`. We sign the JWT directly with the same + // signing key to bypass the issuer's TTL-vs-now coupling. + let mat = generate_jwt_key().unwrap(); + let issuer = SandboxJwtIssuer::from_pem( + mat.signing_key_pem.as_bytes(), + mat.kid.clone(), + "g", + Duration::from_secs(3600), + ) + .unwrap(); + let auth = SandboxJwtAuthenticator::from_pem( + mat.public_key_pem.as_bytes(), + mat.kid.clone(), + "g", + Arc::new(RevocationSet::new()), + ) + .unwrap(); + let claims = SandboxJwtClaims { + sub: format!("{SPIFFE_SUBJECT_PREFIX}sandbox-c"), + iss: "openshell-gateway:g".to_string(), + aud: "openshell-gateway:g".to_string(), + iat: now_secs() - 7200, + exp: now_secs() - 3600, + jti: Uuid::new_v4().to_string(), + sandbox_id: "sandbox-c".to_string(), + }; + let mut header = Header::new(Algorithm::EdDSA); + header.kid = Some(mat.kid); + let token = encode(&header, &claims, &issuer.encoding_key).unwrap(); + let err = auth + .authenticate(&header_map_with_bearer(&token), "/anything") + .await + .expect_err("expired token must reject"); + assert_eq!(err.code(), tonic::Code::Unauthenticated); + } +} diff --git a/crates/openshell-server/src/auth/spiffe.rs b/crates/openshell-server/src/auth/spiffe.rs new file mode 100644 index 000000000..c942a44b4 --- /dev/null +++ b/crates/openshell-server/src/auth/spiffe.rs @@ -0,0 +1,212 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! SPIFFE JWT-SVID authentication for sandbox supervisors. +//! +//! The gateway does not validate SPIFFE JWT-SVID signatures itself. Instead it +//! delegates validation to the local SPIFFE Workload API, keeping algorithm and +//! bundle handling inside the configured SPIFFE implementation. + +use super::authenticator::Authenticator; +use super::principal::{Principal, SandboxIdentitySource, SandboxPrincipal}; +use async_trait::async_trait; +use openshell_core::SpiffeConfig; +use spiffe::{JwtSvid, WorkloadApiClient}; +use std::path::Path; +use tonic::Status; +use tracing::{debug, info, warn}; + +/// Authenticator backed by the SPIFFE Workload API `ValidateJWTSVID` RPC. +pub struct SpiffeAuthenticator { + client: WorkloadApiClient, + config: SpiffeConfig, +} + +impl std::fmt::Debug for SpiffeAuthenticator { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("SpiffeAuthenticator") + .field("socket", &self.config.workload_api_socket_path) + .field("trust_domain", &self.config.trust_domain) + .field("audience", &self.config.audience) + .field("sandbox_id_prefix", &self.config.sandbox_id_prefix) + .finish_non_exhaustive() + } +} + +impl SpiffeAuthenticator { + pub async fn new(config: SpiffeConfig) -> Result { + let endpoint = workload_api_endpoint(&config.workload_api_socket_path); + let client = WorkloadApiClient::connect_to(&endpoint) + .await + .map_err(|e| { + format!("failed to connect to SPIFFE Workload API endpoint {endpoint}: {e}") + })?; + info!( + socket = %config.workload_api_socket_path.display(), + trust_domain = %config.trust_domain, + audience = %config.audience, + "SPIFFE JWT-SVID sandbox authenticator enabled" + ); + Ok(Self { client, config }) + } + + #[allow(clippy::result_large_err)] + async fn validate_bearer(&self, token: &str) -> Result, Status> { + let Some(candidate_id) = candidate_spiffe_id(token) else { + return Ok(None); + }; + if parse_sandbox_id_from_spiffe_id( + &candidate_id, + &self.config.trust_domain, + &self.config.sandbox_id_prefix, + ) + .is_none() + { + return Ok(None); + } + + let svid = self + .client + .validate_jwt_token(&self.config.audience, token) + .await + .map_err(|status| { + debug!(error = %status, "SPIFFE JWT-SVID validation failed"); + Status::unauthenticated("invalid SPIFFE JWT-SVID") + })?; + + self.principal_from_validated_svid(&svid) + } + + #[allow(clippy::result_large_err)] + fn principal_from_validated_svid(&self, svid: &JwtSvid) -> Result, Status> { + let spiffe_id = svid.spiffe_id().to_string(); + let Some(sandbox_id) = parse_sandbox_id_from_spiffe_id( + &spiffe_id, + &self.config.trust_domain, + &self.config.sandbox_id_prefix, + ) else { + warn!( + spiffe_id = %spiffe_id, + trust_domain = %self.config.trust_domain, + prefix = %self.config.sandbox_id_prefix, + "validated SPIFFE ID is outside the configured sandbox identity namespace" + ); + return Err(Status::permission_denied( + "SPIFFE ID is not authorized as an OpenShell sandbox", + )); + }; + + Ok(Some(Principal::Sandbox(SandboxPrincipal { + sandbox_id, + source: SandboxIdentitySource::SpiffeSvid { spiffe_id }, + trust_domain: Some(self.config.trust_domain.clone()), + }))) + } +} + +#[async_trait] +impl Authenticator for SpiffeAuthenticator { + async fn authenticate( + &self, + headers: &http::HeaderMap, + _path: &str, + ) -> Result, Status> { + let Some(token) = headers + .get("authorization") + .and_then(|v| v.to_str().ok()) + .and_then(|v| v.strip_prefix("Bearer ")) + else { + return Ok(None); + }; + self.validate_bearer(token).await + } +} + +fn parse_sandbox_id_from_spiffe_id( + spiffe_id: &str, + trust_domain: &str, + sandbox_id_prefix: &str, +) -> Option { + let trust_domain = trust_domain.trim().trim_start_matches("spiffe://"); + let prefix = format!( + "spiffe://{}{}", + trust_domain.trim_end_matches('/'), + normalize_spiffe_path_prefix(sandbox_id_prefix) + ); + let sandbox_id = spiffe_id.strip_prefix(&prefix)?; + (!sandbox_id.is_empty() && !sandbox_id.contains('/')).then(|| sandbox_id.to_string()) +} + +fn normalize_spiffe_path_prefix(prefix: &str) -> String { + let trimmed = prefix.trim(); + if trimmed.starts_with('/') { + trimmed.to_string() + } else { + format!("/{trimmed}") + } +} + +fn candidate_spiffe_id(jwt: &str) -> Option { + JwtSvid::parse_insecure(jwt) + .ok() + .map(|svid| svid.spiffe_id().to_string()) +} + +fn workload_api_endpoint(path: &Path) -> String { + let path = path.to_string_lossy(); + if path.starts_with("unix:") || path.starts_with("tcp:") { + path.into_owned() + } else { + format!("unix:{path}") + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn parses_sandbox_id_from_configured_spiffe_id() { + assert_eq!( + parse_sandbox_id_from_spiffe_id( + "spiffe://openshell.local/openshell/sandbox/abc", + "openshell.local", + "/openshell/sandbox/", + ) + .as_deref(), + Some("abc") + ); + } + + #[test] + fn rejects_spiffe_id_outside_sandbox_namespace() { + assert!( + parse_sandbox_id_from_spiffe_id( + "spiffe://openshell.local/ns/openshell/sa/default", + "openshell.local", + "/openshell/sandbox/", + ) + .is_none() + ); + assert!( + parse_sandbox_id_from_spiffe_id( + "spiffe://other.local/openshell/sandbox/abc", + "openshell.local", + "/openshell/sandbox/", + ) + .is_none() + ); + } + + #[test] + fn prefixes_plain_socket_paths_as_unix_endpoints() { + assert_eq!( + workload_api_endpoint(Path::new("/spiffe-workload-api/spire-agent.sock")), + "unix:/spiffe-workload-api/spire-agent.sock" + ); + assert_eq!( + workload_api_endpoint(Path::new("unix:/tmp/spire.sock")), + "unix:/tmp/spire.sock" + ); + } +} diff --git a/crates/openshell-server/src/certgen.rs b/crates/openshell-server/src/certgen.rs index f7dcc0803..9fa94e08d 100644 --- a/crates/openshell-server/src/certgen.rs +++ b/crates/openshell-server/src/certgen.rs @@ -52,6 +52,12 @@ pub struct CertgenArgs { #[arg(long, required_unless_present = "output_dir")] client_secret_name: Option, + /// Name of the sandbox-JWT signing-key Secret (`Opaque`) to create. + /// Holds `signing.pem`, `public.pem`, and `kid` keys. Mounted on the + /// gateway pod (only) so it can mint and validate per-sandbox JWTs. + #[arg(long, required_unless_present = "output_dir")] + jwt_secret_name: Option, + /// Extra Subject Alternative Name for the server certificate. Repeatable. /// Auto-detected as an IP address or DNS name. #[arg(long = "server-san", value_name = "SAN")] @@ -93,10 +99,10 @@ enum K8sAction { Create, } -fn decide_k8s(server_exists: bool, client_exists: bool) -> K8sAction { - match (server_exists, client_exists) { - (true, true) => K8sAction::SkipExists, - (false, false) => K8sAction::Create, +fn decide_k8s(server_exists: bool, client_exists: bool, jwt_exists: bool) -> K8sAction { + match (server_exists, client_exists, jwt_exists) { + (true, true, true) => K8sAction::SkipExists, + (false, false, false) => K8sAction::Create, _ => K8sAction::PartialState, } } @@ -114,6 +120,10 @@ async fn run_kubernetes(args: &CertgenArgs, bundle: &PkiBundle) -> Result<()> { .client_secret_name .as_deref() .ok_or_else(|| miette::miette!("--client-secret-name is required"))?; + let jwt_name = args + .jwt_secret_name + .as_deref() + .ok_or_else(|| miette::miette!("--jwt-secret-name is required"))?; let client = Client::try_default() .await @@ -133,22 +143,29 @@ async fn run_kubernetes(args: &CertgenArgs, bundle: &PkiBundle) -> Result<()> { .into_diagnostic() .wrap_err_with(|| format!("failed to read secret {client_name}"))? .is_some(); + let jwt_exists = api + .get_opt(jwt_name) + .await + .into_diagnostic() + .wrap_err_with(|| format!("failed to read secret {jwt_name}"))? + .is_some(); - match decide_k8s(server_exists, client_exists) { + match decide_k8s(server_exists, client_exists, jwt_exists) { K8sAction::SkipExists => { info!( namespace = %namespace, server = %server_name, client = %client_name, + jwt = %jwt_name, "PKI secrets already exist, skipping." ); return Ok(()); } K8sAction::PartialState => { return Err(miette::miette!( - "partial PKI state in namespace {namespace}: exactly one of \ - {server_name} / {client_name} exists. Recover with: \ - kubectl delete secret -n {namespace} {server_name} {client_name}", + "partial PKI state in namespace {namespace}: only some of \ + {server_name} / {client_name} / {jwt_name} exist. Recover with: \ + kubectl delete secret -n {namespace} {server_name} {client_name} {jwt_name}", )); } K8sAction::Create => {} @@ -166,6 +183,12 @@ async fn run_kubernetes(args: &CertgenArgs, bundle: &PkiBundle) -> Result<()> { &bundle.client_key_pem, &bundle.ca_cert_pem, ); + let jwt_secret = jwt_signing_secret( + jwt_name, + &bundle.jwt_signing_key_pem, + &bundle.jwt_public_key_pem, + &bundle.jwt_key_id, + ); api.create(&PostParams::default(), &server_secret) .await @@ -175,11 +198,16 @@ async fn run_kubernetes(args: &CertgenArgs, bundle: &PkiBundle) -> Result<()> { .await .into_diagnostic() .wrap_err_with(|| format!("failed to create secret {client_name}"))?; + api.create(&PostParams::default(), &jwt_secret) + .await + .into_diagnostic() + .wrap_err_with(|| format!("failed to create secret {jwt_name}"))?; info!( namespace = %namespace, server = %server_name, client = %client_name, + jwt = %jwt_name, "PKI secrets created." ); Ok(()) @@ -207,6 +235,31 @@ fn tls_secret(name: &str, crt_pem: &str, key_pem: &str, ca_pem: &str) -> Secret } } +/// Build an `Opaque` Secret carrying the gateway-minted sandbox JWT +/// signing material. Mounted only on the gateway pod — sandbox pods +/// receive a per-pod gateway-signed token, never the signing key itself. +fn jwt_signing_secret(name: &str, signing_pem: &str, public_pem: &str, kid: &str) -> Secret { + let mut data = BTreeMap::new(); + data.insert( + "signing.pem".to_string(), + ByteString(signing_pem.as_bytes().to_vec()), + ); + data.insert( + "public.pem".to_string(), + ByteString(public_pem.as_bytes().to_vec()), + ); + data.insert("kid".to_string(), ByteString(kid.as_bytes().to_vec())); + Secret { + metadata: ObjectMeta { + name: Some(name.to_string()), + ..Default::default() + }, + type_: Some("Opaque".to_string()), + data: Some(data), + ..Default::default() + } +} + // ─────────────────────────────── Local mode ─────────────────────────────── #[derive(Debug, PartialEq, Eq)] @@ -235,12 +288,17 @@ struct LocalPaths { client_dir: PathBuf, client_crt: PathBuf, client_key: PathBuf, + jwt_dir: PathBuf, + jwt_signing: PathBuf, + jwt_public: PathBuf, + jwt_kid: PathBuf, } impl LocalPaths { fn resolve(dir: &Path) -> Self { let server_dir = dir.join("server"); let client_dir = dir.join("client"); + let jwt_dir = dir.join("jwt"); Self { ca_crt: dir.join("ca.crt"), ca_key: dir.join("ca.key"), @@ -250,10 +308,14 @@ impl LocalPaths { client_crt: client_dir.join("tls.crt"), client_key: client_dir.join("tls.key"), client_dir, + jwt_signing: jwt_dir.join("signing.pem"), + jwt_public: jwt_dir.join("public.pem"), + jwt_kid: jwt_dir.join("kid"), + jwt_dir, } } - fn all_files(&self) -> [&Path; 6] { + fn all_files(&self) -> [&Path; 9] { [ &self.ca_crt, &self.ca_key, @@ -261,6 +323,9 @@ impl LocalPaths { &self.server_key, &self.client_crt, &self.client_key, + &self.jwt_signing, + &self.jwt_public, + &self.jwt_kid, ] } @@ -271,7 +336,7 @@ impl LocalPaths { fn decide_local(present: usize) -> LocalAction { match present { - 6 => LocalAction::Skip, + 9 => LocalAction::Skip, 0 => LocalAction::Create, _ => LocalAction::PartialState, } @@ -318,6 +383,9 @@ fn read_local_bundle(paths: &LocalPaths) -> Result { server_key_pem: read_pem(&paths.server_key)?, client_cert_pem: read_pem(&paths.client_crt)?, client_key_pem: read_pem(&paths.client_key)?, + jwt_signing_key_pem: read_pem(&paths.jwt_signing)?, + jwt_public_key_pem: read_pem(&paths.jwt_public)?, + jwt_key_id: read_pem(&paths.jwt_kid)?.trim().to_string(), }) } @@ -339,9 +407,11 @@ fn write_local_bundle(dir: &Path, bundle: &PkiBundle, paths: &LocalPaths) -> Res let temp_server = temp.join("server"); let temp_client = temp.join("client"); + let temp_jwt = temp.join("jwt"); create_dir_restricted(&temp)?; create_dir_restricted(&temp_server)?; create_dir_restricted(&temp_client)?; + create_dir_restricted(&temp_jwt)?; write_pem(&temp.join("ca.crt"), &bundle.ca_cert_pem, false)?; write_pem(&temp.join("ca.key"), &bundle.ca_key_pem, true)?; @@ -349,19 +419,34 @@ fn write_local_bundle(dir: &Path, bundle: &PkiBundle, paths: &LocalPaths) -> Res write_pem(&temp_server.join("tls.key"), &bundle.server_key_pem, true)?; write_pem(&temp_client.join("tls.crt"), &bundle.client_cert_pem, false)?; write_pem(&temp_client.join("tls.key"), &bundle.client_key_pem, true)?; + write_pem( + &temp_jwt.join("signing.pem"), + &bundle.jwt_signing_key_pem, + true, + )?; + write_pem( + &temp_jwt.join("public.pem"), + &bundle.jwt_public_key_pem, + false, + )?; + write_pem(&temp_jwt.join("kid"), &bundle.jwt_key_id, false)?; // Final destination (might not exist yet on first run). create_dir_restricted(dir)?; create_dir_restricted(&paths.server_dir)?; create_dir_restricted(&paths.client_dir)?; + create_dir_restricted(&paths.jwt_dir)?; - let renames: [(PathBuf, &Path); 6] = [ + let renames: [(PathBuf, &Path); 9] = [ (temp.join("ca.crt"), paths.ca_crt.as_path()), (temp.join("ca.key"), paths.ca_key.as_path()), (temp_server.join("tls.crt"), paths.server_crt.as_path()), (temp_server.join("tls.key"), paths.server_key.as_path()), (temp_client.join("tls.crt"), paths.client_crt.as_path()), (temp_client.join("tls.key"), paths.client_key.as_path()), + (temp_jwt.join("signing.pem"), paths.jwt_signing.as_path()), + (temp_jwt.join("public.pem"), paths.jwt_public.as_path()), + (temp_jwt.join("kid"), paths.jwt_kid.as_path()), ]; for (from, to) in &renames { std::fs::rename(from, to) @@ -406,8 +491,8 @@ fn print_bundle(bundle: &PkiBundle) { #[cfg(test)] mod tests { use super::{ - K8sAction, LocalAction, LocalPaths, decide_k8s, decide_local, read_local_bundle, - sibling_temp_dir, tls_secret, write_local_bundle, + K8sAction, LocalAction, LocalPaths, decide_k8s, decide_local, jwt_signing_secret, + read_local_bundle, sibling_temp_dir, tls_secret, write_local_bundle, }; use openshell_bootstrap::pki::generate_pki; use std::path::Path; @@ -415,23 +500,32 @@ mod tests { // ── Kubernetes-mode decision ── #[test] - fn decide_k8s_skip_when_both_exist() { - assert_eq!(decide_k8s(true, true), K8sAction::SkipExists); + fn decide_k8s_skip_when_all_three_exist() { + assert_eq!(decide_k8s(true, true, true), K8sAction::SkipExists); } #[test] - fn decide_k8s_create_when_neither_exists() { - assert_eq!(decide_k8s(false, false), K8sAction::Create); + fn decide_k8s_create_when_none_exist() { + assert_eq!(decide_k8s(false, false, false), K8sAction::Create); } #[test] - fn decide_k8s_partial_when_only_server_exists() { - assert_eq!(decide_k8s(true, false), K8sAction::PartialState); - } - - #[test] - fn decide_k8s_partial_when_only_client_exists() { - assert_eq!(decide_k8s(false, true), K8sAction::PartialState); + fn decide_k8s_partial_for_any_mixed_state() { + let mixes = [ + (true, false, false), + (false, true, false), + (false, false, true), + (true, true, false), + (true, false, true), + (false, true, true), + ]; + for (s, c, j) in mixes { + assert_eq!( + decide_k8s(s, c, j), + K8sAction::PartialState, + "({s},{c},{j})" + ); + } } #[test] @@ -446,11 +540,23 @@ mod tests { assert_eq!(data["ca.crt"].0, b"CA-PEM"); } + #[test] + fn jwt_signing_secret_has_opaque_type_and_three_keys() { + let s = jwt_signing_secret("jwt", "SIGN", "PUB", "kid-1"); + assert_eq!(s.metadata.name.as_deref(), Some("jwt")); + assert_eq!(s.type_.as_deref(), Some("Opaque")); + let data = s.data.expect("data set"); + assert_eq!(data.len(), 3); + assert_eq!(data["signing.pem"].0, b"SIGN"); + assert_eq!(data["public.pem"].0, b"PUB"); + assert_eq!(data["kid"].0, b"kid-1"); + } + // ── Local-mode decision ── #[test] - fn decide_local_skip_when_all_six_present() { - assert_eq!(decide_local(6), LocalAction::Skip); + fn decide_local_skip_when_all_nine_present() { + assert_eq!(decide_local(9), LocalAction::Skip); } #[test] @@ -460,7 +566,7 @@ mod tests { #[test] fn decide_local_partial_for_any_count_in_between() { - for n in 1..=5 { + for n in 1..=8 { assert_eq!(decide_local(n), LocalAction::PartialState, "n = {n}"); } } diff --git a/crates/openshell-server/src/cli.rs b/crates/openshell-server/src/cli.rs index a2cfacde5..7d54872de 100644 --- a/crates/openshell-server/src/cli.rs +++ b/crates/openshell-server/src/cli.rs @@ -332,6 +332,24 @@ async fn run_from_args(mut args: RunArgs, matches: ArgMatches) -> Result<()> { }); } + // PR-2 wires gateway_jwt via the config file only — there's no CLI + // flag yet because the standard deployments (helm chart + RPM init + // script) drop the keypair to a known path and pass that path through + // the TOML. A CLI shortcut can be added if a singleplayer operator + // needs to override. + if let Some(jwt) = file + .as_ref() + .and_then(|f| f.openshell.gateway.gateway_jwt.clone()) + { + config.gateway_jwt = Some(jwt); + } + if let Some(spiffe) = file + .as_ref() + .and_then(|f| f.openshell.gateway.spiffe.clone()) + { + config.spiffe = Some(spiffe); + } + let vm_config = build_vm_config(file.as_ref())?; let docker_config = build_docker_config(file.as_ref())?; @@ -347,8 +365,11 @@ async fn run_from_args(mut args: RunArgs, matches: ArgMatches) -> Result<()> { if has_oidc { info!("OIDC authentication enabled"); } + if config.spiffe.is_some() { + info!("SPIFFE sandbox authentication enabled"); + } - if !has_client_ca && !has_oidc { + if !has_client_ca && !has_oidc && config.spiffe.is_none() { warn!( "Neither mTLS (--tls-client-ca) nor OIDC (--oidc-issuer) is configured — \ the gateway has no authentication mechanism" @@ -746,6 +767,8 @@ mod tests { "openshell-server-tls", "--client-secret-name", "openshell-client-tls", + "--jwt-secret-name", + "openshell-jwt-keys", "--server-san", "openshell.example.com", "--server-san", diff --git a/crates/openshell-server/src/compute/mod.rs b/crates/openshell-server/src/compute/mod.rs index d8e823df9..427a219ce 100644 --- a/crates/openshell-server/src/compute/mod.rs +++ b/crates/openshell-server/src/compute/mod.rs @@ -421,7 +421,11 @@ impl ComputeRuntime { .map(|_| ()) } - pub async fn create_sandbox(&self, sandbox: Sandbox) -> Result { + pub async fn create_sandbox( + &self, + sandbox: Sandbox, + sandbox_token: Option, + ) -> Result { let existing = self .store .get_message_by_name::(sandbox.object_name()) @@ -440,7 +444,12 @@ impl ComputeRuntime { .await .map_err(|e| Status::internal(format!("persist sandbox failed: {e}")))?; - let driver_sandbox = driver_sandbox_from_public(&sandbox); + let mut driver_sandbox = driver_sandbox_from_public(&sandbox); + if let Some(token) = sandbox_token + && let Some(spec) = driver_sandbox.spec.as_mut() + { + spec.sandbox_token = token; + } match self .driver .create_sandbox(Request::new(CreateSandboxRequest { @@ -1131,6 +1140,7 @@ fn driver_sandbox_spec_from_public(spec: &SandboxSpec) -> DriverSandboxSpec { .map(driver_sandbox_template_from_public), gpu: spec.gpu, gpu_device: spec.gpu_device.clone(), + sandbox_token: String::new(), } } diff --git a/crates/openshell-server/src/config_file.rs b/crates/openshell-server/src/config_file.rs index 2a1320a55..dd9f8badf 100644 --- a/crates/openshell-server/src/config_file.rs +++ b/crates/openshell-server/src/config_file.rs @@ -25,7 +25,7 @@ use std::net::SocketAddr; use std::path::{Path, PathBuf}; use openshell_core::config::ComputeDriverKind; -use openshell_core::{OidcConfig, TlsConfig}; +use openshell_core::{GatewayJwtConfig, OidcConfig, SpiffeConfig, TlsConfig}; use serde::{Deserialize, Serialize}; /// Latest schema version this build understands. @@ -115,6 +115,11 @@ pub struct GatewayFileSection { pub host_gateway_ip: Option, #[serde(default)] pub enable_user_namespaces: Option, + /// Lifetime (seconds) of the projected `ServiceAccount` token kubelet + /// writes for the `IssueSandboxToken` bootstrap exchange. Driver + /// clamps to `[600, 86400]`. + #[serde(default)] + pub sa_token_ttl_secs: Option, #[serde(default)] pub guest_tls_ca: Option, #[serde(default)] @@ -133,6 +138,10 @@ pub struct GatewayFileSection { pub tls: Option, #[serde(default)] pub oidc: Option, + #[serde(default)] + pub gateway_jwt: Option, + #[serde(default)] + pub spiffe: Option, // ── Disallowed-in-file fields ──────────────────────────────────────── // @@ -247,6 +256,11 @@ fn inheritable_keys(driver: ComputeDriverKind) -> &'static [&'static str] { "client_tls_secret_name", "host_gateway_ip", "enable_user_namespaces", + "sa_token_ttl_secs", + "spiffe_workload_api_socket_path", + "spiffe_trust_domain", + "spiffe_audience", + "spiffe_sandbox_id_prefix", ], ComputeDriverKind::Docker => &[ "sandbox_namespace", @@ -281,6 +295,23 @@ fn gateway_inherited_value(g: &GatewayFileSection, key: &str) -> Option g.client_tls_secret_name.as_deref().map(string_value), "host_gateway_ip" => g.host_gateway_ip.as_deref().map(string_value), "enable_user_namespaces" => g.enable_user_namespaces.map(toml::Value::Boolean), + "sa_token_ttl_secs" => g.sa_token_ttl_secs.map(toml::Value::Integer), + "spiffe_workload_api_socket_path" => g + .spiffe + .as_ref() + .map(|spiffe| path_value(&spiffe.workload_api_socket_path)), + "spiffe_trust_domain" => g + .spiffe + .as_ref() + .map(|spiffe| string_value(&spiffe.trust_domain)), + "spiffe_audience" => g + .spiffe + .as_ref() + .map(|spiffe| string_value(&spiffe.audience)), + "spiffe_sandbox_id_prefix" => g + .spiffe + .as_ref() + .map(|spiffe| string_value(&spiffe.sandbox_id_prefix)), "guest_tls_ca" => g.guest_tls_ca.as_deref().map(path_value), "guest_tls_cert" => g.guest_tls_cert.as_deref().map(path_value), "guest_tls_key" => g.guest_tls_key.as_deref().map(path_value), diff --git a/crates/openshell-server/src/grpc/auth_rpc.rs b/crates/openshell-server/src/grpc/auth_rpc.rs new file mode 100644 index 000000000..2519035be --- /dev/null +++ b/crates/openshell-server/src/grpc/auth_rpc.rs @@ -0,0 +1,307 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Authentication-related RPC handlers. +//! +//! Hosts the two sandbox-identity RPCs: +//! - `IssueSandboxToken` — bootstrap exchange (K8s SA token → gateway JWT) +//! - `RefreshSandboxToken` — rotate a still-valid gateway JWT +//! +//! Both end in a fresh gateway-signed JWT minted by +//! [`crate::auth::sandbox_jwt::SandboxJwtIssuer`]. `RefreshSandboxToken` +//! additionally revokes the previous JWT's `jti` so the old token +//! becomes unusable as soon as the new one is handed back. + +use crate::ServerState; +use crate::auth::principal::{Principal, SandboxIdentitySource}; +use openshell_core::proto::{ + IssueSandboxTokenRequest, IssueSandboxTokenResponse, RefreshSandboxTokenRequest, + RefreshSandboxTokenResponse, +}; +use std::sync::Arc; +use std::time::SystemTime; +use tonic::{Request, Response, Status}; +use tracing::{debug, info, warn}; + +#[allow(clippy::result_large_err, clippy::unused_async)] +pub async fn handle_issue_sandbox_token( + state: &Arc, + request: Request, +) -> Result, Status> { + let principal = request + .extensions() + .get::() + .cloned() + .ok_or_else(|| Status::unauthenticated("missing principal"))?; + + let Principal::Sandbox(sandbox) = principal else { + return Err(Status::permission_denied( + "IssueSandboxToken requires a sandbox principal", + )); + }; + + // Only the bootstrap K8s ServiceAccount path can mint a fresh + // gateway JWT via this RPC. Sandboxes already holding a gateway JWT + // use `RefreshSandboxToken` instead, which also revokes the old jti. + if !matches!( + sandbox.source, + SandboxIdentitySource::K8sServiceAccount { .. } + ) { + debug!( + sandbox_id = %sandbox.sandbox_id, + "IssueSandboxToken rejected: non-bootstrap principal source" + ); + return Err(Status::permission_denied( + "this principal cannot mint a sandbox token; use RefreshSandboxToken", + )); + } + + let issuer = state.sandbox_jwt_issuer.as_ref().ok_or_else(|| { + warn!( + sandbox_id = %sandbox.sandbox_id, + "IssueSandboxToken called but sandbox JWT issuer is not configured" + ); + Status::unavailable("sandbox JWT minting is not configured on this gateway") + })?; + + let minted = issuer.mint(&sandbox.sandbox_id)?; + info!( + sandbox_id = %sandbox.sandbox_id, + jti = %minted.jti, + "issued gateway sandbox JWT" + ); + Ok(Response::new(IssueSandboxTokenResponse { + token: minted.token, + expires_at_ms: minted.expires_at_ms, + })) +} + +#[allow(clippy::result_large_err, clippy::unused_async)] +pub async fn handle_refresh_sandbox_token( + state: &Arc, + request: Request, +) -> Result, Status> { + let principal = request + .extensions() + .get::() + .cloned() + .ok_or_else(|| Status::unauthenticated("missing principal"))?; + + let Principal::Sandbox(sandbox) = principal else { + return Err(Status::permission_denied( + "RefreshSandboxToken requires a sandbox principal", + )); + }; + + // Only callers already holding a gateway-minted JWT may refresh; the + // K8s bootstrap path must use `IssueSandboxToken`. + let SandboxIdentitySource::BootstrapJwt { jti: old_jti, .. } = &sandbox.source else { + debug!( + sandbox_id = %sandbox.sandbox_id, + "RefreshSandboxToken rejected: non-gateway-JWT principal source" + ); + return Err(Status::permission_denied( + "this principal cannot refresh; use IssueSandboxToken for bootstrap", + )); + }; + + let issuer = state.sandbox_jwt_issuer.as_ref().ok_or_else(|| { + warn!( + sandbox_id = %sandbox.sandbox_id, + "RefreshSandboxToken called but sandbox JWT issuer is not configured" + ); + Status::unavailable("sandbox JWT minting is not configured on this gateway") + })?; + + // Mint the new token first; only revoke the old jti after we have a + // replacement so a failure here doesn't leave the sandbox stranded. + let minted = issuer.mint(&sandbox.sandbox_id)?; + + // Best-effort revocation of the old token. The plan calls for the + // jti deny-list to live in memory in PR 2; PR 5 only needs to drop + // the old jti into it. We use the new token's expiry as a safe upper + // bound for the revocation entry — the old jti can't outlive its own + // `exp`, and on TTL pruning the entry drops out cleanly. + state + .sandbox_jwt_revocation + .revoke(old_jti, minted.expires_at_ms.max(now_ms())); + info!( + sandbox_id = %sandbox.sandbox_id, + revoked_jti = %old_jti, + new_jti = %minted.jti, + "refreshed gateway sandbox JWT" + ); + + Ok(Response::new(RefreshSandboxTokenResponse { + token: minted.token, + expires_at_ms: minted.expires_at_ms, + })) +} + +fn now_ms() -> i64 { + i64::try_from( + SystemTime::now() + .duration_since(SystemTime::UNIX_EPOCH) + .map_or(0, |d| d.as_millis()), + ) + .unwrap_or(i64::MAX) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::ServerState; + use crate::auth::principal::{Principal, SandboxPrincipal, UserPrincipal}; + use crate::auth::revocation::RevocationSet; + use crate::auth::sandbox_jwt::SandboxJwtIssuer; + use crate::compute::new_test_runtime; + use crate::persistence::Store; + use crate::sandbox_index::SandboxIndex; + use crate::sandbox_watch::SandboxWatchBus; + use crate::supervisor_session::SupervisorSessionRegistry; + use crate::tracing_bus::TracingLogBus; + use openshell_bootstrap::jwt::generate_jwt_key; + use openshell_core::Config; + use std::time::Duration; + + async fn state_with_issuer() -> (Arc, SandboxJwtIssuer, Arc) { + let mat = generate_jwt_key().expect("jwt key"); + let revocation = Arc::new(RevocationSet::new()); + let issuer = SandboxJwtIssuer::from_pem( + mat.signing_key_pem.as_bytes(), + mat.kid, + "test-gateway", + Duration::from_secs(3600), + ) + .expect("issuer"); + let store = Arc::new( + Store::connect("sqlite::memory:?cache=shared") + .await + .unwrap(), + ); + let compute = new_test_runtime(store.clone()).await; + let mut state = ServerState::new( + Config::new(None).with_database_url("sqlite::memory:?cache=shared"), + store, + compute, + SandboxIndex::new(), + SandboxWatchBus::new(), + TracingLogBus::new(), + Arc::new(SupervisorSessionRegistry::new()), + None, + ); + state.sandbox_jwt_revocation = revocation.clone(); + // We don't need the authenticator for these tests; only the issuer. + // The handler tests only exercise the mint+revoke path; they + // don't need the issuer to be the same instance that produced + // `issuer` above. A fresh keypair is fine. + let issuer_clone = SandboxJwtIssuer::from_pem( + generate_jwt_key().unwrap().signing_key_pem.as_bytes(), + "kid".to_string(), + "test-gateway", + Duration::from_secs(3600), + ) + .unwrap(); + state.sandbox_jwt_issuer = Some(Arc::new(issuer_clone)); + (Arc::new(state), issuer, revocation) + } + + fn sandbox_principal(sandbox_id: &str, jti: &str) -> Principal { + use crate::auth::principal::SandboxIdentitySource; + Principal::Sandbox(SandboxPrincipal { + sandbox_id: sandbox_id.to_string(), + source: SandboxIdentitySource::BootstrapJwt { + issuer: "openshell-gateway:test-gateway".to_string(), + jti: jti.to_string(), + }, + trust_domain: Some("openshell".to_string()), + }) + } + + #[tokio::test] + async fn refresh_revokes_old_jti_and_returns_new_token() { + let (state, _issuer, revocation) = state_with_issuer().await; + let old_jti = "j-original"; + let mut req = Request::new(RefreshSandboxTokenRequest {}); + req.extensions_mut() + .insert(sandbox_principal("sandbox-a", old_jti)); + let resp = handle_refresh_sandbox_token(&state, req) + .await + .expect("refresh OK") + .into_inner(); + assert!(!resp.token.is_empty()); + assert!(revocation.is_revoked(old_jti), "old jti must be revoked"); + } + + #[tokio::test] + async fn refresh_rejects_user_principal() { + use crate::auth::identity::{Identity, IdentityProvider}; + let (state, _, _) = state_with_issuer().await; + let mut req = Request::new(RefreshSandboxTokenRequest {}); + req.extensions_mut().insert(Principal::User(UserPrincipal { + identity: Identity { + subject: "alice".to_string(), + display_name: None, + roles: vec![], + scopes: vec![], + provider: IdentityProvider::Oidc, + }, + })); + let err = handle_refresh_sandbox_token(&state, req) + .await + .expect_err("user must not refresh"); + assert_eq!(err.code(), tonic::Code::PermissionDenied); + } + + #[tokio::test] + async fn refresh_rejects_k8s_sa_principal() { + // K8s SA-bootstrap principals must use IssueSandboxToken, not + // RefreshSandboxToken — the refresh path assumes a still-valid + // gateway-minted JWT exists. + use crate::auth::principal::SandboxIdentitySource; + let (state, _, _) = state_with_issuer().await; + let mut req = Request::new(RefreshSandboxTokenRequest {}); + req.extensions_mut() + .insert(Principal::Sandbox(SandboxPrincipal { + sandbox_id: "sandbox-a".to_string(), + source: SandboxIdentitySource::K8sServiceAccount { + pod_name: "pod-a".to_string(), + pod_uid: "uid-a".to_string(), + }, + trust_domain: Some("openshell".to_string()), + })); + let err = handle_refresh_sandbox_token(&state, req) + .await + .expect_err("K8s SA principal must not refresh"); + assert_eq!(err.code(), tonic::Code::PermissionDenied); + } + + #[tokio::test] + async fn refresh_fails_when_issuer_not_configured() { + // Build a ServerState without the issuer to confirm the handler + // returns Unavailable. + let store = Arc::new( + Store::connect("sqlite::memory:?cache=shared") + .await + .unwrap(), + ); + let compute = new_test_runtime(store.clone()).await; + let state = Arc::new(ServerState::new( + Config::new(None).with_database_url("sqlite::memory:?cache=shared"), + store, + compute, + SandboxIndex::new(), + SandboxWatchBus::new(), + TracingLogBus::new(), + Arc::new(SupervisorSessionRegistry::new()), + None, + )); + let mut req = Request::new(RefreshSandboxTokenRequest {}); + req.extensions_mut() + .insert(sandbox_principal("sandbox-a", "j-1")); + let err = handle_refresh_sandbox_token(&state, req) + .await + .expect_err("missing issuer must yield unavailable"); + assert_eq!(err.code(), tonic::Code::Unavailable); + } +} diff --git a/crates/openshell-server/src/grpc/mod.rs b/crates/openshell-server/src/grpc/mod.rs index 9ea8d7ece..85c47a19a 100644 --- a/crates/openshell-server/src/grpc/mod.rs +++ b/crates/openshell-server/src/grpc/mod.rs @@ -3,6 +3,7 @@ //! gRPC service implementation. +mod auth_rpc; pub mod policy; mod provider; mod sandbox; @@ -25,15 +26,16 @@ use openshell_core::proto::{ GetSandboxLogsResponse, GetSandboxPolicyStatusRequest, GetSandboxPolicyStatusResponse, GetSandboxProviderEnvironmentRequest, GetSandboxProviderEnvironmentResponse, GetSandboxRequest, GetServiceRequest, HealthRequest, HealthResponse, ImportProviderProfilesRequest, - ImportProviderProfilesResponse, LintProviderProfilesRequest, LintProviderProfilesResponse, - ListProviderProfilesRequest, ListProviderProfilesResponse, ListProvidersRequest, - ListProvidersResponse, ListSandboxPoliciesRequest, ListSandboxPoliciesResponse, - ListSandboxProvidersRequest, ListSandboxProvidersResponse, ListSandboxesRequest, - ListSandboxesResponse, ListServicesRequest, ListServicesResponse, ProviderProfileResponse, - ProviderResponse, PushSandboxLogsRequest, PushSandboxLogsResponse, RejectDraftChunkRequest, - RejectDraftChunkResponse, RelayFrame, ReportPolicyStatusRequest, ReportPolicyStatusResponse, - RevokeSshSessionRequest, RevokeSshSessionResponse, SandboxResponse, SandboxStreamEvent, - ServiceEndpointResponse, ServiceStatus, SubmitPolicyAnalysisRequest, + ImportProviderProfilesResponse, IssueSandboxTokenRequest, IssueSandboxTokenResponse, + LintProviderProfilesRequest, LintProviderProfilesResponse, ListProviderProfilesRequest, + ListProviderProfilesResponse, ListProvidersRequest, ListProvidersResponse, + ListSandboxPoliciesRequest, ListSandboxPoliciesResponse, ListSandboxProvidersRequest, + ListSandboxProvidersResponse, ListSandboxesRequest, ListSandboxesResponse, ListServicesRequest, + ListServicesResponse, ProviderProfileResponse, ProviderResponse, PushSandboxLogsRequest, + PushSandboxLogsResponse, RefreshSandboxTokenRequest, RefreshSandboxTokenResponse, + RejectDraftChunkRequest, RejectDraftChunkResponse, RelayFrame, ReportPolicyStatusRequest, + ReportPolicyStatusResponse, RevokeSshSessionRequest, RevokeSshSessionResponse, SandboxResponse, + SandboxStreamEvent, ServiceEndpointResponse, ServiceStatus, SubmitPolicyAnalysisRequest, SubmitPolicyAnalysisResponse, SupervisorMessage, TcpForwardFrame, UndoDraftChunkRequest, UndoDraftChunkResponse, UpdateConfigRequest, UpdateConfigResponse, UpdateProviderRequest, WatchSandboxRequest, open_shell_server::OpenShell, @@ -510,6 +512,22 @@ impl OpenShell for OpenShellService { policy::handle_get_draft_history(&self.state, request).await } + // --- Sandbox identity --- + + async fn issue_sandbox_token( + &self, + request: Request, + ) -> Result, Status> { + auth_rpc::handle_issue_sandbox_token(&self.state, request).await + } + + async fn refresh_sandbox_token( + &self, + request: Request, + ) -> Result, Status> { + auth_rpc::handle_refresh_sandbox_token(&self.state, request).await + } + // --- Supervisor session --- type ConnectSupervisorStream = diff --git a/crates/openshell-server/src/grpc/policy.rs b/crates/openshell-server/src/grpc/policy.rs index 315b06f3c..edee228cc 100644 --- a/crates/openshell-server/src/grpc/policy.rs +++ b/crates/openshell-server/src/grpc/policy.rs @@ -10,9 +10,9 @@ #![allow(clippy::cast_precision_loss)] // f64->f32 for confidence scores #![allow(clippy::items_after_statements)] // DB_PORTS const inside function +use crate::ServerState; use crate::persistence::{DraftChunkRecord, ObjectId, ObjectName, ObjectType, PolicyRecord, Store}; use crate::policy_store::PolicyStoreExt; -use crate::{ServerState, auth::oidc}; use openshell_core::proto::policy_merge_operation; use openshell_core::proto::setting_value; use openshell_core::proto::{ @@ -314,8 +314,14 @@ fn truncate_for_log(input: &str, max_chars: usize) -> String { } } +#[cfg(test)] fn is_sandbox_caller(request: &Request) -> bool { - oidc::is_sandbox_caller(request.metadata()) + matches!( + request + .extensions() + .get::(), + Some(crate::auth::principal::Principal::Sandbox(_)) + ) } /// Sandbox-class callers may only perform sandbox-scoped policy sync. They @@ -352,7 +358,9 @@ pub(super) async fn handle_get_sandbox_config( state: &Arc, request: Request, ) -> Result, Status> { - let sandbox_id = request.into_inner().sandbox_id; + let sandbox_id = request.get_ref().sandbox_id.clone(); + crate::auth::guard::enforce_sandbox_scope(&request, &sandbox_id)?; + drop(request); let sandbox = state .store @@ -609,7 +617,9 @@ pub(super) async fn handle_get_sandbox_provider_environment( state: &Arc, request: Request, ) -> Result, Status> { - let sandbox_id = request.into_inner().sandbox_id; + let sandbox_id = request.get_ref().sandbox_id.clone(); + crate::auth::guard::enforce_sandbox_scope(&request, &sandbox_id)?; + drop(request); let sandbox = state .store @@ -651,10 +661,32 @@ pub(super) async fn handle_update_config( state: &Arc, request: Request, ) -> Result, Status> { - let sandbox_caller = is_sandbox_caller(&request); + let principal = request + .extensions() + .get::() + .cloned(); + let sandbox_caller = matches!( + principal, + Some(crate::auth::principal::Principal::Sandbox(_)) + ); let req = request.into_inner(); if sandbox_caller { validate_sandbox_caller_update(&req)?; + // Resolve req.name to a sandbox UUID and verify the calling + // sandbox principal owns it. User callers (CLI / TUI) bypass + // this check because RBAC was their gate. + let sandbox = state + .store + .get_message_by_name::(&req.name) + .await + .map_err(|e| Status::internal(format!("fetch sandbox failed: {e}")))? + .ok_or_else(|| Status::not_found("sandbox not found"))?; + crate::auth::guard::ensure_sandbox_scope( + principal + .as_ref() + .expect("sandbox_caller implies principal"), + sandbox.object_id(), + )?; } let key = req.setting_key.trim(); let has_policy = req.policy.is_some(); @@ -1180,6 +1212,8 @@ pub(super) async fn handle_report_policy_status( state: &Arc, request: Request, ) -> Result, Status> { + let sandbox_id = request.get_ref().sandbox_id.clone(); + crate::auth::guard::enforce_sandbox_scope(&request, &sandbox_id)?; let req = request.into_inner(); if req.sandbox_id.is_empty() { return Err(Status::invalid_argument("sandbox_id is required")); @@ -1294,6 +1328,11 @@ pub(super) async fn handle_push_sandbox_logs( state: &Arc, request: Request>, ) -> Result, Status> { + let principal = request + .extensions() + .get::() + .cloned() + .ok_or_else(|| Status::unauthenticated("missing principal"))?; let mut stream = request.into_inner(); let mut validated = false; @@ -1307,6 +1346,10 @@ pub(super) async fn handle_push_sandbox_logs( } if !validated { + // The streaming RPC carries the sandbox_id in every frame, but + // the equality check only needs to run once on the first frame + // — the principal is stable across the stream. + crate::auth::guard::ensure_sandbox_scope(&principal, &batch.sandbox_id)?; state .store .get_message::(&batch.sandbox_id) @@ -1335,6 +1378,11 @@ pub(super) async fn handle_submit_policy_analysis( state: &Arc, request: Request, ) -> Result, Status> { + let principal = request + .extensions() + .get::() + .cloned() + .ok_or_else(|| Status::unauthenticated("missing principal"))?; let req = request.into_inner(); if req.name.is_empty() { return Err(Status::invalid_argument("name is required")); @@ -1347,6 +1395,9 @@ pub(super) async fn handle_submit_policy_analysis( .map_err(|e| Status::internal(format!("fetch sandbox failed: {e}")))? .ok_or_else(|| Status::not_found("sandbox not found"))?; let sandbox_id = sandbox.object_id().to_string(); + // Name → id resolved; now enforce that a sandbox principal only acts + // on its own sandbox. User principals are unaffected. + crate::auth::guard::ensure_sandbox_scope(&principal, &sandbox_id)?; let current_version = state .store @@ -1463,6 +1514,11 @@ pub(super) async fn handle_get_draft_policy( state: &Arc, request: Request, ) -> Result, Status> { + let principal = request + .extensions() + .get::() + .cloned() + .ok_or_else(|| Status::unauthenticated("missing principal"))?; let req = request.into_inner(); if req.name.is_empty() { return Err(Status::invalid_argument("name is required")); @@ -1475,6 +1531,7 @@ pub(super) async fn handle_get_draft_policy( .map_err(|e| Status::internal(format!("fetch sandbox failed: {e}")))? .ok_or_else(|| Status::not_found("sandbox not found"))?; let sandbox_id = sandbox.object_id().to_string(); + crate::auth::guard::ensure_sandbox_scope(&principal, &sandbox_id)?; let status_filter = if req.status_filter.is_empty() { None @@ -2789,6 +2846,10 @@ fn materialize_global_settings( mod tests { use super::*; use crate::ServerState; + use crate::auth::identity::{Identity, IdentityProvider}; + use crate::auth::principal::{ + Principal, SandboxIdentitySource, SandboxPrincipal, UserPrincipal, + }; use crate::compute::new_test_runtime; use crate::persistence::Store; use crate::sandbox_index::SandboxIndex; @@ -2800,6 +2861,41 @@ mod tests { use std::sync::Arc; use tonic::Code; + /// Wrap a request with a user `Principal` so handlers' scope guards + /// (introduced in PR 4) treat the test caller as a CLI user — equivalent + /// to the pre-PR-4 behavior where all tests effectively ran as user. + fn with_user(mut request: Request) -> Request { + request + .extensions_mut() + .insert(Principal::User(UserPrincipal { + identity: Identity { + subject: "test-user".to_string(), + display_name: None, + roles: vec![], + scopes: vec![], + provider: IdentityProvider::Oidc, + }, + })); + request + } + + /// Wrap a request with a sandbox `Principal` bound to `sandbox_id`. + /// Use for tests that exercise sandbox-caller code paths. + #[allow(dead_code)] + fn with_sandbox(mut request: Request, sandbox_id: &str) -> Request { + request + .extensions_mut() + .insert(Principal::Sandbox(SandboxPrincipal { + sandbox_id: sandbox_id.to_string(), + source: SandboxIdentitySource::BootstrapJwt { + issuer: "openshell-gateway:test".to_string(), + jti: "j-test".to_string(), + }, + trust_domain: Some("openshell".to_string()), + })); + request + } + #[test] fn sandbox_caller_update_validation_allows_sandbox_policy_sync() { let req = UpdateConfigRequest { @@ -2834,15 +2930,201 @@ mod tests { } #[test] - fn sandbox_caller_marker_detected_from_metadata() { + fn sandbox_caller_detected_from_principal_extension() { + use crate::auth::principal::{Principal, SandboxIdentitySource, SandboxPrincipal}; let mut req = Request::new(()); - req.metadata_mut().insert( - oidc::INTERNAL_AUTH_SOURCE_HEADER, - oidc::AUTH_SOURCE_SANDBOX.parse().unwrap(), - ); + req.extensions_mut() + .insert(Principal::Sandbox(SandboxPrincipal { + sandbox_id: "test-sandbox".to_string(), + source: SandboxIdentitySource::BootstrapJwt { + issuer: "openshell-gateway:test".to_string(), + jti: "j-1".to_string(), + }, + trust_domain: None, + })); assert!(is_sandbox_caller(&req)); } + #[test] + fn user_principal_not_treated_as_sandbox_caller() { + use crate::auth::identity::{Identity, IdentityProvider}; + use crate::auth::principal::{Principal, UserPrincipal}; + let mut req = Request::new(()); + req.extensions_mut().insert(Principal::User(UserPrincipal { + identity: Identity { + subject: "alice".to_string(), + display_name: None, + roles: vec![], + scopes: vec![], + provider: IdentityProvider::Oidc, + }, + })); + assert!(!is_sandbox_caller(&req)); + } + + // ---- PR-4 IDOR guard (issue #1354) ---- + + #[tokio::test] + async fn cross_sandbox_get_sandbox_config_denied() { + use openshell_core::proto::{SandboxPhase, SandboxSpec}; + let state = test_server_state().await; + // Two sandboxes; the caller is principal of A, the request body + // references B. + for (id, name) in [("sb-a", "sandbox-a"), ("sb-b", "sandbox-b")] { + let sandbox = Sandbox { + metadata: Some(openshell_core::proto::datamodel::v1::ObjectMeta { + id: id.to_string(), + name: name.to_string(), + created_at_ms: 1_000_000, + labels: HashMap::new(), + }), + spec: Some(SandboxSpec { + policy: None, + ..Default::default() + }), + phase: SandboxPhase::Provisioning as i32, + ..Default::default() + }; + state.store.put_message(&sandbox).await.unwrap(); + } + let req = with_sandbox( + Request::new(GetSandboxConfigRequest { + sandbox_id: "sb-b".to_string(), + }), + "sb-a", + ); + let err = handle_get_sandbox_config(&state, req) + .await + .expect_err("cross-sandbox call must be denied"); + assert_eq!(err.code(), Code::PermissionDenied); + } + + #[tokio::test] + async fn same_sandbox_get_sandbox_config_allowed() { + use openshell_core::proto::{SandboxPhase, SandboxSpec}; + let state = test_server_state().await; + let sandbox = Sandbox { + metadata: Some(openshell_core::proto::datamodel::v1::ObjectMeta { + id: "sb-self".to_string(), + name: "self".to_string(), + created_at_ms: 1_000_000, + labels: HashMap::new(), + }), + spec: Some(SandboxSpec { + policy: None, + ..Default::default() + }), + phase: SandboxPhase::Provisioning as i32, + ..Default::default() + }; + state.store.put_message(&sandbox).await.unwrap(); + let req = with_sandbox( + Request::new(GetSandboxConfigRequest { + sandbox_id: "sb-self".to_string(), + }), + "sb-self", + ); + handle_get_sandbox_config(&state, req) + .await + .expect("matching principal must be allowed"); + } + + #[tokio::test] + async fn cross_sandbox_submit_policy_analysis_denied() { + use openshell_core::proto::{SandboxPhase, SandboxSpec}; + let state = test_server_state().await; + for (id, name) in [("sb-a", "sandbox-a"), ("sb-b", "sandbox-b")] { + let sandbox = Sandbox { + metadata: Some(openshell_core::proto::datamodel::v1::ObjectMeta { + id: id.to_string(), + name: name.to_string(), + created_at_ms: 1_000_000, + labels: HashMap::new(), + }), + spec: Some(SandboxSpec { + policy: None, + ..Default::default() + }), + phase: SandboxPhase::Provisioning as i32, + ..Default::default() + }; + state.store.put_message(&sandbox).await.unwrap(); + } + let req = with_sandbox( + Request::new(SubmitPolicyAnalysisRequest { + name: "sandbox-b".to_string(), + ..Default::default() + }), + "sb-a", + ); + let err = handle_submit_policy_analysis(&state, req) + .await + .expect_err("cross-sandbox submit must be denied"); + assert_eq!(err.code(), Code::PermissionDenied); + } + + #[tokio::test] + async fn cross_sandbox_get_draft_policy_denied() { + use openshell_core::proto::{SandboxPhase, SandboxSpec}; + let state = test_server_state().await; + for (id, name) in [("sb-a", "sandbox-a"), ("sb-b", "sandbox-b")] { + let sandbox = Sandbox { + metadata: Some(openshell_core::proto::datamodel::v1::ObjectMeta { + id: id.to_string(), + name: name.to_string(), + created_at_ms: 1_000_000, + labels: HashMap::new(), + }), + spec: Some(SandboxSpec { + policy: None, + ..Default::default() + }), + phase: SandboxPhase::Provisioning as i32, + ..Default::default() + }; + state.store.put_message(&sandbox).await.unwrap(); + } + let req = with_sandbox( + Request::new(GetDraftPolicyRequest { + name: "sandbox-b".to_string(), + status_filter: String::new(), + }), + "sb-a", + ); + let err = handle_get_draft_policy(&state, req) + .await + .expect_err("cross-sandbox draft read must be denied"); + assert_eq!(err.code(), Code::PermissionDenied); + } + + #[tokio::test] + async fn user_principal_can_read_any_sandbox_config() { + // RBAC was the user gate; the IDOR guard must NOT trip for users. + use openshell_core::proto::{SandboxPhase, SandboxSpec}; + let state = test_server_state().await; + let sandbox = Sandbox { + metadata: Some(openshell_core::proto::datamodel::v1::ObjectMeta { + id: "sb-x".to_string(), + name: "x".to_string(), + created_at_ms: 1_000_000, + labels: HashMap::new(), + }), + spec: Some(SandboxSpec { + policy: None, + ..Default::default() + }), + phase: SandboxPhase::Provisioning as i32, + ..Default::default() + }; + state.store.put_message(&sandbox).await.unwrap(); + let req = with_user(Request::new(GetSandboxConfigRequest { + sandbox_id: "sb-x".to_string(), + })); + handle_get_sandbox_config(&state, req) + .await + .expect("user principal must succeed"); + } + // ---- Sandbox without policy ---- #[tokio::test] @@ -2951,9 +3233,9 @@ mod tests { async fn get_sandbox_policy(state: &Arc, sandbox_id: &str) -> ProtoSandboxPolicy { handle_get_sandbox_config( state, - Request::new(GetSandboxConfigRequest { + with_user(Request::new(GetSandboxConfigRequest { sandbox_id: sandbox_id.to_string(), - }), + })), ) .await .unwrap() @@ -3391,9 +3673,9 @@ mod tests { let legacy_env = handle_get_sandbox_provider_environment( &state, - Request::new(GetSandboxProviderEnvironmentRequest { + with_user(Request::new(GetSandboxProviderEnvironmentRequest { sandbox_id: "sb-provider-env".to_string(), - }), + })), ) .await .unwrap() @@ -3403,9 +3685,9 @@ mod tests { enable_providers_v2(&state).await; let v2_env = handle_get_sandbox_provider_environment( &state, - Request::new(GetSandboxProviderEnvironmentRequest { + with_user(Request::new(GetSandboxProviderEnvironmentRequest { sandbox_id: "sb-provider-env".to_string(), - }), + })), ) .await .unwrap() @@ -3437,9 +3719,9 @@ mod tests { let first = handle_get_sandbox_provider_environment( &state, - Request::new(GetSandboxProviderEnvironmentRequest { + with_user(Request::new(GetSandboxProviderEnvironmentRequest { sandbox_id: "sb-provider-revision".to_string(), - }), + })), ) .await .unwrap() @@ -3453,9 +3735,9 @@ mod tests { let second = handle_get_sandbox_provider_environment( &state, - Request::new(GetSandboxProviderEnvironmentRequest { + with_user(Request::new(GetSandboxProviderEnvironmentRequest { sandbox_id: "sb-provider-revision".to_string(), - }), + })), ) .await .unwrap() @@ -3507,9 +3789,9 @@ mod tests { ); let baseline_env = handle_get_sandbox_provider_environment( &state, - Request::new(GetSandboxProviderEnvironmentRequest { + with_user(Request::new(GetSandboxProviderEnvironmentRequest { sandbox_id: "sb-attach-lifecycle".to_string(), - }), + })), ) .await .unwrap() @@ -3517,10 +3799,10 @@ mod tests { handle_attach_sandbox_provider( &state, - Request::new(AttachSandboxProviderRequest { + with_user(Request::new(AttachSandboxProviderRequest { sandbox_name: "attach-lifecycle".to_string(), provider_name: "work-github".to_string(), - }), + })), ) .await .unwrap(); @@ -3534,9 +3816,9 @@ mod tests { let attached_env = handle_get_sandbox_provider_environment( &state, - Request::new(GetSandboxProviderEnvironmentRequest { + with_user(Request::new(GetSandboxProviderEnvironmentRequest { sandbox_id: "sb-attach-lifecycle".to_string(), - }), + })), ) .await .unwrap() @@ -3569,9 +3851,9 @@ mod tests { let detached_env = handle_get_sandbox_provider_environment( &state, - Request::new(GetSandboxProviderEnvironmentRequest { + with_user(Request::new(GetSandboxProviderEnvironmentRequest { sandbox_id: "sb-attach-lifecycle".to_string(), - }), + })), ) .await .unwrap() @@ -3665,9 +3947,9 @@ mod tests { ); let baseline_env = handle_get_sandbox_provider_environment( &state, - Request::new(GetSandboxProviderEnvironmentRequest { + with_user(Request::new(GetSandboxProviderEnvironmentRequest { sandbox_id: "sb-custom-attach-lifecycle".to_string(), - }), + })), ) .await .unwrap() @@ -3675,10 +3957,10 @@ mod tests { handle_attach_sandbox_provider( &state, - Request::new(AttachSandboxProviderRequest { + with_user(Request::new(AttachSandboxProviderRequest { sandbox_name: "custom-attach-lifecycle".to_string(), provider_name: "work-custom".to_string(), - }), + })), ) .await .unwrap(); @@ -3695,9 +3977,9 @@ mod tests { let attached_env = handle_get_sandbox_provider_environment( &state, - Request::new(GetSandboxProviderEnvironmentRequest { + with_user(Request::new(GetSandboxProviderEnvironmentRequest { sandbox_id: "sb-custom-attach-lifecycle".to_string(), - }), + })), ) .await .unwrap() @@ -3729,9 +4011,9 @@ mod tests { ); let detached_env = handle_get_sandbox_provider_environment( &state, - Request::new(GetSandboxProviderEnvironmentRequest { + with_user(Request::new(GetSandboxProviderEnvironmentRequest { sandbox_id: "sb-custom-attach-lifecycle".to_string(), - }), + })), ) .await .unwrap() @@ -3827,9 +4109,9 @@ mod tests { let response = handle_get_sandbox_config( &state, - Request::new(GetSandboxConfigRequest { + with_user(Request::new(GetSandboxConfigRequest { sandbox_id: "sb-global-profile".to_string(), - }), + })), ) .await .unwrap() @@ -3972,7 +4254,7 @@ mod tests { let submit = handle_submit_policy_analysis( &state, - Request::new(SubmitPolicyAnalysisRequest { + with_user(Request::new(SubmitPolicyAnalysisRequest { name: sandbox_name.clone(), proposed_chunks: vec![PolicyChunk { rule_name: "allow_example".to_string(), @@ -3986,7 +4268,7 @@ mod tests { ..Default::default() }], ..Default::default() - }), + })), ) .await .unwrap() @@ -3998,10 +4280,10 @@ mod tests { let draft_policy = handle_get_draft_policy( &state, - Request::new(GetDraftPolicyRequest { + with_user(Request::new(GetDraftPolicyRequest { name: sandbox_name.clone(), status_filter: String::new(), - }), + })), ) .await .unwrap() @@ -4068,10 +4350,10 @@ mod tests { let draft_policy_after_undo = handle_get_draft_policy( &state, - Request::new(GetDraftPolicyRequest { + with_user(Request::new(GetDraftPolicyRequest { name: sandbox_name.clone(), status_filter: String::new(), - }), + })), ) .await .unwrap() @@ -4120,10 +4402,10 @@ mod tests { let draft_policy_after_clear = handle_get_draft_policy( &state, - Request::new(GetDraftPolicyRequest { + with_user(Request::new(GetDraftPolicyRequest { name: sandbox_name.clone(), status_filter: String::new(), - }), + })), ) .await .unwrap() @@ -4181,7 +4463,7 @@ mod tests { let submit = handle_submit_policy_analysis( &state, - Request::new(SubmitPolicyAnalysisRequest { + with_user(Request::new(SubmitPolicyAnalysisRequest { name: sandbox_name.clone(), proposed_chunks: vec![PolicyChunk { rule_name: "allow_example".to_string(), @@ -4190,7 +4472,7 @@ mod tests { ..Default::default() }], ..Default::default() - }), + })), ) .await .unwrap() @@ -4211,10 +4493,10 @@ mod tests { let draft = handle_get_draft_policy( &state, - Request::new(GetDraftPolicyRequest { + with_user(Request::new(GetDraftPolicyRequest { name: sandbox_name, status_filter: String::new(), - }), + })), ) .await .unwrap() @@ -4289,7 +4571,7 @@ mod tests { async move { handle_submit_policy_analysis( &state, - Request::new(SubmitPolicyAnalysisRequest { + with_user(Request::new(SubmitPolicyAnalysisRequest { name: sandbox_name, analysis_mode: "agent_authored".to_string(), proposed_chunks: vec![PolicyChunk { @@ -4298,7 +4580,7 @@ mod tests { ..Default::default() }], ..Default::default() - }), + })), ) .await .unwrap() @@ -4318,10 +4600,10 @@ mod tests { let draft = handle_get_draft_policy( &state, - Request::new(GetDraftPolicyRequest { + with_user(Request::new(GetDraftPolicyRequest { name: sandbox_name.clone(), status_filter: String::new(), - }), + })), ) .await .unwrap() @@ -4395,7 +4677,7 @@ mod tests { async move { handle_submit_policy_analysis( &state, - Request::new(SubmitPolicyAnalysisRequest { + with_user(Request::new(SubmitPolicyAnalysisRequest { name: sandbox_name, analysis_mode: "mechanistic".to_string(), proposed_chunks: vec![PolicyChunk { @@ -4404,7 +4686,7 @@ mod tests { ..Default::default() }], ..Default::default() - }), + })), ) .await .unwrap() @@ -4418,10 +4700,10 @@ mod tests { let draft = handle_get_draft_policy( &state, - Request::new(GetDraftPolicyRequest { + with_user(Request::new(GetDraftPolicyRequest { name: sandbox_name, status_filter: String::new(), - }), + })), ) .await .unwrap() @@ -4490,7 +4772,7 @@ mod tests { let submit = handle_submit_policy_analysis( &state, - Request::new(SubmitPolicyAnalysisRequest { + with_user(Request::new(SubmitPolicyAnalysisRequest { name: sandbox_name.clone(), proposed_chunks: vec![PolicyChunk { rule_name: "allow_example".to_string(), @@ -4498,7 +4780,7 @@ mod tests { ..Default::default() }], ..Default::default() - }), + })), ) .await .unwrap() @@ -4538,10 +4820,10 @@ mod tests { let draft = handle_get_draft_policy( &state, - Request::new(GetDraftPolicyRequest { + with_user(Request::new(GetDraftPolicyRequest { name: sandbox_name, status_filter: String::new(), - }), + })), ) .await .unwrap() @@ -4610,7 +4892,7 @@ mod tests { handle_submit_policy_analysis( &state, - Request::new(SubmitPolicyAnalysisRequest { + with_user(Request::new(SubmitPolicyAnalysisRequest { name: sandbox_a.object_name().to_string(), proposed_chunks: vec![PolicyChunk { rule_name: "allow_example".to_string(), @@ -4624,17 +4906,17 @@ mod tests { ..Default::default() }], ..Default::default() - }), + })), ) .await .unwrap(); let draft_policy = handle_get_draft_policy( &state, - Request::new(GetDraftPolicyRequest { + with_user(Request::new(GetDraftPolicyRequest { name: sandbox_a.object_name().to_string(), status_filter: String::new(), - }), + })), ) .await .unwrap() diff --git a/crates/openshell-server/src/grpc/sandbox.rs b/crates/openshell-server/src/grpc/sandbox.rs index 5c523b10e..a66a48202 100644 --- a/crates/openshell-server/src/grpc/sandbox.rs +++ b/crates/openshell-server/src/grpc/sandbox.rs @@ -128,7 +128,28 @@ pub(super) async fn handle_create_sandbox( status })?; - let sandbox = state.compute.create_sandbox(sandbox).await?; + // Mint the gateway JWT for singleplayer drivers. K8s sandboxes skip + // this mint and bootstrap via `IssueSandboxToken` at supervisor + // startup; identifying "is this K8s?" lives in the compute layer, so + // we mint unconditionally here when the issuer is configured and let + // the K8s driver simply ignore the field. + let sandbox_token = state.sandbox_jwt_issuer.as_ref().map(|issuer| { + issuer.mint(&id).map(|minted| { + tracing::info!( + sandbox_id = %id, + jti = %minted.jti, + "minted sandbox JWT" + ); + minted.token + }) + }); + let sandbox_token = match sandbox_token { + Some(Ok(token)) => Some(token), + Some(Err(status)) => return Err(status), + None => None, + }; + + let sandbox = state.compute.create_sandbox(sandbox, sandbox_token).await?; info!( sandbox_id = %id, diff --git a/crates/openshell-server/src/inference.rs b/crates/openshell-server/src/inference.rs index 50d1e8df3..c219fc5ce 100644 --- a/crates/openshell-server/src/inference.rs +++ b/crates/openshell-server/src/inference.rs @@ -57,8 +57,25 @@ impl ObjectType for InferenceRoute { impl Inference for InferenceService { async fn get_inference_bundle( &self, - _request: Request, + request: Request, ) -> Result, Status> { + // GetInferenceBundle is gateway-wide (no per-sandbox routes yet), + // so it has no `sandbox_id` to compare against. Just reject + // anonymous callers; both user and sandbox principals are allowed. + match request + .extensions() + .get::() + { + Some( + crate::auth::principal::Principal::User(_) + | crate::auth::principal::Principal::Sandbox(_), + ) => {} + Some(crate::auth::principal::Principal::Anonymous) | None => { + return Err(Status::unauthenticated( + "GetInferenceBundle requires an authenticated caller", + )); + } + } resolve_inference_bundle(self.state.store.as_ref()) .await .map(Response::new) diff --git a/crates/openshell-server/src/lib.rs b/crates/openshell-server/src/lib.rs index a6e337dec..7f4b5f7ba 100644 --- a/crates/openshell-server/src/lib.rs +++ b/crates/openshell-server/src/lib.rs @@ -103,6 +103,30 @@ pub struct ServerState { /// OIDC JWKS cache for JWT validation. `None` when OIDC is not configured. pub oidc_cache: Option>, + + /// Gateway-minted sandbox JWT issuer. `None` when `config.gateway_jwt` + /// is not configured; in that mode `IssueSandboxToken` returns + /// `Status::unavailable`. Populated at startup from the on-disk key + /// material that `certgen` writes. + pub sandbox_jwt_issuer: Option>, + + /// Authenticator that validates gateway-minted sandbox JWTs on every + /// inbound request. Always set when `sandbox_jwt_issuer` is, so callers + /// presenting a freshly minted token are recognized. + pub sandbox_jwt_authenticator: Option>, + + /// Authenticator that validates SPIFFE JWT-SVIDs through the local SPIFFE + /// Workload API and maps authorized SPIFFE IDs to sandbox principals. + pub spiffe_authenticator: Option>, + + /// Optional K8s `ServiceAccount` authenticator that backs the + /// `IssueSandboxToken` bootstrap path. Only present when the gateway + /// runs in-cluster. + pub k8s_sa_authenticator: Option>, + + /// In-memory revocation set for gateway-minted sandbox JWTs. + /// Populated by `DeleteSandbox` and (in PR 5) `RefreshSandboxToken`. + pub sandbox_jwt_revocation: Arc, } fn is_benign_tls_handshake_failure(error: &std::io::Error) -> bool { @@ -147,6 +171,11 @@ impl ServerState { settings_mutex: tokio::sync::Mutex::new(()), supervisor_sessions, oidc_cache, + sandbox_jwt_issuer: None, + sandbox_jwt_authenticator: None, + spiffe_authenticator: None, + k8s_sa_authenticator: None, + sandbox_jwt_revocation: Arc::new(auth::revocation::RevocationSet::new()), } } } @@ -204,7 +233,7 @@ pub async fn run_server( supervisor_sessions.clone(), ) .await?; - let state = Arc::new(ServerState::new( + let mut state = ServerState::new( config.clone(), store.clone(), compute, @@ -213,7 +242,108 @@ pub async fn run_server( tracing_log_bus, supervisor_sessions, oidc_cache, - )); + ); + + // Load the gateway-minted sandbox JWT signing key when configured. + // Optional in PR 2 so single-driver dev deployments without certgen + // continue to start. The helm-deployed gateway and the RPM init script + // populate `gateway_jwt` once `certgen` has produced the on-disk + // material. + if let Some(ref jwt) = config.gateway_jwt { + let signing_pem = std::fs::read(&jwt.signing_key_path).map_err(|e| { + Error::config(format!( + "failed to read sandbox JWT signing key from {}: {e}", + jwt.signing_key_path.display() + )) + })?; + let public_pem = std::fs::read(&jwt.public_key_path).map_err(|e| { + Error::config(format!( + "failed to read sandbox JWT public key from {}: {e}", + jwt.public_key_path.display() + )) + })?; + let kid = std::fs::read_to_string(&jwt.kid_path) + .map_err(|e| { + Error::config(format!( + "failed to read sandbox JWT kid from {}: {e}", + jwt.kid_path.display() + )) + })? + .trim() + .to_string(); + if kid.is_empty() { + return Err(Error::config(format!( + "sandbox JWT kid file {} is empty", + jwt.kid_path.display() + ))); + } + let issuer = auth::sandbox_jwt::SandboxJwtIssuer::from_pem( + &signing_pem, + kid.clone(), + &jwt.gateway_id, + Duration::from_secs(jwt.ttl_secs), + ) + .map_err(Error::config)?; + let authenticator = auth::sandbox_jwt::SandboxJwtAuthenticator::from_pem( + &public_pem, + kid, + &jwt.gateway_id, + state.sandbox_jwt_revocation.clone(), + ) + .map_err(Error::config)?; + info!( + gateway_id = %jwt.gateway_id, + ttl_secs = jwt.ttl_secs, + "gateway-minted sandbox JWT enabled" + ); + state.sandbox_jwt_issuer = Some(Arc::new(issuer)); + state.sandbox_jwt_authenticator = Some(Arc::new(authenticator)); + } + + if let Some(ref spiffe) = config.spiffe { + let authenticator = auth::spiffe::SpiffeAuthenticator::new(spiffe.clone()) + .await + .map_err(|e| Error::config(format!("SPIFFE initialization failed: {e}")))?; + state.spiffe_authenticator = Some(Arc::new(authenticator)); + } + + // K8s ServiceAccount bootstrap authenticator. Only constructed when + // the gateway is running in-cluster (kubelet provides the API host + // env var) and has a sandbox JWT issuer to mint replacements against; + // outside the cluster we can't talk to the apiserver's JWKS endpoint, + // and without the issuer there's nothing to exchange the SA token + // for. + if state.sandbox_jwt_issuer.is_some() && std::env::var_os("KUBERNETES_SERVICE_HOST").is_some() { + // Pod lookups must target the sandbox namespace (where the K8s + // driver places sandbox pods), not the gateway's own pod + // namespace. Sourced from the merged + // `[openshell.drivers.kubernetes].namespace` config, falling + // back to "default" only if the driver config can't be parsed. + let sandbox_namespace = kubernetes_config_from_file(config_file.as_ref()) + .map_or_else(|_| "default".to_string(), |cfg| cfg.namespace); + match kube::Client::try_default().await { + Ok(client) => { + let resolver = Arc::new(auth::k8s_sa::LiveK8sResolver::new( + client, + &sandbox_namespace, + "openshell-gateway".to_string(), + )); + let authenticator = auth::k8s_sa::K8sServiceAccountAuthenticator::new(resolver); + state.k8s_sa_authenticator = Some(Arc::new(authenticator)); + info!( + namespace = %sandbox_namespace, + "K8s ServiceAccount bootstrap authenticator enabled" + ); + } + Err(e) => warn!( + error = %e, + "in-cluster K8s client construction failed; \ + K8s ServiceAccount bootstrap is disabled" + ), + } + } + + let state = Arc::new(state); // Resume sandboxes that were stopped during the previous gateway // shutdown so the running compute state matches the persisted store. diff --git a/crates/openshell-server/src/multiplex.rs b/crates/openshell-server/src/multiplex.rs index deac9ee78..69a06a644 100644 --- a/crates/openshell-server/src/multiplex.rs +++ b/crates/openshell-server/src/multiplex.rs @@ -31,8 +31,15 @@ use tower_http::request_id::{MakeRequestId, RequestId}; use tracing::Span; use crate::{ - OpenShellService, ServerState, auth::authz::AuthzPolicy, auth::identity::Identity, auth::oidc, - http_router, inference::InferenceService, service_http_router, + OpenShellService, ServerState, + auth::authenticator::{AuthenticatorChain, PermissiveUserAuthenticator}, + auth::authz::AuthzPolicy, + auth::identity::Identity, + auth::oidc::{self, OidcAuthenticator}, + auth::principal::{Principal, UserPrincipal}, + http_router, + inference::InferenceService, + service_http_router, }; /// Request-ID generator that produces a UUID v4 for each inbound request. @@ -153,17 +160,11 @@ impl MultiplexService { user_role: oidc.user_role.clone(), scopes_enabled: !oidc.scopes_claim.is_empty(), }); - let has_client_ca = self - .state - .config - .tls - .as_ref() - .is_some_and(|tls| tls.client_ca_path.is_some()); - let grpc_service = AuthGrpcRouter::new( + let authenticator_chain = build_authenticator_chain(&self.state); + let grpc_service = AuthGrpcRouter::with_peer_identity( GrpcRouter::new(openshell, inference), - self.state.oidc_cache.clone(), + authenticator_chain, authz_policy, - has_client_ca, peer_identity, ); let http_service = http_router(self.state.clone()); @@ -256,56 +257,109 @@ where } } -/// gRPC router wrapper that authenticates and authorizes requests. +/// Assemble the authenticator chain for the gateway. /// -/// When `oidc_cache` is `Some`, extracts the `authorization: Bearer ` -/// header, validates the JWT (authentication), then checks RBAC roles -/// (authorization) before forwarding to the inner gRPC router. +/// Chain order (first-match-wins): +/// 1. `K8sServiceAccountAuthenticator` (path-scoped to `IssueSandboxToken`) +/// — exchanges a projected SA token for a `Principal::Sandbox` so the +/// `IssueSandboxToken` handler can mint a gateway JWT. No-op on every +/// other path; only present when the gateway runs in-cluster. +/// 2. `SandboxJwtAuthenticator` — validates gateway-minted JWTs. Recognized +/// via a distinctive `kid` so non-matching Bearer tokens fall through. +/// 3. `SpiffeAuthenticator` — validates SPIFFE JWT-SVIDs through the +/// local SPIFFE Workload API and maps sandbox SPIFFE IDs to +/// `Principal::Sandbox`. +/// 4. `OidcAuthenticator` — validates user Bearer tokens against the +/// configured OIDC issuer. Returns `Unauthenticated` for missing +/// Bearer headers so non-OIDC clients can't sneak through. +/// 5. `PermissiveUserAuthenticator` — installed only when no OIDC is +/// configured (singleplayer / helm-dev). Catches anything the +/// sandbox authenticators didn't claim and produces a synthetic +/// user principal, preserving the pre-PR-1 "no OIDC = open" posture. /// -/// Authentication is provider-specific (currently OIDC via `oidc.rs`). -/// Authorization is provider-agnostic (via `authz.rs`). This separation -/// aligns with RFC 0001's control-plane identity design. +/// When neither OIDC nor gateway-minted JWTs are configured (a barebones +/// dev gateway), the chain is left as `None` so the router short-circuits +/// to pass-through. +fn build_authenticator_chain(state: &ServerState) -> Option { + let mut authenticators: Vec> = Vec::new(); + if let Some(k8s) = state.k8s_sa_authenticator.clone() { + authenticators.push(k8s); + } + if let Some(jwt) = state.sandbox_jwt_authenticator.clone() { + authenticators.push(jwt); + } + if let Some(spiffe) = state.spiffe_authenticator.clone() { + authenticators.push(spiffe); + } + if let Some(cache) = state.oidc_cache.clone() { + authenticators.push(Arc::new(OidcAuthenticator::new(cache))); + } else if !authenticators.is_empty() { + // No OIDC, but sandbox-side authentication IS configured — + // user CLI calls must still pass through, so install a + // permissive final fallback. Production deployments configure + // OIDC and this branch is unused. + authenticators.push(Arc::new(PermissiveUserAuthenticator::new("dev-anonymous"))); + } + if authenticators.is_empty() { + return None; + } + Some(AuthenticatorChain::new(authenticators)) +} + +/// gRPC router wrapper that runs the [`AuthenticatorChain`] and inserts the +/// resulting [`Principal`] into the request's extensions. /// -/// Sandbox-class methods (`oidc::is_sandbox_method`) accept callers without -/// a Bearer token: the gRPC channel's mTLS handshake is the trust -/// boundary. The router marks such requests with the -/// `INTERNAL_AUTH_SOURCE_HEADER` so handlers (`policy.rs`) can apply -/// sandbox-restricted scope. +/// Behavior: +/// - Strip any external `x-openshell-auth-source` marker first (so callers +/// cannot spoof a sandbox identity). +/// - Health probes / reflection bypass the chain entirely. +/// - When no chain is configured (OIDC not configured), forward without +/// authentication — preserves today's pass-through behavior. +/// - Otherwise, run the chain. The first match produces a `Principal`. +/// `Principal::User` is gated by the RBAC `AuthzPolicy`. The legacy +/// sandbox marker also inserts the metadata marker for backwards-compat +/// with handlers that still consume it (PR-1 only; removed in PR 3). #[derive(Clone)] pub struct AuthGrpcRouter { inner: S, - oidc_cache: Option>, + authenticator_chain: Option, authz_policy: Option, - /// Whether a client CA is configured (mTLS is a valid auth mechanism). - has_client_ca: bool, /// mTLS peer identity extracted from the TLS handshake. peer_identity: Option, } impl AuthGrpcRouter { + #[cfg(test)] fn new( inner: S, - oidc_cache: Option>, + authenticator_chain: Option, + authz_policy: Option, + ) -> Self { + Self::with_peer_identity(inner, authenticator_chain, authz_policy, None) + } + + fn with_peer_identity( + inner: S, + authenticator_chain: Option, authz_policy: Option, - has_client_ca: bool, peer_identity: Option, ) -> Self { Self { inner, - oidc_cache, + authenticator_chain, authz_policy, - has_client_ca, peer_identity, } } } +fn status_response(status: tonic::Status) -> Response { + status.into_http() +} + impl tower::Service> for AuthGrpcRouter where - S: tower::Service, Response = Response> - + Clone - + Send - + 'static, + S: tower::Service, Response = Response> + Clone + Send + 'static, S::Future: Send, S::Error: Send + Into>, B: Send + 'static, @@ -319,28 +373,21 @@ where } fn call(&mut self, req: Request) -> Self::Future { - let oidc_cache = self.oidc_cache.clone(); + let chain = self.authenticator_chain.clone(); let authz_policy = self.authz_policy.clone(); - let has_client_ca = self.has_client_ca; let peer_identity = self.peer_identity.clone(); let mut inner = self.inner.clone(); Box::pin(async move { let mut req = req; - oidc::clear_internal_auth_markers(req.headers_mut()); - // No auth configured — pass through. - if oidc_cache.is_none() && !has_client_ca { + // No chain configured — pass through. Preserves today's + // "auth not configured means open" behavior for dev / + // fronting-proxy deployments. + let Some(chain) = chain else { return inner.ready().await?.call(req).await; - } - - // mTLS-only (no OIDC) — TLS layer already enforced client certs, - // so if we got here the peer is authenticated. - if oidc_cache.is_none() && has_client_ca { - return inner.ready().await?.call(req).await; - } + }; - let cache = oidc_cache.expect("checked above"); let path = req.uri().path().to_string(); // Health probes and reflection — truly unauthenticated. @@ -348,72 +395,32 @@ where return inner.ready().await?.call(req).await; } - // Sandbox-class RPCs — no Bearer expected. The gRPC channel's - // mTLS handshake (or the operator's fronting proxy when - // `--disable-gateway-auth` is set) is the trust boundary. - if oidc::is_sandbox_method(&path) { - oidc::mark_sandbox_caller(req.headers_mut()); - return inner.ready().await?.call(req).await; - } - - // Dual-auth methods (e.g. UpdateConfig) — Bearer present grants - // full scope (CLI users); Bearer absent marks the caller as - // sandbox-class for restricted scope downstream. - if oidc::is_dual_auth_method(&path) && !has_bearer_token(req.headers()) { - oidc::mark_sandbox_caller(req.headers_mut()); - return inner.ready().await?.call(req).await; - } - - // Extract Bearer token from the authorization header. - let token = req - .headers() - .get("authorization") - .and_then(|v| v.to_str().ok()) - .and_then(|v| v.strip_prefix("Bearer ")); - - let Some(token) = token else { - // No bearer token — fall back to mTLS if a client cert was - // presented (only possible when both OIDC and client CA are - // configured and require_client_auth is false). - if let Some(ref identity) = peer_identity { - if let Some(ref policy) = authz_policy - && let Err(status) = policy.check(identity, &path) - { - let response = status.into_http(); - let (parts, body) = response.into_parts(); - let body = tonic::body::BoxBody::new(body); - return Ok(Response::from_parts(parts, body)); + let principal = match chain.authenticate(req.headers(), &path).await { + Ok(Some(p)) => p, + Ok(None) => { + if let Some(identity) = peer_identity { + Principal::User(UserPrincipal { identity }) + } else { + return Ok(status_response(tonic::Status::unauthenticated( + "missing authorization header", + ))); } - return inner.ready().await?.call(req).await; - } - let status = tonic::Status::unauthenticated("missing authorization header"); - let response = status.into_http(); - let (parts, body) = response.into_parts(); - let body = tonic::body::BoxBody::new(body); - return Ok(Response::from_parts(parts, body)); - }; - - // Authenticate: validate the JWT and produce an Identity. - let identity = match cache.validate_token(token).await { - Ok(id) => id, - Err(status) => { - let response = status.into_http(); - let (parts, body) = response.into_parts(); - let body = tonic::body::BoxBody::new(body); - return Ok(Response::from_parts(parts, body)); } + Err(status) => return Ok(status_response(status)), }; - // Authorize: check RBAC roles against the method. - if let Some(ref policy) = authz_policy - && let Err(status) = policy.check(&identity, &path) + // Authorize user principals via RBAC. Sandbox principals get + // a per-handler `sandbox_id` equality check in PR 4; right now + // they bypass RBAC because the public sandbox-class methods + // they call were path-bypassed before this refactor too. + if let Principal::User(ref user) = principal + && let Some(ref policy) = authz_policy + && let Err(status) = policy.check(&user.identity, &path) { - let response = status.into_http(); - let (parts, body) = response.into_parts(); - let body = tonic::body::BoxBody::new(body); - return Ok(Response::from_parts(parts, body)); + return Ok(status_response(status)); } + req.extensions_mut().insert(principal); inner.ready().await?.call(req).await }) } @@ -513,13 +520,6 @@ where } } -fn has_bearer_token(headers: &http::HeaderMap) -> bool { - headers - .get("authorization") - .and_then(|v| v.to_str().ok()) - .is_some_and(|v| v.starts_with("Bearer ")) -} - fn grpc_method_from_path(path: &str) -> String { path.rsplit('/').next().unwrap_or(path).to_string() } @@ -860,4 +860,180 @@ mod tests { fn normalize_root_path() { assert_eq!(normalize_http_path("/"), "unknown"); } + + mod auth_router { + use super::*; + use crate::auth::authenticator::test_support::MockAuthenticator; + use crate::auth::identity::{Identity, IdentityProvider}; + use crate::auth::principal::{ + Principal, SandboxIdentitySource, SandboxPrincipal, UserPrincipal, + }; + use http_body_util::Full; + use std::sync::Arc; + use std::sync::Mutex; + use tower::Service; + + type RecordedPrincipal = Arc>>; + + /// Service that snapshots the `Principal` from request extensions + /// and returns 200 OK. Used by router-level tests to assert the + /// chain's effect on the downstream service. + #[derive(Clone)] + struct PrincipalRecorder { + recorded: RecordedPrincipal, + } + + impl PrincipalRecorder { + fn new() -> (Self, RecordedPrincipal) { + let recorded = Arc::new(Mutex::new(None)); + ( + Self { + recorded: recorded.clone(), + }, + recorded, + ) + } + } + + impl Service> for PrincipalRecorder { + type Response = Response; + type Error = std::convert::Infallible; + type Future = Pin> + Send>>; + + fn poll_ready(&mut self, _cx: &mut Context<'_>) -> Poll> { + Poll::Ready(Ok(())) + } + + fn call(&mut self, req: Request) -> Self::Future { + let principal = req.extensions().get::().cloned(); + *self.recorded.lock().unwrap() = principal; + Box::pin(async move { Ok(Response::new(tonic::body::Body::empty())) }) + } + } + + fn empty_request(path: &str) -> Request> { + Request::builder() + .uri(path) + .body(Full::new(Bytes::new())) + .unwrap() + } + + fn user_principal(subject: &str) -> Principal { + Principal::User(UserPrincipal { + identity: Identity { + subject: subject.to_string(), + display_name: None, + roles: vec![], + scopes: vec![], + provider: IdentityProvider::Oidc, + }, + }) + } + + fn sandbox_principal() -> Principal { + Principal::Sandbox(SandboxPrincipal { + sandbox_id: "sandbox-a".to_string(), + source: SandboxIdentitySource::BootstrapJwt { + issuer: "openshell-gateway:test".to_string(), + jti: "j-1".to_string(), + }, + trust_domain: Some("openshell".to_string()), + }) + } + + #[tokio::test] + async fn user_principal_lands_in_request_extensions() { + let mock = Arc::new(MockAuthenticator::returning(Ok(Some(user_principal( + "alice", + ))))); + let chain = AuthenticatorChain::new(vec![mock]); + let (recorder, seen) = PrincipalRecorder::new(); + let mut router = AuthGrpcRouter::new(recorder, Some(chain), None); + let _ = router + .call(empty_request("/openshell.v1.OpenShell/ListSandboxes")) + .await + .unwrap(); + let principal = seen.lock().unwrap().clone().expect("principal"); + match principal { + Principal::User(u) => assert_eq!(u.identity.subject, "alice"), + _ => panic!("expected user principal"), + } + } + + #[tokio::test] + async fn sandbox_principal_lands_in_request_extensions() { + let mock = Arc::new(MockAuthenticator::returning(Ok(Some(sandbox_principal())))); + let chain = AuthenticatorChain::new(vec![mock]); + let (recorder, seen) = PrincipalRecorder::new(); + let mut router = AuthGrpcRouter::new(recorder, Some(chain), None); + let _ = router + .call(empty_request("/openshell.v1.OpenShell/ReportPolicyStatus")) + .await + .unwrap(); + let captured = seen.lock().unwrap().clone(); + match captured { + Some(Principal::Sandbox(p)) => assert_eq!(p.sandbox_id, "sandbox-a"), + other => panic!("expected sandbox principal, got {other:?}"), + } + } + + #[tokio::test] + async fn missing_principal_returns_unauthenticated() { + let mock = Arc::new(MockAuthenticator::returning(Ok(None))); + let chain = AuthenticatorChain::new(vec![mock]); + let (recorder, seen) = PrincipalRecorder::new(); + let mut router = AuthGrpcRouter::new(recorder, Some(chain), None); + let res = router + .call(empty_request("/openshell.v1.OpenShell/ListSandboxes")) + .await + .unwrap(); + assert!(seen.lock().unwrap().is_none()); + // tonic sets grpc-status=16 (UNAUTHENTICATED) in trailers. + let grpc_status = res + .headers() + .get("grpc-status") + .map(|v| v.to_str().unwrap().to_string()); + assert_eq!(grpc_status.as_deref(), Some("16")); + } + + #[tokio::test] + async fn authenticator_error_short_circuits() { + let mock = Arc::new(MockAuthenticator::returning(Err( + tonic::Status::unauthenticated("forged"), + ))); + let chain = AuthenticatorChain::new(vec![mock]); + let (recorder, seen) = PrincipalRecorder::new(); + let mut router = AuthGrpcRouter::new(recorder, Some(chain), None); + let res = router + .call(empty_request("/openshell.v1.OpenShell/ListSandboxes")) + .await + .unwrap(); + assert!(seen.lock().unwrap().is_none()); + assert_eq!( + res.headers() + .get("grpc-status") + .map(|v| v.to_str().unwrap().to_string()) + .as_deref(), + Some("16") + ); + } + + #[tokio::test] + async fn health_methods_bypass_chain() { + // Authenticator is wired to fail-closed; the request still gets + // through because the path is exempt. + let mock = Arc::new(MockAuthenticator::returning(Err( + tonic::Status::unauthenticated("would reject"), + ))); + let chain = AuthenticatorChain::new(vec![mock.clone()]); + let (recorder, _) = PrincipalRecorder::new(); + let mut router = AuthGrpcRouter::new(recorder, Some(chain), None); + let res = router + .call(empty_request("/openshell.v1.OpenShell/Health")) + .await + .unwrap(); + assert_eq!(res.status(), 200); + assert_eq!(mock.call_count(), 0, "health must not consult the chain"); + } + } } diff --git a/crates/openshell-server/tests/auth_endpoint_integration.rs b/crates/openshell-server/tests/auth_endpoint_integration.rs index 59c2a23f6..bed244145 100644 --- a/crates/openshell-server/tests/auth_endpoint_integration.rs +++ b/crates/openshell-server/tests/auth_endpoint_integration.rs @@ -779,6 +779,22 @@ impl openshell_core::proto::open_shell_server::OpenShell for TestOpenShell { Err(tonic::Status::unimplemented("not implemented in test")) } + async fn issue_sandbox_token( + &self, + _request: tonic::Request, + ) -> Result, tonic::Status> + { + Err(tonic::Status::unimplemented("not implemented in test")) + } + + async fn refresh_sandbox_token( + &self, + _request: tonic::Request, + ) -> Result, tonic::Status> + { + Err(tonic::Status::unimplemented("not implemented in test")) + } + async fn connect_supervisor( &self, _request: tonic::Request>, diff --git a/crates/openshell-server/tests/edge_tunnel_auth.rs b/crates/openshell-server/tests/edge_tunnel_auth.rs index 73ad0aff0..fc676ae7b 100644 --- a/crates/openshell-server/tests/edge_tunnel_auth.rs +++ b/crates/openshell-server/tests/edge_tunnel_auth.rs @@ -409,6 +409,20 @@ impl OpenShell for TestOpenShell { Err(Status::unimplemented("not implemented in test")) } + async fn issue_sandbox_token( + &self, + _request: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("not implemented in test")) + } + + async fn refresh_sandbox_token( + &self, + _request: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("not implemented in test")) + } + async fn connect_supervisor( &self, _request: tonic::Request>, diff --git a/crates/openshell-server/tests/multiplex_integration.rs b/crates/openshell-server/tests/multiplex_integration.rs index 14a63c566..572308d1e 100644 --- a/crates/openshell-server/tests/multiplex_integration.rs +++ b/crates/openshell-server/tests/multiplex_integration.rs @@ -378,6 +378,20 @@ impl OpenShell for TestOpenShell { Err(Status::unimplemented("not implemented in test")) } + async fn issue_sandbox_token( + &self, + _request: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("not implemented in test")) + } + + async fn refresh_sandbox_token( + &self, + _request: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("not implemented in test")) + } + async fn connect_supervisor( &self, _request: tonic::Request>, diff --git a/crates/openshell-server/tests/multiplex_tls_integration.rs b/crates/openshell-server/tests/multiplex_tls_integration.rs index 00ed1657f..17e045a9e 100644 --- a/crates/openshell-server/tests/multiplex_tls_integration.rs +++ b/crates/openshell-server/tests/multiplex_tls_integration.rs @@ -391,6 +391,20 @@ impl OpenShell for TestOpenShell { Err(Status::unimplemented("not implemented in test")) } + async fn issue_sandbox_token( + &self, + _request: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("not implemented in test")) + } + + async fn refresh_sandbox_token( + &self, + _request: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("not implemented in test")) + } + async fn connect_supervisor( &self, _request: tonic::Request>, diff --git a/crates/openshell-server/tests/supervisor_relay_integration.rs b/crates/openshell-server/tests/supervisor_relay_integration.rs index d82c9c261..0e3de66d1 100644 --- a/crates/openshell-server/tests/supervisor_relay_integration.rs +++ b/crates/openshell-server/tests/supervisor_relay_integration.rs @@ -367,6 +367,18 @@ impl OpenShell for RelayGateway { ) -> Result, Status> { Err(Status::unimplemented("unused")) } + async fn issue_sandbox_token( + &self, + _: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("unused")) + } + async fn refresh_sandbox_token( + &self, + _: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("unused")) + } } // --------------------------------------------------------------------------- diff --git a/crates/openshell-server/tests/ws_tunnel_integration.rs b/crates/openshell-server/tests/ws_tunnel_integration.rs index 277cffb51..28b615c2f 100644 --- a/crates/openshell-server/tests/ws_tunnel_integration.rs +++ b/crates/openshell-server/tests/ws_tunnel_integration.rs @@ -404,6 +404,20 @@ impl OpenShell for TestOpenShell { Err(Status::unimplemented("not implemented in test")) } + async fn issue_sandbox_token( + &self, + _request: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("not implemented in test")) + } + + async fn refresh_sandbox_token( + &self, + _request: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("not implemented in test")) + } + async fn connect_supervisor( &self, _request: tonic::Request>, diff --git a/crates/openshell-tui/Cargo.toml b/crates/openshell-tui/Cargo.toml index b0ac0c7ca..723528cd7 100644 --- a/crates/openshell-tui/Cargo.toml +++ b/crates/openshell-tui/Cargo.toml @@ -21,7 +21,7 @@ ratatui = { workspace = true } crossterm = { workspace = true } terminal-colorsaurus = { workspace = true } tokio = { workspace = true } -tonic = { workspace = true, features = ["tls"] } +tonic = { workspace = true, features = ["tls-native-roots"] } miette = { workspace = true } owo-colors = { workspace = true } serde = { workspace = true } diff --git a/deploy/helm/openshell/README.md b/deploy/helm/openshell/README.md index cc856731d..f27fd5da1 100644 --- a/deploy/helm/openshell/README.md +++ b/deploy/helm/openshell/README.md @@ -52,6 +52,8 @@ See [`values.yaml`](values.yaml) for configurable values. Selected overlays: - [`ci/values-gateway.yaml`](ci/values-gateway.yaml) — gateway-only configuration - [`ci/values-cert-manager.yaml`](ci/values-cert-manager.yaml) — cert-manager integration - [`ci/values-keycloak.yaml`](ci/values-keycloak.yaml) — Keycloak OIDC integration +- [`ci/values-spire.yaml`](ci/values-spire.yaml) — SPIFFE/SPIRE sandbox supervisor authentication +- [`ci/values-spire-stack.yaml`](ci/values-spire-stack.yaml) — SPIRE hardened chart values for local development ## PKI bootstrap @@ -70,3 +72,13 @@ The Job is idempotent: Disable with `--set pkiInitJob.enabled=false` when bringing your own PKI (cert-manager, external CA, or pre-created Secrets). See `certManager.*` in `values.yaml` for the cert-manager alternative. + +## SPIFFE/SPIRE sandbox identity + +Set `server.spiffe.enabled=true` to use SPIFFE JWT-SVIDs for sandbox supervisor +authentication instead of gateway-minted sandbox JWTs. The chart mounts the +SPIFFE CSI Workload API socket into the gateway pod and configures sandbox pods +to request `spiffe:///openshell/sandbox/` JWT-SVIDs. + +For local development, uncomment the SPIRE Helm releases in `skaffold.yaml` and +add `ci/values-spire.yaml` to the OpenShell release values files. diff --git a/deploy/helm/openshell/ci/values-spire-stack.yaml b/deploy/helm/openshell/ci/values-spire-stack.yaml new file mode 100644 index 000000000..018b441d6 --- /dev/null +++ b/deploy/helm/openshell/ci/values-spire-stack.yaml @@ -0,0 +1,22 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# SPIRE hardened chart values for the local Helm dev environment. +global: + spire: + clusterName: openshell-dev + trustDomain: openshell.local + +spire-server: + controllerManager: + identities: + clusterSPIFFEIDs: + openshell-sandboxes: + enabled: true + spiffeIDTemplate: 'spiffe://{{ .TrustDomain }}/openshell/sandbox/{{ index .PodMeta.Annotations "openshell.io/sandbox-id" }}' + namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: openshell + podSelector: + matchLabels: + openshell.ai/managed-by: openshell diff --git a/deploy/helm/openshell/ci/values-spire.yaml b/deploy/helm/openshell/ci/values-spire.yaml new file mode 100644 index 000000000..cf4dd1104 --- /dev/null +++ b/deploy/helm/openshell/ci/values-spire.yaml @@ -0,0 +1,11 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# OpenShell overlay for local SPIRE-backed supervisor authentication. +server: + spiffe: + enabled: true + trustDomain: openshell.local + audience: openshell-gateway + workloadApiSocketPath: /spiffe-workload-api/spire-agent.sock + sandboxIdPrefix: /openshell/sandbox/ diff --git a/deploy/helm/openshell/skaffold.yaml b/deploy/helm/openshell/skaffold.yaml index 779211877..9905c4b10 100644 --- a/deploy/helm/openshell/skaffold.yaml +++ b/deploy/helm/openshell/skaffold.yaml @@ -79,6 +79,24 @@ deploy: # # wait ensures Gateway API CRDs are registered before the openshell # # release attempts to create Gateway and HTTPRoute resources. # wait: true + # SPIRE — installs SPIRE Server, Agent, Controller Manager, CSI Driver, + # and OIDC Discovery Provider using the SPIFFE hardened charts. + # Uncomment both releases and ci/values-spire.yaml below to use + # SPIFFE JWT-SVIDs for sandbox supervisor authentication. + #- name: spire-crds + # repo: https://spiffe.github.io/helm-charts-hardened/ + # remoteChart: spire-crds + # namespace: spire + # createNamespace: true + # wait: true + #- name: spire + # repo: https://spiffe.github.io/helm-charts-hardened/ + # remoteChart: spire + # namespace: spire + # createNamespace: true + # valuesFiles: + # - ci/values-spire-stack.yaml + # wait: true - name: openshell chartPath: . namespace: openshell @@ -95,6 +113,9 @@ deploy: #- ci/values-keycloak.yaml # To enable the Gateway API HTTPRoute (requires Envoy Gateway above): #- ci/values-gateway.yaml + # To enable SPIFFE/SPIRE sandbox supervisor authentication (requires + # the spire-crds and spire releases above): + #- ci/values-spire.yaml setValueTemplates: image.repository: '{{.IMAGE_REPO_openshell_gateway}}' image.tag: '{{.IMAGE_TAG_openshell_gateway}}' diff --git a/deploy/helm/openshell/templates/_helpers.tpl b/deploy/helm/openshell/templates/_helpers.tpl index 00925d2d3..7f3fe4cc2 100644 --- a/deploy/helm/openshell/templates/_helpers.tpl +++ b/deploy/helm/openshell/templates/_helpers.tpl @@ -125,3 +125,11 @@ init-container {{- printf "%s://%s.%s.svc.cluster.local:%d" $scheme (include "openshell.fullname" .) .Release.Namespace (int .Values.service.port) -}} {{- end -}} {{- end }} + +{{/* +Directory mounted for the SPIFFE Workload API CSI volume. The socket itself +lives at server.spiffe.workloadApiSocketPath. +*/}} +{{- define "openshell.spiffeWorkloadApiMountPath" -}} +{{- dir .Values.server.spiffe.workloadApiSocketPath -}} +{{- end }} diff --git a/deploy/helm/openshell/templates/certgen.yaml b/deploy/helm/openshell/templates/certgen.yaml index ef4500db6..61203760b 100644 --- a/deploy/helm/openshell/templates/certgen.yaml +++ b/deploy/helm/openshell/templates/certgen.yaml @@ -100,6 +100,7 @@ spec: - generate-certs - --server-secret-name={{ .Values.server.tls.certSecretName }} - --client-secret-name={{ .Values.server.tls.clientTlsSecretName }} + - --jwt-secret-name={{ .Values.server.sandboxJwt.signingSecretName | default (printf "%s-jwt-keys" (include "openshell.fullname" .)) }} {{- range .Values.pkiInitJob.serverDnsNames }} - --server-san={{ . }} {{- end }} diff --git a/deploy/helm/openshell/templates/gateway-config.yaml b/deploy/helm/openshell/templates/gateway-config.yaml index 9d95e45c1..3dc92e095 100644 --- a/deploy/helm/openshell/templates/gateway-config.yaml +++ b/deploy/helm/openshell/templates/gateway-config.yaml @@ -64,6 +64,21 @@ data: {{- end }} {{- end }} + {{- if .Values.server.spiffe.enabled }} + [openshell.gateway.spiffe] + workload_api_socket_path = {{ .Values.server.spiffe.workloadApiSocketPath | quote }} + trust_domain = {{ .Values.server.spiffe.trustDomain | quote }} + audience = {{ .Values.server.spiffe.audience | quote }} + sandbox_id_prefix = {{ .Values.server.spiffe.sandboxIdPrefix | quote }} + {{- else }} + [openshell.gateway.gateway_jwt] + signing_key_path = "/etc/openshell-jwt/signing.pem" + public_key_path = "/etc/openshell-jwt/public.pem" + kid_path = "/etc/openshell-jwt/kid" + gateway_id = {{ .Values.server.sandboxJwt.gatewayId | default (include "openshell.fullname" .) | quote }} + ttl_secs = {{ .Values.server.sandboxJwt.ttlSecs | default 86400 }} + {{- end }} + {{- if .Values.server.oidc.issuer }} [openshell.gateway.oidc] @@ -87,6 +102,14 @@ data: [openshell.drivers.kubernetes] grpc_endpoint = {{ include "openshell.grpcEndpoint" . | quote }} supervisor_sideload_method = {{ include "openshell.supervisorSideloadMethod" . | quote }} + {{- if .Values.server.spiffe.enabled }} + spiffe_workload_api_socket_path = {{ .Values.server.spiffe.workloadApiSocketPath | quote }} + spiffe_trust_domain = {{ .Values.server.spiffe.trustDomain | quote }} + spiffe_audience = {{ .Values.server.spiffe.audience | quote }} + spiffe_sandbox_id_prefix = {{ .Values.server.spiffe.sandboxIdPrefix | quote }} + {{- else }} + sa_token_ttl_secs = {{ .Values.server.sandboxJwt.k8sSaTokenTtlSecs | default 3600 }} + {{- end }} {{- if .Values.server.sandboxImagePullPolicy }} image_pull_policy = {{ .Values.server.sandboxImagePullPolicy | quote }} {{- end }} diff --git a/deploy/helm/openshell/templates/role.yaml b/deploy/helm/openshell/templates/role.yaml index 1d756117c..4d26451bf 100644 --- a/deploy/helm/openshell/templates/role.yaml +++ b/deploy/helm/openshell/templates/role.yaml @@ -29,3 +29,14 @@ rules: - get - list - watch + # Per-sandbox identity (issue #1354): the gateway resolves a sandbox + # pod's projected SA token to its `openshell.io/sandbox-id` annotation + # via a pod GET when the supervisor calls IssueSandboxToken. patch is + # intentionally NOT granted — the annotation is set once at pod create + # and must remain immutable for the lifetime of the sandbox. + - apiGroups: + - "" + resources: + - pods + verbs: + - get diff --git a/deploy/helm/openshell/templates/statefulset.yaml b/deploy/helm/openshell/templates/statefulset.yaml index c6ff21491..47d8555cc 100644 --- a/deploy/helm/openshell/templates/statefulset.yaml +++ b/deploy/helm/openshell/templates/statefulset.yaml @@ -75,6 +75,15 @@ spec: - name: gateway-config mountPath: /etc/openshell readOnly: true + {{- if .Values.server.spiffe.enabled }} + - name: spiffe-workload-api + mountPath: {{ include "openshell.spiffeWorkloadApiMountPath" . | quote }} + readOnly: true + {{- else }} + - name: sandbox-jwt + mountPath: /etc/openshell-jwt + readOnly: true + {{- end }} {{- if not .Values.server.disableTls }} - name: tls-cert mountPath: /etc/openshell-tls/server @@ -84,12 +93,12 @@ spec: mountPath: /etc/openshell-tls/client-ca readOnly: true {{- end }} + {{- end }} {{- if and .Values.server.oidc.issuer .Values.server.oidc.caConfigMapName }} - name: oidc-ca mountPath: /etc/openshell-tls/oidc-ca readOnly: true {{- end }} - {{- end }} ports: - name: grpc containerPort: {{ .Values.service.port }} @@ -131,6 +140,17 @@ spec: - name: gateway-config configMap: name: {{ include "openshell.fullname" . }}-config + {{- if .Values.server.spiffe.enabled }} + - name: spiffe-workload-api + csi: + driver: csi.spiffe.io + readOnly: true + {{- else }} + - name: sandbox-jwt + secret: + secretName: {{ .Values.server.sandboxJwt.signingSecretName | default (printf "%s-jwt-keys" (include "openshell.fullname" .)) }} + defaultMode: 0400 + {{- end }} {{- if not .Values.server.disableTls }} - name: tls-cert secret: @@ -147,12 +167,12 @@ spec: secretName: {{ .Values.server.tls.clientCaSecretName }} {{- end }} {{- end }} + {{- end }} {{- if and .Values.server.oidc.issuer .Values.server.oidc.caConfigMapName }} - name: oidc-ca configMap: name: {{ .Values.server.oidc.caConfigMapName }} {{- end }} - {{- end }} {{- with .Values.nodeSelector }} nodeSelector: {{- toYaml . | nindent 8 }} diff --git a/deploy/helm/openshell/tests/gateway_config_test.yaml b/deploy/helm/openshell/tests/gateway_config_test.yaml index 2d464b8e6..c8856900d 100644 --- a/deploy/helm/openshell/tests/gateway_config_test.yaml +++ b/deploy/helm/openshell/tests/gateway_config_test.yaml @@ -66,3 +66,30 @@ tests: - matchRegex: path: data["gateway.toml"] pattern: 'server_sans\s*=\s*\["openshell", "\*\.dev\.openshell\.localhost"\]' + + - it: renders SPIFFE sandbox auth instead of gateway JWT when enabled + set: + server.spiffe.enabled: true + template: templates/gateway-config.yaml + asserts: + - matchRegex: + path: data["gateway.toml"] + pattern: '\[openshell\.gateway\.spiffe\]' + - matchRegex: + path: data["gateway.toml"] + pattern: 'spiffe_workload_api_socket_path\s*=\s*"/spiffe-workload-api/spire-agent\.sock"' + - notMatchRegex: + path: data["gateway.toml"] + pattern: '\[openshell\.gateway\.gateway_jwt\]' + + - it: mounts the SPIFFE Workload API socket when SPIFFE is enabled + set: + server.spiffe.enabled: true + template: templates/statefulset.yaml + asserts: + - matchRegex: + path: spec.template.spec.volumes[1].name + pattern: '^spiffe-workload-api$' + - matchRegex: + path: spec.template.spec.volumes[1].csi.driver + pattern: '^csi\.spiffe\.io$' diff --git a/deploy/helm/openshell/values.yaml b/deploy/helm/openshell/values.yaml index c7fa50296..623422b74 100644 --- a/deploy/helm/openshell/values.yaml +++ b/deploy/helm/openshell/values.yaml @@ -128,6 +128,33 @@ server: clientCaSecretName: openshell-server-client-ca # K8s secret mounted into sandbox pods for mTLS to the server clientTlsSecretName: openshell-client-tls + # Gateway-minted sandbox JWT signing keys. The pre-install certgen hook + # generates an Ed25519 keypair and writes it to a secret containing + # signing.pem (PKCS#8), public.pem (SPKI), and kid (plain text). + sandboxJwt: + # Name of the Opaque Secret holding the signing key material. Empty + # falls back to "-jwt-keys". + signingSecretName: "" + # Stable gateway identity embedded in iss/aud of every minted token. + # Defaults to the release name so HA replicas share identity. + gatewayId: "" + # Token TTL in seconds. Defaults to 86400 (24h). + ttlSecs: 86400 + # Lifetime (seconds) of the projected ServiceAccount token kubelet + # writes into each sandbox pod for the IssueSandboxToken bootstrap + # exchange. Kubelet enforces a minimum of 600s; the driver clamps + # values outside [600, 86400]. Default 3600 — generous, since the + # supervisor consumes the token within seconds of pod start. + k8sSaTokenTtlSecs: 3600 + # SPIFFE/SPIRE sandbox identity. When enabled, sandbox supervisors fetch a + # JWT-SVID from the SPIFFE Workload API and present it directly to the + # gateway instead of bootstrapping a gateway-minted sandbox JWT. + spiffe: + enabled: false + trustDomain: openshell.local + audience: openshell-gateway + workloadApiSocketPath: /spiffe-workload-api/spire-agent.sock + sandboxIdPrefix: /openshell/sandbox/ # OIDC (OpenID Connect) configuration for JWT-based authentication. # When issuer is set, the server validates Bearer tokens on gRPC requests. oidc: diff --git a/docs/kubernetes/access-control.mdx b/docs/kubernetes/access-control.mdx index d66dc528d..0c53d0ace 100644 --- a/docs/kubernetes/access-control.mdx +++ b/docs/kubernetes/access-control.mdx @@ -19,6 +19,14 @@ The Helm chart always generates mTLS certificates at install time. The gateway u For how the CLI resolves gateways and stores credentials, refer to [Gateway Authentication](/reference/gateway-auth). +## Sandbox Supervisor Identity + +Kubernetes sandbox supervisors authenticate back to the gateway as sandbox workloads. By default, the gateway mints its own sandbox JWTs and Kubernetes sandboxes bootstrap them with a projected ServiceAccount token. + +Set `server.spiffe.enabled=true` to use SPIFFE JWT-SVIDs instead. In this mode, sandbox pods mount the SPIFFE CSI Workload API socket, request a JWT-SVID for `server.spiffe.audience`, and present that token directly to the gateway. The gateway validates the token through its local SPIFFE Workload API socket and accepts SPIFFE IDs under `spiffe:///openshell/sandbox/`. + +SPIFFE mode requires a SPIFFE implementation such as SPIRE and a `ClusterSPIFFEID` that assigns per-sandbox IDs from the pod's `openshell.io/sandbox-id` annotation. + ## OIDC User Authentication Set `server.oidc.issuer` to enable OIDC. The gateway validates the `Authorization: Bearer ` header on every request against the issuer's JWKS endpoint. diff --git a/proto/compute_driver.proto b/proto/compute_driver.proto index 3c4308f3f..6de13f3e5 100644 --- a/proto/compute_driver.proto +++ b/proto/compute_driver.proto @@ -90,6 +90,13 @@ message DriverSandboxSpec { // (e.g. "0", "1"). When empty with gpu=true, the driver assigns the // first available GPU. string gpu_device = 10; + // Gateway-minted JWT identifying this sandbox to the gateway. Set by + // the gateway on create; the driver materialises it via its native + // secret mechanism (Docker/Podman/VM bind-mount a per-sandbox file; + // the Kubernetes driver ignores this field and relies on its projected + // ServiceAccount token bootstrap instead). Never echoed to the public + // Sandbox proto. + string sandbox_token = 11; } // Driver-owned runtime template consumed by the compute platform. diff --git a/proto/openshell.proto b/proto/openshell.proto index e4a1b0673..0ca74d4f0 100644 --- a/proto/openshell.proto +++ b/proto/openshell.proto @@ -208,6 +208,51 @@ service OpenShell { // Get decision history for a sandbox's draft policy. rpc GetDraftHistory(GetDraftHistoryRequest) returns (GetDraftHistoryResponse); + + // Exchange a sandbox-bootstrap credential (e.g. a Kubernetes projected + // ServiceAccount token) for a gateway-minted JWT bound to the calling + // sandbox's UUID. Used by the Kubernetes driver path; singleplayer + // drivers receive the gateway JWT directly from the create-sandbox flow + // and never call this RPC. + rpc IssueSandboxToken(IssueSandboxTokenRequest) returns (IssueSandboxTokenResponse); + + // Rotate the calling sandbox's gateway JWT. The previously-issued + // token is revoked (its jti added to the gateway's deny list) and a + // fresh token bound to the same sandbox UUID is returned. The + // supervisor calls this from a background task at ~80% of the token's + // lifetime; the new token is cached in memory only — the on-disk + // bootstrap file is intentionally not rewritten. + rpc RefreshSandboxToken(RefreshSandboxTokenRequest) + returns (RefreshSandboxTokenResponse); +} + +// IssueSandboxToken request. Empty body; identity is established by the +// authentication credentials carried in the request headers (a projected +// Kubernetes ServiceAccount JWT in the K8s driver path). +message IssueSandboxTokenRequest {} + +// IssueSandboxToken response. The supervisor caches the returned token in +// memory and presents it as `Authorization: Bearer` on every subsequent +// gateway RPC. +message IssueSandboxTokenResponse { + // Gateway-minted JWT bound to the calling sandbox's UUID. + string token = 1; + // Absolute expiry of the issued token, milliseconds since the epoch. + int64 expires_at_ms = 2; +} + +// RefreshSandboxToken request. Empty body; the calling principal must +// already be a sandbox principal (i.e. the request carries a still-valid +// gateway-minted JWT in its Authorization header). +message RefreshSandboxTokenRequest {} + +// RefreshSandboxToken response. The previous token is revoked server-side +// before this response is sent. +message RefreshSandboxTokenResponse { + // Fresh gateway-minted JWT bound to the same sandbox UUID. + string token = 1; + // Absolute expiry of the new token, milliseconds since the epoch. + int64 expires_at_ms = 2; } // Health check request.