Skip to content

Commit eea2fd7

Browse files
committed
fix: gate CUDA directory checks on GPU vendor to prevent false CUDA detection
Container images that install CUDA runtime libraries (e.g., cuda-cudart-12-5 via apt) create /usr/local/cuda-12 directories as a side effect. The previous code checked for these directories before checking whether a GPU was present, causing CPU-only hosts to select a CUDA backend that crashes because libcuda.so.1 is absent. Reorder checks so CUDA directory existence only refines the capability when an NVIDIA GPU is actually detected, consistent with the arm64 L4T code path. Signed-off-by: Sertac Ozercan <sozercan@gmail.com>
1 parent 89076ba commit eea2fd7

File tree

3 files changed

+156
-11
lines changed

3 files changed

+156
-11
lines changed

pkg/system/capabilities.go

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -132,29 +132,32 @@ func (s *SystemState) getSystemCapabilities() string {
132132
}
133133
}
134134

135-
if cuda13DirExists {
136-
s.systemCapabilities = nvidiaCuda13
137-
return s.systemCapabilities
138-
}
139-
140-
if cuda12DirExists {
141-
s.systemCapabilities = nvidiaCuda12
142-
return s.systemCapabilities
143-
}
144-
135+
// No GPU detected → default capability
145136
if s.GPUVendor == "" {
146137
xlog.Info("Default capability (no GPU detected)", "env", capabilityEnv)
147138
s.systemCapabilities = defaultCapability
148139
return s.systemCapabilities
149140
}
150141

151-
// If vram is less than 4GB, let's default to CPU but warn the user that they can override that via env
142+
// GPU detected but insufficient VRAM → default with warning
152143
if s.VRAM <= 4*1024*1024*1024 {
153144
xlog.Warn("VRAM is less than 4GB, defaulting to CPU", "env", capabilityEnv)
154145
s.systemCapabilities = defaultCapability
155146
return s.systemCapabilities
156147
}
157148

149+
// CUDA directories refine capability only for NVIDIA GPUs
150+
if s.GPUVendor == Nvidia {
151+
if cuda13DirExists {
152+
s.systemCapabilities = nvidiaCuda13
153+
return s.systemCapabilities
154+
}
155+
if cuda12DirExists {
156+
s.systemCapabilities = nvidiaCuda12
157+
return s.systemCapabilities
158+
}
159+
}
160+
158161
s.systemCapabilities = s.GPUVendor
159162
return s.systemCapabilities
160163
}

pkg/system/capabilities_test.go

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
package system
2+
3+
import (
4+
"os"
5+
"runtime"
6+
7+
. "github.com/onsi/ginkgo/v2"
8+
. "github.com/onsi/gomega"
9+
)
10+
11+
var _ = Describe("getSystemCapabilities", func() {
12+
const eightGB = 8 * 1024 * 1024 * 1024
13+
const twoGB = 2 * 1024 * 1024 * 1024
14+
15+
var (
16+
origEnv string
17+
origCuda12 bool
18+
origCuda13 bool
19+
)
20+
21+
BeforeEach(func() {
22+
if runtime.GOOS == "darwin" {
23+
Skip("darwin short-circuits before reaching CUDA logic")
24+
}
25+
26+
origEnv = os.Getenv(capabilityEnv)
27+
os.Unsetenv(capabilityEnv)
28+
29+
origCuda12 = cuda12DirExists
30+
origCuda13 = cuda13DirExists
31+
})
32+
33+
AfterEach(func() {
34+
cuda12DirExists = origCuda12
35+
cuda13DirExists = origCuda13
36+
37+
if origEnv != "" {
38+
os.Setenv(capabilityEnv, origEnv)
39+
}
40+
})
41+
42+
type testCase struct {
43+
gpuVendor string
44+
vram uint64
45+
cuda12 bool
46+
cuda13 bool
47+
wantCapability string
48+
wantTokens []string
49+
}
50+
51+
DescribeTable("capability detection",
52+
func(tc testCase) {
53+
cuda12DirExists = tc.cuda12
54+
cuda13DirExists = tc.cuda13
55+
56+
s := &SystemState{
57+
GPUVendor: tc.gpuVendor,
58+
VRAM: tc.vram,
59+
}
60+
61+
Expect(s.getSystemCapabilities()).To(Equal(tc.wantCapability))
62+
Expect(s.BackendPreferenceTokens()).To(Equal(tc.wantTokens))
63+
},
64+
Entry("CUDA dir present but no GPU", testCase{
65+
gpuVendor: "",
66+
vram: 0,
67+
cuda12: true,
68+
cuda13: false,
69+
wantCapability: "default",
70+
wantTokens: []string{"cpu"},
71+
}),
72+
Entry("CUDA 12 with NVIDIA GPU", testCase{
73+
gpuVendor: Nvidia,
74+
vram: eightGB,
75+
cuda12: true,
76+
cuda13: false,
77+
wantCapability: "nvidia-cuda-12",
78+
wantTokens: []string{"cuda", "vulkan", "cpu"},
79+
}),
80+
Entry("CUDA 13 with NVIDIA GPU", testCase{
81+
gpuVendor: Nvidia,
82+
vram: eightGB,
83+
cuda12: false,
84+
cuda13: true,
85+
wantCapability: "nvidia-cuda-13",
86+
wantTokens: []string{"cuda", "vulkan", "cpu"},
87+
}),
88+
Entry("Both CUDA dirs with NVIDIA GPU prefers 13", testCase{
89+
gpuVendor: Nvidia,
90+
vram: eightGB,
91+
cuda12: true,
92+
cuda13: true,
93+
wantCapability: "nvidia-cuda-13",
94+
wantTokens: []string{"cuda", "vulkan", "cpu"},
95+
}),
96+
Entry("CUDA dir with AMD GPU ignored", testCase{
97+
gpuVendor: AMD,
98+
vram: eightGB,
99+
cuda12: true,
100+
cuda13: false,
101+
wantCapability: "amd",
102+
wantTokens: []string{"rocm", "hip", "vulkan", "cpu"},
103+
}),
104+
Entry("No CUDA dir and no GPU", testCase{
105+
gpuVendor: "",
106+
vram: 0,
107+
cuda12: false,
108+
cuda13: false,
109+
wantCapability: "default",
110+
wantTokens: []string{"cpu"},
111+
}),
112+
Entry("No CUDA dir with NVIDIA GPU", testCase{
113+
gpuVendor: Nvidia,
114+
vram: eightGB,
115+
cuda12: false,
116+
cuda13: false,
117+
wantCapability: "nvidia",
118+
wantTokens: []string{"cuda", "vulkan", "cpu"},
119+
}),
120+
Entry("CUDA dir with NVIDIA GPU but low VRAM", testCase{
121+
gpuVendor: Nvidia,
122+
vram: twoGB,
123+
cuda12: true,
124+
cuda13: false,
125+
wantCapability: "default",
126+
wantTokens: []string{"cpu"},
127+
}),
128+
)
129+
})

pkg/system/system_suite_test.go

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
package system
2+
3+
import (
4+
"testing"
5+
6+
. "github.com/onsi/ginkgo/v2"
7+
. "github.com/onsi/gomega"
8+
)
9+
10+
func TestSystem(t *testing.T) {
11+
RegisterFailHandler(Fail)
12+
RunSpecs(t, "System test suite")
13+
}

0 commit comments

Comments
 (0)