diff --git a/beacon_node/client/src/builder.rs b/beacon_node/client/src/builder.rs index 865599b9bd2..0595665ce04 100644 --- a/beacon_node/client/src/builder.rs +++ b/beacon_node/client/src/builder.rs @@ -668,6 +668,7 @@ where chain: self.beacon_chain.clone(), db_path: self.db_path.clone(), freezer_db_path: self.freezer_db_path.clone(), + data_dir: Some(self.http_api_config.data_dir.clone()), gossipsub_registry: self.libp2p_registry.take().map(std::sync::Mutex::new), }); diff --git a/beacon_node/http_api/src/lib.rs b/beacon_node/http_api/src/lib.rs index 74710c4ed20..7633e750956 100644 --- a/beacon_node/http_api/src/lib.rs +++ b/beacon_node/http_api/src/lib.rs @@ -59,7 +59,6 @@ use eth2::types::{ ForkChoiceNode, LightClientUpdatesQuery, PublishBlockRequest, ValidatorId, }; use eth2::{CONSENSUS_VERSION_HEADER, CONTENT_TYPE_HEADER, SSZ_CONTENT_TYPE_HEADER}; -use health_metrics::observe::Observe; use lighthouse_network::Enr; use lighthouse_network::NetworkGlobals; use lighthouse_network::PeerId; @@ -2754,9 +2753,10 @@ pub fn serve( .and(warp::path("health")) .and(warp::path::end()) .and(task_spawner_filter.clone()) - .then(|task_spawner: TaskSpawner| { + .and(data_dir_filter.clone()) + .then(|task_spawner: TaskSpawner, data_dir: PathBuf| { task_spawner.blocking_json_task(Priority::P0, move || { - eth2::lighthouse::Health::observe() + health_metrics::observe::observe_health_with_data_dir(&data_dir) .map(api_types::GenericResponse::from) .map_err(warp_utils::reject::custom_bad_request) }) diff --git a/beacon_node/http_metrics/src/lib.rs b/beacon_node/http_metrics/src/lib.rs index cfa55b54eba..811ce92e551 100644 --- a/beacon_node/http_metrics/src/lib.rs +++ b/beacon_node/http_metrics/src/lib.rs @@ -41,6 +41,7 @@ pub struct Context { pub chain: Option>>, pub db_path: Option, pub freezer_db_path: Option, + pub data_dir: Option, pub gossipsub_registry: Option>, } diff --git a/beacon_node/http_metrics/src/metrics.rs b/beacon_node/http_metrics/src/metrics.rs index c19fa8fd3b2..7e23689b980 100644 --- a/beacon_node/http_metrics/src/metrics.rs +++ b/beacon_node/http_metrics/src/metrics.rs @@ -39,7 +39,10 @@ pub fn gather_prometheus_metrics( network_utils::discovery_metrics::scrape_discovery_metrics(); - health_metrics::metrics::scrape_health_metrics(); + match ctx.data_dir.as_ref() { + Some(data_dir) => health_metrics::metrics::scrape_health_metrics_for_data_dir(data_dir), + None => health_metrics::metrics::scrape_health_metrics(), + }; // It's important to ensure these metrics are explicitly enabled in the case that users aren't // using glibc and this function causes panics. diff --git a/beacon_node/src/config.rs b/beacon_node/src/config.rs index 0a52bcef06a..9746fe3c430 100644 --- a/beacon_node/src/config.rs +++ b/beacon_node/src/config.rs @@ -250,6 +250,7 @@ pub fn get_config( client_config.monitoring_api = Some(monitoring_api::Config { db_path: None, freezer_db_path: None, + data_dir: Some(client_config.data_dir().clone()), update_period_secs, monitoring_endpoint: monitoring_endpoint.to_string(), }); diff --git a/common/health_metrics/src/metrics.rs b/common/health_metrics/src/metrics.rs index c216426b7d3..ad3b866fad1 100644 --- a/common/health_metrics/src/metrics.rs +++ b/common/health_metrics/src/metrics.rs @@ -1,6 +1,7 @@ -use crate::observe::Observe; +use crate::observe::{Observe, observe_system_health_with_data_dir}; use eth2::lighthouse::{ProcessHealth, SystemHealth}; use metrics::*; +use std::path::Path; use std::sync::LazyLock; pub static PROCESS_NUM_THREADS: LazyLock> = LazyLock::new(|| { @@ -124,7 +125,12 @@ pub static BOOT_TIME: LazyLock> = LazyLock::new(|| { pub fn scrape_health_metrics() { scrape_process_health_metrics(); - scrape_system_health_metrics(); + scrape_system_health_metrics(None); +} + +pub fn scrape_health_metrics_for_data_dir(data_dir: &Path) { + scrape_process_health_metrics(); + scrape_system_health_metrics(Some(data_dir)); } pub fn scrape_process_health_metrics() { @@ -139,10 +145,14 @@ pub fn scrape_process_health_metrics() { } } -pub fn scrape_system_health_metrics() { +pub fn scrape_system_health_metrics(data_dir: Option<&Path>) { // This will silently fail if we are unable to observe the health. This is desired behaviour // since we don't support `Health` for all platforms. - if let Ok(health) = SystemHealth::observe() { + let health_result = match data_dir { + Some(dir) => observe_system_health_with_data_dir(dir), + None => SystemHealth::observe(), + }; + if let Ok(health) = health_result { set_gauge(&SYSTEM_VIRT_MEM_TOTAL, health.sys_virt_mem_total as i64); set_gauge( &SYSTEM_VIRT_MEM_AVAILABLE, diff --git a/common/health_metrics/src/observe.rs b/common/health_metrics/src/observe.rs index 5bc37703016..295b8609a28 100644 --- a/common/health_metrics/src/observe.rs +++ b/common/health_metrics/src/observe.rs @@ -1,4 +1,5 @@ use eth2::lighthouse::{Health, ProcessHealth, SystemHealth}; +use std::path::Path; #[cfg(target_os = "linux")] use { @@ -10,6 +11,21 @@ pub trait Observe: Sized { fn observe() -> Result; } +/// Observe health metrics, reporting disk usage for the filesystem containing `data_dir` +/// instead of the root filesystem. +#[cfg(not(target_os = "linux"))] +pub fn observe_health_with_data_dir(_data_dir: &Path) -> Result { + Err("Health is only available on Linux".into()) +} + +#[cfg(target_os = "linux")] +pub fn observe_health_with_data_dir(data_dir: &Path) -> Result { + Ok(Health { + process: ProcessHealth::observe()?, + system: observe_system_health_with_data_dir(data_dir)?, + }) +} + impl Observe for Health { #[cfg(not(target_os = "linux"))] fn observe() -> Result { @@ -18,13 +34,71 @@ impl Observe for Health { #[cfg(target_os = "linux")] fn observe() -> Result { - Ok(Self { - process: ProcessHealth::observe()?, - system: SystemHealth::observe()?, - }) + observe_health_with_data_dir(Path::new("/")) } } +/// Observe system health metrics, reporting disk usage for the filesystem containing +/// `data_dir` instead of the root filesystem. +#[cfg(not(target_os = "linux"))] +pub fn observe_system_health_with_data_dir(_data_dir: &Path) -> Result { + Err("Health is only available on Linux".into()) +} + +#[cfg(target_os = "linux")] +pub fn observe_system_health_with_data_dir(data_dir: &Path) -> Result { + let vm = psutil::memory::virtual_memory() + .map_err(|e| format!("Unable to get virtual memory: {:?}", e))?; + let loadavg = psutil::host::loadavg().map_err(|e| format!("Unable to get loadavg: {:?}", e))?; + + let cpu = psutil::cpu::cpu_times().map_err(|e| format!("Unable to get cpu times: {:?}", e))?; + + let disk_usage = psutil::disk::disk_usage(data_dir) + .map_err(|e| format!("Unable to get disk usage info: {:?}", e))?; + + let disk = psutil::disk::DiskIoCountersCollector::default() + .disk_io_counters() + .map_err(|e| format!("Unable to get disk counters: {:?}", e))?; + + let net = psutil::network::NetIoCountersCollector::default() + .net_io_counters() + .map_err(|e| format!("Unable to get network io counters: {:?}", e))?; + + let boot_time = psutil::host::boot_time() + .map_err(|e| format!("Unable to get system boot time: {:?}", e))? + .duration_since(std::time::UNIX_EPOCH) + .map_err(|e| format!("Boot time is lower than unix epoch: {}", e))? + .as_secs(); + + Ok(SystemHealth { + sys_virt_mem_total: vm.total(), + sys_virt_mem_available: vm.available(), + sys_virt_mem_used: vm.used(), + sys_virt_mem_free: vm.free(), + sys_virt_mem_cached: vm.cached(), + sys_virt_mem_buffers: vm.buffers(), + sys_virt_mem_percent: vm.percent(), + sys_loadavg_1: loadavg.one, + sys_loadavg_5: loadavg.five, + sys_loadavg_15: loadavg.fifteen, + cpu_cores: psutil::cpu::cpu_count_physical(), + cpu_threads: psutil::cpu::cpu_count(), + system_seconds_total: cpu.system().as_secs(), + cpu_time_total: cpu.total().as_secs(), + user_seconds_total: cpu.user().as_secs(), + iowait_seconds_total: cpu.iowait().as_secs(), + idle_seconds_total: cpu.idle().as_secs(), + disk_node_bytes_total: disk_usage.total(), + disk_node_bytes_free: disk_usage.free(), + disk_node_reads_total: disk.read_count(), + disk_node_writes_total: disk.write_count(), + network_node_bytes_total_received: net.bytes_recv(), + network_node_bytes_total_transmit: net.bytes_sent(), + misc_node_boot_ts_seconds: boot_time, + misc_os: std::env::consts::OS.to_string(), + }) +} + impl Observe for SystemHealth { #[cfg(not(target_os = "linux"))] fn observe() -> Result { @@ -33,58 +107,7 @@ impl Observe for SystemHealth { #[cfg(target_os = "linux")] fn observe() -> Result { - let vm = psutil::memory::virtual_memory() - .map_err(|e| format!("Unable to get virtual memory: {:?}", e))?; - let loadavg = - psutil::host::loadavg().map_err(|e| format!("Unable to get loadavg: {:?}", e))?; - - let cpu = - psutil::cpu::cpu_times().map_err(|e| format!("Unable to get cpu times: {:?}", e))?; - - let disk_usage = psutil::disk::disk_usage("/") - .map_err(|e| format!("Unable to disk usage info: {:?}", e))?; - - let disk = psutil::disk::DiskIoCountersCollector::default() - .disk_io_counters() - .map_err(|e| format!("Unable to get disk counters: {:?}", e))?; - - let net = psutil::network::NetIoCountersCollector::default() - .net_io_counters() - .map_err(|e| format!("Unable to get network io counters: {:?}", e))?; - - let boot_time = psutil::host::boot_time() - .map_err(|e| format!("Unable to get system boot time: {:?}", e))? - .duration_since(std::time::UNIX_EPOCH) - .map_err(|e| format!("Boot time is lower than unix epoch: {}", e))? - .as_secs(); - - Ok(Self { - sys_virt_mem_total: vm.total(), - sys_virt_mem_available: vm.available(), - sys_virt_mem_used: vm.used(), - sys_virt_mem_free: vm.free(), - sys_virt_mem_cached: vm.cached(), - sys_virt_mem_buffers: vm.buffers(), - sys_virt_mem_percent: vm.percent(), - sys_loadavg_1: loadavg.one, - sys_loadavg_5: loadavg.five, - sys_loadavg_15: loadavg.fifteen, - cpu_cores: psutil::cpu::cpu_count_physical(), - cpu_threads: psutil::cpu::cpu_count(), - system_seconds_total: cpu.system().as_secs(), - cpu_time_total: cpu.total().as_secs(), - user_seconds_total: cpu.user().as_secs(), - iowait_seconds_total: cpu.iowait().as_secs(), - idle_seconds_total: cpu.idle().as_secs(), - disk_node_bytes_total: disk_usage.total(), - disk_node_bytes_free: disk_usage.free(), - disk_node_reads_total: disk.read_count(), - disk_node_writes_total: disk.write_count(), - network_node_bytes_total_received: net.bytes_recv(), - network_node_bytes_total_transmit: net.bytes_sent(), - misc_node_boot_ts_seconds: boot_time, - misc_os: std::env::consts::OS.to_string(), - }) + observe_system_health_with_data_dir(Path::new("/")) } } diff --git a/common/monitoring_api/src/lib.rs b/common/monitoring_api/src/lib.rs index 03b93f2faae..1d73acb6e97 100644 --- a/common/monitoring_api/src/lib.rs +++ b/common/monitoring_api/src/lib.rs @@ -4,7 +4,7 @@ use std::{path::PathBuf, time::Duration}; use eth2::lighthouse::SystemHealth; use gather::{gather_beacon_metrics, gather_validator_metrics}; -use health_metrics::observe::Observe; +use health_metrics::observe::{Observe, observe_system_health_with_data_dir}; use reqwest::{IntoUrl, Response}; pub use reqwest::{StatusCode, Url}; use sensitive_url::SensitiveUrl; @@ -56,6 +56,8 @@ pub struct Config { /// Path for the cold database required for fetching beacon db size metrics. /// Note: not relevant for validator and system metrics. pub freezer_db_path: Option, + /// Data directory path used for reporting disk usage of the correct filesystem. + pub data_dir: Option, /// User-defined update period in seconds. pub update_period_secs: Option, } @@ -67,6 +69,8 @@ pub struct MonitoringHttpClient { db_path: Option, /// Path to the freezer database. freezer_db_path: Option, + /// Data directory path used for reporting disk usage of the correct filesystem. + data_dir: Option, update_period: Duration, monitoring_endpoint: SensitiveUrl, } @@ -77,6 +81,7 @@ impl MonitoringHttpClient { client: reqwest::Client::new(), db_path: config.db_path.clone(), freezer_db_path: config.freezer_db_path.clone(), + data_dir: config.data_dir.clone(), update_period: Duration::from_secs( config.update_period_secs.unwrap_or(DEFAULT_UPDATE_DURATION), ), @@ -158,8 +163,13 @@ impl MonitoringHttpClient { } /// Gets system metrics by observing capturing the SystemHealth metrics. + /// Reports disk usage for the datadir filesystem when available. pub fn get_system_metrics(&self) -> Result { - let system_health = SystemHealth::observe().map_err(Error::SystemMetricsFailed)?; + let system_health = match &self.data_dir { + Some(dir) => observe_system_health_with_data_dir(dir), + None => SystemHealth::observe(), + } + .map_err(Error::SystemMetricsFailed)?; Ok(MonitoringMetrics { metadata: Metadata::new(ProcessType::System), process_metrics: Process::System(system_health.into()), diff --git a/validator_client/http_api/src/lib.rs b/validator_client/http_api/src/lib.rs index a35b4ec6c6d..4b12a01adbc 100644 --- a/validator_client/http_api/src/lib.rs +++ b/validator_client/http_api/src/lib.rs @@ -226,6 +226,11 @@ pub fn serve( }) }); + // Optional data_dir filter for health endpoints that should not reject when validator_dir + // is unavailable. Falls back to root filesystem disk reporting. + let inner_data_dir = ctx.validator_dir.clone(); + let data_dir_filter = warp::any().map(move || inner_data_dir.clone()); + let inner_secrets_dir = ctx.secrets_dir.clone(); let secrets_dir_filter = warp::any().map(move || inner_secrets_dir.clone()).and_then( |secrets_dir: Option<_>| async move { @@ -301,9 +306,14 @@ pub fn serve( let get_lighthouse_health = warp::path("lighthouse") .and(warp::path("health")) .and(warp::path::end()) - .then(|| { + .and(data_dir_filter.clone()) + .then(|data_dir: Option| { blocking_json_task(move || { - eth2::lighthouse::Health::observe() + let health = match data_dir { + Some(ref dir) => health_metrics::observe::observe_health_with_data_dir(dir), + None => eth2::lighthouse::Health::observe(), + }; + health .map(api_types::GenericResponse::from) .map_err(warp_utils::reject::custom_bad_request) }) diff --git a/validator_client/http_metrics/src/lib.rs b/validator_client/http_metrics/src/lib.rs index 70b447a4939..c2b7a1d0b79 100644 --- a/validator_client/http_metrics/src/lib.rs +++ b/validator_client/http_metrics/src/lib.rs @@ -11,6 +11,7 @@ use serde::{Deserialize, Serialize}; use slot_clock::{SlotClock, SystemTimeSlotClock}; use std::future::Future; use std::net::{IpAddr, Ipv4Addr, SocketAddr}; +use std::path::PathBuf; use std::sync::Arc; use std::time::{SystemTime, UNIX_EPOCH}; use tracing::info; @@ -51,6 +52,7 @@ pub struct Shared { pub struct Context { pub config: Config, pub shared: RwLock>, + pub data_dir: Option, } /// Configuration for the HTTP server. @@ -206,7 +208,10 @@ pub fn gather_prometheus_metrics( scrape_allocator_metrics(); } - health_metrics::metrics::scrape_health_metrics(); + match ctx.data_dir.as_ref() { + Some(data_dir) => health_metrics::metrics::scrape_health_metrics_for_data_dir(data_dir), + None => health_metrics::metrics::scrape_health_metrics(), + }; encoder .encode(&metrics::gather(), &mut buffer) diff --git a/validator_client/src/config.rs b/validator_client/src/config.rs index d68a78b705f..b8dd61ac8cb 100644 --- a/validator_client/src/config.rs +++ b/validator_client/src/config.rs @@ -362,6 +362,7 @@ impl Config { config.monitoring_api = Some(monitoring_api::Config { db_path: None, freezer_db_path: None, + data_dir: Some(config.validator_dir.clone()), update_period_secs, monitoring_endpoint: monitoring_endpoint.to_string(), }); diff --git a/validator_client/src/lib.rs b/validator_client/src/lib.rs index f70d5830ec7..ec346bae565 100644 --- a/validator_client/src/lib.rs +++ b/validator_client/src/lib.rs @@ -143,6 +143,7 @@ impl ProductionValidatorClient { Arc::new(validator_http_metrics::Context { config: config.http_metrics.clone(), shared: RwLock::new(shared), + data_dir: Some(config.validator_dir.clone()), }); let exit = context.executor.exit();