Skip to content

Commit 0aea754

Browse files
authored
Merge 8063f60 into 9994c8f
2 parents 9994c8f + 8063f60 commit 0aea754

7 files changed

Lines changed: 717 additions & 3 deletions

File tree

README.md

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,31 @@ deepnote-toolkit config set server.jupyter_port 9000
6363

6464
**Security note**: The CLI will warn if Jupyter runs without authentication. For local development only. Set `DEEPNOTE_JUPYTER_TOKEN` for shared environments.
6565

66+
## Environment Variables
67+
68+
### Debugging and Logging
69+
70+
The following environment variables control debug logging and diagnostic output:
71+
72+
- **`DEEPNOTE_ENABLE_DEBUG_LOGGING`**: Set to `true` to enable verbose DEBUG-level logs for tornado, jupyter_server, and jupyter_client. This increases log verbosity which can help troubleshoot server-related issues. Default: `false` (INFO level)
73+
74+
- **`DEEPNOTE_ENABLE_ZMQ_DEBUG`**: Set to `true` to enable detailed ZMQ message flow logging for kernel communication debugging. This logs all messages exchanged between the Jupyter server and kernel, which is useful for diagnosing stuck execution or kernel communication issues. Default: `false`
75+
76+
**Example Usage**:
77+
78+
```bash
79+
# Enable debug logging
80+
DEEPNOTE_ENABLE_DEBUG_LOGGING=true deepnote-toolkit server
81+
82+
# Enable ZMQ message debugging
83+
DEEPNOTE_ENABLE_ZMQ_DEBUG=true deepnote-toolkit server
84+
85+
# Enable both
86+
DEEPNOTE_ENABLE_DEBUG_LOGGING=true DEEPNOTE_ENABLE_ZMQ_DEBUG=true deepnote-toolkit server
87+
```
88+
89+
**Note**: Debug logging can significantly increase log volume and may impact performance. Only enable in development or when troubleshooting specific issues.
90+
6691
## Need help?
6792

6893
- Join our [Community](https://github.com/deepnote/deepnote/discussions)!

deepnote_core/resources/jupyter/jupyter_server_config.py

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,11 @@
1010
# do not import explicitly it will break the config loading
1111
c = get_config() # pylint: disable=E0602; # noqa: F821
1212

13+
# Environment variable-based debug logging configuration
14+
# Set DEEPNOTE_ENABLE_DEBUG_LOGGING=true to enable verbose DEBUG logs
15+
# Set DEEPNOTE_ENABLE_ZMQ_DEBUG=true to enable detailed ZMQ message logging
16+
debug_logging_enabled = os.getenv("DEEPNOTE_ENABLE_DEBUG_LOGGING", "false").lower() == "true"
17+
log_level = "DEBUG" if debug_logging_enabled else "INFO"
1318

1419
# ------------------------------------------------------------------------------
1520
# Application(SingletonConfigurable) configuration
@@ -27,7 +32,8 @@
2732
## Set the log level by value or name.
2833
# Choices: any of [0, 10, 20, 30, 40, 50, 'DEBUG', 'INFO', 'WARN', 'ERROR', 'CRITICAL']
2934
# Default: 30
30-
c.Application.log_level = 10
35+
# Conditional based on DEEPNOTE_ENABLE_DEBUG_LOGGING environment variable
36+
c.Application.log_level = 10 if debug_logging_enabled else 20 # DEBUG or INFO
3137

3238
## Configure additional log handlers.
3339
#
@@ -414,7 +420,15 @@
414420

415421
##
416422
# See also: Application.logging_config
417-
# c.ServerApp.logging_config = {}
423+
# Enhanced logging configuration for debugging
424+
# Uses debug_logging_enabled and log_level variables defined at the top of this file
425+
c.ServerApp.logging_config = {
426+
"loggers": {
427+
"tornado.access": {"level": log_level},
428+
"jupyter_server.serverapp": {"level": log_level},
429+
"jupyter_client.session": {"level": log_level},
430+
}
431+
}
418432

419433
## The login handler class to use.
420434
# Default: 'notebook.auth.login.LoginHandler'
@@ -820,7 +834,10 @@
820834

821835
## Debug output in the Session
822836
# Default: False
823-
# c.Session.debug = False
837+
# Enable ZMQ message flow debugging for troubleshooting kernel communication
838+
# Set DEEPNOTE_ENABLE_ZMQ_DEBUG=true to enable detailed ZMQ message logging
839+
if os.getenv("DEEPNOTE_ENABLE_ZMQ_DEBUG", "false").lower() == "true":
840+
c.Session.debug = True
824841

825842
## The maximum number of digests to remember.
826843
#
Lines changed: 231 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,231 @@
1+
"""
2+
This module provides execution timeout monitoring for Jupyter notebook cells.
3+
It can detect long-running executions and optionally send warnings or interrupt them.
4+
"""
5+
6+
import os
7+
import signal
8+
import threading
9+
import time
10+
from typing import Optional
11+
12+
import requests
13+
from IPython.core.interactiveshell import ExecutionInfo, ExecutionResult
14+
15+
from .get_webapp_url import get_absolute_userpod_api_url
16+
from .logging import LoggerManager
17+
18+
19+
class ExecutionTimeoutMonitor:
20+
"""
21+
Monitors execution duration and can send warnings or interrupt stuck executions.
22+
"""
23+
24+
def __init__(
25+
self,
26+
warning_threshold_seconds: int = 240,
27+
timeout_seconds: int = 300,
28+
enable_auto_interrupt: bool = False,
29+
):
30+
"""
31+
Initialize the execution timeout monitor.
32+
33+
Args:
34+
warning_threshold_seconds: Seconds after which to send a warning (default: 240s = 4min)
35+
timeout_seconds: Seconds after which to consider execution stuck (default: 300s = 5min)
36+
enable_auto_interrupt: Whether to automatically interrupt stuck executions (default: False)
37+
"""
38+
self.logger = LoggerManager().get_logger()
39+
self.warning_threshold = warning_threshold_seconds
40+
self.timeout_threshold = timeout_seconds
41+
self.enable_auto_interrupt = enable_auto_interrupt
42+
self.current_execution: Optional[dict] = None
43+
self.warning_timer: Optional[threading.Timer] = None
44+
self.timeout_timer: Optional[threading.Timer] = None
45+
self._execution_lock = threading.Lock()
46+
47+
def on_pre_execute(self, info: ExecutionInfo) -> None:
48+
"""
49+
Called before executing a cell.
50+
Starts timers for warning and timeout.
51+
"""
52+
cell_preview = info.raw_cell[:100] if info.raw_cell else "<empty>"
53+
54+
with self._execution_lock:
55+
self.current_execution = {
56+
"code": cell_preview,
57+
"start": time.time(),
58+
}
59+
60+
# Start warning timer
61+
if self.warning_threshold > 0:
62+
self.warning_timer = threading.Timer(
63+
self.warning_threshold, self._send_warning
64+
)
65+
self.warning_timer.daemon = True
66+
self.warning_timer.start()
67+
68+
# Start timeout timer
69+
if self.enable_auto_interrupt and self.timeout_threshold > 0:
70+
self.timeout_timer = threading.Timer(
71+
self.timeout_threshold, self._interrupt_execution
72+
)
73+
self.timeout_timer.daemon = True
74+
self.timeout_timer.start()
75+
76+
self.logger.debug(
77+
"Timeout monitoring started: warning=%ds, timeout=%ds, auto_interrupt=%s",
78+
self.warning_threshold,
79+
self.timeout_threshold,
80+
self.enable_auto_interrupt,
81+
)
82+
83+
def on_post_execute(self, result: ExecutionResult) -> None:
84+
"""
85+
Called after executing a cell.
86+
Cancels any pending timers.
87+
"""
88+
with self._execution_lock:
89+
self._cancel_timers()
90+
self.current_execution = None
91+
92+
def _cancel_timers(self) -> None:
93+
"""Cancel all active timers."""
94+
if self.warning_timer:
95+
self.warning_timer.cancel()
96+
self.warning_timer = None
97+
if self.timeout_timer:
98+
self.timeout_timer.cancel()
99+
self.timeout_timer = None
100+
101+
def _send_warning(self) -> None:
102+
"""Send warning when execution is running longer than threshold."""
103+
# Capture execution data while holding lock
104+
with self._execution_lock:
105+
if not self.current_execution:
106+
return
107+
execution_data = self.current_execution.copy()
108+
109+
# Process outside lock to avoid blocking
110+
duration = time.time() - execution_data["start"]
111+
code_preview = execution_data["code"][:50]
112+
113+
self.logger.warning(
114+
"LONG_EXECUTION | duration=%.1fs | preview=%s",
115+
duration,
116+
code_preview.replace("\n", "\\n"),
117+
)
118+
119+
# Try to report to webapp
120+
self._report_to_webapp(duration, code_preview, warning=True)
121+
122+
def _interrupt_execution(self) -> None:
123+
"""Interrupt execution after timeout threshold is exceeded."""
124+
# Capture execution data while holding lock
125+
with self._execution_lock:
126+
if not self.current_execution:
127+
return
128+
execution_data = self.current_execution.copy()
129+
130+
# Process outside lock to avoid blocking
131+
duration = time.time() - execution_data["start"]
132+
code_preview = execution_data["code"][:50]
133+
134+
self.logger.error(
135+
"TIMEOUT_INTERRUPT | duration=%.1fs | Sending SIGINT to interrupt execution",
136+
duration,
137+
)
138+
139+
# Report to webapp before interrupting
140+
self._report_to_webapp(duration, code_preview, warning=False)
141+
142+
# Send SIGINT to interrupt the execution (simulates Ctrl+C)
143+
try:
144+
os.kill(os.getpid(), signal.SIGINT)
145+
except Exception as e: # pylint: disable=broad-exception-caught
146+
self.logger.error("Failed to send SIGINT: %s", e)
147+
148+
def _report_to_webapp(
149+
self, duration: float, code_preview: str, warning: bool
150+
) -> None:
151+
"""
152+
Report execution warning/timeout to webapp.
153+
154+
Args:
155+
duration: Execution duration in seconds
156+
code_preview: Preview of the code being executed
157+
warning: Whether this is a warning (True) or timeout (False)
158+
"""
159+
try:
160+
endpoint = "warning" if warning else "timeout"
161+
url = get_absolute_userpod_api_url(f"execution/{endpoint}")
162+
163+
payload = {
164+
"duration": duration,
165+
"code_preview": code_preview,
166+
"threshold": (
167+
self.warning_threshold if warning else self.timeout_threshold
168+
),
169+
}
170+
171+
response = requests.post(url, json=payload, timeout=2)
172+
response.raise_for_status()
173+
174+
self.logger.debug("Successfully reported %s to webapp", endpoint)
175+
176+
except Exception as e: # pylint: disable=broad-exception-caught
177+
self.logger.error("Failed to report to webapp: %s", e)
178+
179+
180+
# Global instance
181+
_timeout_monitor: Optional[ExecutionTimeoutMonitor] = None
182+
183+
184+
def setup_execution_timeout_monitor(
185+
warning_threshold_seconds: int = 240,
186+
timeout_seconds: int = 300,
187+
enable_auto_interrupt: bool = False,
188+
) -> None:
189+
"""
190+
Set up execution timeout monitoring.
191+
192+
This is optional and should be called during runtime initialization if needed.
193+
194+
Args:
195+
warning_threshold_seconds: Seconds after which to send a warning (default: 240s = 4min)
196+
timeout_seconds: Seconds after which to consider execution stuck (default: 300s = 5min)
197+
enable_auto_interrupt: Whether to automatically interrupt stuck executions (default: False)
198+
"""
199+
global _timeout_monitor # pylint: disable=global-statement
200+
201+
try:
202+
from IPython import get_ipython
203+
204+
ip = get_ipython()
205+
if ip is None:
206+
LoggerManager().get_logger().warning(
207+
"IPython instance not available, skipping timeout monitor setup"
208+
)
209+
return
210+
211+
_timeout_monitor = ExecutionTimeoutMonitor(
212+
warning_threshold_seconds=warning_threshold_seconds,
213+
timeout_seconds=timeout_seconds,
214+
enable_auto_interrupt=enable_auto_interrupt,
215+
)
216+
217+
# Register event handlers
218+
ip.events.register("pre_execute", _timeout_monitor.on_pre_execute)
219+
ip.events.register("post_execute", _timeout_monitor.on_post_execute)
220+
221+
LoggerManager().get_logger().info(
222+
"Execution timeout monitor initialized: warning=%ds, timeout=%ds, auto_interrupt=%s",
223+
warning_threshold_seconds,
224+
timeout_seconds,
225+
enable_auto_interrupt,
226+
)
227+
228+
except Exception as e: # pylint: disable=broad-exception-caught
229+
LoggerManager().get_logger().error(
230+
"Failed to set up timeout monitor: %s", e
231+
)

0 commit comments

Comments
 (0)