deepnote
diff --git a/‎README.md‎
Lines changed: 25 additions & 0 deletions b/‎README.md‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎deepnote_core/resources/jupyter/jupyter_server_config.py‎
Lines changed: 20 additions & 3 deletions b/‎deepnote_core/resources/jupyter/jupyter_server_config.py‎
Lines changed: 20 additions & 3 deletions
diff --git a/‎deepnote_toolkit/execution_timeout.py‎
Lines changed: 231 additions & 0 deletions b/‎deepnote_toolkit/execution_timeout.py‎
Lines changed: 231 additions & 0 deletions
@@ -63,6 +63,31 @@ deepnote-toolkit config set server.jupyter_port 9000
 
 **Security note**: The CLI will warn if Jupyter runs without authentication. For local development only. Set `DEEPNOTE_JUPYTER_TOKEN` for shared environments.
 
+## Environment Variables
+
+### Debugging and Logging
+
+The following environment variables control debug logging and diagnostic output:
+
+- **`DEEPNOTE_ENABLE_DEBUG_LOGGING`**: Set to `true` to enable verbose DEBUG-level logs for tornado, jupyter_server, and jupyter_client. This increases log verbosity which can help troubleshoot server-related issues. Default: `false` (INFO level)
+
+- **`DEEPNOTE_ENABLE_ZMQ_DEBUG`**: Set to `true` to enable detailed ZMQ message flow logging for kernel communication debugging. This logs all messages exchanged between the Jupyter server and kernel, which is useful for diagnosing stuck execution or kernel communication issues. Default: `false`
+
+**Example Usage**:
+
+```bash
+# Enable debug logging
+DEEPNOTE_ENABLE_DEBUG_LOGGING=true deepnote-toolkit server
+
+# Enable ZMQ message debugging
+DEEPNOTE_ENABLE_ZMQ_DEBUG=true deepnote-toolkit server
+
+# Enable both
+DEEPNOTE_ENABLE_DEBUG_LOGGING=true DEEPNOTE_ENABLE_ZMQ_DEBUG=true deepnote-toolkit server
+```
+
+**Note**: Debug logging can significantly increase log volume and may impact performance. Only enable in development or when troubleshooting specific issues.
+
 ## Need help?
 
 - Join our [Community](https://github.com/deepnote/deepnote/discussions)!
 
@@ -10,6 +10,11 @@
 # do not import explicitly it will break the config loading
 c = get_config()  # pylint: disable=E0602; # noqa: F821
 
+# Environment variable-based debug logging configuration
+# Set DEEPNOTE_ENABLE_DEBUG_LOGGING=true to enable verbose DEBUG logs
+# Set DEEPNOTE_ENABLE_ZMQ_DEBUG=true to enable detailed ZMQ message logging
+debug_logging_enabled = os.getenv("DEEPNOTE_ENABLE_DEBUG_LOGGING", "false").lower() == "true"
+log_level = "DEBUG" if debug_logging_enabled else "INFO"
 
 # ------------------------------------------------------------------------------
 # Application(SingletonConfigurable) configuration
@@ -27,7 +32,8 @@
 ## Set the log level by value or name.
 #  Choices: any of [0, 10, 20, 30, 40, 50, 'DEBUG', 'INFO', 'WARN', 'ERROR', 'CRITICAL']
 #  Default: 30
-c.Application.log_level = 10
+# Conditional based on DEEPNOTE_ENABLE_DEBUG_LOGGING environment variable
+c.Application.log_level = 10 if debug_logging_enabled else 20  # DEBUG or INFO
 
 ## Configure additional log handlers.
 #
@@ -414,7 +420,15 @@
 
 ##
 #  See also: Application.logging_config
-# c.ServerApp.logging_config = {}
+# Enhanced logging configuration for debugging
+# Uses debug_logging_enabled and log_level variables defined at the top of this file
+c.ServerApp.logging_config = {
+    "loggers": {
+        "tornado.access": {"level": log_level},
+        "jupyter_server.serverapp": {"level": log_level},
+        "jupyter_client.session": {"level": log_level},
+    }
+}
 
 ## The login handler class to use.
 #  Default: 'notebook.auth.login.LoginHandler'
@@ -820,7 +834,10 @@
 
 ## Debug output in the Session
 #  Default: False
-# c.Session.debug = False
+# Enable ZMQ message flow debugging for troubleshooting kernel communication
+# Set DEEPNOTE_ENABLE_ZMQ_DEBUG=true to enable detailed ZMQ message logging
+if os.getenv("DEEPNOTE_ENABLE_ZMQ_DEBUG", "false").lower() == "true":
+    c.Session.debug = True
 
 ## The maximum number of digests to remember.
 #
 
@@ -0,0 +1,231 @@
+"""
+This module provides execution timeout monitoring for Jupyter notebook cells.
+It can detect long-running executions and optionally send warnings or interrupt them.
+"""
+
+import os
+import signal
+import threading
+import time
+from typing import Optional
+
+import requests
+from IPython.core.interactiveshell import ExecutionInfo, ExecutionResult
+
+from .get_webapp_url import get_absolute_userpod_api_url
+from .logging import LoggerManager
+
+
+class ExecutionTimeoutMonitor:
+    """
+    Monitors execution duration and can send warnings or interrupt stuck executions.
+    """
+
+    def __init__(
+        self,
+        warning_threshold_seconds: int = 240,
+        timeout_seconds: int = 300,
+        enable_auto_interrupt: bool = False,
+    ):
+        """
+        Initialize the execution timeout monitor.
+
+        Args:
+            warning_threshold_seconds: Seconds after which to send a warning (default: 240s = 4min)
+            timeout_seconds: Seconds after which to consider execution stuck (default: 300s = 5min)
+            enable_auto_interrupt: Whether to automatically interrupt stuck executions (default: False)
+        """
+        self.logger = LoggerManager().get_logger()
+        self.warning_threshold = warning_threshold_seconds
+        self.timeout_threshold = timeout_seconds
+        self.enable_auto_interrupt = enable_auto_interrupt
+        self.current_execution: Optional[dict] = None
+        self.warning_timer: Optional[threading.Timer] = None
+        self.timeout_timer: Optional[threading.Timer] = None
+        self._execution_lock = threading.Lock()
+
+    def on_pre_execute(self, info: ExecutionInfo) -> None:
+        """
+        Called before executing a cell.
+        Starts timers for warning and timeout.
+        """
+        cell_preview = info.raw_cell[:100] if info.raw_cell else "<empty>"
+
+        with self._execution_lock:
+            self.current_execution = {
+                "code": cell_preview,
+                "start": time.time(),
+            }
+
+            # Start warning timer
+            if self.warning_threshold > 0:
+                self.warning_timer = threading.Timer(
+                    self.warning_threshold, self._send_warning
+                )
+                self.warning_timer.daemon = True
+                self.warning_timer.start()
+
+            # Start timeout timer
+            if self.enable_auto_interrupt and self.timeout_threshold > 0:
+                self.timeout_timer = threading.Timer(
+                    self.timeout_threshold, self._interrupt_execution
+                )
+                self.timeout_timer.daemon = True
+                self.timeout_timer.start()
+
+        self.logger.debug(
+            "Timeout monitoring started: warning=%ds, timeout=%ds, auto_interrupt=%s",
+            self.warning_threshold,
+            self.timeout_threshold,
+            self.enable_auto_interrupt,
+        )
+
+    def on_post_execute(self, result: ExecutionResult) -> None:
+        """
+        Called after executing a cell.
+        Cancels any pending timers.
+        """
+        with self._execution_lock:
+            self._cancel_timers()
+            self.current_execution = None
+
+    def _cancel_timers(self) -> None:
+        """Cancel all active timers."""
+        if self.warning_timer:
+            self.warning_timer.cancel()
+            self.warning_timer = None
+        if self.timeout_timer:
+            self.timeout_timer.cancel()
+            self.timeout_timer = None
+
+    def _send_warning(self) -> None:
+        """Send warning when execution is running longer than threshold."""
+        # Capture execution data while holding lock
+        with self._execution_lock:
+            if not self.current_execution:
+                return
+            execution_data = self.current_execution.copy()
+
+        # Process outside lock to avoid blocking
+        duration = time.time() - execution_data["start"]
+        code_preview = execution_data["code"][:50]
+
+        self.logger.warning(
+            "LONG_EXECUTION | duration=%.1fs | preview=%s",
+            duration,
+            code_preview.replace("\n", "\\n"),
+        )
+
+        # Try to report to webapp
+        self._report_to_webapp(duration, code_preview, warning=True)
+
+    def _interrupt_execution(self) -> None:
+        """Interrupt execution after timeout threshold is exceeded."""
+        # Capture execution data while holding lock
+        with self._execution_lock:
+            if not self.current_execution:
+                return
+            execution_data = self.current_execution.copy()
+
+        # Process outside lock to avoid blocking
+        duration = time.time() - execution_data["start"]
+        code_preview = execution_data["code"][:50]
+
+        self.logger.error(
+            "TIMEOUT_INTERRUPT | duration=%.1fs | Sending SIGINT to interrupt execution",
+            duration,
+        )
+
+        # Report to webapp before interrupting
+        self._report_to_webapp(duration, code_preview, warning=False)
+
+        # Send SIGINT to interrupt the execution (simulates Ctrl+C)
+        try:
+            os.kill(os.getpid(), signal.SIGINT)
+        except Exception as e:  # pylint: disable=broad-exception-caught
+            self.logger.error("Failed to send SIGINT: %s", e)
+
+    def _report_to_webapp(
+        self, duration: float, code_preview: str, warning: bool
+    ) -> None:
+        """
+        Report execution warning/timeout to webapp.
+
+        Args:
+            duration: Execution duration in seconds
+            code_preview: Preview of the code being executed
+            warning: Whether this is a warning (True) or timeout (False)
+        """
+        try:
+            endpoint = "warning" if warning else "timeout"
+            url = get_absolute_userpod_api_url(f"execution/{endpoint}")
+
+            payload = {
+                "duration": duration,
+                "code_preview": code_preview,
+                "threshold": (
+                    self.warning_threshold if warning else self.timeout_threshold
+                ),
+            }
+
+            response = requests.post(url, json=payload, timeout=2)
+            response.raise_for_status()
+
+            self.logger.debug("Successfully reported %s to webapp", endpoint)
+
+        except Exception as e:  # pylint: disable=broad-exception-caught
+            self.logger.error("Failed to report to webapp: %s", e)
+
+
+# Global instance
+_timeout_monitor: Optional[ExecutionTimeoutMonitor] = None
+
+
+def setup_execution_timeout_monitor(
+    warning_threshold_seconds: int = 240,
+    timeout_seconds: int = 300,
+    enable_auto_interrupt: bool = False,
+) -> None:
+    """
+    Set up execution timeout monitoring.
+
+    This is optional and should be called during runtime initialization if needed.
+
+    Args:
+        warning_threshold_seconds: Seconds after which to send a warning (default: 240s = 4min)
+        timeout_seconds: Seconds after which to consider execution stuck (default: 300s = 5min)
+        enable_auto_interrupt: Whether to automatically interrupt stuck executions (default: False)
+    """
+    global _timeout_monitor  # pylint: disable=global-statement
+
+    try:
+        from IPython import get_ipython
+
+        ip = get_ipython()
+        if ip is None:
+            LoggerManager().get_logger().warning(
+                "IPython instance not available, skipping timeout monitor setup"
+            )
+            return
+
+        _timeout_monitor = ExecutionTimeoutMonitor(
+            warning_threshold_seconds=warning_threshold_seconds,
+            timeout_seconds=timeout_seconds,
+            enable_auto_interrupt=enable_auto_interrupt,
+        )
+
+        # Register event handlers
+        ip.events.register("pre_execute", _timeout_monitor.on_pre_execute)
+        ip.events.register("post_execute", _timeout_monitor.on_post_execute)
+
+        LoggerManager().get_logger().info(
+            "Execution timeout monitor initialized: warning=%ds, timeout=%ds, auto_interrupt=%s",
+            warning_threshold_seconds,
+            timeout_seconds,
+            enable_auto_interrupt,
+        )
+
+    except Exception as e:  # pylint: disable=broad-exception-caught
+        LoggerManager().get_logger().error(
+            "Failed to set up timeout monitor: %s", e
+        )