rapidsai · raydouglass · Apr 15, 2024 · Feb 29, 2024 · Mar 1, 2024 · Mar 27, 2024
diff --git a/.github/ops-bot.yaml b/.github/ops-bot.yaml
@@ -0,0 +1,4 @@
+# This file controls which features from the `ops-bot` repository below are enabled.
+# - https:/rapidsai/ops-bot
+
+forward_merger: true
diff --git a/ci/check_style.sh b/ci/check_style.sh
@@ -11,7 +11,7 @@ rapids-dependency-file-generator \
  --file_key checks \
  --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee env.yaml
 
-rapids-mamba-retry env create --force -f env.yaml -n checks
+rapids-mamba-retry env create --yes -f env.yaml -n checks
 conda activate checks
 
 rapids-logger "Run pre-commit checks - Python backend"

diff --git a/ci/test_python.sh b/ci/test_python.sh
@@ -11,14 +11,14 @@ rapids-dependency-file-generator \
  --file_key test_python \
  --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee env.yaml
 
-rapids-mamba-retry env create --force -f env.yaml -n test
+rapids-mamba-retry env create --yes -f env.yaml -n test
 
 # Temporarily allow unbound variables for conda activation.
 set +u
 conda activate test
 set -u
 
-# rapids-logger "Downloading artifacts from previous jobs"
+rapids-logger "Downloading artifacts from previous jobs"
 PYTHON_CHANNEL=$(rapids-download-conda-from-s3 python)
 
 rapids-print-env

diff --git a/conda/environments/all_arch-any.yaml b/conda/environments/all_arch-any.yaml
@@ -12,6 +12,8 @@ dependencies:
 - psutil
 - pynvml
 - pytest
+- pytest-asyncio
 - pytest-jupyter[server]>=0.6.0
 - python>=3.8
+- websockets
 name: all_arch-any
diff --git a/dependencies.yaml b/dependencies.yaml
@@ -80,3 +80,5 @@ dependencies:
  packages:
  - pytest
  - pytest-jupyter[server]>=0.6.0
+ - pytest-asyncio
+ - websockets
diff --git a/jupyterlab_nvdashboard/apps/cpu.py b/jupyterlab_nvdashboard/apps/cpu.py
@@ -1,13 +1,11 @@
 import json
 import psutil
 import time
-import tornado
-from jupyter_server.base.handlers import APIHandler
+from jupyterlab_nvdashboard.apps.utils import CustomWebSocketHandler
 
 
-class CPUResourceHandler(APIHandler):
- @tornado.web.authenticated
- def get(self):
+class CPUResourceWebSocketHandler(CustomWebSocketHandler):
+ def send_data(self):
  now = time.time()
  stats = {
  "time": now * 1000,
@@ -18,5 +16,4 @@ def get(self):
  "network_read": psutil.net_io_counters().bytes_recv,
  "network_write": psutil.net_io_counters().bytes_sent,
  }
- self.set_header("Content-Type", "application/json")
- self.write(json.dumps(stats))
+ self.write_message(json.dumps(stats))
diff --git a/jupyterlab_nvdashboard/apps/gpu.py b/jupyterlab_nvdashboard/apps/gpu.py
@@ -1,8 +1,7 @@
 import json
+from jupyterlab_nvdashboard.apps.utils import CustomWebSocketHandler
 import pynvml
 import time
-import tornado
-from jupyter_server.base.handlers import APIHandler
 
 try:
  pynvml.nvmlInit()
@@ -41,19 +40,17 @@
  pci_gen = None
 
 
-class GPUUtilizationHandler(APIHandler):
- @tornado.web.authenticated
- def get(self):
+class GPUUtilizationWebSocketHandler(CustomWebSocketHandler):
+ def send_data(self):
  gpu_utilization = [
  pynvml.nvmlDeviceGetUtilizationRates(gpu_handles[i]).gpu
  for i in range(ngpus)
  ]
- self.finish(json.dumps({"gpu_utilization": gpu_utilization}))
+ self.write_message(json.dumps({"gpu_utilization": gpu_utilization}))
 
 
-class GPUUsageHandler(APIHandler):
- @tornado.web.authenticated
- def get(self):
+class GPUUsageWebSocketHandler(CustomWebSocketHandler):
+ def send_data(self):
  memory_usage = [
  pynvml.nvmlDeviceGetMemoryInfo(handle).used
  for handle in gpu_handles
@@ -64,16 +61,15 @@ def get(self):
  for handle in gpu_handles
  ]
 
- self.finish(
+ self.write_message(
  json.dumps(
  {"memory_usage": memory_usage, "total_memory": total_memory}
  )
  )
 
 
-class GPUResourceHandler(APIHandler):
- @tornado.web.authenticated
- def get(self):
+class GPUResourceWebSocketHandler(CustomWebSocketHandler):
+ def send_data(self):
  now = time.time()
  stats = {
  "time": now * 1000,
@@ -118,15 +114,13 @@ def get(self):
  stats["gpu_memory_total"] = round(
  (stats["gpu_memory_total"] / gpu_mem_sum) * 100, 2
  )
- self.set_header("Content-Type", "application/json")
- self.write(json.dumps(stats))
+ self.write_message(json.dumps(stats))
 
 
-class NVLinkThroughputHandler(APIHandler):
+class NVLinkThroughputWebSocketHandler(CustomWebSocketHandler):
  prev_throughput = None
 
- @tornado.web.authenticated
- def get(self):
+ def send_data(self):
  throughput = [
  pynvml.nvmlDeviceGetFieldValues(
  handle,
@@ -162,9 +156,8 @@ def get(self):
  # Store the current throughput for the next request
  self.prev_throughput = throughput
 
- self.set_header("Content-Type", "application/json")
  # Send the change in throughput as part of the response
- self.write(
+ self.write_message(
  json.dumps(
  {
  "nvlink_rx": [
@@ -191,9 +184,8 @@ def get(self):
  )
 
 
-class PCIStatsHandler(APIHandler):
- @tornado.web.authenticated
- def get(self):
+class PCIStatsWebSocketHandler(CustomWebSocketHandler):
+ def send_data(self):
  # Use device-0 to get "upper bound"
  pci_width = pynvml.nvmlDeviceGetMaxPcieLinkWidth(gpu_handles[0])
  pci_bw = {
@@ -231,5 +223,4 @@ def get(self):
  "max_rxtx_tp": max_rxtx_tp,
  }
 
- self.set_header("Content-Type", "application/json")
- self.write(json.dumps(stats))
+ self.write_message(json.dumps(stats))
diff --git a/jupyterlab_nvdashboard/apps/utils.py b/jupyterlab_nvdashboard/apps/utils.py
@@ -0,0 +1,31 @@
+from tornado.websocket import WebSocketHandler
+import tornado
+import json
+
+
+class CustomWebSocketHandler(WebSocketHandler):
+ def open(self):
+ self.write_message(json.dumps({"status": "connected"}))
+ self.set_nodelay(True)
+ # Start a periodic callback to send data every 50ms
+ self.callback = tornado.ioloop.PeriodicCallback(self.send_data, 1000)
+ self.callback.start()
+
+ def on_message(self, message):
+ message_data = json.loads(message)
+ # Update the periodic callback frequency
+ new_frequency = message_data["updateFrequency"]
+ if hasattr(self, "callback"):
+ self.callback.stop()
+ self.callback = tornado.ioloop.PeriodicCallback(
+ self.send_data, new_frequency
+ )
+ if not message_data["isPaused"]:
+ self.callback.start()
+
+ def on_close(self):
+ if hasattr(self, "callback") and self.callback.is_running():
+ self.callback.stop()
+
+ def send_data(self):
+ pass
diff --git a/jupyterlab_nvdashboard/handlers.py b/jupyterlab_nvdashboard/handlers.py
@@ -26,13 +26,13 @@ def setup_handlers(web_app):
  base_url, URL_PATH, "nvlink_throughput"
  )
  handlers += [
- (route_pattern_gpu_util, apps.gpu.GPUUtilizationHandler),
- (route_pattern_gpu_usage, apps.gpu.GPUUsageHandler),
- (route_pattern_gpu_resource, apps.gpu.GPUResourceHandler),
- (route_pattern_pci_stats, apps.gpu.PCIStatsHandler),
+ (route_pattern_gpu_util, apps.gpu.GPUUtilizationWebSocketHandler),
+ (route_pattern_gpu_usage, apps.gpu.GPUUsageWebSocketHandler),
+ (route_pattern_gpu_resource, apps.gpu.GPUResourceWebSocketHandler),
+ (route_pattern_pci_stats, apps.gpu.PCIStatsWebSocketHandler),
  (
  route_pattern_nvlink_throughput,
- apps.gpu.NVLinkThroughputHandler,
+ apps.gpu.NVLinkThroughputWebSocketHandler,
  ),
  ]
 
@@ -41,7 +41,7 @@ def setup_handlers(web_app):
  )
 
  handlers += [
- (route_pattern_cpu_resource, apps.cpu.CPUResourceHandler),
+ (route_pattern_cpu_resource, apps.cpu.CPUResourceWebSocketHandler),
  ]
 
  web_app.add_handlers(host_pattern, handlers)
diff --git a/jupyterlab_nvdashboard/tests/conftest.py b/jupyterlab_nvdashboard/tests/conftest.py
@@ -0,0 +1,2 @@
+def pytest_configure(config):
+ config.addinivalue_line("markers", "asyncio: mark test as asyncio")
diff --git a/jupyterlab_nvdashboard/tests/test_cpu_handlers.py b/jupyterlab_nvdashboard/tests/test_cpu_handlers.py
@@ -0,0 +1,37 @@
+import json
+import pytest
+from unittest.mock import MagicMock, patch
+
+from jupyterlab_nvdashboard.apps.cpu import CPUResourceWebSocketHandler
+
+
+@pytest.fixture
+def mock_handler(monkeypatch):
+ mock = MagicMock()
+ monkeypatch.setattr(
+ "jupyterlab_nvdashboard.apps.cpu.CustomWebSocketHandler.write_message",
+ mock,
+ )
+ return mock
+
+
+@pytest.fixture
+def handler_args():
+ with patch("tornado.web.Application") as mock_application, patch(
+ "tornado.httputil.HTTPServerRequest"
+ ) as mock_request:
+ yield mock_application, mock_request
+
+
+def test_cpu_resource_handler(mock_handler, handler_args):
+ handler = CPUResourceWebSocketHandler(*handler_args)
+ handler.send_data()
+ args, _ = mock_handler.call_args
+ data = json.loads(args[0])
+ assert "time" in data
+ assert "cpu_utilization" in data
+ assert "memory_usage" in data
+ assert "disk_read" in data
+ assert "disk_write" in data
+ assert "network_read" in data
+ assert "network_write" in data
diff --git a/jupyterlab_nvdashboard/tests/test_gpu_handlers.py b/jupyterlab_nvdashboard/tests/test_gpu_handlers.py
@@ -0,0 +1,80 @@
+import json
+import pytest
+from unittest.mock import MagicMock, patch
+
+from jupyterlab_nvdashboard.apps.gpu import (
+ GPUUtilizationWebSocketHandler,
+ GPUUsageWebSocketHandler,
+ GPUResourceWebSocketHandler,
+ NVLinkThroughputWebSocketHandler,
+ PCIStatsWebSocketHandler,
+)
+
+
+@pytest.fixture
+def mock_handler(monkeypatch):
+ mock = MagicMock()
+ monkeypatch.setattr(
+ "jupyterlab_nvdashboard.apps.gpu.CustomWebSocketHandler.write_message",
+ mock,
+ )
+ return mock
+
+
+@pytest.fixture
+def handler_args():
+ with patch("tornado.web.Application") as mock_application, patch(
+ "tornado.httputil.HTTPServerRequest"
+ ) as mock_request:
+ yield mock_application, mock_request
+
+
+def test_gpu_utilization_handler(mock_handler, handler_args):
+ handler = GPUUtilizationWebSocketHandler(*handler_args)
+ handler.send_data()
+ args, _ = mock_handler.call_args
+ data = json.loads(args[0])
+ assert "gpu_utilization" in data
+
+
+def test_gpu_usage_handler(mock_handler, handler_args):
+ handler = GPUUsageWebSocketHandler(*handler_args)
+ handler.send_data()
+ args, _ = mock_handler.call_args
+ data = json.loads(args[0])
+ assert "memory_usage" in data
+ assert "total_memory" in data
+
+
+def test_gpu_resource_handler(mock_handler, handler_args):
+ handler = GPUResourceWebSocketHandler(*handler_args)
+ handler.send_data()
+ args, _ = mock_handler.call_args
+ data = json.loads(args[0])
+ assert "time" in data
+ assert "gpu_utilization_total" in data
+ assert "gpu_memory_total" in data
+ assert "rx_total" in data
+ assert "tx_total" in data
+ assert "gpu_memory_individual" in data
+ assert "gpu_utilization_individual" in data
+
+
+def test_nvlink_throughput_handler(mock_handler, handler_args):
+ handler = NVLinkThroughputWebSocketHandler(*handler_args)
+ handler.send_data()
+ args, _ = mock_handler.call_args
+ data = json.loads(args[0])
+ assert "nvlink_rx" in data
+ assert "nvlink_tx" in data
+ assert "max_rxtx_bw" in data
+
+
+def test_pci_stats_handler(mock_handler, handler_args):
+ handler = PCIStatsWebSocketHandler(*handler_args)
+ handler.send_data()
+ args, _ = mock_handler.call_args
+ data = json.loads(args[0])
+ assert "pci_tx" in data
+ assert "pci_rx" in data
+ assert "max_rxtx_tp" in data