KernelTuner · benvanwerkhoven · Jun 4, 2024 · Apr 19, 2024 · Apr 22, 2024 · Apr 22, 2024
diff --git a/.github/workflows/test-python-package.yml b/.github/workflows/test-python-package.yml
@@ -28,7 +28,7 @@ jobs:
             - name: Setup Nox
               uses: fjwillemsen/setup-nox2@v3.0.0
             - name: Setup Poetry
-              uses: Gr1N/setup-poetry@v8
+              uses: Gr1N/setup-poetry@v9
             - run: poetry self add poetry-plugin-export
             - name: Run tests with Nox
               run: |

diff --git a/doc/source/observers.rst b/doc/source/observers.rst
@@ -112,3 +112,11 @@ More information about PMT can be found here: https://git.astron.nl/RD/pmt/
 
 
 
+NCUObserver
+~~~~~~~~~~~
+
+The NCUObserver can be used to automatically extract performance counters during tuning using Nvidia's NsightCompute profiler.
+The NCUObserver relies on an intermediate library, which can be found here: https://github.com/nlesc-recruit/nvmetrics
-The NCUObserver can be used to automatically extract performance counters during tuning using Nvidia's NsightCompute profiler.
-The NCUObserver relies on an intermediate library, which can be found here: https://github.com/nlesc-recruit/nvmetrics
+The NCUObserver can be used to automatically extract performance counters during tuning using Nvidia's CUDA Profiling Tools Interface (CUPTI) library, to offer insights into performance characteristics like memory access patterns and instruction execution statistics. It behaves much like the Nvidia Nsight Compute CLI (ncu) application, hence the name NCUObserver. This observer relies on an intermediate library, which can be found here: https://github.com/nlesc-recruit/nvmetrics
-The NCUObserver can be used to automatically extract performance counters during tuning using Nvidia's NsightCompute profiler.
-The NCUObserver relies on an intermediate library, which can be found here: https://github.com/nlesc-recruit/nvmetrics
+The NCUObserver can be used to automatically extract performance counters during tuning using Nvidia's CUDA Profiling Tools Interface (CUPTI) library, to offer insights into performance characteristics like memory access patterns and instruction execution statistics. It behaves much like the Nvidia Nsight Compute CLI (ncu) application, hence the name NCUObserver. This observer relies on an intermediate library, which can be found here: https://github.com/nlesc-recruit/nvmetrics
+
+.. autoclass:: kernel_tuner.observers.ncu.NCUObserver
+
diff --git a/examples/cuda/vector_add_observers_ncu.py b/examples/cuda/vector_add_observers_ncu.py
@@ -0,0 +1,57 @@
+#!/usr/bin/env python
+"""This is the minimal example from the README"""
+import json
+
+import numpy
+from kernel_tuner import tune_kernel
+from kernel_tuner.observers.ncu import NCUObserver
+
+def tune():
+
+    kernel_string = """
+    __global__ void vector_add(float *c, float *a, float *b, int n) {
+        int i = blockIdx.x * block_size_x + threadIdx.x;
+        if (i<n) {
+            c[i] = a[i] + b[i];
+        }
+    }
+    """
+
+    size = 80000000
+
+    a = numpy.random.randn(size).astype(numpy.float32)
+    b = numpy.random.randn(size).astype(numpy.float32)
+    c = numpy.zeros_like(b)
+    n = numpy.int32(size)
+
+    args = [c, a, b, n]
+
+    tune_params = dict()
+    tune_params["block_size_x"] = [128+64*i for i in range(15)]
+
+    ncu_metrics = ["dram__bytes.sum",                                       # Counter         byte            # of bytes accessed in DRAM
+                   "dram__bytes_read.sum",                                  # Counter         byte            # of bytes read from DRAM
+                   "dram__bytes_write.sum",                                 # Counter         byte            # of bytes written to DRAM
+                   "smsp__sass_thread_inst_executed_op_fadd_pred_on.sum",   # Counter         inst            # of FADD thread instructions executed where all predicates were true
+                   "smsp__sass_thread_inst_executed_op_ffma_pred_on.sum",   # Counter         inst            # of FFMA thread instructions executed where all predicates were true
+                   "smsp__sass_thread_inst_executed_op_fmul_pred_on.sum",   # Counter         inst            # of FMUL thread instructions executed where all predicates were true
+                  ]
+
+    ncuobserver = NCUObserver(metrics=ncu_metrics)
+
+    def total_fp32_flops(p):
+        return p["smsp__sass_thread_inst_executed_op_fadd_pred_on.sum"] + 2 * p["smsp__sass_thread_inst_executed_op_ffma_pred_on.sum"] + p["smsp__sass_thread_inst_executed_op_fmul_pred_on.sum"]
+
+    metrics = dict()
+    metrics["GFLOP/s"] = lambda p: (total_fp32_flops(p) / 1e9) / (p["time"]/1e3)
+    metrics["Expected GFLOP/s"] = lambda p: (size / 1e9) / (p["time"]/1e3)
+    metrics["GB/s"] = lambda p: (p["dram__bytes.sum"] / 1e9) / (p["time"]/1e3)
+    metrics["Expected GB/s"] = lambda p: (size*4*3 / 1e9) / (p["time"]/1e3)
+
+    results, env = tune_kernel("vector_add", kernel_string, size, args, tune_params, observers=[ncuobserver], metrics=metrics, iterations=7)
+
+    return results
+
+
+if __name__ == "__main__":
+    tune()
diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py
@@ -23,7 +23,7 @@
 from kernel_tuner.backends.opencl import OpenCLFunctions
 from kernel_tuner.backends.hip import HipFunctions
 from kernel_tuner.observers.nvml import NVMLObserver
-from kernel_tuner.observers.observer import ContinuousObserver, OutputObserver
+from kernel_tuner.observers.observer import ContinuousObserver, OutputObserver, PrologueObserver
 
 try:
     import torch
@@ -314,11 +314,13 @@ def __init__(
             )
         else:
             raise ValueError("Sorry, support for languages other than CUDA, OpenCL, HIP, C, and Fortran is not implemented yet")
+        self.dev = dev
 
         # look for NVMLObserver in observers, if present, enable special tunable parameters through nvml
         self.use_nvml = False
         self.continuous_observers = []
         self.output_observers = []
+        self.prologue_observers = []
         if observers:
             for obs in observers:
                 if isinstance(obs, NVMLObserver):
@@ -328,49 +330,61 @@ def __init__(
                     self.continuous_observers.append(obs.continuous_observer)
                 if isinstance(obs, OutputObserver):
                     self.output_observers.append(obs)
+                if isinstance(obs, PrologueObserver):
+                    self.prologue_observers.append(obs)
 
+        # Take list of observers from self.dev because Backends tend to add their own observer
+        self.benchmark_observers = [
+            obs for obs in self.dev.observers if not isinstance(obs, (ContinuousObserver, PrologueObserver))
+        ]
 
         self.iterations = iterations
 
         self.lang = lang
-        self.dev = dev
         self.units = dev.units
         self.name = dev.name
         self.max_threads = dev.max_threads
         if not quiet:
             print("Using: " + self.dev.name)
 
+    def benchmark_prologue(self, func, gpu_args, threads, grid, result):
+        """Benchmark prologue one kernel execution per PrologueObserver"""
+
+        for obs in self.prologue_observers:
+            self.dev.synchronize()
+            obs.before_start()
+            self.dev.run_kernel(func, gpu_args, threads, grid)
+            self.dev.synchronize()
+            obs.after_finish()
+            result.update(obs.get_results())
+
     def benchmark_default(self, func, gpu_args, threads, grid, result):
-        """Benchmark one kernel execution at a time."""
-        observers = [
-            obs for obs in self.dev.observers if not isinstance(obs, ContinuousObserver)
-        ]
+        """Benchmark one kernel execution for 'iterations' at a time"""
 
         self.dev.synchronize()
         for _ in range(self.iterations):
-            for obs in observers:
+            for obs in self.benchmark_observers:
                 obs.before_start()
             self.dev.synchronize()
             self.dev.start_event()
             self.dev.run_kernel(func, gpu_args, threads, grid)
             self.dev.stop_event()
-            for obs in observers:
+            for obs in self.benchmark_observers:
                 obs.after_start()
             while not self.dev.kernel_finished():
-                for obs in observers:
+                for obs in self.benchmark_observers:
                     obs.during()
                 time.sleep(1e-6)  # one microsecond
             self.dev.synchronize()
-            for obs in observers:
+            for obs in self.benchmark_observers:
                 obs.after_finish()
 
-        for obs in observers:
+        for obs in self.benchmark_observers:
             result.update(obs.get_results())
 
     def benchmark_continuous(self, func, gpu_args, threads, grid, result, duration):
         """Benchmark continuously for at least 'duration' seconds"""
         iterations = int(np.ceil(duration / (result["time"] / 1000)))
-        # print(f"{iterations=} {(result['time']/1000)=}")
         self.dev.synchronize()
         for obs in self.continuous_observers:
             obs.before_start()
@@ -420,9 +434,8 @@ def benchmark(self, func, gpu_args, instance, verbose, objective, skip_nvml_sett
 
         result = {}
         try:
-            self.benchmark_default(
-                func, gpu_args, instance.threads, instance.grid, result
-            )
+            self.benchmark_prologue(func, gpu_args, instance.threads, instance.grid, result)
+            self.benchmark_default(func, gpu_args, instance.threads, instance.grid, result)
 
             if self.continuous_observers:
                 duration = 1

diff --git a/kernel_tuner/observers/__init__.py b/kernel_tuner/observers/__init__.py
@@ -1 +1 @@
-from .observer import BenchmarkObserver, IterationObserver, ContinuousObserver, OutputObserver
+from .observer import BenchmarkObserver, IterationObserver, ContinuousObserver, OutputObserver, PrologueObserver
diff --git a/kernel_tuner/observers/ncu.py b/kernel_tuner/observers/ncu.py
@@ -0,0 +1,41 @@
+from kernel_tuner.observers import PrologueObserver
+
+try:
+    import nvmetrics
+except (ImportError):
+    nvmetrics = None
+
+class NCUObserver(PrologueObserver):
+    """``NCUObserver`` measures performance counters.
+
+        The exact performance counters supported differ per GPU, some examples:
+
+         * "dram__bytes.sum",                                     # Counter         byte            # of bytes accessed in DRAM
+         * "dram__bytes_read.sum",                                # Counter         byte            # of bytes read from DRAM
+         * "dram__bytes_write.sum",                               # Counter         byte            # of bytes written to DRAM
+         * "smsp__sass_thread_inst_executed_op_fadd_pred_on.sum", # Counter         inst            # of FADD thread instructions executed where all predicates were true
+         * "smsp__sass_thread_inst_executed_op_ffma_pred_on.sum", # Counter         inst            # of FFMA thread instructions executed where all predicates were true
+         * "smsp__sass_thread_inst_executed_op_fmul_pred_on.sum", # Counter         inst            # of FMUL thread instructions executed where all predicates were true
+
+        :param metrics: The metrics to observe. This should be a list of strings.
+                        You can use ``ncu --query-metrics`` to get a list of valid metrics.
+        :type metrics: list[str]
+
+    """
+
+    def __init__(self, metrics=None, device=0):
+        if not nvmetrics:
+            raise ImportError("could not import nvmetrics")
+
+        self.metrics = metrics
+        self.device = device
+        self.results = dict()
+
+    def before_start(self):
+        nvmetrics.measureMetricsStart(self.metrics, self.device)
+
+    def after_finish(self):
+        self.results = nvmetrics.measureMetricsStop()
+
+    def get_results(self):
+        return dict(zip(self.metrics, self.results))
diff --git a/kernel_tuner/observers/observer.py b/kernel_tuner/observers/observer.py
@@ -57,4 +57,15 @@ def process_output(self, answer, output):
         """
         pass
 
+class PrologueObserver(BenchmarkObserver):
+    """Observer that measures something in a seperate kernel invocation prior to the normal benchmark."""
 
+    @abstractmethod
+    def before_start(self):
+        """prologue start is called before the kernel starts"""
+        pass
+
+    @abstractmethod
+    def after_finish(self):
+        """prologue finish is called after the kernel has finished execution"""
+        pass
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		from .observer import BenchmarkObserver, IterationObserver, ContinuousObserver, OutputObserver
		from .observer import BenchmarkObserver, IterationObserver, ContinuousObserver, OutputObserver, PrologueObserver