From 327fcaa00d42d69c2fb3e4a8251ccb96439609d6 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tkonolige@octoml.ai>
Date: Thu, 11 Nov 2021 21:20:56 -0800
Subject: [PATCH 1/3] [PROFILING] Add ability to profile a single
 function_profiling

Add a new function `tvm.runtime.profiling.profile_function` which
collects performance metrics for a single function in an IRModule. For
example, collecting performance counters using `PAPIMetricCollector`.
This is helpful for optimizing kernels and schedules for a single
operator.
---
 include/tvm/runtime/profiling.h               | 31 ++++++++++++
 python/tvm/runtime/profiling/__init__.py      | 43 ++++++++++++++++
 src/runtime/profiling.cc                      | 50 +++++++++++++++++++
 .../python/unittest/test_runtime_profiling.py | 47 +++++++++++++++++
 4 files changed, 171 insertions(+)
diff --git a/include/tvm/runtime/profiling.h b/include/tvm/runtime/profiling.h
index 366f4f1deed1..7d08e690f75c 100644
--- a/include/tvm/runtime/profiling.h
+++ b/include/tvm/runtime/profiling.h
@@ -477,6 +477,37 @@ String ShapeString(NDArray shape, DLDataType dtype);
  */
 String ShapeString(const std::vector<int64_t>& shape, DLDataType dtype);
 
+/*! \brief Collect performance information of a function execution. Usually
+ * used with a compiled PrimFunc (via tvm.build).
+ *
+ * This information can include performance counters like cache hits and FLOPs
+ * that are useful in debugging performance issues of individual PrimFuncs.
+ * Different metrics can be collected depending on which MetricCollector is
+ * used.
+ *
+ * Example usage:
+ * \code{.cpp}
+ * // Use PAPI to measure the number of floating point operations.
+ * PackedFunc profiler = ProfileModule(
+ *     mod, "main", kDLCPU, 0, {CreatePAPIMetricCollector({{kDLCPU, 0}, {"PAPI_FP_OPS"}})});
+ * Report r = profiler(arg1, arg2, arg);
+ * std::cout << r << std::endl;
+ * \endcode
+ *
+ * \param mod Module to profile. Usually a PrimFunc that has been compiled to machine code.
+ * \param func_name Name of function to run in the module.
+ * \param device_type Device type to run on. Profiling will include performance
+ *                    metrics specific to this device type.
+ * \param device_id Id of device to run on.
+ * \param collectors List of different
+ *                   ways to collect metrics. See @MetricCollector.
+ * \returns A PackedFunc which takes the same arguments as the `mod[func_name]`
+ *          and returns performance metrics as a `Map<String, ObjectRef>` where
+ *          values can be `CountNode`, `DurationNode`, `PercentNode`.
+ */
+PackedFunc ProfileFunction(Module mod, std::string func_name, int device_type, int device_id,
+                           Array<MetricCollector> collectors);
+
 }  // namespace profiling
 }  // namespace runtime
 }  // namespace tvm
diff --git a/python/tvm/runtime/profiling/__init__.py b/python/tvm/runtime/profiling/__init__.py
index 7d40a81e498a..6a69152a31b8 100644
--- a/python/tvm/runtime/profiling/__init__.py
+++ b/python/tvm/runtime/profiling/__init__.py
@@ -163,6 +163,49 @@ def __init__(self, dev: Device):
         self.__init_handle_by_constructor__(_ffi_api.DeviceWrapper, dev)
 
 
+def profile_function(mod, dev, collectors, func_name="main"):
+    """Collect performance information of a function execution. Usually used with
+    a compiled PrimFunc.
+
+    This information can include performance counters like cache hits and FLOPs
+    that are useful in debugging performance issues of individual PrimFuncs.
+    Different metrics can be collected depending on which MetricCollector is
+    used.
+
+    Example
+    -------
+
+    .. code-block: python
+        f = tvm.build(my_func, target="llvm", name="my_func")
+        prof = tvm.runtime.profiling.profile_function(
+            f,
+            tvm.cpu(),
+            [tvm.runtime.profiling.PAPIMetricCollector({tvm.cpu(): ["PAPI_FP_OPS"]}),
+        )
+        counters = prof(*args)
+        print(counters)
+
+    Parameters
+    ----------
+    mod: Module
+        Module containing the function to profile.
+    dev: Device
+        Device to run the function on.
+    collectors: List[MetricCollector]
+        :py:class:`MetricCollector`s which will collect performance information.
+    func_name: str
+        Name of the function in `mod` to profile. Defaults to "main".
+
+    Returns
+    -------
+    prof: PackedFunc[args, Dict[str, ObjectRef]]
+        PackedFunc which takes the same arguments as the `mod[func_name]` and
+        returns performance metrics as a `Dict[str, ObjectRef]` where values
+        can be `CountNode`, `DurationNode`, `PercentNode`.
+    """
+    return _ffi_api.ProfileFunction(mod, func_name, dev.device_type, dev.device_id, collectors)
+
+
 # We only enable this class when TVM is build with PAPI support
 if _ffi.get_global_func("runtime.profiling.PAPIMetricCollector", allow_missing=True) is not None:
 
diff --git a/src/runtime/profiling.cc b/src/runtime/profiling.cc
index 90d4ac64238f..4b5f8e806f2b 100644
--- a/src/runtime/profiling.cc
+++ b/src/runtime/profiling.cc
@@ -677,6 +677,56 @@ TVM_REGISTER_GLOBAL("runtime.profiling.FromJSON").set_body_typed(Report::FromJSO
 TVM_REGISTER_GLOBAL("runtime.profiling.DeviceWrapper").set_body_typed([](Device dev) {
   return DeviceWrapper(dev);
 });
+
+PackedFunc ProfileFunction(Module mod, std::string func_name, int device_type, int device_id,
+                           Array<MetricCollector> collectors) {
+  // Module::GetFunction is not const, so this lambda has to be mutable
+  return PackedFunc([=](TVMArgs args, TVMRetValue* ret) mutable {
+    PackedFunc f = mod.GetFunction(func_name);
+    Device dev{static_cast<DLDeviceType>(device_type), device_id};
+
+    // warmup
+    for (size_t i = 0; i < 10; i++) {
+      f.CallPacked(args, ret);
+    }
+
+    for (auto& collector : collectors) {
+      collector->Init({DeviceWrapper(dev)});
+    }
+    std::vector<ObjectRef> collector_data;
+    for (auto& collector : collectors) {
+      collector_data.push_back(collector->Start(dev));
+    }
+    // TODO(tkonolige): repeated calls if the runtime is small?
+    f.CallPacked(args, ret);
+    std::unordered_map<String, ObjectRef> results;
+    for (size_t i = 0; i < collectors.size(); i++) {
+      auto r = collectors[i]->Stop(collector_data[i]);
+      // We might want to do this in a separate loop to avoid unnecessary time
+      // spent before stopping subsequent collectors.
+      for (auto kv : r) {
+        results[kv.first] = kv.second;
+      }
+    }
+    *ret = Map<String, ObjectRef>(results);
+  });
+}
+
+TVM_REGISTER_GLOBAL("runtime.profiling.ProfileFunction")
+    .set_body_typed<PackedFunc(Module, String, int, int,
+                               Array<MetricCollector>)>([](Module mod, String func_name,
+                                                           int device_type, int device_id,
+                                                           Array<MetricCollector> collectors) {
+      if (mod->type_key() == std::string("rpc")) {
+        LOG(FATAL)
+            << "Profiling a module over RPC is not yet supported";  // because we can't send
+                                                                    // MetricCollectors over rpc.
+        throw;
+      } else {
+        return ProfileFunction(mod, func_name, device_type, device_id, collectors);
+      }
+    });
+
 }  // namespace profiling
 }  // namespace runtime
 }  // namespace tvm
diff --git a/tests/python/unittest/test_runtime_profiling.py b/tests/python/unittest/test_runtime_profiling.py
index 4e777435429b..7fa40ea29663 100644
--- a/tests/python/unittest/test_runtime_profiling.py
+++ b/tests/python/unittest/test_runtime_profiling.py
@@ -29,6 +29,7 @@
 from tvm import rpc
 from tvm.contrib import utils
 from tvm.runtime.profiling import Report
+from tvm.script import tir as T
 
 
 def read_csv(report):
@@ -195,6 +196,52 @@ def test_report_serialization():
     )
 
 
+@T.prim_func
+def axpy_cpu(a: T.handle, b: T.handle, c: T.handle) -> None:
+    A = T.match_buffer(a, [10], "float64")
+    B = T.match_buffer(b, [10], "float64")
+    C = T.match_buffer(c, [10], "float64")
+    for i in range(10):
+        C[i] = A[i] + B[i]
+
+
+@T.prim_func
+def axpy_gpu(a: T.handle, b: T.handle, c: T.handle) -> None:
+    A = T.match_buffer(a, [10], "float64")
+    B = T.match_buffer(b, [10], "float64")
+    C = T.match_buffer(c, [10], "float64")
+    for i in T.thread_binding(0, 10, "threadIdx.x"):
+        C[i] = A[i] + B[i]
+
+
+@tvm.testing.parametrize_targets("cuda", "llvm")
+@pytest.mark.skipif(
+    tvm.get_global_func("runtime.profiling.PAPIMetricCollector", allow_missing=True) is None,
+    reason="PAPI profiling not enabled",
+)
+def test_profile_function(target, dev):
+    target = tvm.target.Target(target)
+    if str(target.kind) == "llvm":
+        metric = "PAPI_FP_OPS"
+        func = axpy_cpu
+    elif str(target.kind) == "cuda":
+        metric = (
+            "cuda:::gpu__compute_memory_access_throughput.max.pct_of_peak_sustained_region:device=0"
+        )
+        func = axpy_gpu
+    else:
+        pytest.skip(f"Target {target.kind} not supported by this test")
+    f = tvm.build(func, target=target)
+    a = tvm.nd.array(np.ones(10), device=dev)
+    b = tvm.nd.array(np.ones(10), device=dev)
+    c = tvm.nd.array(np.zeros(10), device=dev)
+    report = tvm.runtime.profiling.profile_function(
+        f, dev, [tvm.runtime.profiling.PAPIMetricCollector({dev: [metric]})]
+    )(a, b, c)
+    assert metric in report.keys()
+    assert report[metric].value > 0
+
+
 if __name__ == "__main__":
     import sys
     import pytest

From afe9720a192980a1c717f5389e324d17aab1013e Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tkonolige@octoml.ai>
Date: Tue, 23 Nov 2021 10:02:56 -0800
Subject: [PATCH 2/3] fix docs

---
 include/tvm/runtime/profiling.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/tvm/runtime/profiling.h b/include/tvm/runtime/profiling.h
index 7d08e690f75c..40e1099e3657 100644
--- a/include/tvm/runtime/profiling.h
+++ b/include/tvm/runtime/profiling.h
@@ -500,7 +500,7 @@ String ShapeString(const std::vector<int64_t>& shape, DLDataType dtype);
  *                    metrics specific to this device type.
  * \param device_id Id of device to run on.
  * \param collectors List of different
- *                   ways to collect metrics. See @MetricCollector.
+ *                   ways to collect metrics. See MetricCollector.
  * \returns A PackedFunc which takes the same arguments as the `mod[func_name]`
  *          and returns performance metrics as a `Map<String, ObjectRef>` where
  *          values can be `CountNode`, `DurationNode`, `PercentNode`.

From 83fde5576b0da65f3c5c657ca420d8ce7955ac7b Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tkonolige@octoml.ai>
Date: Mon, 29 Nov 2021 15:16:11 -0800
Subject: [PATCH 3/3] configurable number of warmup iterations. avoid
 allocating when stopping collectors

---
 include/tvm/runtime/profiling.h          |  5 +++-
 python/tvm/runtime/profiling/__init__.py | 11 +++++++--
 src/runtime/profiling.cc                 | 29 +++++++++++++++---------
 3 files changed, 31 insertions(+), 14 deletions(-)

diff --git a/include/tvm/runtime/profiling.h b/include/tvm/runtime/profiling.h
index 40e1099e3657..606bf502c195 100644
--- a/include/tvm/runtime/profiling.h
+++ b/include/tvm/runtime/profiling.h
@@ -499,6 +499,9 @@ String ShapeString(const std::vector<int64_t>& shape, DLDataType dtype);
  * \param device_type Device type to run on. Profiling will include performance
  *                    metrics specific to this device type.
  * \param device_id Id of device to run on.
+ * \param warmup_iters Number of iterations of the function to run before collecting
+ *                     performance information. Recommend to set this larger
+ *                     than 0 so that cache effects are consistent.
  * \param collectors List of different
  *                   ways to collect metrics. See MetricCollector.
  * \returns A PackedFunc which takes the same arguments as the `mod[func_name]`
@@ -506,7 +509,7 @@ String ShapeString(const std::vector<int64_t>& shape, DLDataType dtype);
  *          values can be `CountNode`, `DurationNode`, `PercentNode`.
  */
 PackedFunc ProfileFunction(Module mod, std::string func_name, int device_type, int device_id,
-                           Array<MetricCollector> collectors);
+                           int warmup_iters, Array<MetricCollector> collectors);
 
 }  // namespace profiling
 }  // namespace runtime
diff --git a/python/tvm/runtime/profiling/__init__.py b/python/tvm/runtime/profiling/__init__.py
index 6a69152a31b8..86145ce6242f 100644
--- a/python/tvm/runtime/profiling/__init__.py
+++ b/python/tvm/runtime/profiling/__init__.py
@@ -163,7 +163,7 @@ def __init__(self, dev: Device):
         self.__init_handle_by_constructor__(_ffi_api.DeviceWrapper, dev)
 
 
-def profile_function(mod, dev, collectors, func_name="main"):
+def profile_function(mod, dev, collectors, func_name="main", warmup_iters=10):
     """Collect performance information of a function execution. Usually used with
     a compiled PrimFunc.
 
@@ -191,10 +191,15 @@ def profile_function(mod, dev, collectors, func_name="main"):
         Module containing the function to profile.
     dev: Device
         Device to run the function on.
+
     collectors: List[MetricCollector]
         :py:class:`MetricCollector`s which will collect performance information.
     func_name: str
         Name of the function in `mod` to profile. Defaults to "main".
+    warmup_iters: int
+        Number of iterations to run the function before collecting performance
+        information. Recommended to set this larger than 0 for consistent cache
+        effects. Defaults to 10.
 
     Returns
     -------
@@ -203,7 +208,9 @@ def profile_function(mod, dev, collectors, func_name="main"):
         returns performance metrics as a `Dict[str, ObjectRef]` where values
         can be `CountNode`, `DurationNode`, `PercentNode`.
     """
-    return _ffi_api.ProfileFunction(mod, func_name, dev.device_type, dev.device_id, collectors)
+    return _ffi_api.ProfileFunction(
+        mod, func_name, dev.device_type, dev.device_id, warmup_iters, collectors
+    )
 
 
 # We only enable this class when TVM is build with PAPI support
diff --git a/src/runtime/profiling.cc b/src/runtime/profiling.cc
index 4b5f8e806f2b..000f6eac27ae 100644
--- a/src/runtime/profiling.cc
+++ b/src/runtime/profiling.cc
@@ -679,43 +679,50 @@ TVM_REGISTER_GLOBAL("runtime.profiling.DeviceWrapper").set_body_typed([](Device
 });
 
 PackedFunc ProfileFunction(Module mod, std::string func_name, int device_type, int device_id,
-                           Array<MetricCollector> collectors) {
+                           int warmup_iters, Array<MetricCollector> collectors) {
   // Module::GetFunction is not const, so this lambda has to be mutable
   return PackedFunc([=](TVMArgs args, TVMRetValue* ret) mutable {
     PackedFunc f = mod.GetFunction(func_name);
     Device dev{static_cast<DLDeviceType>(device_type), device_id};
 
     // warmup
-    for (size_t i = 0; i < 10; i++) {
+    for (int i = 0; i < warmup_iters; i++) {
       f.CallPacked(args, ret);
     }
 
     for (auto& collector : collectors) {
       collector->Init({DeviceWrapper(dev)});
     }
+    std::vector<Map<String, ObjectRef>> results;
+    results.reserve(collectors.size());
     std::vector<ObjectRef> collector_data;
+    collector_data.reserve(collectors.size());
     for (auto& collector : collectors) {
       collector_data.push_back(collector->Start(dev));
     }
+
     // TODO(tkonolige): repeated calls if the runtime is small?
     f.CallPacked(args, ret);
-    std::unordered_map<String, ObjectRef> results;
+
     for (size_t i = 0; i < collectors.size(); i++) {
-      auto r = collectors[i]->Stop(collector_data[i]);
-      // We might want to do this in a separate loop to avoid unnecessary time
-      // spent before stopping subsequent collectors.
-      for (auto kv : r) {
-        results[kv.first] = kv.second;
+      results.push_back(collectors[i]->Stop(collector_data[i]));
+    }
+    Map<String, ObjectRef> combined_results;
+    for (auto m : results) {
+      for (auto p : m) {
+        // assume that there is no shared metric name between collectors
+        combined_results.Set(p.first, p.second);
       }
     }
-    *ret = Map<String, ObjectRef>(results);
+    *ret = combined_results;
   });
 }
 
 TVM_REGISTER_GLOBAL("runtime.profiling.ProfileFunction")
-    .set_body_typed<PackedFunc(Module, String, int, int,
+    .set_body_typed<PackedFunc(Module, String, int, int, int,
                                Array<MetricCollector>)>([](Module mod, String func_name,
                                                            int device_type, int device_id,
+                                                           int warmup_iters,
                                                            Array<MetricCollector> collectors) {
       if (mod->type_key() == std::string("rpc")) {
         LOG(FATAL)
@@ -723,7 +730,7 @@ TVM_REGISTER_GLOBAL("runtime.profiling.ProfileFunction")
                                                                     // MetricCollectors over rpc.
         throw;
       } else {
-        return ProfileFunction(mod, func_name, device_type, device_id, collectors);
+        return ProfileFunction(mod, func_name, device_type, device_id, warmup_iters, collectors);
       }
     });