From 327fcaa00d42d69c2fb3e4a8251ccb96439609d6 Mon Sep 17 00:00:00 2001 From: Tristan Konolige Date: Thu, 11 Nov 2021 21:20:56 -0800 Subject: [PATCH 1/3] [PROFILING] Add ability to profile a single function_profiling Add a new function `tvm.runtime.profiling.profile_function` which collects performance metrics for a single function in an IRModule. For example, collecting performance counters using `PAPIMetricCollector`. This is helpful for optimizing kernels and schedules for a single operator. --- include/tvm/runtime/profiling.h | 31 ++++++++++++ python/tvm/runtime/profiling/__init__.py | 43 ++++++++++++++++ src/runtime/profiling.cc | 50 +++++++++++++++++++ .../python/unittest/test_runtime_profiling.py | 47 +++++++++++++++++ 4 files changed, 171 insertions(+) diff --git a/include/tvm/runtime/profiling.h b/include/tvm/runtime/profiling.h index 366f4f1deed1..7d08e690f75c 100644 --- a/include/tvm/runtime/profiling.h +++ b/include/tvm/runtime/profiling.h @@ -477,6 +477,37 @@ String ShapeString(NDArray shape, DLDataType dtype); */ String ShapeString(const std::vector& shape, DLDataType dtype); +/*! \brief Collect performance information of a function execution. Usually + * used with a compiled PrimFunc (via tvm.build). + * + * This information can include performance counters like cache hits and FLOPs + * that are useful in debugging performance issues of individual PrimFuncs. + * Different metrics can be collected depending on which MetricCollector is + * used. + * + * Example usage: + * \code{.cpp} + * // Use PAPI to measure the number of floating point operations. + * PackedFunc profiler = ProfileModule( + * mod, "main", kDLCPU, 0, {CreatePAPIMetricCollector({{kDLCPU, 0}, {"PAPI_FP_OPS"}})}); + * Report r = profiler(arg1, arg2, arg); + * std::cout << r << std::endl; + * \endcode + * + * \param mod Module to profile. Usually a PrimFunc that has been compiled to machine code. + * \param func_name Name of function to run in the module. + * \param device_type Device type to run on. Profiling will include performance + * metrics specific to this device type. + * \param device_id Id of device to run on. + * \param collectors List of different + * ways to collect metrics. See @MetricCollector. + * \returns A PackedFunc which takes the same arguments as the `mod[func_name]` + * and returns performance metrics as a `Map` where + * values can be `CountNode`, `DurationNode`, `PercentNode`. + */ +PackedFunc ProfileFunction(Module mod, std::string func_name, int device_type, int device_id, + Array collectors); + } // namespace profiling } // namespace runtime } // namespace tvm diff --git a/python/tvm/runtime/profiling/__init__.py b/python/tvm/runtime/profiling/__init__.py index 7d40a81e498a..6a69152a31b8 100644 --- a/python/tvm/runtime/profiling/__init__.py +++ b/python/tvm/runtime/profiling/__init__.py @@ -163,6 +163,49 @@ def __init__(self, dev: Device): self.__init_handle_by_constructor__(_ffi_api.DeviceWrapper, dev) +def profile_function(mod, dev, collectors, func_name="main"): + """Collect performance information of a function execution. Usually used with + a compiled PrimFunc. + + This information can include performance counters like cache hits and FLOPs + that are useful in debugging performance issues of individual PrimFuncs. + Different metrics can be collected depending on which MetricCollector is + used. + + Example + ------- + + .. code-block: python + f = tvm.build(my_func, target="llvm", name="my_func") + prof = tvm.runtime.profiling.profile_function( + f, + tvm.cpu(), + [tvm.runtime.profiling.PAPIMetricCollector({tvm.cpu(): ["PAPI_FP_OPS"]}), + ) + counters = prof(*args) + print(counters) + + Parameters + ---------- + mod: Module + Module containing the function to profile. + dev: Device + Device to run the function on. + collectors: List[MetricCollector] + :py:class:`MetricCollector`s which will collect performance information. + func_name: str + Name of the function in `mod` to profile. Defaults to "main". + + Returns + ------- + prof: PackedFunc[args, Dict[str, ObjectRef]] + PackedFunc which takes the same arguments as the `mod[func_name]` and + returns performance metrics as a `Dict[str, ObjectRef]` where values + can be `CountNode`, `DurationNode`, `PercentNode`. + """ + return _ffi_api.ProfileFunction(mod, func_name, dev.device_type, dev.device_id, collectors) + + # We only enable this class when TVM is build with PAPI support if _ffi.get_global_func("runtime.profiling.PAPIMetricCollector", allow_missing=True) is not None: diff --git a/src/runtime/profiling.cc b/src/runtime/profiling.cc index 90d4ac64238f..4b5f8e806f2b 100644 --- a/src/runtime/profiling.cc +++ b/src/runtime/profiling.cc @@ -677,6 +677,56 @@ TVM_REGISTER_GLOBAL("runtime.profiling.FromJSON").set_body_typed(Report::FromJSO TVM_REGISTER_GLOBAL("runtime.profiling.DeviceWrapper").set_body_typed([](Device dev) { return DeviceWrapper(dev); }); + +PackedFunc ProfileFunction(Module mod, std::string func_name, int device_type, int device_id, + Array collectors) { + // Module::GetFunction is not const, so this lambda has to be mutable + return PackedFunc([=](TVMArgs args, TVMRetValue* ret) mutable { + PackedFunc f = mod.GetFunction(func_name); + Device dev{static_cast(device_type), device_id}; + + // warmup + for (size_t i = 0; i < 10; i++) { + f.CallPacked(args, ret); + } + + for (auto& collector : collectors) { + collector->Init({DeviceWrapper(dev)}); + } + std::vector collector_data; + for (auto& collector : collectors) { + collector_data.push_back(collector->Start(dev)); + } + // TODO(tkonolige): repeated calls if the runtime is small? + f.CallPacked(args, ret); + std::unordered_map results; + for (size_t i = 0; i < collectors.size(); i++) { + auto r = collectors[i]->Stop(collector_data[i]); + // We might want to do this in a separate loop to avoid unnecessary time + // spent before stopping subsequent collectors. + for (auto kv : r) { + results[kv.first] = kv.second; + } + } + *ret = Map(results); + }); +} + +TVM_REGISTER_GLOBAL("runtime.profiling.ProfileFunction") + .set_body_typed)>([](Module mod, String func_name, + int device_type, int device_id, + Array collectors) { + if (mod->type_key() == std::string("rpc")) { + LOG(FATAL) + << "Profiling a module over RPC is not yet supported"; // because we can't send + // MetricCollectors over rpc. + throw; + } else { + return ProfileFunction(mod, func_name, device_type, device_id, collectors); + } + }); + } // namespace profiling } // namespace runtime } // namespace tvm diff --git a/tests/python/unittest/test_runtime_profiling.py b/tests/python/unittest/test_runtime_profiling.py index 4e777435429b..7fa40ea29663 100644 --- a/tests/python/unittest/test_runtime_profiling.py +++ b/tests/python/unittest/test_runtime_profiling.py @@ -29,6 +29,7 @@ from tvm import rpc from tvm.contrib import utils from tvm.runtime.profiling import Report +from tvm.script import tir as T def read_csv(report): @@ -195,6 +196,52 @@ def test_report_serialization(): ) +@T.prim_func +def axpy_cpu(a: T.handle, b: T.handle, c: T.handle) -> None: + A = T.match_buffer(a, [10], "float64") + B = T.match_buffer(b, [10], "float64") + C = T.match_buffer(c, [10], "float64") + for i in range(10): + C[i] = A[i] + B[i] + + +@T.prim_func +def axpy_gpu(a: T.handle, b: T.handle, c: T.handle) -> None: + A = T.match_buffer(a, [10], "float64") + B = T.match_buffer(b, [10], "float64") + C = T.match_buffer(c, [10], "float64") + for i in T.thread_binding(0, 10, "threadIdx.x"): + C[i] = A[i] + B[i] + + +@tvm.testing.parametrize_targets("cuda", "llvm") +@pytest.mark.skipif( + tvm.get_global_func("runtime.profiling.PAPIMetricCollector", allow_missing=True) is None, + reason="PAPI profiling not enabled", +) +def test_profile_function(target, dev): + target = tvm.target.Target(target) + if str(target.kind) == "llvm": + metric = "PAPI_FP_OPS" + func = axpy_cpu + elif str(target.kind) == "cuda": + metric = ( + "cuda:::gpu__compute_memory_access_throughput.max.pct_of_peak_sustained_region:device=0" + ) + func = axpy_gpu + else: + pytest.skip(f"Target {target.kind} not supported by this test") + f = tvm.build(func, target=target) + a = tvm.nd.array(np.ones(10), device=dev) + b = tvm.nd.array(np.ones(10), device=dev) + c = tvm.nd.array(np.zeros(10), device=dev) + report = tvm.runtime.profiling.profile_function( + f, dev, [tvm.runtime.profiling.PAPIMetricCollector({dev: [metric]})] + )(a, b, c) + assert metric in report.keys() + assert report[metric].value > 0 + + if __name__ == "__main__": import sys import pytest From afe9720a192980a1c717f5389e324d17aab1013e Mon Sep 17 00:00:00 2001 From: Tristan Konolige Date: Tue, 23 Nov 2021 10:02:56 -0800 Subject: [PATCH 2/3] fix docs --- include/tvm/runtime/profiling.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/tvm/runtime/profiling.h b/include/tvm/runtime/profiling.h index 7d08e690f75c..40e1099e3657 100644 --- a/include/tvm/runtime/profiling.h +++ b/include/tvm/runtime/profiling.h @@ -500,7 +500,7 @@ String ShapeString(const std::vector& shape, DLDataType dtype); * metrics specific to this device type. * \param device_id Id of device to run on. * \param collectors List of different - * ways to collect metrics. See @MetricCollector. + * ways to collect metrics. See MetricCollector. * \returns A PackedFunc which takes the same arguments as the `mod[func_name]` * and returns performance metrics as a `Map` where * values can be `CountNode`, `DurationNode`, `PercentNode`. From 83fde5576b0da65f3c5c657ca420d8ce7955ac7b Mon Sep 17 00:00:00 2001 From: Tristan Konolige Date: Mon, 29 Nov 2021 15:16:11 -0800 Subject: [PATCH 3/3] configurable number of warmup iterations. avoid allocating when stopping collectors --- include/tvm/runtime/profiling.h | 5 +++- python/tvm/runtime/profiling/__init__.py | 11 +++++++-- src/runtime/profiling.cc | 29 +++++++++++++++--------- 3 files changed, 31 insertions(+), 14 deletions(-) diff --git a/include/tvm/runtime/profiling.h b/include/tvm/runtime/profiling.h index 40e1099e3657..606bf502c195 100644 --- a/include/tvm/runtime/profiling.h +++ b/include/tvm/runtime/profiling.h @@ -499,6 +499,9 @@ String ShapeString(const std::vector& shape, DLDataType dtype); * \param device_type Device type to run on. Profiling will include performance * metrics specific to this device type. * \param device_id Id of device to run on. + * \param warmup_iters Number of iterations of the function to run before collecting + * performance information. Recommend to set this larger + * than 0 so that cache effects are consistent. * \param collectors List of different * ways to collect metrics. See MetricCollector. * \returns A PackedFunc which takes the same arguments as the `mod[func_name]` @@ -506,7 +509,7 @@ String ShapeString(const std::vector& shape, DLDataType dtype); * values can be `CountNode`, `DurationNode`, `PercentNode`. */ PackedFunc ProfileFunction(Module mod, std::string func_name, int device_type, int device_id, - Array collectors); + int warmup_iters, Array collectors); } // namespace profiling } // namespace runtime diff --git a/python/tvm/runtime/profiling/__init__.py b/python/tvm/runtime/profiling/__init__.py index 6a69152a31b8..86145ce6242f 100644 --- a/python/tvm/runtime/profiling/__init__.py +++ b/python/tvm/runtime/profiling/__init__.py @@ -163,7 +163,7 @@ def __init__(self, dev: Device): self.__init_handle_by_constructor__(_ffi_api.DeviceWrapper, dev) -def profile_function(mod, dev, collectors, func_name="main"): +def profile_function(mod, dev, collectors, func_name="main", warmup_iters=10): """Collect performance information of a function execution. Usually used with a compiled PrimFunc. @@ -191,10 +191,15 @@ def profile_function(mod, dev, collectors, func_name="main"): Module containing the function to profile. dev: Device Device to run the function on. + collectors: List[MetricCollector] :py:class:`MetricCollector`s which will collect performance information. func_name: str Name of the function in `mod` to profile. Defaults to "main". + warmup_iters: int + Number of iterations to run the function before collecting performance + information. Recommended to set this larger than 0 for consistent cache + effects. Defaults to 10. Returns ------- @@ -203,7 +208,9 @@ def profile_function(mod, dev, collectors, func_name="main"): returns performance metrics as a `Dict[str, ObjectRef]` where values can be `CountNode`, `DurationNode`, `PercentNode`. """ - return _ffi_api.ProfileFunction(mod, func_name, dev.device_type, dev.device_id, collectors) + return _ffi_api.ProfileFunction( + mod, func_name, dev.device_type, dev.device_id, warmup_iters, collectors + ) # We only enable this class when TVM is build with PAPI support diff --git a/src/runtime/profiling.cc b/src/runtime/profiling.cc index 4b5f8e806f2b..000f6eac27ae 100644 --- a/src/runtime/profiling.cc +++ b/src/runtime/profiling.cc @@ -679,43 +679,50 @@ TVM_REGISTER_GLOBAL("runtime.profiling.DeviceWrapper").set_body_typed([](Device }); PackedFunc ProfileFunction(Module mod, std::string func_name, int device_type, int device_id, - Array collectors) { + int warmup_iters, Array collectors) { // Module::GetFunction is not const, so this lambda has to be mutable return PackedFunc([=](TVMArgs args, TVMRetValue* ret) mutable { PackedFunc f = mod.GetFunction(func_name); Device dev{static_cast(device_type), device_id}; // warmup - for (size_t i = 0; i < 10; i++) { + for (int i = 0; i < warmup_iters; i++) { f.CallPacked(args, ret); } for (auto& collector : collectors) { collector->Init({DeviceWrapper(dev)}); } + std::vector> results; + results.reserve(collectors.size()); std::vector collector_data; + collector_data.reserve(collectors.size()); for (auto& collector : collectors) { collector_data.push_back(collector->Start(dev)); } + // TODO(tkonolige): repeated calls if the runtime is small? f.CallPacked(args, ret); - std::unordered_map results; + for (size_t i = 0; i < collectors.size(); i++) { - auto r = collectors[i]->Stop(collector_data[i]); - // We might want to do this in a separate loop to avoid unnecessary time - // spent before stopping subsequent collectors. - for (auto kv : r) { - results[kv.first] = kv.second; + results.push_back(collectors[i]->Stop(collector_data[i])); + } + Map combined_results; + for (auto m : results) { + for (auto p : m) { + // assume that there is no shared metric name between collectors + combined_results.Set(p.first, p.second); } } - *ret = Map(results); + *ret = combined_results; }); } TVM_REGISTER_GLOBAL("runtime.profiling.ProfileFunction") - .set_body_typed)>([](Module mod, String func_name, int device_type, int device_id, + int warmup_iters, Array collectors) { if (mod->type_key() == std::string("rpc")) { LOG(FATAL) @@ -723,7 +730,7 @@ TVM_REGISTER_GLOBAL("runtime.profiling.ProfileFunction") // MetricCollectors over rpc. throw; } else { - return ProfileFunction(mod, func_name, device_type, device_id, collectors); + return ProfileFunction(mod, func_name, device_type, device_id, warmup_iters, collectors); } });