From 3cbf3c4b57d390337bd0eec861d451b039c9beab Mon Sep 17 00:00:00 2001 From: Jacob Bohlin Date: Fri, 7 Jan 2022 17:28:34 +0100 Subject: [PATCH 1/2] [microNPU] Performance model bugfixes * Fixed incorrect num_blocks calculations for both BufferModes. * Fixed similar issues with Read/Write byte calculations. * Fixed an issue where the 'partkernel' flag was not propagated to the performance estimation code. * Fixed single buffering check incorrectly used output shape and block rather than the input shape and block. * Fixed block config not aligned to micro block for Elementwise. Change-Id: Ide6b231bc1a17c65bed20129d2179a215ada14b2 --- .../contrib/ethosu/cascader/device_config.py | 49 ++++++++++--------- src/contrib/ethosu/cascader/parts/ethosu.cc | 24 ++++----- .../cascader/test_ethosu_block_config.py | 4 +- .../test_ethosu/cascader/test_ethosu_part.py | 2 + .../cascader/test_ethosu_part_performance.py | 1 + 5 files changed, 45 insertions(+), 35 deletions(-) diff --git a/python/tvm/contrib/ethosu/cascader/device_config.py b/python/tvm/contrib/ethosu/cascader/device_config.py index 5abdb302234b..39d171c87d39 100644 --- a/python/tvm/contrib/ethosu/cascader/device_config.py +++ b/python/tvm/contrib/ethosu/cascader/device_config.py @@ -288,7 +288,7 @@ def _get_input_block( input_shape: _Shape, dtype: str, op_type: str, - is_partkernel: bool, + partkernel: bool, stride_h: int, stride_w: int, dilated_kernel_h: int, @@ -310,7 +310,7 @@ def _get_input_block( if op_type == "ethosu_conv2d": if dtype == "int8": - if is_partkernel: + if partkernel: depth = self._align(min(32, input_shape.depth), 8) else: depth = self._align(min(16, input_shape.depth), 8) @@ -336,7 +336,7 @@ def get_kernel_steps( dilated_kernel_h: int, dilated_kernel_w: int, ifm_dtype: str, - is_partkernel: bool = False, + partkernel: bool = False, ) -> List[int]: """Calculate the total number of subkernels and their sizes @@ -351,7 +351,7 @@ def get_kernel_steps( Width of dilated kernel ifm_dtype: str Datatype of the Input Feature Map tensor (IFM) - is_partkernel: bool + partkernel: bool Flag showing whether part-kernel first traversal is used Returns @@ -368,7 +368,7 @@ def get_kernel_steps( kernel_steps = [] for y, x in subkernels: subkernel_elements = x * y - if op_type == "ethosu_conv2d" and is_partkernel: + if op_type == "ethosu_conv2d" and partkernel: # Part-kernel-first traversal conv2d divisor = 4 if ifm_dtype == "int8" else 2 kernel_steps.append(int(_round_up_div(subkernel_elements, divisor))) @@ -509,29 +509,31 @@ def get_elementwise_block_config( banks_available -= 2 # Split the block in half until it fits into SHRAM + max_height, max_width, max_depth = self._max_block_shape.as_list()[1:] if output_layout == "NHCWB16": split_order = (a for a in [1, 3, 2]) output_block = [ output_shape[0], - min(output_shape[1], self._max_block_shape.height), - min(output_shape[2] * output_shape[4], self._max_block_shape.depth), - min(output_shape[3], self._max_block_shape.width), + _round_up(min(output_shape[1], max_height), self._micro_block.height), + min(output_shape[2] * output_shape[4], max_width), + _round_up(min(output_shape[3], max_width), self._micro_block.width), 16, ] else: split_order = (a for a in [1, 2, 3]) output_block = [ output_shape[0], - min(output_shape[1], self._max_block_shape.height), - min(output_shape[2], self._max_block_shape.width), - min(output_shape[3], self._max_block_shape.depth), + _round_up(min(output_shape[1], max_height), self._micro_block.height), + _round_up(min(output_shape[2], max_width), self._micro_block.width), + _round_up(min(output_shape[3], max_depth), self._micro_block.depth), ] split_axis = next(split_order) + + offset = [0] * len(output_block) + stripes = [1] * len(output_block) + order = [1, 2, 4, 3, 0] if output_layout == "NHCWB16" else [1, 2, 3, 4] while True: # Create stripe config for output block - offset = [0] * len(output_block) - stripes = [1] * len(output_block) - order = [1, 2, 4, 3, 0] if output_layout == "NHCWB16" else [1, 2, 3, 4] output_stripe_config = StripeConfig( output_block, output_block, output_block, order, stripes, offset ) @@ -564,10 +566,12 @@ def get_elementwise_block_config( block_config.append(BlockConfig(output_block, output_block, 0, output_cycles)) break - if output_block[split_axis] == 1: + if output_block[split_axis] == self._micro_block.as_list()[split_axis]: split_axis = next(split_order) - output_block[split_axis] = _round_up_div(output_block[split_axis], 2) + output_block[split_axis] = _round_up( + _round_up_div(output_block[split_axis], 2), self._micro_block.as_list()[split_axis] + ) return block_config @@ -670,9 +674,9 @@ def get_valid_block_configs( # Input block depth has additional limitations for operators that require full input depth input_block_depth = 0 - is_partkernel = self.is_partkernel(op_type, ifm_channels, ifm_dtype, kernel_h * kernel_w) + partkernel = self.is_partkernel(op_type, ifm_channels, ifm_dtype, kernel_h * kernel_w) if op_type == "ethosu_conv2d": - if is_partkernel: + if partkernel: input_block_depth = min(ifm_channels, 16) else: input_block_depth = min(ifm_channels, 32) @@ -745,7 +749,8 @@ def get_valid_block_configs( kernel_h, kernel_w, ifm_channels, - is_partkernel, + "int8", + partkernel, ) block_config = BlockConfig( input_block_shape.as_list(), output_block, compute_cycles, output_cycles @@ -767,7 +772,7 @@ def _estimate_compute_cycles_per_block( kernel_w: int, input_channels: int, ifm_dtype: str, - is_partkernel: bool = False, + partkernel: bool = False, ) -> Tuple[int, int]: # Calculate the amount of micro blocks per block, per axis num_quantum_x = _round_up_div(block_shape.width, self._micro_block.width) @@ -775,7 +780,7 @@ def _estimate_compute_cycles_per_block( num_quantum_z = _round_up_div(block_shape.depth, self._micro_block.depth) num_quantum_xy = num_quantum_x * num_quantum_y - kernel_steps = self.get_kernel_steps(op_type, kernel_h, kernel_w, ifm_dtype, is_partkernel) + kernel_steps = self.get_kernel_steps(op_type, kernel_h, kernel_w, ifm_dtype, partkernel) wd_cycles = self._get_weight_decoder_cycles(op_type) delay_cycles = self._get_delay_cycles(op_type, ifm_dtype) @@ -794,7 +799,7 @@ def _estimate_compute_cycles_per_block( elif subkernel_steps > 1: compute_cycles += delay_cycles * (subkernel_steps - 1) * num_quantum_z - if is_partkernel: + if partkernel: compute_cycles *= _round_up_div(input_block_shape.depth, 8) if op_type == "ethosu_conv2d": diff --git a/src/contrib/ethosu/cascader/parts/ethosu.cc b/src/contrib/ethosu/cascader/parts/ethosu.cc index 4bc270750f1a..f9c5a8409fae 100644 --- a/src/contrib/ethosu/cascader/parts/ethosu.cc +++ b/src/contrib/ethosu/cascader/parts/ethosu.cc @@ -74,6 +74,8 @@ const BlockConfig EthosuPartNode::GetBlockConfig(const StripeConfig& output_stri BlockConfig best_block_config; float best_cost = std::numeric_limits::infinity(); std::vector output_stripe_shape = output_stripe_config->GetShape(); + auto input_stripe_configs = CalculateInputStripeConfigs(output_stripe_config); + std::vector input_stripe_shape = input_stripe_configs[0]->GetShape(); for (const auto& block_config : valid_block_configs_) { std::vector output_block = block_config->GetOutputBlockShape(); @@ -86,7 +88,7 @@ const BlockConfig EthosuPartNode::GetBlockConfig(const StripeConfig& output_stri mul_reduce(output_stripe_shape); // Single buffering hardware optimization - if (mul_reduce(output_stripe_shape) <= 2 * mul_reduce(output_block)) { + if (mul_reduce(input_stripe_shape) <= 2 * mul_reduce(block_config->GetInputBlockShape())) { relative_cost /= 2; } @@ -107,25 +109,25 @@ const PerformanceInfo EthosuPartNode::GetPerformanceInfo(const StripeConfig& out std::vector bytes_per_input = GetBytesRead(block_shape, output_stripe_config->GetShape()); - int elements_per_block = mul_reduce(block_shape); - int bytes_per_output = elements_per_block; float num_blocks = 1.0f; for (size_t i = 0; i < block_shape.size(); i++) { if (buffer_mode == BufferMode::RECOMPUTE) { - num_blocks *= static_cast(output_stripe_config->GetShape()[i] * - output_stripe_config->GetStripes()[i]) / - block_shape[i]; + num_blocks *= std::max(static_cast(output_stripe_config->GetShape()[i]) / + block_shape[i] * output_stripe_config->GetStripes()[i], + 1.0f); } else { num_blocks *= - std::max(static_cast(output_stripe_config->GetExtent()[i]) / block_shape[i], 1.0f); + std::max(static_cast(output_tensor_->GetShape()[i]) / block_shape[i], 1.0f); } } - float num_stripes = mul_reduce(output_stripe_config->GetStripes()) - 1.0f; + + float num_stripes = mul_reduce(output_stripe_config->GetStripes()); std::vector read_bytes; - for (int block_bytes : bytes_per_input) { - read_bytes.push_back((num_blocks + num_stripes) * block_bytes); + for (int64_t stripe_bytes : bytes_per_input) { + read_bytes.push_back(num_stripes * stripe_bytes); } - int64_t write_bytes = (num_blocks + num_stripes) * bytes_per_output; + int64_t write_bytes = + num_blocks * mul_reduce(block_shape) * output_tensor_->GetDataType().bytes(); int block_output_cycles = block_config->GetOutputCycles(); int block_compute_cycles = block_config->GetComputeCycles(); diff --git a/tests/python/contrib/test_ethosu/cascader/test_ethosu_block_config.py b/tests/python/contrib/test_ethosu/cascader/test_ethosu_block_config.py index 09fd056ce794..ee416a12e158 100644 --- a/tests/python/contrib/test_ethosu/cascader/test_ethosu_block_config.py +++ b/tests/python/contrib/test_ethosu/cascader/test_ethosu_block_config.py @@ -207,7 +207,7 @@ ((1, 7, 10, 16), (1, 7, 1, 10, 16)), ((1, 7, 6, 16), (1, 7, 1, 6, 16)), # Pooling - ((1, 1, 2, 80), (1, 1, 5, 2, 16)), + ((1, 1, 2, 16), (1, 1, 1, 2, 16)), ((1, 10, 6, 16), (1, 10, 1, 6, 16)), ], ), @@ -225,7 +225,7 @@ ((1, 8, 20, 16), (1, 8, 1, 20, 16)), ((1, 14, 6, 16), (1, 14, 1, 6, 16)), # Pooling - ((1, 2, 2, 48), (1, 2, 3, 2, 16)), + ((1, 2, 2, 16), (1, 2, 1, 2, 16)), ((1, 10, 12, 16), (1, 10, 1, 12, 16)), ], ), diff --git a/tests/python/contrib/test_ethosu/cascader/test_ethosu_part.py b/tests/python/contrib/test_ethosu/cascader/test_ethosu_part.py index bf6fb4579bd1..105b6722e8c6 100644 --- a/tests/python/contrib/test_ethosu/cascader/test_ethosu_part.py +++ b/tests/python/contrib/test_ethosu/cascader/test_ethosu_part.py @@ -47,6 +47,8 @@ def test_ethosu_part(): ) input_tensor = cs.Tensor(shape=[1, 66, 74, 16], dtype="int8") part.set_input(0, input_tensor) + output_tensor = cs.Tensor(shape=[1, 66, 74, 16], dtype="int8") + part.set_output(output_tensor) assert part.get_stripe_align_hint() == output_quantum # Check that the performance model runs, don't verify output diff --git a/tests/python/contrib/test_ethosu/cascader/test_ethosu_part_performance.py b/tests/python/contrib/test_ethosu/cascader/test_ethosu_part_performance.py index 60d5fa2a463d..437b0a9ead9d 100644 --- a/tests/python/contrib/test_ethosu/cascader/test_ethosu_part_performance.py +++ b/tests/python/contrib/test_ethosu/cascader/test_ethosu_part_performance.py @@ -216,6 +216,7 @@ def test_conv_performance( ) part.set_input(0, cs.Tensor(in_shape, "int8")) part.set_input(1, cs.Tensor([ifm_channels, kernel[0], kernel[1], out_shape[-1]], "int8")) + part.set_output(cs.Tensor(out_shape, "int8")) stripes = [1] * len(output_quantum) offset = [0] * len(output_quantum) From 07e76036733ea3ea14e3b2d401df75a451fb2ff3 Mon Sep 17 00:00:00 2001 From: Jacob Bohlin Date: Wed, 6 Apr 2022 10:28:58 +0200 Subject: [PATCH 2/2] Address review comment Changed incorrect usage of 'max_width' to 'max_depth'. --- python/tvm/contrib/ethosu/cascader/device_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/tvm/contrib/ethosu/cascader/device_config.py b/python/tvm/contrib/ethosu/cascader/device_config.py index 39d171c87d39..ac20e4a29c18 100644 --- a/python/tvm/contrib/ethosu/cascader/device_config.py +++ b/python/tvm/contrib/ethosu/cascader/device_config.py @@ -515,7 +515,7 @@ def get_elementwise_block_config( output_block = [ output_shape[0], _round_up(min(output_shape[1], max_height), self._micro_block.height), - min(output_shape[2] * output_shape[4], max_width), + min(output_shape[2] * output_shape[4], max_depth), _round_up(min(output_shape[3], max_width), self._micro_block.width), 16, ]