From 2abce99330cd0edd8c2567da315e25be28c1b81d Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Thu, 26 May 2022 08:48:29 +0900
Subject: [PATCH 1/4] Allow dynamic shmem of size > 48K

---
 src/runtime/cuda/cuda_module.cc | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)
diff --git a/src/runtime/cuda/cuda_module.cc b/src/runtime/cuda/cuda_module.cc
index 7d6879a62aba..2471b3110b1a 100644
--- a/src/runtime/cuda/cuda_module.cc
+++ b/src/runtime/cuda/cuda_module.cc
@@ -164,11 +164,18 @@ class CUDAWrappedFunc {
   void operator()(TVMArgs args, TVMRetValue* rv, void** void_args) const {
     int device_id;
     CUDA_CALL(cudaGetDevice(&device_id));
+    ThreadWorkLoad wl = launch_param_config_.Extract(args);
+
     if (fcache_[device_id] == nullptr) {
       fcache_[device_id] = m_->GetFunc(device_id, func_name_);
+      if (wl.dyn_shmem_size >= (48 << 10)) {
+        // Assumption: dyn_shmem_size doesn't change across different invocations of
+        // fcache_[device_id]
+        cuFuncSetAttribute(fcache_[device_id], CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
+                           wl.dyn_shmem_size);
+      }
     }
     CUstream strm = static_cast<CUstream>(CUDAThreadEntry::ThreadLocal()->stream);
-    ThreadWorkLoad wl = launch_param_config_.Extract(args);
     CUresult result = cuLaunchKernel(fcache_[device_id], wl.grid_dim(0), wl.grid_dim(1),
                                      wl.grid_dim(2), wl.block_dim(0), wl.block_dim(1),
                                      wl.block_dim(2), wl.dyn_shmem_size, strm, void_args, nullptr);

From 5b52cc91d55610b3271b903da22078a53e02bee6 Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Fri, 27 May 2022 06:40:23 +0900
Subject: [PATCH 2/4] add error msg

---
 src/runtime/cuda/cuda_module.cc | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/runtime/cuda/cuda_module.cc b/src/runtime/cuda/cuda_module.cc
index 2471b3110b1a..739875fe850f 100644
--- a/src/runtime/cuda/cuda_module.cc
+++ b/src/runtime/cuda/cuda_module.cc
@@ -171,8 +171,12 @@ class CUDAWrappedFunc {
       if (wl.dyn_shmem_size >= (48 << 10)) {
         // Assumption: dyn_shmem_size doesn't change across different invocations of
         // fcache_[device_id]
-        cuFuncSetAttribute(fcache_[device_id], CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
-                           wl.dyn_shmem_size);
+        CUresult result = cuFuncSetAttribute(
+            fcache_[device_id], CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, wl.dyn_shmem_size);
+        if (result != CUDA_SUCCESS) {
+          LOG(FATAL) << "Failed to set the allowed dynamic shared memory size to "
+                     << wl.dyn_shmem_size;
+        }
       }
     }
     CUstream strm = static_cast<CUstream>(CUDAThreadEntry::ThreadLocal()->stream);

From 10f01818598d5c03c896ae70fa5d7f2167b82606 Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Fri, 27 May 2022 19:48:37 +0900
Subject: [PATCH 3/4] skip cascader test

---
 tests/python/contrib/test_ethosu/cascader/test_scheduler.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/python/contrib/test_ethosu/cascader/test_scheduler.py b/tests/python/contrib/test_ethosu/cascader/test_scheduler.py
index c97cfeb7a991..b3610315441e 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_scheduler.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_scheduler.py
@@ -48,6 +48,7 @@ def test_cascade(SRAM, FLASH, TwoConv2DWithSliceTE, TwoConv2DTE, MobileNetv1Star
         cs.cascade(sch, te_graph, const_dict, options, SRAM, FLASH, [SRAM], device_config)
 
 
+@pytest.mark.skip(reason="See https://github.com/apache/tvm/issues/11483")
 def test_compute_cycles_annotation(SRAM, FLASH, TwoConv2DTE):
     device_config = cs.EthosuDeviceConfig("ethos-u55-256")
     options = infra.make_options(

From 4aa41f86e106b0d73467dfb633146729bb888d1f Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Fri, 27 May 2022 19:48:22 +0900
Subject: [PATCH 4/4] suppres logging

---
 python/tvm/contrib/cc.py   | 3 ---
 python/tvm/contrib/nvcc.py | 2 --
 2 files changed, 5 deletions(-)

diff --git a/python/tvm/contrib/cc.py b/python/tvm/contrib/cc.py
index ec40ef3189d1..867cbd601256 100644
--- a/python/tvm/contrib/cc.py
+++ b/python/tvm/contrib/cc.py
@@ -19,7 +19,6 @@
 import sys
 import os
 import subprocess
-import logging
 
 from .._ffi.base import py_str
 
@@ -239,7 +238,6 @@ def _linux_compile(output, objects, options, compile_cmd, compile_shared=False):
         cmd += objects
     if options:
         cmd += options
-    logging.info("invoking '%s'", cmd)
     proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
     (out, _) = proc.communicate()
     if proc.returncode != 0:
@@ -266,7 +264,6 @@ def _windows_compile(output, objects, options):
         cmd += options
 
     try:
-        logging.info("invoking '%s'", cmd)
         proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
         (out, _) = proc.communicate()
     except FileNotFoundError:
diff --git a/python/tvm/contrib/nvcc.py b/python/tvm/contrib/nvcc.py
index 33a32c9c0047..5a104be9966d 100644
--- a/python/tvm/contrib/nvcc.py
+++ b/python/tvm/contrib/nvcc.py
@@ -21,7 +21,6 @@
 import subprocess
 import os
 import warnings
-import logging
 
 import tvm._ffi
 from tvm.target import Target
@@ -103,7 +102,6 @@ def compile_cuda(code, target_format="ptx", arch=None, options=None, path_target
     # if cxx_compiler_path != "":
     #    cmd += ["-ccbin", cxx_compiler_path]
 
-    logging.info("invoking '%s'", cmd)
     proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
 
     (out, _) = proc.communicate()