apache · junrushao · May 27, 2022 · May 25, 2022 · May 26, 2022 · May 27, 2022
diff --git a/python/tvm/contrib/cc.py b/python/tvm/contrib/cc.py
@@ -19,7 +19,6 @@
 import sys
 import os
 import subprocess
-import logging
 
 from .._ffi.base import py_str
 
@@ -239,7 +238,6 @@ def _linux_compile(output, objects, options, compile_cmd, compile_shared=False):
         cmd += objects
     if options:
         cmd += options
-    logging.info("invoking '%s'", cmd)
     proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
     (out, _) = proc.communicate()
     if proc.returncode != 0:
@@ -266,7 +264,6 @@ def _windows_compile(output, objects, options):
         cmd += options
 
     try:
-        logging.info("invoking '%s'", cmd)
         proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
         (out, _) = proc.communicate()
     except FileNotFoundError:

diff --git a/python/tvm/contrib/nvcc.py b/python/tvm/contrib/nvcc.py
@@ -21,7 +21,6 @@
 import subprocess
 import os
 import warnings
-import logging
 
 import tvm._ffi
 from tvm.target import Target
@@ -103,7 +102,6 @@ def compile_cuda(code, target_format="ptx", arch=None, options=None, path_target
     # if cxx_compiler_path != "":
     #    cmd += ["-ccbin", cxx_compiler_path]
 
-    logging.info("invoking '%s'", cmd)
     proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
 
     (out, _) = proc.communicate()

diff --git a/src/runtime/cuda/cuda_module.cc b/src/runtime/cuda/cuda_module.cc
@@ -164,11 +164,22 @@ class CUDAWrappedFunc {
   void operator()(TVMArgs args, TVMRetValue* rv, void** void_args) const {
     int device_id;
     CUDA_CALL(cudaGetDevice(&device_id));
+    ThreadWorkLoad wl = launch_param_config_.Extract(args);
+
     if (fcache_[device_id] == nullptr) {
       fcache_[device_id] = m_->GetFunc(device_id, func_name_);
+      if (wl.dyn_shmem_size >= (48 << 10)) {
 } else if (storage_scope.rank == runtime::StorageRank::kShared) { 
   size_t size = static_cast<size_t>(op->ConstantAllocationSize()); 
 } else if (storage_scope.rank == runtime::StorageRank::kShared) { 
   size_t size = static_cast<size_t>(op->ConstantAllocationSize()); 
+        // Assumption: dyn_shmem_size doesn't change across different invocations of
+        // fcache_[device_id]
+        CUresult result = cuFuncSetAttribute(
+            fcache_[device_id], CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, wl.dyn_shmem_size);
+        if (result != CUDA_SUCCESS) {
+          LOG(FATAL) << "Failed to set the allowed dynamic shared memory size to "
+                     << wl.dyn_shmem_size;
+        }
+      }
     }
     CUstream strm = static_cast<CUstream>(CUDAThreadEntry::ThreadLocal()->stream);
-    ThreadWorkLoad wl = launch_param_config_.Extract(args);
     CUresult result = cuLaunchKernel(fcache_[device_id], wl.grid_dim(0), wl.grid_dim(1),
                                      wl.grid_dim(2), wl.block_dim(0), wl.block_dim(1),
                                      wl.block_dim(2), wl.dyn_shmem_size, strm, void_args, nullptr);

diff --git a/tests/python/contrib/test_ethosu/cascader/test_scheduler.py b/tests/python/contrib/test_ethosu/cascader/test_scheduler.py
@@ -48,6 +48,7 @@ def test_cascade(SRAM, FLASH, TwoConv2DWithSliceTE, TwoConv2DTE, MobileNetv1Star
         cs.cascade(sch, te_graph, const_dict, options, SRAM, FLASH, [SRAM], device_config)
 
 
+@pytest.mark.skip(reason="See https://github.com/apache/tvm/issues/11483")
 def test_compute_cycles_annotation(SRAM, FLASH, TwoConv2DTE):
     device_config = cs.EthosuDeviceConfig("ethos-u55-256")
     options = infra.make_options(