diff --git a/include/tvm/runtime/debug.h b/include/tvm/runtime/debug.h
new file mode 100644
index 000000000000..29d812b74dd8
--- /dev/null
+++ b/include/tvm/runtime/debug.h
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file tvm/runtime/debug.h
+ * \brief Helpers for debugging at runtime.
+ */
+#ifndef TVM_RUNTIME_DEBUG_H_
+#define TVM_RUNTIME_DEBUG_H_
+
+#include <tvm/runtime/container/adt.h>
+#include <tvm/runtime/ndarray.h>
+
+#include <ostream>
+#include <string>
+
+namespace tvm {
+namespace runtime {
+
+/*!
+ * \brief Helpers to describe runtime objects in human-friendly form. For \p nd_arrays we show their
+ * shapes and dtypes, but also their contents if 'small' and on the \p host_device (mostly so that
+ * we can see dynamic shapes as they are computed). For \p adts we show the ADT fields. For
+ * \p objects we dispatch to one of the above as appropriate.
+ */
+void AppendNDArray(std::ostream& os, const NDArray& nd_array, const DLDevice& host_device,
+                   bool show_content = true);
+void AppendADT(std::ostream& os, const ADT& adt, const DLDevice& host_device,
+               bool show_content = true);
+void AppendRuntimeObject(std::ostream& os, const ObjectRef& object, const DLDevice& host_device,
+                         bool show_content = true);
+std::string RuntimeObject2String(const ObjectRef& object, const DLDevice& host_device,
+                                 bool show_content = true);
+
+}  // namespace runtime
+}  // namespace tvm
+
+#endif  // TVM_RUNTIME_DEBUG_H_
diff --git a/python/tvm/relay/op/tensor.py b/python/tvm/relay/op/tensor.py
index 20b883ba2616..963bb3d55693 100644
--- a/python/tvm/relay/op/tensor.py
+++ b/python/tvm/relay/op/tensor.py
@@ -1178,8 +1178,18 @@ def copy(data):
 
 
 @script
-def _copy_shape_func(data_shape):
-    return data_shape
+def _copy_shape_func_tensor(data_shape):
+    ndim = data_shape.shape[0]
+    out = output_tensor((ndim,), "int64")
+    for i in const_range(ndim):
+        out[i] = data_shape[i]
+    return out
+
+
+@script
+def _copy_shape_func_scalar(data_shape):
+    out = output_tensor((), "int64")
+    return out
 
 
 @reg.register_shape_func("copy", False)
@@ -1187,7 +1197,10 @@ def copy_shape_func(attrs, inputs, _):
     """
     Shape function for copy op.
     """
-    return [_copy_shape_func(inputs[0])]
+    input = inputs[0]
+    if len(input.shape) == 0:
+        return [_copy_shape_func_scalar(input)]
+    return [_copy_shape_func_tensor(input)]
 
 
 def device_copy(data, src_device, dst_device):
diff --git a/src/relay/backend/te_compiler.cc b/src/relay/backend/te_compiler.cc
index 901661dd87a3..3ff6076473f1 100644
--- a/src/relay/backend/te_compiler.cc
+++ b/src/relay/backend/te_compiler.cc
@@ -350,7 +350,7 @@ class TECompilerImpl : public TECompilerNode {
 
   // implement lowered shape func
   CCacheValue LowerShapeFuncInternal(const CCacheKey& key) {
-    VLOG(1) << "lowering dynamic shape function:" << std::endl
+    VLOG(1) << "lowering dynamic shape function for:" << std::endl
             << PrettyPrint(key->source_func) << std::endl
             << "for target:" << std::endl
             << key->target->ToDebugString();
diff --git a/src/relay/backend/te_compiler_cache.cc b/src/relay/backend/te_compiler_cache.cc
index f028c3da02ab..32164f3fdf20 100644
--- a/src/relay/backend/te_compiler_cache.cc
+++ b/src/relay/backend/te_compiler_cache.cc
@@ -145,7 +145,7 @@ class ScheduleBuilder : public backend::MemoizedExprTranslator<Array<te::Tensor>
       candidate_name = truncated_name.str();
     }
 
-    // TODO(mbs): This should be the definititive global by which the PrimFunc is known and
+    // TODO(mbs): This should be the definitive global by which the PrimFunc is known and
     // no other GlobalVar ctors should appear inside the lowering machinery.
     auto prim_fn_var = GlobalVar(renamer(candidate_name));
     prim_fn_var->checked_type_ = relay_func->checked_type();
@@ -371,6 +371,7 @@ class MakeShapeFunc : public backend::MemoizedExprTranslator<Array<te::Tensor>>
 
   CachedFunc Create(const Function& prim_func, const Target& target,
                     std::function<std::string(std::string)> renamer) {
+    VLOG_CONTEXT << "MakeShapeFunc";
     TShapeDataDependent shape_func_param_states;
 
     for (auto param : prim_func->params) {
@@ -399,11 +400,12 @@ class MakeShapeFunc : public backend::MemoizedExprTranslator<Array<te::Tensor>>
     // Setup the name;
     readable_name_stream_ << "shape_func";
 
-    // Create the `te::Tensor`s which represent the output.
-    auto outputs = VisitExpr(prim_func->body);
+    // Create the tensor expressions representing the output shapes.
+    Array<te::Tensor> outputs = VisitExpr(prim_func->body);
 
     // Generate a name.
     auto candidate_name = readable_name_stream_.str();
+
     constexpr static size_t kMaxFuncNameLength = 80;
     // WARNING: Please make sure to also update TVM_CRT_MAX_STRLEN_FUNCTION_NAME
     //          whenever the value of kMaxFuncNameLength changes
@@ -463,7 +465,7 @@ class MakeShapeFunc : public backend::MemoizedExprTranslator<Array<te::Tensor>>
     for (auto t : outputs) {
       out_ops.push_back(t->op);
     }
-    auto schedule = te::create_schedule(out_ops);
+    te::Schedule schedule = te::create_schedule(out_ops);
     tvm::te::AutoInlineInjective(schedule);
     for (const auto& scalar : scalars_) {
       auto scalar_op = scalar->op;
@@ -589,12 +591,15 @@ class MakeShapeFunc : public backend::MemoizedExprTranslator<Array<te::Tensor>>
   }
 
   Array<te::Tensor> VisitExpr_(const CallNode* call_node) final {
+    VLOG(1) << "considering call:" << std::endl << PrettyPrint(GetRef<Call>(call_node));
     if (auto* func = call_node->op.as<FunctionNode>()) {
+      VLOG(1) << "user function";
       for (size_t i = 0; i < func->params.size(); ++i) {
         param_arg_map_[func->params[i]] = call_node->args[i];
       }
       return VisitExpr(func->body);
     }
+
     static auto fshape_func = Op::GetAttrMap<FShapeFunc>("FShapeFunc");
     static auto tshape_data_dependent = Op::GetAttrMap<TShapeDataDependent>("TShapeDataDependent");
     ICHECK(call_node->op.as<OpNode>()) << "Primitive function only allows call into primitive ops";
@@ -635,20 +640,16 @@ class MakeShapeFunc : public backend::MemoizedExprTranslator<Array<te::Tensor>>
     // Get output ndims
     auto ret_type = call_node->checked_type();
     Array<IndexExpr> out_ndims;
-    if (const auto* ttype = ret_type.as<TensorTypeNode>()) {
+    for (const auto& ttype : FlattenTupleType(ret_type)) {
       out_ndims.push_back(IntImm(DataType::Int(32), ttype->shape.size()));
-    } else {
-      auto rtype = ret_type.as<TupleTypeNode>();
-      // TODO(@icemelon): Allow recursive tuple
-      ICHECK(rtype);
-      for (size_t i = 0; i < rtype->fields.size(); ++i) {
-        auto ttype = rtype->fields[i].as<TensorTypeNode>();
-        ICHECK(ttype);
-        out_ndims.push_back(IntImm(DataType::Int(32), ttype->shape.size()));
-      }
     }
+
     // Call shape function
-    auto outputs = fshape_func[op](call_node->attrs, inputs, out_ndims);
+    Array<te::Tensor> outputs = fshape_func[op](call_node->attrs, inputs, out_ndims);
+    VLOG(1) << "shape function for '" << op->name << "' with inputs:" << std::endl
+            << inputs << std::endl
+            << "yielded outputs:" << std::endl
+            << outputs;
     readable_name_stream_ << "_" << op->name;
     return outputs;
   }
diff --git a/src/runtime/debug.cc b/src/runtime/debug.cc
new file mode 100644
index 000000000000..e5d9f0ead09e
--- /dev/null
+++ b/src/runtime/debug.cc
@@ -0,0 +1,128 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/runtime/debug.cc
+ * \brief Helpers for debugging at runtime.
+ */
+
+#include <tvm/runtime/debug.h>
+
+namespace tvm {
+namespace runtime {
+
+template <typename T>
+void AppendMembers(std::ostream& os, const NDArray& nd_array, int64_t dim0) {
+  os << "=[";
+  for (int64_t i = 0; i < dim0; ++i) {
+    if (i > 0) {
+      os << ",";
+    }
+    os << reinterpret_cast<T*>(nd_array->data)[i];
+  }
+  os << "]";
+}
+
+void AppendNDArray(std::ostream& os, const NDArray& nd_array, const DLDevice& host_device,
+                   bool show_contents) {
+  os << "NDArray[";
+  os << "(";
+  for (int dim = 0; dim < nd_array->ndim; ++dim) {
+    if (dim > 0) {
+      os << ",";
+    }
+    os << nd_array->shape[dim];
+  }
+  std::string basic_type = DLDataType2String(nd_array->dtype);
+  os << ")," << basic_type;
+  os << ",(" << nd_array->device.device_type;
+  os << "," << nd_array->device.device_id;
+  os << ")]";
+  if (show_contents && nd_array->device.device_type == host_device.device_type &&
+      nd_array->device.device_id == host_device.device_id) {
+    int64_t dim0;
+    if (nd_array->ndim == 0) {
+      dim0 = 1;
+    } else if (nd_array->ndim == 1) {
+      dim0 = nd_array->shape[0];
+      if (dim0 > 10) {
+        // Too large.
+        dim0 = 0;
+      }
+    } else {
+      // Not rank-1.
+      dim0 = 0;
+    }
+    if (dim0 > 0) {
+      if (basic_type == "bool") {
+        AppendMembers<bool>(os, nd_array, dim0);
+      } else if (basic_type == "int8") {
+        AppendMembers<int8_t>(os, nd_array, dim0);
+      } else if (basic_type == "int16") {
+        AppendMembers<int16_t>(os, nd_array, dim0);
+      } else if (basic_type == "int32") {
+        AppendMembers<int32_t>(os, nd_array, dim0);
+      } else if (basic_type == "int64") {
+        AppendMembers<int64_t>(os, nd_array, dim0);
+      } else if (basic_type == "uint8") {
+        AppendMembers<uint8_t>(os, nd_array, dim0);
+      } else if (basic_type == "uint16") {
+        AppendMembers<uint16_t>(os, nd_array, dim0);
+      } else if (basic_type == "uint32") {
+        AppendMembers<uint32_t>(os, nd_array, dim0);
+      } else if (basic_type == "uint64") {
+        AppendMembers<uint64_t>(os, nd_array, dim0);
+      } else if (basic_type == "float32") {
+        AppendMembers<float>(os, nd_array, dim0);
+      } else if (basic_type == "float64") {
+        AppendMembers<double>(os, nd_array, dim0);
+      }
+    }
+  }
+}
+
+void AppendADT(std::ostream& os, const ADT& adt, const DLDevice& host_device, bool show_contents) {
+  os << "ADT(" << adt->tag;
+  for (size_t i = 0; i < adt->size; ++i) {
+    os << ",";
+    AppendRuntimeObject(os, adt[i], host_device, show_contents);
+  }
+  os << ")";
+}
+
+void AppendRuntimeObject(std::ostream& os, const ObjectRef& object, const DLDevice& host_device,
+                         bool show_contents) {
+  if (const auto* adt_obj = object.as<ADTObj>()) {
+    AppendADT(os, GetRef<ADT>(adt_obj), host_device, show_contents);
+  } else if (const auto* nd_array_cont = object.as<NDArray::Container>()) {
+    AppendNDArray(os, GetRef<NDArray>(nd_array_cont), host_device, show_contents);
+  } else {
+    os << "?";
+  }
+}
+
+std::string RuntimeObject2String(const ObjectRef& object, const DLDevice& host_device,
+                                 bool show_contents) {
+  std::ostringstream os;
+  AppendRuntimeObject(os, object, host_device, show_contents);
+  return os.str();
+}
+
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/runtime/vm/executable.cc b/src/runtime/vm/executable.cc
index 76c385ae9918..e2fe867630b0 100644
--- a/src/runtime/vm/executable.cc
+++ b/src/runtime/vm/executable.cc
@@ -24,6 +24,7 @@
 
 #include <dmlc/memory_io.h>
 #include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/debug.h>
 #include <tvm/runtime/registry.h>
 #include <tvm/runtime/vm/executable.h>
 #include <tvm/runtime/vm/vm.h>
@@ -171,27 +172,13 @@ std::string Executable::GetBytecode() const {
   return oss.str();
 }
 
-namespace {
-String ShapeString(const ShapeTuple& shape_tuple, DLDataType dtype) {
-  std::stringstream sizes;
-  sizes << DLDataType2String(dtype) << "[";
-  for (size_t i = 0; i < shape_tuple.size(); i++) {
-    if (i != 0) {
-      sizes << ", ";
-    }
-    sizes << shape_tuple.data()[i];
-  }
-  sizes << "]";
-  return String(sizes.str());
-}
-}  // namespace
-
 std::string Executable::GetConstants() const {
   std::ostringstream oss;
   for (size_t i = 0; i < constants.size(); ++i) {
     const auto& constant = constants[i];
     auto ndarray = Downcast<NDArray>(constant);
-    oss << "VM Const[" << i << "]: has shape " << ShapeString(ndarray.Shape(), ndarray->dtype)
+    oss << "VM Const[" << i
+        << "]: " << RuntimeObject2String(ndarray, virtual_devices[host_device_index])
         << " on device index " << const_device_indexes[i] << std::endl;
   }
   return oss.str();
diff --git a/src/runtime/vm/vm.cc b/src/runtime/vm/vm.cc
index acbbec0d2991..7a83c9acb906 100644
--- a/src/runtime/vm/vm.cc
+++ b/src/runtime/vm/vm.cc
@@ -24,6 +24,8 @@
 
 #include <dmlc/memory_io.h>
 #include <tvm/runtime/container/adt.h>
+#include <tvm/runtime/data_type.h>
+#include <tvm/runtime/debug.h>
 #include <tvm/runtime/logging.h>
 #include <tvm/runtime/memory.h>
 #include <tvm/runtime/object.h>
@@ -292,13 +294,14 @@ Index VirtualMachine::PopFrame() {
 }
 
 void VirtualMachine::InvokeGlobal(const VMFunction& func, const std::vector<ObjectRef>& args) {
-  VLOG(2) << "Invoking global " << func.name << " " << args.size();
+  VLOG(2) << "Invoking global " << func.name << " with " << args.size() << " args";
 
   PushFrame(func.params.size(), this->pc_ + 1, func);
   for (size_t i = 0; i < args.size(); ++i) {
     WriteRegister(i, args[i]);
+    VLOG(2) << "arg " << i << " = "
+            << RuntimeObject2String(args[i], GetDevice(exec_->host_device_index));
   }
-  VLOG(2) << "func.params= " << func.params.size();
 
   code_ = func.instructions.data();
   pc_ = 0;
@@ -527,20 +530,35 @@ void VirtualMachine::RunLoop() {
         goto main_loop;
       }
       case Opcode::InvokePacked: {
-        VLOG(2) << "InvokedPacked " << instr.packed_index << " arity=" << instr.arity;
         ICHECK_LE(instr.packed_index, packed_funcs_.size());
         const auto& func = packed_funcs_[instr.packed_index];
         const auto& arity = instr.arity;
         std::vector<ObjectRef> args;
         for (Index i = 0; i < arity; ++i) {
-          VLOG(2) << "arg" << i << " $" << instr.packed_args[i];
           auto arg = ReadRegister(instr.packed_args[i]);
           args.push_back(arg);
+#if TVM_LOG_DEBUG
+          if (i < arity) {
+            const bool is_input = i < arity - instr.output_size;
+            VLOG(2) << (is_input ? "input" : "placeholder") << " arg " << i << " = "
+                    << RuntimeObject2String(arg, GetDevice(exec_->host_device_index),
+                                            /*show_contents=*/is_input);
+          }
+#endif
         }
 
         // We no longer need to write the registers back, we write directly
         // through the registers mutably.
         InvokePacked(instr.packed_index, func, arity, instr.output_size, args);
+
+#if TVM_LOG_DEBUG
+        for (Index i = arity - instr.output_size; i < arity; ++i) {
+          auto arg = ReadRegister(instr.packed_args[i]);
+          VLOG(2) << "output arg " << i << " = "
+                  << RuntimeObject2String(arg, GetDevice(exec_->host_device_index));
+        }
+#endif
+
         pc_++;
         goto main_loop;
       }
@@ -606,19 +624,10 @@ void VirtualMachine::RunLoop() {
         auto storage_obj = ReadRegister(instr.alloc_tensor.storage);
         auto offset = LoadScalarInt(instr.alloc_tensor.offset);
         auto storage = Downcast<Storage>(storage_obj);
-#if TVM_LOG_DEBUG
-        std::ostringstream os;
-        os << "AllocTensor: ";
-        os << "offset=" << offset;
-        os << ", shape=[";
-        for (auto i : shape) {
-          os << i << ",";
-        }
-        os << "]";
-        os << ", dtype=" << DLDataType2String(instr.alloc_tensor.dtype);
-        VLOG(2) << os.str();
-#endif
         auto obj = storage->AllocNDArray(offset, shape, instr.alloc_tensor.dtype);
+        VLOG(2) << "allocated "
+                << RuntimeObject2String(obj, GetDevice(exec_->host_device_index),
+                                        /*show_contents=*/false);
 
         WriteRegister(instr.dst, obj);
         OpStopHook();
@@ -635,6 +644,9 @@ void VirtualMachine::RunLoop() {
         auto storage = Downcast<Storage>(storage_obj);
         auto offset = LoadScalarInt(instr.alloc_tensor.offset);
         auto obj = storage->AllocNDArray(offset, shape, instr.alloc_tensor_reg.dtype);
+        VLOG(2) << "allocated "
+                << RuntimeObject2String(obj, GetDevice(exec_->host_device_index),
+                                        /*show_contents=*/false);
 
         WriteRegister(instr.dst, obj);
         OpStopHook();
@@ -668,7 +680,7 @@ void VirtualMachine::RunLoop() {
         auto storage_obj = SimpleObjAllocator().make_object<StorageObj>();
         Allocator* allocator = GetAllocator(instr.alloc_storage.device_index);
         ICHECK(allocator) << "Did you forget to init the VirtualMachine with devices?";
-        VLOG(2) << "AllocStorage: allocation_size=" << size << ", alignment=" << alignment
+        VLOG(2) << "allocating with allocation_size=" << size << ", alignment=" << alignment
                 << ", dtype_hint=" << DLDataType2String(instr.alloc_storage.dtype_hint)
                 << ", device_index=" << instr.alloc_storage.device_index;
 
@@ -688,6 +700,8 @@ void VirtualMachine::RunLoop() {
         for (int i = 0; i < ndim; ++i) {
           reinterpret_cast<int64_t*>(out_tensor->data)[i] = input_array->shape[i];
         }
+        VLOG(2) << "shape = "
+                << RuntimeObject2String(out_tensor, GetDevice(exec_->host_device_index));
         WriteRegister(instr.dst, out_tensor);
         pc_++;
         goto main_loop;
@@ -722,18 +736,10 @@ void VirtualMachine::RunLoop() {
         int64_t ndim = shape_tensor->shape[0];
         std::vector<int64_t> shape(dims, dims + ndim);
         // Reshape the input tensor
-#if TVM_LOG_DEBUG
-        std::ostringstream os;
-        os << "ReshapeTensor: ";
-        os << "shape=[";
-        for (auto i : shape) {
-          os << i << ",";
-        }
-        os << "]";
-        os << ", dtype=" << DLDataType2String(tensor_arr->dtype);
-        VLOG(2) << os.str();
-#endif
         auto out_tensor = tensor_arr.CreateView(shape, tensor_arr->dtype);
+        VLOG(2) << "reshaped "
+                << RuntimeObject2String(tensor_obj, GetDevice(exec_->host_device_index)) << " to "
+                << RuntimeObject2String(out_tensor, GetDevice(exec_->host_device_index));
         WriteRegister(instr.dst, out_tensor);
         OpStopHook();
         pc_++;
diff --git a/src/target/compilation_config.cc b/src/target/compilation_config.cc
index 0401eebe51ef..a56e0ad0777c 100644
--- a/src/target/compilation_config.cc
+++ b/src/target/compilation_config.cc
@@ -62,31 +62,31 @@ void CompilationConfigNode::EstablishDefaultVirtualDevices(const transform::Pass
   if (host_target.defined()) {
     CHECK(!host_target->host.defined()) << "Host targets are not expected to have hosts";
     host_device_type = static_cast<DLDeviceType>(host_target->kind->device_type);
-    DLOG(INFO) << "Using the given host target " << host_target->ToDebugString()
-               << " of device type " << host_device_type << " for the host target";
+    VLOG(1) << "Using the given host target " << host_target->ToDebugString() << " of device type "
+            << host_device_type << " for the host target";
     for (const auto& primitive_target : primitive_targets) {
       if (primitive_target->host.defined() &&
           !StructuralEqual()(primitive_target->host, host_target)) {
-        DLOG(WARNING) << "The primitive target " << primitive_target->ToDebugString()
-                      << " already has a host which disagrees with the desired host target. It "
-                      << "will be ignored.";
+        VLOG(1) << "The primitive target " << primitive_target->ToDebugString()
+                << " already has a host which disagrees with the desired host target. It "
+                << "will be ignored.";
       }
     }
   } else if (primitive_targets.size() == 1 && primitive_targets.front()->host.defined()) {
     host_target = primitive_targets.front()->GetHost().value();
     CHECK(!host_target->host.defined()) << "Host targets are not expected to have hosts";
     host_device_type = static_cast<DLDeviceType>(host_target->kind->device_type);
-    DLOG(INFO) << "Using the host of the unique primitive target, namely "
-               << host_target->ToDebugString() << " of device type " << host_device_type
-               << " for the host target";
+    VLOG(1) << "Using the host of the unique primitive target, namely "
+            << host_target->ToDebugString() << " of device type " << host_device_type
+            << " for the host target";
   } else if (primitive_targets.size() == 1 &&
              primitive_targets.front()->kind->device_type == kDLCPU) {
     // In the homogenous case without an explicit host target just use the given target so long as
     // it's a CPU.
     host_device_type = kDLCPU;
     host_target = primitive_targets.front();
-    DLOG(INFO) << "Using the unique primitive target " << host_target->ToDebugString()
-               << " of device type " << host_device_type << " for the host target";
+    VLOG(1) << "Using the unique primitive target " << host_target->ToDebugString()
+            << " of device type " << host_device_type << " for the host target";
   } else {
     // Fallback.
     host_device_type = kDLCPU;
@@ -94,15 +94,15 @@ void CompilationConfigNode::EstablishDefaultVirtualDevices(const transform::Pass
     // in the hetrogeneous case since its options may not be appropriate for host code
     // (eg shape functions). Instead, create a fresh default Target.
     host_target = MakeDefaultTarget(host_device_type);
-    DLOG(WARNING) << "Using the default target " << host_target->ToDebugString()
-                  << " of device type " << host_device_type << " for the host target";
+    VLOG(1) << "Using the default target " << host_target->ToDebugString() << " of device type "
+            << host_device_type << " for the host target";
   }
   ICHECK(host_target.defined());
   ICHECK(!host_target->host.defined());
 
   if (host_device_type != kDLCPU) {
     // I think we're on thin ice here until we've audited the code base for assumed kDLCPU.
-    LOG(WARNING) << "The host target is not a CPU.";
+    VLOG(1) << "The host target is not a CPU.";
   }
 
   //
@@ -132,22 +132,22 @@ void CompilationConfigNode::EstablishDefaultVirtualDevices(const transform::Pass
     CHECK_GT(v, 0)
         << "The 'relay.fallback_device_type' pass attribute is set to an invalid device type " << v;
     default_primitive_device_type = static_cast<DLDeviceType>(v);
-    DLOG(INFO) << "Using the 'relay.fallback_device_type' pass attribute "
-               << default_primitive_device_type
-               << " as the default device type for all primitive operations";
+    VLOG(1) << "Using the 'relay.fallback_device_type' pass attribute "
+            << default_primitive_device_type
+            << " as the default device type for all primitive operations";
   } else if (primitive_targets.size() == 1) {
     // In the homogeneous case there's no free choice.
     default_primitive_device_type =
         static_cast<DLDeviceType>(primitive_targets.front()->kind->device_type);
-    DLOG(INFO) << "Using the device type " << default_primitive_device_type
-               << " of the unique primitive target as the default device type for all primitive "
-               << "operations";
+    VLOG(1) << "Using the device type " << default_primitive_device_type
+            << " of the unique primitive target as the default device type for all primitive "
+            << "operations";
   } else {
     // Fallback. Note that we'll require a primitive Target of kDLCPU device_type to be given
     // and won't manufacture one out of thin air.
     default_primitive_device_type = kDLCPU;
-    DLOG(WARNING) << "Using " << default_primitive_device_type
-                  << " as the default device type for all primitive operations";
+    VLOG(1) << "Using " << default_primitive_device_type
+            << " as the default device type for all primitive operations";
   }
 
   //
@@ -227,11 +227,11 @@ CompilationConfig::CompilationConfig(const transform::PassContext& pass_ctx,
       node->legacy_target_map.size() == 1 ? (*node->legacy_target_map.begin()).second : Target();
 
   for (const auto& target : node->primitive_targets) {
-    DLOG(INFO) << "Target " << target->ToDebugString() << " of device type "
-               << target->kind->device_type << " is available for primitives";
+    VLOG(1) << "Target " << target->ToDebugString() << " of device type "
+            << target->kind->device_type << " is available for primitives";
   }
-  DLOG(INFO) << "Using default primitive virtual device " << node->default_primitive_virtual_device;
-  DLOG(INFO) << "Using host virtual device " << node->host_virtual_device;
+  VLOG(1) << "Using default primitive virtual device " << node->default_primitive_virtual_device;
+  VLOG(1) << "Using host virtual device " << node->host_virtual_device;
 
   data_ = std::move(node);
 }
diff --git a/tests/python/relay/dyn/test_dynamic_op_level3.py b/tests/python/relay/dyn/test_dynamic_op_level3.py
index 7669d02cd536..0456401e8ad2 100644
--- a/tests/python/relay/dyn/test_dynamic_op_level3.py
+++ b/tests/python/relay/dyn/test_dynamic_op_level3.py
@@ -21,7 +21,6 @@
 import tvm
 import tvm.testing
 from tvm import relay, te
-from tvm.relay import create_executor, transform
 from tvm.relay.testing import check_grad, run_infer_type
 
 
@@ -44,6 +43,15 @@ def verify_func(func, data, ref_res, target_device=tvm.testing.enabled_targets()
             relay.backend.te_compiler.get().clear()
 
 
+def check_on_vm(target, dev, args, expected_result, mod):
+    """
+    Check that evaluating `expr` applied to the arguments produces
+    `result` on Relay VM.
+    """
+    rts_result = relay.create_executor("vm", device=dev, target=target, mod=mod).evaluate()(*args)
+    tvm.testing.assert_allclose(expected_result, rts_result.numpy())
+
+
 @tvm.testing.uses_gpu
 def test_dyn_reshape():
     def verify_reshape(shape, newshape, oshape):
@@ -410,5 +418,59 @@ def verify_sparse_fill_empty_rows(
     )
 
 
+def test_dyn_copy():
+    target = tvm.target.Target("llvm")
+    dev = tvm.cpu()
+    mod = tvm.parser.fromtext(
+        """
+        #[version = "0.0.5"]
+        def @main(%x: Tensor[(?, 3), int64]) -> Tensor[(?, 3), int64] {
+          copy(%x)
+        }
+        """
+    )
+    x_data = np.random.rand(15, 3).astype("int64")
+    expected = x_data
+    check_on_vm(target, dev, [x_data], expected, mod)
+
+
+def test_dyn_copy_scalar():
+    target = tvm.target.Target("llvm")
+    dev = tvm.cpu()
+    mod = tvm.parser.fromtext(
+        """
+        #[version = "0.0.5"]
+        def @main(%x: int32, %y: Tensor[(?), int32]) -> Tensor[(?), int32] {
+          %0 = copy(%x);
+          %1 = expand_dims(%0, axis=0);
+          %2 = (%y, %1);
+          concatenate(%2)
+        }
+        """
+    )
+    x_data = 3
+    y_data = np.random.rand(7).astype("int32")
+    expected = np.concatenate((y_data, np.expand_dims(x_data, axis=0)))
+    check_on_vm(target, dev, [x_data, y_data], expected, mod)
+
+
+def test_dyn_cast():
+    target = tvm.target.Target("llvm")
+    dev = tvm.cpu()
+    mod = tvm.parser.fromtext(
+        """
+        #[version = "0.0.5"]
+        def @main(%x: Tensor[(?, 3), int64]) -> Tensor[(?, 3), int32] {
+          cast(%x, dtype="int32")
+        }
+        """
+    )
+    x_data = np.random.rand(15, 3).astype("int64")
+    expected = x_data.astype("int32")
+    check_on_vm(target, dev, [x_data], expected, mod)
+
+
 if __name__ == "__main__":
-    pytest.main([__file__])
+    import sys
+
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))
diff --git a/tests/python/relay/test_vm.py b/tests/python/relay/test_vm.py
index 1c60702982cc..7f0f8041b1a2 100644
--- a/tests/python/relay/test_vm.py
+++ b/tests/python/relay/test_vm.py
@@ -36,7 +36,7 @@
 from tvm.relay.backend.vm import VMCompiler
 
 
-def check_result(target, dev, args, expected_result, mod=None):
+def check_result(target, dev, args, expected_result, mod):
     """
     Check that evaluating `expr` applied to the arguments produces
     `result` on Relay VM.
@@ -111,7 +111,7 @@ def test_id(target, dev):
     x_data = np.random.rand(10, 10).astype("float64")
     mod = tvm.IRModule()
     mod["main"] = f
-    check_result(target, dev, [x_data], x_data, mod=mod)
+    check_result(target, dev, [x_data], x_data, mod)
 
 
 def test_op(target, dev):
@@ -120,7 +120,7 @@ def test_op(target, dev):
     x_data = np.random.rand(10, 10).astype("float32")
     mod = tvm.IRModule()
     mod["main"] = f
-    check_result(target, dev, [x_data], 2 * x_data, mod=mod)
+    check_result(target, dev, [x_data], 2 * x_data, mod)
 
 
 def any(x):
@@ -140,10 +140,10 @@ def test_cond(target, dev):
     mod = tvm.IRModule()
     mod["main"] = f
     # same
-    check_result(target, dev, [x_data, x_data], True, mod=mod)
+    check_result(target, dev, [x_data, x_data], True, mod)
 
     # diff
-    check_result(target, dev, [x_data, y_data], False, mod=mod)
+    check_result(target, dev, [x_data, y_data], False, mod)
 
 
 @tvm.testing.known_failing_targets("vulkan")
@@ -157,10 +157,10 @@ def test_simple_if(target, dev):
     mod = tvm.IRModule()
     mod["main"] = f
     # same
-    check_result(target, dev, [x_data, x_data], x_data, mod=mod)
+    check_result(target, dev, [x_data, x_data], x_data, mod)
 
     # diff
-    check_result(target, dev, [x_data, y_data], y_data, mod=mod)
+    check_result(target, dev, [x_data, y_data], y_data, mod)
 
 
 @tvm.testing.parametrize_targets("llvm")
@@ -204,7 +204,7 @@ def test_unused_function(target, dev):
     x_data = np.random.rand(2, 2).astype("float32")
     y_data = x_data * 2
 
-    check_result(target, dev, [x_data], y_data, mod=mod)
+    check_result(target, dev, [x_data], y_data, mod)
 
 
 def test_simple_call(target, dev):
@@ -218,7 +218,7 @@ def test_simple_call(target, dev):
     i_data = np.array(0, dtype="int32")
     iarg = relay.var("iarg", shape=[], dtype="int32")
     mod["main"] = relay.Function([iarg], sum_up(iarg))
-    check_result(target, dev, [i_data], i_data, mod=mod)
+    check_result(target, dev, [i_data], i_data, mod)
 
 
 def test_count_loop(target, dev):
@@ -239,7 +239,7 @@ def test_count_loop(target, dev):
     mod["main"] = relay.Function([iarg], sum_up(iarg))
     result = veval(mod, i_data, device=dev, target=target)
     tvm.testing.assert_allclose(result.numpy(), i_data)
-    check_result(target, dev, [i_data], i_data, mod=mod)
+    check_result(target, dev, [i_data], i_data, mod)
 
 
 def test_sum_loop(target, dev):
@@ -263,7 +263,7 @@ def test_sum_loop(target, dev):
     iarg = relay.var("i", shape=[], dtype="int32")
     aarg = relay.var("accum", shape=[], dtype="int32")
     mod["main"] = relay.Function([iarg, aarg], sum_up(iarg, aarg))
-    check_result(target, dev, [i_data, accum_data], sum(range(1, loop_bound + 1)), mod=mod)
+    check_result(target, dev, [i_data, accum_data], sum(range(1, loop_bound + 1)), mod)
 
 
 def test_tuple_fst(target, dev):
@@ -274,7 +274,7 @@ def test_tuple_fst(target, dev):
     j_data = np.random.rand(10).astype("float32")
     mod = tvm.IRModule()
     mod["main"] = f
-    check_result(target, dev, [(i_data, j_data)], i_data, mod=mod)
+    check_result(target, dev, [(i_data, j_data)], i_data, mod)
 
 
 def test_tuple_second(target, dev):
@@ -285,7 +285,7 @@ def test_tuple_second(target, dev):
     j_data = np.random.rand(10).astype("float32")
     mod = tvm.IRModule()
     mod["main"] = f
-    check_result(target, dev, [(i_data, j_data)], j_data, mod=mod)
+    check_result(target, dev, [(i_data, j_data)], j_data, mod)
 
 
 def test_list_constructor(target, dev):
@@ -325,7 +325,7 @@ def test_let_tensor(target, dev):
     x_data = np.random.rand(*shape).astype("float32")
     mod = tvm.IRModule()
     mod["main"] = f
-    check_result(target, dev, [x_data], x_data + 42.0, mod=mod)
+    check_result(target, dev, [x_data], x_data + 42.0, mod)
 
 
 def test_let_scalar(target, dev):
@@ -342,7 +342,7 @@ def test_let_scalar(target, dev):
     x_data = np.array(np.random.rand()).astype("float32")
     mod = tvm.IRModule()
     mod["main"] = f
-    check_result(target, dev, [x_data], x_data + 42.0, mod=mod)
+    check_result(target, dev, [x_data], x_data + 42.0, mod)
 
 
 def test_compose(target, dev):
@@ -616,7 +616,7 @@ def test_add_op_scalar(target, dev):
     ]
     for (x_data, y_data) in x_y_data:
         mod["main"] = func
-        check_result(target, dev, [x_data, y_data], x_data + y_data, mod=mod)
+        check_result(target, dev, [x_data, y_data], x_data + y_data, mod)
 
 
 def test_add_op_scalar_int(target, dev):
@@ -637,7 +637,7 @@ def test_add_op_scalar_int(target, dev):
     ]
     for (x_data, y_data) in x_y_data:
         mod["main"] = func
-        check_result(target, dev, [x_data, y_data], x_data + y_data, mod=mod)
+        check_result(target, dev, [x_data, y_data], x_data + y_data, mod)
 
 
 def test_add_op_tensor(target, dev):
@@ -654,7 +654,7 @@ def test_add_op_tensor(target, dev):
     x_data = np.random.rand(10, 5).astype("float32")
     y_data = np.random.rand(10, 5).astype("float32")
     mod["main"] = func
-    check_result(target, dev, [x_data, y_data], x_data + y_data, mod=mod)
+    check_result(target, dev, [x_data, y_data], x_data + y_data, mod)
 
 
 def test_add_op_broadcast(target, dev):
@@ -671,7 +671,7 @@ def test_add_op_broadcast(target, dev):
     x_data = np.random.rand(10, 5).astype("float32")
     y_data = np.random.rand(1, 5).astype("float32")
     mod["main"] = func
-    check_result(target, dev, [x_data, y_data], x_data + y_data, mod=mod)
+    check_result(target, dev, [x_data, y_data], x_data + y_data, mod)
 
 
 def test_vm_optimize_dynamic():
@@ -717,7 +717,7 @@ def body_with_free_var(i, acc):
         ret = relay.TupleGetItem(tup, 1)
         mod = tvm.IRModule()
         mod["main"] = relay.Function(relay.analysis.free_vars(ret), ret)
-        check_result(target, dev, args, expected, mod=mod)
+        check_result(target, dev, args, expected, mod)
 
 
 def test_vm_reshape_tensor(target, dev):
@@ -1040,8 +1040,8 @@ def @main(%a: Tensor[(5, 7), float32],
     # - The offset of the tensor within the storage (second arg) to alloc_tensor
     # Both should be on the CPU
     assert "VirtualDevice[0]: device type 1" in exe.virtual_devices
-    assert "Const[0]: has shape int64[] on device index 0" in exe.constants
-    assert "Const[1]: has shape int64[] on device index 0" in exe.constants
+    assert "VM Const[0]: NDArray[(),int64,(1,0)]=[140] on device index 0" in exe.constants
+    assert "VM Const[1]: NDArray[(),int64,(1,0)]=[0] on device index 0" in exe.constants
 
 
 @tvm.testing.requires_cuda
@@ -1073,7 +1073,7 @@ def @main(%x: Tensor[(2, 8), float32],
 
     # The newshape annotation should have been turned into a constant on the CPU.
     assert "VirtualDevice[0]: device type 1" in exe.virtual_devices
-    assert "Const[0]: has shape int64[3] on device index 0" in exe.constants
+    assert "VM Const[0]: NDArray[(3),int64,(1,0)]=[2,4,2] on device index 0" in exe.constants
 
 
 @tvm.testing.requires_cuda