diff --git a/src/relay/backend/graph_executor_codegen.cc b/src/relay/backend/graph_executor_codegen.cc index af426e5c71cb..6ab64b25aeb8 100644 --- a/src/relay/backend/graph_executor_codegen.cc +++ b/src/relay/backend/graph_executor_codegen.cc @@ -316,6 +316,12 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslatorattrs_["device_index"] = device_types; } + // storage scope + std::vector storage_scope; + for (const auto& virtual_device : storage_info->virtual_devices) { + storage_scope.push_back(std::string(virtual_device->memory_scope)); + } + node->attrs_["storage_scope"] = std::move(storage_scope); auto node_id = nodes_.size(); nodes_.push_back(node); // Tuple return value, flatten as tuple @@ -432,7 +438,6 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslatorattrs.defined()) { // Call is an extern function - std::cout << "call_node: \n" << PrettyPrint(call) << std::endl; const auto* func = call_node->op.as(); ICHECK(func) << "Expected the operator to be a global var, but got " << call_node->op->GetTypeKey(); // getting a relay fn here, not sure why. @@ -529,12 +534,15 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator storage_ids; + std::vector storage_scopes; std::vector device_types; std::vector dltypes; std::vector node_row_ptr{0}; for (auto node : nodes_) { const auto& shape_vec = dmlc::get(node->attrs_["shape"]); const auto& storage_id = dmlc::get>(node->attrs_["storage_id"]); + const auto& storage_scope = + dmlc::get>(node->attrs_["storage_scope"]); const auto& dtype_vec = dmlc::get>(node->attrs_["dtype"]); ICHECK_EQ(node->num_outputs_, shape_vec.size()); @@ -543,12 +551,25 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslatorattrs_.count("device_index")) { const auto& dev_types = dmlc::get>(node->attrs_["device_index"]); device_types.insert(device_types.end(), dev_types.begin(), dev_types.end()); } node_row_ptr.push_back(num_entry); } + + // verification if storage_scope contains any non global memory scope + // in other case it's better not to write scopes to the JSON at all + bool global_only_scope = true; + for (const auto& ss : storage_scopes) { + if (!(ss.empty() || ss == "global")) { + global_only_scope = false; + } + } + if (global_only_scope) { + storage_scopes.clear(); + } writer->BeginObject(); writer->WriteObjectKeyValue("nodes", nodes_); writer->WriteObjectKeyValue("arg_nodes", arg_nodes); @@ -562,6 +583,10 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslatorWriteObjectKeyValue("attrs", attrs); diff --git a/src/runtime/graph_executor/graph_executor.cc b/src/runtime/graph_executor/graph_executor.cc index 8ae98d930f13..78e65f6f2319 100644 --- a/src/runtime/graph_executor/graph_executor.cc +++ b/src/runtime/graph_executor/graph_executor.cc @@ -42,6 +42,7 @@ #include #include "../file_utils.h" +#include "../texture.h" namespace tvm { namespace runtime { @@ -51,6 +52,7 @@ inline size_t GetDataAlignment(const DLTensor& arr) { if (align < kAllocAlignment) return kAllocAlignment; return align; } +constexpr auto Is2DStorage = IsTextureStorage; } // namespace details /*! @@ -361,24 +363,16 @@ void GraphExecutor::SetupStorage() { // Find the maximum space size. for (size_t i = 0; i < attrs_.shape.size(); ++i) { int storage_id = attrs_.storage_id[i]; + std::string storage_scope = attrs_.storage_scope.empty() ? "" : attrs_.storage_scope[i]; // Use the fallback device if no device index is available. int device_type = static_cast(devices_[0].device_type); if (!attrs_.device_index.empty()) { device_type = attrs_.device_index[i]; } - size_t size = 1; - for (int64_t sz : attrs_.shape[i]) { - size *= static_cast(sz); - } - ICHECK_GE(storage_id, 0) << "Do not support runtime shape op"; - DLDataType t = vtype[i]; - size_t bits = t.bits * t.lanes; - ICHECK(bits % 8U == 0U || bits == 1U || bits == 4U); - size_t bytes = ((bits + 7U) / 8U) * size; uint32_t sid = static_cast(storage_id); if (sid >= pool_entry.size()) { - pool_entry.resize(sid + 1, {0, -1}); + pool_entry.resize(sid + 1, {-1, {0}, {}}); } else { ICHECK(pool_entry[sid].device_type == -1 || pool_entry[sid].device_type == device_type) << "The same pool entry cannot be assigned to multiple devices"; @@ -395,8 +389,38 @@ void GraphExecutor::SetupStorage() { pool_entry[sid].linked_param = lookup_rv; } pool_entry[sid].param_data_entry = i; - pool_entry[sid].size = std::max(pool_entry[sid].size, bytes); pool_entry[sid].device_type = device_type; + pool_entry[sid].scope = storage_scope; + + DLDataType t = vtype[i]; + if (!details::Is2DStorage(storage_scope)) { + size_t size = 1; + for (int64_t sz : attrs_.shape[i]) { + size *= static_cast(sz); + } + size_t bits = t.bits * t.lanes; + ICHECK(bits % 8U == 0U || bits == 1U || bits == 4U); + int64_t bytes = ((bits + 7U) / 8U) * size; + pool_entry[sid].shape[0] = std::max(pool_entry[sid].shape[0], bytes); + pool_entry[sid].dtype = DLDataType{kDLFloat, 32, 1}; + } else { + if (pool_entry[sid].shape.size() == 1) { + pool_entry[sid].shape.resize(3, 0); + } + size_t axis = runtime::DefaultTextureLayoutSeparator(attrs_.shape[i].size(), storage_scope); + auto shape = ApplyTexture2DFlattening(attrs_.shape[i], attrs_.shape[i].size(), axis); + pool_entry[sid].shape[0] = std::max(pool_entry[sid].shape[0], shape.height); + pool_entry[sid].shape[1] = std::max(pool_entry[sid].shape[1], shape.width); + CHECK(pool_entry[sid].shape[2] == 0 || pool_entry[sid].shape[2] == shape.channel) + << pool_entry[sid].shape[2] << " != " << shape.channel + << ", texture channel length must be consistent within a storage pool"; + pool_entry[sid].shape[2] = shape.channel; + CHECK(pool_entry[sid].dtype.bits == 0 || TypeEqual(pool_entry[sid].dtype, t)) + << DLDataType2String(pool_entry[sid].dtype) << " != " << DLDataType2String(t) + << ", pool entry for 2d texure allocations must be of the same type;" + << " downstream error from memory planner likely"; + pool_entry[sid].dtype = t; + } } // Allocate the space. @@ -410,9 +434,15 @@ void GraphExecutor::SetupStorage() { if (pit.linked_param.defined()) { storage_pool_.push_back(pit.linked_param); } else { - std::vector shape; - shape.push_back(static_cast(pit.size + 3) / 4); - storage_pool_.push_back(NDArray::Empty(shape, DLDataType{kDLFloat, 32, 1}, dev)); + std::vector shape = pit.shape; + if (shape.size() == 1) { + shape[0] = (shape[0] + 3) / 4; + } + Optional mem_scope; + if (!pit.scope.empty()) { + mem_scope = String(pit.scope); + } + storage_pool_.push_back(NDArray::Empty(shape, pit.dtype, dev, mem_scope)); } } diff --git a/src/runtime/graph_executor/graph_executor.h b/src/runtime/graph_executor/graph_executor.h index 2564f5b0d924..bbe94636b3a1 100644 --- a/src/runtime/graph_executor/graph_executor.h +++ b/src/runtime/graph_executor/graph_executor.h @@ -204,10 +204,12 @@ class TVM_DLL GraphExecutor : public ModuleNode { protected: // Memory pool entry. struct PoolEntry { - size_t size; int device_type; + std::vector shape; + DLDataType dtype; int param_data_entry; NDArray linked_param; + std::string scope; // PoolEntry(int s, int dev_type, void* pre_linked_param) : // size(s), device_type(dev_type), pre_linked_param(std::move(pre_linked_param)) {} }; @@ -303,6 +305,7 @@ class TVM_DLL GraphExecutor : public ModuleNode { std::vector storage_id; std::vector device_index; std::vector dltype; + std::vector storage_scope; std::vector> shape; // The graph attribute fields. void Load(dmlc::JSONReader* reader) { @@ -328,6 +331,15 @@ class TVM_DLL GraphExecutor : public ModuleNode { reader->Read(&storage_id); ICHECK(!reader->NextArrayItem()); bitmask |= 2; + } else if (key == "storage_scope") { + reader->BeginArray(); + ICHECK(reader->NextArrayItem()); + reader->Read(&type); + ICHECK_EQ(type, "list_str"); + ICHECK(reader->NextArrayItem()); + reader->Read(&storage_scope); + ICHECK(!reader->NextArrayItem()); + bitmask |= 1; } else if (key == "shape") { reader->BeginArray(); ICHECK(reader->NextArrayItem()); diff --git a/src/target/source/codegen_opencl.cc b/src/target/source/codegen_opencl.cc index 5d04d00339fc..e8d47b720bf6 100644 --- a/src/target/source/codegen_opencl.cc +++ b/src/target/source/codegen_opencl.cc @@ -98,7 +98,7 @@ std::string CodeGenOpenCL::Finish() { "#pragma OPENCL EXTENSION cl_amd_fp16 : enable\n" "#else\n" "#error \"Half precision floating point not supported" - "by OpenCL implementation on your device.\" \n" + " by OpenCL implementation on your device.\" \n" "#endif\n\n"; } @@ -109,7 +109,7 @@ std::string CodeGenOpenCL::Finish() { "#pragma OPENCL EXTENSION cl_amd_fp64 : enable\n" "#else\n" "#error \"Double precision floating point not supported" - "by OpenCL implementation on your device.\" \n" + " by OpenCL implementation on your device.\" \n" "#endif\n\n"; }