Skip to content

Commit

Permalink
[Texture support][Part 0] Device API and runtime support (#7711)
Browse files Browse the repository at this point in the history
* Add TVMBackendAllocTexture and support in OpenCL device API.

* Add runtime optimized caching allocator.
This should be replaced with AOT memory planning
when the relay/tir/compile engine refactor lands.

* Few bug fixes for runtime texture allocator.

* Add OpenCL device api support for image2d<float16> textures.

* Update OpenCL DeviceAPI to support Image2D data space
allocations and copying to/from host/image2d directly.
Allocation employs a lowering convention to 2d images
for activations and weights.

* Fix to follow OpenCL spec. for indexing.

* Rename texture_pool.h -> texture.h

* Move Nd to 2d lowering convention code into runtime texture
utilities that can be shared by codegen and the runtime.

* Update texture lowering utilities

* Add TODO comment about pitch support

* Remove FreeTexture

* Fix ICHECK comment

* Partial cherry pick from @ZihengJiang
[email protected]:ZihengJiang/tvm.git:52822c5bd
[RUNTIME] OpenCL texture memory.

* Remove runtime and device texture APIs.

* Add OpenCL packed functions for texture workspace (de)allocations.

* Add OpenCLBuffer structure to track
memory layout through OpenCL Device API.

* Rebase: TVMContext -> Device

* Implement DLTensor* overload of CopyDataToFrom in OpenCL DeviceAPI.

* Implement OpenCL CopyDataFromTo(DLTensor*...)
overload and tensor shapes to calculate image extent
when copying date directly to or from texture cache.

* Update format (cpp-lint)

* Update format (clang)

* Buffer descriptor name change and formatting.

* Add texture pool documentation.

* Update runtime to use new global.texture scope.

* Move texture_pool.cc into opencl impl.

* Add test coverage for copying in and out
of storage allocs of texture scope.

* Documented APIs and structures, renamed buffer descriptor layout tags.

Co-authored-by: ZihengJiang <[email protected]>
  • Loading branch information
csullivan and ZihengJiang authored Jun 5, 2021
1 parent f4ec5fd commit 010d11b
Show file tree
Hide file tree
Showing 6 changed files with 703 additions and 31 deletions.
64 changes: 58 additions & 6 deletions src/runtime/opencl/opencl_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@
#include "../file_utils.h"
#include "../meta_data.h"
#include "../pack_args.h"
#include "../texture.h"
#include "../thread_storage_scope.h"
#include "../workspace_pool.h"

Expand Down Expand Up @@ -174,6 +175,29 @@ inline const char* CLGetErrorString(cl_int error) {
}
}

inline cl_channel_type DTypeToOpenCLChannelType(DLDataType data_type) {
DataType dtype(data_type);
if (dtype == DataType::Float(32)) {
return CL_FLOAT;
} else if (dtype == DataType::Float(16)) {
return CL_HALF_FLOAT;
} else if (dtype == DataType::Int(8)) {
return CL_SIGNED_INT8;
} else if (dtype == DataType::Int(16)) {
return CL_SIGNED_INT16;
} else if (dtype == DataType::Int(32)) {
return CL_SIGNED_INT32;
} else if (dtype == DataType::UInt(8)) {
return CL_UNSIGNED_INT8;
} else if (dtype == DataType::UInt(16)) {
return CL_UNSIGNED_INT16;
} else if (dtype == DataType::UInt(32)) {
return CL_UNSIGNED_INT32;
}
LOG(FATAL) << "data type is not supported in OpenCL runtime yet: " << dtype;
return CL_FLOAT;
}

/*!
* \brief Protected OpenCL call
* \param func Expression to call.
Expand Down Expand Up @@ -243,11 +267,18 @@ class OpenCLWorkspace : public DeviceAPI {
void SetDevice(Device dev) final;
void GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* rv) final;
void* AllocDataSpace(Device dev, size_t size, size_t alignment, DLDataType type_hint) final;
void* AllocDataSpace(Device dev, int ndim, const int64_t* shape, DLDataType dtype,
Optional<String> mem_scope = NullOpt) final;
void FreeDataSpace(Device dev, void* ptr) final;
void StreamSync(Device dev, TVMStreamHandle stream) final;
void* AllocWorkspace(Device dev, size_t size, DLDataType type_hint) final;
void FreeWorkspace(Device dev, void* data) final;

// Texture (image2d_t) alloca APIs
cl_mem AllocTexture(Device dev, size_t width, size_t height, DLDataType type_hint);
void* AllocTextureWorkspace(Device dev, size_t width, size_t height, DLDataType type_hint);
void FreeTextureWorkspace(Device dev, void* data);

/*!
* \brief Get the thread local ThreadEntry
*/
Expand All @@ -256,10 +287,7 @@ class OpenCLWorkspace : public DeviceAPI {
// get the global workspace
static OpenCLWorkspace* Global();

protected:
void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size,
Device dev_from, Device dev_to, DLDataType type_hint,
TVMStreamHandle stream) final;
void CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) final;
};

/*! \brief Thread local workspace */
Expand All @@ -278,9 +306,11 @@ class OpenCLThreadEntry {
std::vector<KTEntry> kernel_table;
/*! \brief workspace pool */
WorkspacePool pool;
/*! \brief texture pool */
TexturePool texture_pool;
// constructor
OpenCLThreadEntry(DLDeviceType device_type, DeviceAPI* device_api)
: pool(device_type, device_api) {
: pool(device_type, device_api), texture_pool(device_type, device_api) {
device.device_id = 0;
device.device_type = device_type;
}
Expand All @@ -289,6 +319,29 @@ class OpenCLThreadEntry {
// get the global workspace
static OpenCLThreadEntry* ThreadLocal();
};

/*! \brief OpenCL runtime buffer structure with tracked memory layout */
struct BufferDescriptor {
enum class MemoryLayout {
/*! \brief One dimensional buffer in row-major layout*/
kBuffer1D,
/*! \brief Two dimensional texture w/ width = axis[-1]
* e.g. image2d[height=NCH, width=W]
*/
kImage2DActivation,
/*! \brief Two dimensional texture w/ height = axis[0]
* e.g. image2d[height=O, width=IHW]
*/
kImage2DWeight,
};
BufferDescriptor() = default;
explicit BufferDescriptor(Optional<String> scope) : layout(MemoryLayoutFromScope(scope)) {}
static MemoryLayout MemoryLayoutFromScope(Optional<String> mem_scope);
static String ScopeFromMemoryLayout(MemoryLayout mem_scope);

cl_mem buffer{nullptr};
MemoryLayout layout{MemoryLayout::kBuffer1D};
};
} // namespace cl

// Module to support thread-safe multi-device execution.
Expand Down Expand Up @@ -349,7 +402,6 @@ class OpenCLModuleNode : public ModuleNode {
// parsed kernel data
std::unordered_map<std::string, std::string> parsed_kernels_;
};

} // namespace runtime
} // namespace tvm
#endif // TVM_RUNTIME_OPENCL_OPENCL_COMMON_H_
215 changes: 191 additions & 24 deletions src/runtime/opencl/opencl_device_api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,63 @@ namespace cl {
std::string GetPlatformInfo(cl_platform_id pid, cl_platform_info param_name);
std::string GetDeviceInfo(cl_device_id pid, cl_device_info param_name);

struct ImageInfo {
size_t origin[3] = {};
size_t region[3] = {};
size_t row_pitch = 0;
size_t slice_pitch = 0;
};

/*!
* \brief Utility to apply a memory layout specific lowering convention
* to infer the physical shape from the provided DLTensor's logical shape.
* \param desc Descriptor which contains the buffer and layout tag.
* \param The DLTensor used to infer the tensors physical shape.
*/
ImageInfo GetImageInfo(const cl::BufferDescriptor* desc, const DLTensor* tensor) {
ImageInfo info{};
ICHECK(tensor->dtype.lanes == 1) << "Image dtype has lanes: " << tensor->dtype.lanes;

info.origin[0] = info.origin[1] = info.origin[2] = 0;
info.row_pitch = 0;
info.slice_pitch = 0;

size_t axis = DefaultTextureLayoutSeparator(
tensor->ndim, cl::BufferDescriptor::ScopeFromMemoryLayout(desc->layout));
auto texture_shape = ApplyTexture2DFlattening<int64_t>(tensor->shape, tensor->ndim, axis);
info.region[0] = texture_shape.width;
info.region[1] = texture_shape.height;
info.region[2] = 1;
return info;
}

cl::BufferDescriptor::MemoryLayout cl::BufferDescriptor::MemoryLayoutFromScope(
Optional<String> mem_scope) {
if (!mem_scope.defined()) {
return cl::BufferDescriptor::MemoryLayout::kBuffer1D;
} else if (mem_scope.value() == "global.texture") {
return cl::BufferDescriptor::MemoryLayout::kImage2DActivation;
} else if (mem_scope.value() == "global.texture-weight") {
return cl::BufferDescriptor::MemoryLayout::kImage2DWeight;
}
LOG(FATAL) << "No memory layout defined for memory of scope: " << mem_scope.value();
return cl::BufferDescriptor::MemoryLayout::kBuffer1D;
}

String cl::BufferDescriptor::ScopeFromMemoryLayout(cl::BufferDescriptor::MemoryLayout layout) {
switch (layout) {
case cl::BufferDescriptor::MemoryLayout::kBuffer1D:
return "global";
case cl::BufferDescriptor::MemoryLayout::kImage2DActivation:
return "global.texture";
case cl::BufferDescriptor::MemoryLayout::kImage2DWeight:
return "global.texture-weight";
}
LOG(FATAL) << "No scope corresponding to the provided memory layout: "
<< static_cast<int>(layout);
return "";
}

OpenCLThreadEntry* OpenCLWorkspace::GetThreadEntry() { return OpenCLThreadEntry::ThreadLocal(); }

OpenCLWorkspace* OpenCLWorkspace::Global() {
Expand Down Expand Up @@ -138,41 +195,118 @@ void* OpenCLWorkspace::AllocDataSpace(Device dev, size_t size, size_t alignment,
this->Init();
ICHECK(context != nullptr) << "No OpenCL device";
cl_int err_code;
cl_mem mptr = clCreateBuffer(this->context, CL_MEM_READ_WRITE, size, nullptr, &err_code);
cl::BufferDescriptor* desc = new cl::BufferDescriptor;
desc->buffer = clCreateBuffer(this->context, CL_MEM_READ_WRITE, size, nullptr, &err_code);
desc->layout = cl::BufferDescriptor::MemoryLayout::kBuffer1D;
OPENCL_CHECK_ERROR(err_code);
return mptr;
return desc;
}

void* OpenCLWorkspace::AllocDataSpace(Device dev, int ndim, const int64_t* shape, DLDataType dtype,
Optional<String> mem_scope) {
if (!mem_scope.defined() || mem_scope.value() == "global") {
return DeviceAPI::AllocDataSpace(dev, ndim, shape, dtype, mem_scope);
}
ICHECK(IsTextureStorage(std::string(mem_scope.value())))
<< "Device does not support allocate data space with "
<< "specified memory scope: " << mem_scope.value();

ICHECK(ndim > 2) << "Shape for texture allocation must be at least rank 3; "
<< "provided shape is rank " << ndim;

cl::BufferDescriptor* desc = new cl::BufferDescriptor(mem_scope);
size_t axis = DefaultTextureLayoutSeparator(ndim, mem_scope.value());
auto texture = ApplyTexture2DFlattening<int64_t>(shape, ndim, axis);
desc->buffer = AllocTexture(dev, texture.width, texture.height, dtype);
return desc;
}

void OpenCLWorkspace::FreeDataSpace(Device dev, void* ptr) {
// We have to make sure that the memory object is not in the command queue
// for some OpenCL platforms.
OPENCL_CALL(clFinish(this->GetQueue(dev)));

cl_mem mptr = static_cast<cl_mem>(ptr);
OPENCL_CALL(clReleaseMemObject(mptr));
cl::BufferDescriptor* desc = static_cast<cl::BufferDescriptor*>(ptr);
OPENCL_CALL(clReleaseMemObject(desc->buffer));
delete desc;
}

void OpenCLWorkspace::CopyDataFromTo(const void* from, size_t from_offset, void* to,
size_t to_offset, size_t size, Device dev_from, Device dev_to,
DLDataType type_hint, TVMStreamHandle stream) {
cl_mem OpenCLWorkspace::AllocTexture(Device dev, size_t width, size_t height,
DLDataType type_hint) {
this->Init();
ICHECK(stream == nullptr);
if (IsOpenCLDevice(dev_from) && IsOpenCLDevice(dev_to)) {
OPENCL_CALL(clEnqueueCopyBuffer(this->GetQueue(dev_to),
static_cast<cl_mem>((void*)from), // NOLINT(*)
static_cast<cl_mem>(to), from_offset, to_offset, size, 0,
nullptr, nullptr));
} else if (IsOpenCLDevice(dev_from) && dev_to.device_type == kDLCPU) {
OPENCL_CALL(clEnqueueReadBuffer(this->GetQueue(dev_from),
static_cast<cl_mem>((void*)from), // NOLINT(*)
CL_FALSE, from_offset, size, static_cast<char*>(to) + to_offset,
0, nullptr, nullptr));
OPENCL_CALL(clFinish(this->GetQueue(dev_from)));
} else if (dev_from.device_type == kDLCPU && IsOpenCLDevice(dev_to)) {
OPENCL_CALL(clEnqueueWriteBuffer(this->GetQueue(dev_to), static_cast<cl_mem>(to), CL_FALSE,
to_offset, size, static_cast<const char*>(from) + from_offset,
0, nullptr, nullptr));
OPENCL_CALL(clFinish(this->GetQueue(dev_to)));
ICHECK(context != nullptr) << "No OpenCL device";
cl_int err_code;
cl_channel_type cl_type = DTypeToOpenCLChannelType(type_hint);
cl_image_format format = {CL_RGBA, cl_type};
cl_image_desc descriptor = {CL_MEM_OBJECT_IMAGE2D, width, height, 0, 0, 0, 0, 0, 0};
cl_mem mptr =
clCreateImage(this->context, CL_MEM_READ_WRITE, &format, &descriptor, nullptr, &err_code);
OPENCL_CHECK_ERROR(err_code);
return mptr;
}

void* OpenCLWorkspace::AllocTextureWorkspace(Device dev, size_t width, size_t height,
DLDataType type_hint) {
return GetThreadEntry()->texture_pool.AllocTexture(dev, width, height, type_hint);
}

void OpenCLWorkspace::FreeTextureWorkspace(Device dev, void* ptr) {
GetThreadEntry()->texture_pool.FreeTexture(dev, ptr);
}

void OpenCLWorkspace::CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) {
size_t nbytes = GetDataSize(*from);
ICHECK_EQ(nbytes, GetDataSize(*to));
ICHECK(IsContiguous(*from) && IsContiguous(*to))
<< "CopyDataFromTo only support contiguous array for now";

if (IsOpenCLDevice(from->device) && IsOpenCLDevice(to->device)) {
const auto* from_desc = static_cast<const cl::BufferDescriptor*>(from->data);
ICHECK(from_desc->layout == cl::BufferDescriptor::MemoryLayout::kBuffer1D)
<< "Device to device copying is currently only implemented for OpenCL buffer storage";
auto* to_desc = static_cast<cl::BufferDescriptor*>(to->data);
OPENCL_CALL(clEnqueueCopyBuffer(this->GetQueue(to->device), from_desc->buffer, to_desc->buffer,
from->byte_offset, to->byte_offset, nbytes, 0, nullptr,
nullptr));
} else if (IsOpenCLDevice(from->device) && to->device.device_type == kDLCPU) {
const auto* from_desc = static_cast<const cl::BufferDescriptor*>(from->data);
switch (from_desc->layout) {
case cl::BufferDescriptor::MemoryLayout::kBuffer1D:
OPENCL_CALL(clEnqueueReadBuffer(
this->GetQueue(from->device), from_desc->buffer, CL_FALSE, from->byte_offset, nbytes,
static_cast<char*>(to->data) + to->byte_offset, 0, nullptr, nullptr));
break;
case cl::BufferDescriptor::MemoryLayout::kImage2DActivation:
case cl::BufferDescriptor::MemoryLayout::kImage2DWeight:
auto image_info = GetImageInfo(from_desc, from);
// TODO(csullivan): Support calculating row_pitch correctly in the case of reuse.
// Note that when utilizing texture pools for memory reuse, the allocated image
// size can be larger than the size to be read.
OPENCL_CALL(clEnqueueReadImage(
this->GetQueue(from->device), from_desc->buffer, CL_FALSE, image_info.origin,
image_info.region, image_info.row_pitch, image_info.slice_pitch,
static_cast<char*>(to->data) + to->byte_offset, 0, nullptr, nullptr));
break;
}
OPENCL_CALL(clFinish(this->GetQueue(from->device)));
} else if (from->device.device_type == kDLCPU && IsOpenCLDevice(to->device)) {
auto* to_desc = static_cast<cl::BufferDescriptor*>(to->data);
switch (to_desc->layout) {
case cl::BufferDescriptor::MemoryLayout::kBuffer1D:
OPENCL_CALL(clEnqueueWriteBuffer(
this->GetQueue(to->device), to_desc->buffer, CL_FALSE, to->byte_offset, nbytes,
static_cast<const char*>(from->data) + from->byte_offset, 0, nullptr, nullptr));
break;
case cl::BufferDescriptor::MemoryLayout::kImage2DActivation:
case cl::BufferDescriptor::MemoryLayout::kImage2DWeight:
auto image_info = GetImageInfo(to_desc, to);
OPENCL_CALL(clEnqueueWriteImage(
this->GetQueue(to->device), to_desc->buffer, CL_FALSE, image_info.origin,
image_info.region, image_info.row_pitch, image_info.slice_pitch,
static_cast<const char*>(from->data) + from->byte_offset, 0, nullptr, nullptr));
break;
}
OPENCL_CALL(clFinish(this->GetQueue(to->device)));
} else {
LOG(FATAL) << "Expect copy from/to OpenCL or between OpenCL";
}
Expand Down Expand Up @@ -291,6 +425,39 @@ void OpenCLWorkspace::Init(const std::string& type_key, const std::string& devic
initialized_ = true;
}

TVM_REGISTER_GLOBAL("device_api.opencl.AllocTexture").set_body([](TVMArgs args, TVMRetValue* rv) {
int device_type = args[0];
int device_id = args[1];
int width = args[2];
int height = args[3];
int dtype_code_hint = args[4];
int dtype_bits_hint = args[5];
Device dev;
dev.device_type = static_cast<DLDeviceType>(device_type);
dev.device_id = device_id;

DLDataType type_hint;
type_hint.code = static_cast<decltype(type_hint.code)>(dtype_code_hint);
type_hint.bits = static_cast<decltype(type_hint.bits)>(dtype_bits_hint);
type_hint.lanes = 1;

OpenCLWorkspace* ptr = OpenCLWorkspace::Global();
*rv = ptr->AllocTextureWorkspace(dev, static_cast<size_t>(width), static_cast<size_t>(height),
type_hint);
});

TVM_REGISTER_GLOBAL("device_api.opencl.FreeTexture").set_body([](TVMArgs args, TVMRetValue* rv) {
int device_type = args[0];
int device_id = args[1];
void* data = args[2];
OpenCLWorkspace* ptr = OpenCLWorkspace::Global();
Device dev;
dev.device_type = static_cast<DLDeviceType>(device_type);
dev.device_id = device_id;
ptr->FreeTextureWorkspace(dev, data);
*rv = static_cast<int32_t>(0);
});

TVM_REGISTER_GLOBAL("device_api.opencl").set_body([](TVMArgs args, TVMRetValue* rv) {
DeviceAPI* ptr = OpenCLWorkspace::Global();
*rv = static_cast<void*>(ptr);
Expand Down
3 changes: 2 additions & 1 deletion src/runtime/opencl/opencl_module.cc
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,8 @@ class OpenCLWrappedFunc {
}
// setup arguments.
for (cl_uint i = 0; i < arg_size_.size(); ++i) {
OPENCL_CALL(clSetKernelArg(kernel, i, arg_size_[i], void_args[i]));
auto* arg = static_cast<cl::BufferDescriptor*>(void_args[i]);
OPENCL_CALL(clSetKernelArg(kernel, i, arg_size_[i], arg->buffer));
}
cl_command_queue queue = w_->GetQueue(t->device);
ThreadWorkLoad wl = thread_axis_cfg_.Extract(args);
Expand Down
Loading

0 comments on commit 010d11b

Please sign in to comment.