From dc09fd90ce602f8b1e4006bd5a9ce75ff6bf12ac Mon Sep 17 00:00:00 2001 From: Abhishek Bagusetty Date: Fri, 24 Mar 2023 17:26:01 +0000 Subject: [PATCH 1/5] [SYCL] fix global_work_size kernel query descriptor --- sycl/plugins/cuda/pi_cuda.cpp | 162 +++++++++++++--------- sycl/plugins/level_zero/pi_level_zero.cpp | 11 +- 2 files changed, 105 insertions(+), 68 deletions(-) diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index 9e493502645a9..895fcabb2a577 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -2912,72 +2912,106 @@ pi_result cuda_piKernelGetGroupInfo(pi_kernel kernel, pi_device device, size_t param_value_size, void *param_value, size_t *param_value_size_ret) { + PI_ASSERT(kernel, PI_ERROR_INVALID_KERNEL); + PI_ASSERT(device, PI_ERROR_INVALID_DEVICE); + // Here we want to query about a kernel's cuda blocks! - if (kernel != nullptr) { + switch (param_name) { + case PI_KERNEL_GROUP_INFO_GLOBAL_WORK_SIZE: { + size_t global_work_size[3] = {0, 0, 0}; - switch (param_name) { - case PI_KERNEL_GROUP_INFO_WORK_GROUP_SIZE: { - int max_threads = 0; - sycl::detail::pi::assertion( - cuFuncGetAttribute(&max_threads, - CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, - kernel->get()) == CUDA_SUCCESS); - return getInfo(param_value_size, param_value, param_value_size_ret, - size_t(max_threads)); - } - case PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE: { - size_t group_size[3] = {0, 0, 0}; - const auto &reqd_wg_size_md_map = - kernel->program_->kernelReqdWorkGroupSizeMD_; - const auto reqd_wg_size_md = reqd_wg_size_md_map.find(kernel->name_); - if (reqd_wg_size_md != reqd_wg_size_md_map.end()) { - const auto reqd_wg_size = reqd_wg_size_md->second; - group_size[0] = std::get<0>(reqd_wg_size); - group_size[1] = std::get<1>(reqd_wg_size); - group_size[2] = std::get<2>(reqd_wg_size); - } - return getInfoArray(3, param_value_size, param_value, - param_value_size_ret, group_size); - } - case PI_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE: { - // OpenCL LOCAL == CUDA SHARED - int bytes = 0; - sycl::detail::pi::assertion( - cuFuncGetAttribute(&bytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, - kernel->get()) == CUDA_SUCCESS); - return getInfo(param_value_size, param_value, param_value_size_ret, - pi_uint64(bytes)); - } - case PI_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE: { - // Work groups should be multiples of the warp size - int warpSize = 0; - sycl::detail::pi::assertion( - cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, - device->get()) == CUDA_SUCCESS); - return getInfo(param_value_size, param_value, param_value_size_ret, - static_cast(warpSize)); - } - case PI_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE: { - // OpenCL PRIVATE == CUDA LOCAL - int bytes = 0; - sycl::detail::pi::assertion( - cuFuncGetAttribute(&bytes, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, - kernel->get()) == CUDA_SUCCESS); - return getInfo(param_value_size, param_value, param_value_size_ret, - pi_uint64(bytes)); - } - case PI_KERNEL_GROUP_INFO_NUM_REGS: { - int numRegs = 0; - sycl::detail::pi::assertion( - cuFuncGetAttribute(&numRegs, CU_FUNC_ATTRIBUTE_NUM_REGS, - kernel->get()) == CUDA_SUCCESS); - return getInfo(param_value_size, param_value, param_value_size_ret, - pi_uint32(numRegs)); - } - default: - __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name); - } + size_t max_block_dimX{0}, max_block_dimY{0}, max_block_dimZ{0}; + sycl::detail::pi::assertion( + cuDeviceGetAttribute(&max_block_dimX, + CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, + device->get()) == CUDA_SUCCESS); + sycl::detail::pi::assertion( + cuDeviceGetAttribute(&max_block_dimY, + CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, + device->get()) == CUDA_SUCCESS); + sycl::detail::pi::assertion( + cuDeviceGetAttribute(&max_block_dimZ, + CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, + device->get()) == CUDA_SUCCESS); + + size_t max_grid_dimX{0}, max_grid_dimY{0}, max_grid_dimZ{0}; + sycl::detail::pi::assertion( + cuDeviceGetAttribute(&max_grid_dimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, + device->get()) == CUDA_SUCCESS); + sycl::detail::pi::assertion( + cuDeviceGetAttribute(&max_grid_dimY, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, + device->get()) == CUDA_SUCCESS); + sycl::detail::pi::assertion( + cuDeviceGetAttribute(&max_grid_dimZ, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, + device->get()) == CUDA_SUCCESS); + + global_work_size[0] = max_block_dimX * max_grid_dimX; + global_work_size[1] = max_block_dimY * max_grid_dimY; + global_work_size[2] = max_block_dimZ * max_grid_dimZ; + return getInfoArray(3, param_value_size, param_value, param_value_size_ret, + global_work_size); + } + case PI_KERNEL_GROUP_INFO_WORK_GROUP_SIZE: { + int max_threads = 0; + sycl::detail::pi::assertion( + cuFuncGetAttribute(&max_threads, + CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, + kernel->get()) == CUDA_SUCCESS); + return getInfo(param_value_size, param_value, param_value_size_ret, + size_t(max_threads)); + } + case PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE: { + size_t group_size[3] = {0, 0, 0}; + const auto &reqd_wg_size_md_map = + kernel->program_->kernelReqdWorkGroupSizeMD_; + const auto reqd_wg_size_md = reqd_wg_size_md_map.find(kernel->name_); + if (reqd_wg_size_md != reqd_wg_size_md_map.end()) { + const auto reqd_wg_size = reqd_wg_size_md->second; + group_size[0] = std::get<0>(reqd_wg_size); + group_size[1] = std::get<1>(reqd_wg_size); + group_size[2] = std::get<2>(reqd_wg_size); + } + return getInfoArray(3, param_value_size, param_value, param_value_size_ret, + group_size); + } + case PI_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE: { + // OpenCL LOCAL == CUDA SHARED + int bytes = 0; + sycl::detail::pi::assertion( + cuFuncGetAttribute(&bytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, + kernel->get()) == CUDA_SUCCESS); + return getInfo(param_value_size, param_value, param_value_size_ret, + pi_uint64(bytes)); + } + case PI_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE: { + // Work groups should be multiples of the warp size + int warpSize = 0; + sycl::detail::pi::assertion( + cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, + device->get()) == CUDA_SUCCESS); + return getInfo(param_value_size, param_value, param_value_size_ret, + static_cast(warpSize)); + } + case PI_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE: { + // OpenCL PRIVATE == CUDA LOCAL + int bytes = 0; + sycl::detail::pi::assertion( + cuFuncGetAttribute(&bytes, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, + kernel->get()) == CUDA_SUCCESS); + return getInfo(param_value_size, param_value, param_value_size_ret, + pi_uint64(bytes)); + } + case PI_KERNEL_GROUP_INFO_NUM_REGS: { + int numRegs = 0; + sycl::detail::pi::assertion( + cuFuncGetAttribute(&numRegs, CU_FUNC_ATTRIBUTE_NUM_REGS, + kernel->get()) == CUDA_SUCCESS); + return getInfo(param_value_size, param_value, param_value_size_ret, + pi_uint32(numRegs)); + } + default: + __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name); } return PI_ERROR_INVALID_KERNEL; @@ -5552,7 +5586,7 @@ pi_result cuda_piextEnqueueDeviceGlobalVariableRead( // Windows: dynamically loaded plugins might have been unloaded already // when this is called. Sycl RT holds onto the PI plugin so it can be // called safely. But this is not transitive. If the PI plugin in turn -// dynamically loaded a different DLL, that may have been unloaded. +// dynamically loaded a different DLL, that may have been unloaded. // TODO: add a global variable lifetime management code here (see // pi_level_zero.cpp for reference) Currently this is just a NOOP. pi_result cuda_piTearDown(void *) { diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index 7fbffe6d804d4..9069c1b608643 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -4233,10 +4233,13 @@ pi_result piKernelGetGroupInfo(pi_kernel Kernel, pi_device Device, // TODO: To revisit after level_zero/issues/262 is resolved struct { size_t Arr[3]; - } WorkSize = {{Device->ZeDeviceComputeProperties->maxGroupSizeX, - Device->ZeDeviceComputeProperties->maxGroupSizeY, - Device->ZeDeviceComputeProperties->maxGroupSizeZ}}; - return ReturnValue(WorkSize); + } GlobalWorkSize = {{(Device->ZeDeviceComputeProperties->maxGroupSizeX * + Device->ZeDeviceComputeProperties->maxGroupCountX), + (Device->ZeDeviceComputeProperties->maxGroupSizeY * + Device->ZeDeviceComputeProperties->maxGroupCountY), + (Device->ZeDeviceComputeProperties->maxGroupSizeZ * + Device->ZeDeviceComputeProperties->maxGroupCountZ)}}; + return ReturnValue(GlobalWorkSize); } case PI_KERNEL_GROUP_INFO_WORK_GROUP_SIZE: { // As of right now, L0 is missing API to query kernel and device specific From 4498c9a1be1312042cb7409a8f64b1d612e5950a Mon Sep 17 00:00:00 2001 From: Abhishek Bagusetty Date: Fri, 24 Mar 2023 18:03:57 +0000 Subject: [PATCH 2/5] [HIP] add support for HIP plugin --- sycl/plugins/cuda/pi_cuda.cpp | 4 +- sycl/plugins/hip/pi_hip.cpp | 147 ++++++++++++++++++++-------------- 2 files changed, 91 insertions(+), 60 deletions(-) diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index 895fcabb2a577..b57b6325de1a3 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -2921,7 +2921,7 @@ pi_result cuda_piKernelGetGroupInfo(pi_kernel kernel, pi_device device, case PI_KERNEL_GROUP_INFO_GLOBAL_WORK_SIZE: { size_t global_work_size[3] = {0, 0, 0}; - size_t max_block_dimX{0}, max_block_dimY{0}, max_block_dimZ{0}; + int max_block_dimX{0}, max_block_dimY{0}, max_block_dimZ{0}; sycl::detail::pi::assertion( cuDeviceGetAttribute(&max_block_dimX, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, @@ -2935,7 +2935,7 @@ pi_result cuda_piKernelGetGroupInfo(pi_kernel kernel, pi_device device, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, device->get()) == CUDA_SUCCESS); - size_t max_grid_dimX{0}, max_grid_dimY{0}, max_grid_dimZ{0}; + int max_grid_dimX{0}, max_grid_dimY{0}, max_grid_dimZ{0}; sycl::detail::pi::assertion( cuDeviceGetAttribute(&max_grid_dimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, device->get()) == CUDA_SUCCESS); diff --git a/sycl/plugins/hip/pi_hip.cpp b/sycl/plugins/hip/pi_hip.cpp index b3b2276fdfa5a..afbed7bfa86e9 100644 --- a/sycl/plugins/hip/pi_hip.cpp +++ b/sycl/plugins/hip/pi_hip.cpp @@ -3536,67 +3536,98 @@ pi_result hip_piKernelGetGroupInfo(pi_kernel kernel, pi_device device, size_t param_value_size, void *param_value, size_t *param_value_size_ret) { + PI_ASSERT(kernel, PI_ERROR_INVALID_KERNEL); + PI_ASSERT(device, PI_ERROR_INVALID_DEVICE); + // here we want to query about a kernel's hip blocks! - if (kernel != nullptr) { + switch (param_name) { + case PI_KERNEL_GROUP_INFO_GLOBAL_WORK_SIZE: { + size_t global_work_size[3] = {0, 0, 0}; - switch (param_name) { - case PI_KERNEL_GROUP_INFO_WORK_GROUP_SIZE: { - int max_threads = 0; - sycl::detail::pi::assertion( - hipFuncGetAttribute(&max_threads, - HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, - kernel->get()) == hipSuccess); - return getInfo(param_value_size, param_value, param_value_size_ret, - size_t(max_threads)); - } - case PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE: { - // Returns the work-group size specified in the kernel source or IL. - // If the work-group size is not specified in the kernel source or IL, - // (0, 0, 0) is returned. - // https://www.khronos.org/registry/OpenCL/sdk/2.1/docs/man/xhtml/clGetKernelWorkGroupInfo.html - - // TODO: can we extract the work group size from the PTX? - size_t group_size[3] = {0, 0, 0}; - return getInfoArray(3, param_value_size, param_value, - param_value_size_ret, group_size); - } - case PI_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE: { - // OpenCL LOCAL == HIP SHARED - int bytes = 0; - sycl::detail::pi::assertion( - hipFuncGetAttribute(&bytes, HIP_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, - kernel->get()) == hipSuccess); - return getInfo(param_value_size, param_value, param_value_size_ret, - pi_uint64(bytes)); - } - case PI_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE: { - // Work groups should be multiples of the warp size - int warpSize = 0; - sycl::detail::pi::assertion( - hipDeviceGetAttribute(&warpSize, hipDeviceAttributeWarpSize, - device->get()) == hipSuccess); - return getInfo(param_value_size, param_value, param_value_size_ret, - static_cast(warpSize)); - } - case PI_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE: { - // OpenCL PRIVATE == HIP LOCAL - int bytes = 0; - sycl::detail::pi::assertion( - hipFuncGetAttribute(&bytes, HIP_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, - kernel->get()) == hipSuccess); - return getInfo(param_value_size, param_value, param_value_size_ret, - pi_uint64(bytes)); - } - case PI_KERNEL_GROUP_INFO_NUM_REGS: { - sycl::detail::pi::die("PI_KERNEL_GROUP_INFO_NUM_REGS in " - "piKernelGetGroupInfo not implemented\n"); - return {}; - } + int max_block_dimX{0}, max_block_dimY{0}, max_block_dimZ{0}; + sycl::detail::pi::assertion( + hipDeviceGetAttribute(&max_block_dimX, hipDeviceAttributeMaxBlockDimX, + device->get()) == hipSuccess); + sycl::detail::pi::assertion( + hipDeviceGetAttribute(&max_block_dimY, hipDeviceAttributeMaxBlockDimY, + device->get()) == hipSuccess); + sycl::detail::pi::assertion( + hipDeviceGetAttribute(&max_block_dimZ, hipDeviceAttributeMaxBlockDimZ, + device->get()) == hipSuccess); - default: - __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name); - } + int max_grid_dimX{0}, max_grid_dimY{0}, max_grid_dimZ{0}; + sycl::detail::pi::assertion( + hipDeviceGetAttribute(&max_grid_dimX, hipDeviceAttributeMaxGridDimX, + device->get()) == hipSuccess); + sycl::detail::pi::assertion( + hipDeviceGetAttribute(&max_grid_dimY, hipDeviceAttributeMaxGridDimY, + device->get()) == hipSuccess); + sycl::detail::pi::assertion( + hipDeviceGetAttribute(&max_grid_dimZ, hipDeviceAttributeMaxGridDimZ, + device->get()) == hipSuccess); + + global_work_size[0] = max_block_dimX * max_grid_dimX; + global_work_size[1] = max_block_dimY * max_grid_dimY; + global_work_size[2] = max_block_dimZ * max_grid_dimZ; + return getInfoArray(3, param_value_size, param_value, param_value_size_ret, + global_work_size); + } + case PI_KERNEL_GROUP_INFO_WORK_GROUP_SIZE: { + int max_threads = 0; + sycl::detail::pi::assertion( + hipFuncGetAttribute(&max_threads, + HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, + kernel->get()) == hipSuccess); + return getInfo(param_value_size, param_value, param_value_size_ret, + size_t(max_threads)); + } + case PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE: { + // Returns the work-group size specified in the kernel source or IL. + // If the work-group size is not specified in the kernel source or IL, + // (0, 0, 0) is returned. + // https://www.khronos.org/registry/OpenCL/sdk/2.1/docs/man/xhtml/clGetKernelWorkGroupInfo.html + + // TODO: can we extract the work group size from the PTX? + size_t group_size[3] = {0, 0, 0}; + return getInfoArray(3, param_value_size, param_value, param_value_size_ret, + group_size); + } + case PI_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE: { + // OpenCL LOCAL == HIP SHARED + int bytes = 0; + sycl::detail::pi::assertion( + hipFuncGetAttribute(&bytes, HIP_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, + kernel->get()) == hipSuccess); + return getInfo(param_value_size, param_value, param_value_size_ret, + pi_uint64(bytes)); + } + case PI_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE: { + // Work groups should be multiples of the warp size + int warpSize = 0; + sycl::detail::pi::assertion( + hipDeviceGetAttribute(&warpSize, hipDeviceAttributeWarpSize, + device->get()) == hipSuccess); + return getInfo(param_value_size, param_value, param_value_size_ret, + static_cast(warpSize)); + } + case PI_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE: { + // OpenCL PRIVATE == HIP LOCAL + int bytes = 0; + sycl::detail::pi::assertion( + hipFuncGetAttribute(&bytes, HIP_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, + kernel->get()) == hipSuccess); + return getInfo(param_value_size, param_value, param_value_size_ret, + pi_uint64(bytes)); + } + case PI_KERNEL_GROUP_INFO_NUM_REGS: { + sycl::detail::pi::die("PI_KERNEL_GROUP_INFO_NUM_REGS in " + "piKernelGetGroupInfo not implemented\n"); + return {}; + } + + default: + __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name); } return PI_ERROR_INVALID_KERNEL; @@ -5348,7 +5379,7 @@ pi_result hip_piextEnqueueDeviceGlobalVariableRead( // Windows: dynamically loaded plugins might have been unloaded already // when this is called. Sycl RT holds onto the PI plugin so it can be // called safely. But this is not transitive. If the PI plugin in turn -// dynamically loaded a different DLL, that may have been unloaded. +// dynamically loaded a different DLL, that may have been unloaded. // TODO: add a global variable lifetime management code here (see // pi_level_zero.cpp for reference) Currently this is just a NOOP. pi_result hip_piTearDown(void *PluginParameter) { From 64da35a38a6f3324b29c3afdbb3d45f346972651 Mon Sep 17 00:00:00 2001 From: Abhishek Bagusetty Date: Fri, 24 Mar 2023 18:57:34 +0000 Subject: [PATCH 3/5] added a convience assert similar to UR --- sycl/plugins/cuda/pi_cuda.hpp | 5 +++++ sycl/plugins/hip/pi_hip.hpp | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/sycl/plugins/cuda/pi_cuda.hpp b/sycl/plugins/cuda/pi_cuda.hpp index f6baeab0a4445..d35bf5fd1a3bc 100644 --- a/sycl/plugins/cuda/pi_cuda.hpp +++ b/sycl/plugins/cuda/pi_cuda.hpp @@ -42,6 +42,11 @@ #include #include +// Helper for one-liner validation +#define PI_ASSERT(condition, error) \ + if (!(condition)) \ + return error; + extern "C" { /// \cond IGNORE_BLOCK_IN_DOXYGEN diff --git a/sycl/plugins/hip/pi_hip.hpp b/sycl/plugins/hip/pi_hip.hpp index 7778f1d07be96..2001c7b22a425 100644 --- a/sycl/plugins/hip/pi_hip.hpp +++ b/sycl/plugins/hip/pi_hip.hpp @@ -39,6 +39,11 @@ #include #include +// Helper for one-liner validation +#define PI_ASSERT(condition, error) \ + if (!(condition)) \ + return error; + extern "C" { /// \cond INGORE_BLOCK_IN_DOXYGEN From 28393aba9b11bc4ace839adc1a88f5de2e56a03f Mon Sep 17 00:00:00 2001 From: Abhishek Bagusetty Date: Tue, 28 Mar 2023 18:18:09 +0000 Subject: [PATCH 4/5] fix the failing unit test to query for device-built-in kernels, custom device-types appropriately --- sycl/plugins/level_zero/pi_level_zero.cpp | 1 - sycl/test-e2e/Basic/kernel_info.cpp | 11 +++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index aff95f4e29001..2096fafe0c4ad 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -4228,7 +4228,6 @@ pi_result piKernelGetGroupInfo(pi_kernel Kernel, pi_device Device, std::shared_lock Guard(Kernel->Mutex); switch (ParamName) { case PI_KERNEL_GROUP_INFO_GLOBAL_WORK_SIZE: { - // TODO: To revisit after level_zero/issues/262 is resolved struct { size_t Arr[3]; } GlobalWorkSize = {{(Device->ZeDeviceComputeProperties->maxGroupSizeX * diff --git a/sycl/test-e2e/Basic/kernel_info.cpp b/sycl/test-e2e/Basic/kernel_info.cpp index 0703ec2d97213..e359ba1eb17ab 100644 --- a/sycl/test-e2e/Basic/kernel_info.cpp +++ b/sycl/test-e2e/Basic/kernel_info.cpp @@ -6,8 +6,6 @@ // Fail is flaky for level_zero, enable when fixed. // UNSUPPORTED: level_zero // -// CUDA and HIP do not currently implement global_work_size -// UNSUPPORTED: cuda, hip //==--- kernel_info.cpp - SYCL kernel info test ----------------------------==// // @@ -56,9 +54,14 @@ int main() { assert(prefWGSizeMult > 0); try { + // To check (a) first if the kernel is device built-in, (b) then check if + // the device type is custom + if (!sycl::is_compatible({KernelID}, q.get_device())) { + assert(dev.get_info() == + sycl::info::device_type::custom); + } + krn.get_info(dev); - assert(dev.get_info() == - sycl::info::device_type::custom); } catch (sycl::exception &e) { assert(e.code() == sycl::errc::invalid); } From fb405f51f5d09c9485e67a808c13af89da23966d Mon Sep 17 00:00:00 2001 From: Abhishek Bagusetty Date: Tue, 4 Apr 2023 13:19:09 +0000 Subject: [PATCH 5/5] address PR comments to just reflect the changes related to global_work_sizes --- sycl/plugins/cuda/pi_cuda.cpp | 193 +++++++++++++++++----------------- sycl/plugins/cuda/pi_cuda.hpp | 5 - sycl/plugins/hip/pi_hip.cpp | 172 +++++++++++++++--------------- sycl/plugins/hip/pi_hip.hpp | 5 - 4 files changed, 184 insertions(+), 191 deletions(-) diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index 1156990855686..59a13381e2eb9 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -2935,106 +2935,109 @@ pi_result cuda_piKernelGetGroupInfo(pi_kernel kernel, pi_device device, size_t param_value_size, void *param_value, size_t *param_value_size_ret) { - PI_ASSERT(kernel, PI_ERROR_INVALID_KERNEL); - PI_ASSERT(device, PI_ERROR_INVALID_DEVICE); - // Here we want to query about a kernel's cuda blocks! - switch (param_name) { - case PI_KERNEL_GROUP_INFO_GLOBAL_WORK_SIZE: { - size_t global_work_size[3] = {0, 0, 0}; + if (kernel != nullptr) { - int max_block_dimX{0}, max_block_dimY{0}, max_block_dimZ{0}; - sycl::detail::pi::assertion( - cuDeviceGetAttribute(&max_block_dimX, - CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, - device->get()) == CUDA_SUCCESS); - sycl::detail::pi::assertion( - cuDeviceGetAttribute(&max_block_dimY, - CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, - device->get()) == CUDA_SUCCESS); - sycl::detail::pi::assertion( - cuDeviceGetAttribute(&max_block_dimZ, - CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, - device->get()) == CUDA_SUCCESS); + switch (param_name) { + case PI_KERNEL_GROUP_INFO_GLOBAL_WORK_SIZE: { + size_t global_work_size[3] = {0, 0, 0}; - int max_grid_dimX{0}, max_grid_dimY{0}, max_grid_dimZ{0}; - sycl::detail::pi::assertion( - cuDeviceGetAttribute(&max_grid_dimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, - device->get()) == CUDA_SUCCESS); - sycl::detail::pi::assertion( - cuDeviceGetAttribute(&max_grid_dimY, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, - device->get()) == CUDA_SUCCESS); - sycl::detail::pi::assertion( - cuDeviceGetAttribute(&max_grid_dimZ, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, - device->get()) == CUDA_SUCCESS); + int max_block_dimX{0}, max_block_dimY{0}, max_block_dimZ{0}; + sycl::detail::pi::assertion( + cuDeviceGetAttribute(&max_block_dimX, + CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, + device->get()) == CUDA_SUCCESS); + sycl::detail::pi::assertion( + cuDeviceGetAttribute(&max_block_dimY, + CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, + device->get()) == CUDA_SUCCESS); + sycl::detail::pi::assertion( + cuDeviceGetAttribute(&max_block_dimZ, + CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, + device->get()) == CUDA_SUCCESS); - global_work_size[0] = max_block_dimX * max_grid_dimX; - global_work_size[1] = max_block_dimY * max_grid_dimY; - global_work_size[2] = max_block_dimZ * max_grid_dimZ; - return getInfoArray(3, param_value_size, param_value, param_value_size_ret, - global_work_size); - } - case PI_KERNEL_GROUP_INFO_WORK_GROUP_SIZE: { - int max_threads = 0; - sycl::detail::pi::assertion( - cuFuncGetAttribute(&max_threads, - CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, - kernel->get()) == CUDA_SUCCESS); - return getInfo(param_value_size, param_value, param_value_size_ret, - size_t(max_threads)); - } - case PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE: { - size_t group_size[3] = {0, 0, 0}; - const auto &reqd_wg_size_md_map = - kernel->program_->kernelReqdWorkGroupSizeMD_; - const auto reqd_wg_size_md = reqd_wg_size_md_map.find(kernel->name_); - if (reqd_wg_size_md != reqd_wg_size_md_map.end()) { - const auto reqd_wg_size = reqd_wg_size_md->second; - group_size[0] = std::get<0>(reqd_wg_size); - group_size[1] = std::get<1>(reqd_wg_size); - group_size[2] = std::get<2>(reqd_wg_size); - } - return getInfoArray(3, param_value_size, param_value, param_value_size_ret, - group_size); - } - case PI_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE: { - // OpenCL LOCAL == CUDA SHARED - int bytes = 0; - sycl::detail::pi::assertion( - cuFuncGetAttribute(&bytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, - kernel->get()) == CUDA_SUCCESS); - return getInfo(param_value_size, param_value, param_value_size_ret, - pi_uint64(bytes)); - } - case PI_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE: { - // Work groups should be multiples of the warp size - int warpSize = 0; - sycl::detail::pi::assertion( - cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, - device->get()) == CUDA_SUCCESS); - return getInfo(param_value_size, param_value, param_value_size_ret, - static_cast(warpSize)); - } - case PI_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE: { - // OpenCL PRIVATE == CUDA LOCAL - int bytes = 0; - sycl::detail::pi::assertion( - cuFuncGetAttribute(&bytes, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, - kernel->get()) == CUDA_SUCCESS); - return getInfo(param_value_size, param_value, param_value_size_ret, - pi_uint64(bytes)); - } - case PI_KERNEL_GROUP_INFO_NUM_REGS: { - int numRegs = 0; - sycl::detail::pi::assertion( - cuFuncGetAttribute(&numRegs, CU_FUNC_ATTRIBUTE_NUM_REGS, - kernel->get()) == CUDA_SUCCESS); - return getInfo(param_value_size, param_value, param_value_size_ret, - pi_uint32(numRegs)); - } - default: - __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name); + int max_grid_dimX{0}, max_grid_dimY{0}, max_grid_dimZ{0}; + sycl::detail::pi::assertion( + cuDeviceGetAttribute(&max_grid_dimX, + CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, + device->get()) == CUDA_SUCCESS); + sycl::detail::pi::assertion( + cuDeviceGetAttribute(&max_grid_dimY, + CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, + device->get()) == CUDA_SUCCESS); + sycl::detail::pi::assertion( + cuDeviceGetAttribute(&max_grid_dimZ, + CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, + device->get()) == CUDA_SUCCESS); + + global_work_size[0] = max_block_dimX * max_grid_dimX; + global_work_size[1] = max_block_dimY * max_grid_dimY; + global_work_size[2] = max_block_dimZ * max_grid_dimZ; + return getInfoArray(3, param_value_size, param_value, + param_value_size_ret, global_work_size); + } + case PI_KERNEL_GROUP_INFO_WORK_GROUP_SIZE: { + int max_threads = 0; + sycl::detail::pi::assertion( + cuFuncGetAttribute(&max_threads, + CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, + kernel->get()) == CUDA_SUCCESS); + return getInfo(param_value_size, param_value, param_value_size_ret, + size_t(max_threads)); + } + case PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE: { + size_t group_size[3] = {0, 0, 0}; + const auto &reqd_wg_size_md_map = + kernel->program_->kernelReqdWorkGroupSizeMD_; + const auto reqd_wg_size_md = reqd_wg_size_md_map.find(kernel->name_); + if (reqd_wg_size_md != reqd_wg_size_md_map.end()) { + const auto reqd_wg_size = reqd_wg_size_md->second; + group_size[0] = std::get<0>(reqd_wg_size); + group_size[1] = std::get<1>(reqd_wg_size); + group_size[2] = std::get<2>(reqd_wg_size); + } + return getInfoArray(3, param_value_size, param_value, + param_value_size_ret, group_size); + } + case PI_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE: { + // OpenCL LOCAL == CUDA SHARED + int bytes = 0; + sycl::detail::pi::assertion( + cuFuncGetAttribute(&bytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, + kernel->get()) == CUDA_SUCCESS); + return getInfo(param_value_size, param_value, param_value_size_ret, + pi_uint64(bytes)); + } + case PI_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE: { + // Work groups should be multiples of the warp size + int warpSize = 0; + sycl::detail::pi::assertion( + cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, + device->get()) == CUDA_SUCCESS); + return getInfo(param_value_size, param_value, param_value_size_ret, + static_cast(warpSize)); + } + case PI_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE: { + // OpenCL PRIVATE == CUDA LOCAL + int bytes = 0; + sycl::detail::pi::assertion( + cuFuncGetAttribute(&bytes, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, + kernel->get()) == CUDA_SUCCESS); + return getInfo(param_value_size, param_value, param_value_size_ret, + pi_uint64(bytes)); + } + case PI_KERNEL_GROUP_INFO_NUM_REGS: { + int numRegs = 0; + sycl::detail::pi::assertion( + cuFuncGetAttribute(&numRegs, CU_FUNC_ATTRIBUTE_NUM_REGS, + kernel->get()) == CUDA_SUCCESS); + return getInfo(param_value_size, param_value, param_value_size_ret, + pi_uint32(numRegs)); + } + default: + __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name); + } } return PI_ERROR_INVALID_KERNEL; diff --git a/sycl/plugins/cuda/pi_cuda.hpp b/sycl/plugins/cuda/pi_cuda.hpp index d35bf5fd1a3bc..f6baeab0a4445 100644 --- a/sycl/plugins/cuda/pi_cuda.hpp +++ b/sycl/plugins/cuda/pi_cuda.hpp @@ -42,11 +42,6 @@ #include #include -// Helper for one-liner validation -#define PI_ASSERT(condition, error) \ - if (!(condition)) \ - return error; - extern "C" { /// \cond IGNORE_BLOCK_IN_DOXYGEN diff --git a/sycl/plugins/hip/pi_hip.cpp b/sycl/plugins/hip/pi_hip.cpp index bf07feb565f7e..acac19eef627a 100644 --- a/sycl/plugins/hip/pi_hip.cpp +++ b/sycl/plugins/hip/pi_hip.cpp @@ -3581,98 +3581,98 @@ pi_result hip_piKernelGetGroupInfo(pi_kernel kernel, pi_device device, size_t param_value_size, void *param_value, size_t *param_value_size_ret) { - PI_ASSERT(kernel, PI_ERROR_INVALID_KERNEL); - PI_ASSERT(device, PI_ERROR_INVALID_DEVICE); - // here we want to query about a kernel's hip blocks! - switch (param_name) { - case PI_KERNEL_GROUP_INFO_GLOBAL_WORK_SIZE: { - size_t global_work_size[3] = {0, 0, 0}; + if (kernel != nullptr) { - int max_block_dimX{0}, max_block_dimY{0}, max_block_dimZ{0}; - sycl::detail::pi::assertion( - hipDeviceGetAttribute(&max_block_dimX, hipDeviceAttributeMaxBlockDimX, - device->get()) == hipSuccess); - sycl::detail::pi::assertion( - hipDeviceGetAttribute(&max_block_dimY, hipDeviceAttributeMaxBlockDimY, - device->get()) == hipSuccess); - sycl::detail::pi::assertion( - hipDeviceGetAttribute(&max_block_dimZ, hipDeviceAttributeMaxBlockDimZ, - device->get()) == hipSuccess); + switch (param_name) { + case PI_KERNEL_GROUP_INFO_GLOBAL_WORK_SIZE: { + size_t global_work_size[3] = {0, 0, 0}; - int max_grid_dimX{0}, max_grid_dimY{0}, max_grid_dimZ{0}; - sycl::detail::pi::assertion( - hipDeviceGetAttribute(&max_grid_dimX, hipDeviceAttributeMaxGridDimX, - device->get()) == hipSuccess); - sycl::detail::pi::assertion( - hipDeviceGetAttribute(&max_grid_dimY, hipDeviceAttributeMaxGridDimY, - device->get()) == hipSuccess); - sycl::detail::pi::assertion( - hipDeviceGetAttribute(&max_grid_dimZ, hipDeviceAttributeMaxGridDimZ, - device->get()) == hipSuccess); + int max_block_dimX{0}, max_block_dimY{0}, max_block_dimZ{0}; + sycl::detail::pi::assertion( + hipDeviceGetAttribute(&max_block_dimX, hipDeviceAttributeMaxBlockDimX, + device->get()) == hipSuccess); + sycl::detail::pi::assertion( + hipDeviceGetAttribute(&max_block_dimY, hipDeviceAttributeMaxBlockDimY, + device->get()) == hipSuccess); + sycl::detail::pi::assertion( + hipDeviceGetAttribute(&max_block_dimZ, hipDeviceAttributeMaxBlockDimZ, + device->get()) == hipSuccess); - global_work_size[0] = max_block_dimX * max_grid_dimX; - global_work_size[1] = max_block_dimY * max_grid_dimY; - global_work_size[2] = max_block_dimZ * max_grid_dimZ; - return getInfoArray(3, param_value_size, param_value, param_value_size_ret, - global_work_size); - } - case PI_KERNEL_GROUP_INFO_WORK_GROUP_SIZE: { - int max_threads = 0; - sycl::detail::pi::assertion( - hipFuncGetAttribute(&max_threads, - HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, - kernel->get()) == hipSuccess); - return getInfo(param_value_size, param_value, param_value_size_ret, - size_t(max_threads)); - } - case PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE: { - // Returns the work-group size specified in the kernel source or IL. - // If the work-group size is not specified in the kernel source or IL, - // (0, 0, 0) is returned. - // https://www.khronos.org/registry/OpenCL/sdk/2.1/docs/man/xhtml/clGetKernelWorkGroupInfo.html - - // TODO: can we extract the work group size from the PTX? - size_t group_size[3] = {0, 0, 0}; - return getInfoArray(3, param_value_size, param_value, param_value_size_ret, - group_size); - } - case PI_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE: { - // OpenCL LOCAL == HIP SHARED - int bytes = 0; - sycl::detail::pi::assertion( - hipFuncGetAttribute(&bytes, HIP_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, - kernel->get()) == hipSuccess); - return getInfo(param_value_size, param_value, param_value_size_ret, - pi_uint64(bytes)); - } - case PI_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE: { - // Work groups should be multiples of the warp size - int warpSize = 0; - sycl::detail::pi::assertion( - hipDeviceGetAttribute(&warpSize, hipDeviceAttributeWarpSize, - device->get()) == hipSuccess); - return getInfo(param_value_size, param_value, param_value_size_ret, - static_cast(warpSize)); - } - case PI_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE: { - // OpenCL PRIVATE == HIP LOCAL - int bytes = 0; - sycl::detail::pi::assertion( - hipFuncGetAttribute(&bytes, HIP_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, - kernel->get()) == hipSuccess); - return getInfo(param_value_size, param_value, param_value_size_ret, - pi_uint64(bytes)); - } - case PI_KERNEL_GROUP_INFO_NUM_REGS: { - sycl::detail::pi::die("PI_KERNEL_GROUP_INFO_NUM_REGS in " - "piKernelGetGroupInfo not implemented\n"); - return {}; - } + int max_grid_dimX{0}, max_grid_dimY{0}, max_grid_dimZ{0}; + sycl::detail::pi::assertion( + hipDeviceGetAttribute(&max_grid_dimX, hipDeviceAttributeMaxGridDimX, + device->get()) == hipSuccess); + sycl::detail::pi::assertion( + hipDeviceGetAttribute(&max_grid_dimY, hipDeviceAttributeMaxGridDimY, + device->get()) == hipSuccess); + sycl::detail::pi::assertion( + hipDeviceGetAttribute(&max_grid_dimZ, hipDeviceAttributeMaxGridDimZ, + device->get()) == hipSuccess); - default: - __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name); + global_work_size[0] = max_block_dimX * max_grid_dimX; + global_work_size[1] = max_block_dimY * max_grid_dimY; + global_work_size[2] = max_block_dimZ * max_grid_dimZ; + return getInfoArray(3, param_value_size, param_value, + param_value_size_ret, global_work_size); + } + case PI_KERNEL_GROUP_INFO_WORK_GROUP_SIZE: { + int max_threads = 0; + sycl::detail::pi::assertion( + hipFuncGetAttribute(&max_threads, + HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, + kernel->get()) == hipSuccess); + return getInfo(param_value_size, param_value, param_value_size_ret, + size_t(max_threads)); + } + case PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE: { + // Returns the work-group size specified in the kernel source or IL. + // If the work-group size is not specified in the kernel source or IL, + // (0, 0, 0) is returned. + // https://www.khronos.org/registry/OpenCL/sdk/2.1/docs/man/xhtml/clGetKernelWorkGroupInfo.html + + // TODO: can we extract the work group size from the PTX? + size_t group_size[3] = {0, 0, 0}; + return getInfoArray(3, param_value_size, param_value, + param_value_size_ret, group_size); + } + case PI_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE: { + // OpenCL LOCAL == HIP SHARED + int bytes = 0; + sycl::detail::pi::assertion( + hipFuncGetAttribute(&bytes, HIP_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, + kernel->get()) == hipSuccess); + return getInfo(param_value_size, param_value, param_value_size_ret, + pi_uint64(bytes)); + } + case PI_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE: { + // Work groups should be multiples of the warp size + int warpSize = 0; + sycl::detail::pi::assertion( + hipDeviceGetAttribute(&warpSize, hipDeviceAttributeWarpSize, + device->get()) == hipSuccess); + return getInfo(param_value_size, param_value, param_value_size_ret, + static_cast(warpSize)); + } + case PI_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE: { + // OpenCL PRIVATE == HIP LOCAL + int bytes = 0; + sycl::detail::pi::assertion( + hipFuncGetAttribute(&bytes, HIP_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, + kernel->get()) == hipSuccess); + return getInfo(param_value_size, param_value, param_value_size_ret, + pi_uint64(bytes)); + } + case PI_KERNEL_GROUP_INFO_NUM_REGS: { + sycl::detail::pi::die("PI_KERNEL_GROUP_INFO_NUM_REGS in " + "piKernelGetGroupInfo not implemented\n"); + return {}; + } + + default: + __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name); + } } return PI_ERROR_INVALID_KERNEL; diff --git a/sycl/plugins/hip/pi_hip.hpp b/sycl/plugins/hip/pi_hip.hpp index 2001c7b22a425..7778f1d07be96 100644 --- a/sycl/plugins/hip/pi_hip.hpp +++ b/sycl/plugins/hip/pi_hip.hpp @@ -39,11 +39,6 @@ #include #include -// Helper for one-liner validation -#define PI_ASSERT(condition, error) \ - if (!(condition)) \ - return error; - extern "C" { /// \cond INGORE_BLOCK_IN_DOXYGEN