From dc09fd90ce602f8b1e4006bd5a9ce75ff6bf12ac Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <abagusetty@anl.gov>
Date: Fri, 24 Mar 2023 17:26:01 +0000
Subject: [PATCH 1/5] [SYCL] fix global_work_size kernel query descriptor

---
 sycl/plugins/cuda/pi_cuda.cpp             | 162 +++++++++++++---------
 sycl/plugins/level_zero/pi_level_zero.cpp |  11 +-
 2 files changed, 105 insertions(+), 68 deletions(-)

diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp
index 9e493502645a9..895fcabb2a577 100644
--- a/sycl/plugins/cuda/pi_cuda.cpp
+++ b/sycl/plugins/cuda/pi_cuda.cpp
@@ -2912,72 +2912,106 @@ pi_result cuda_piKernelGetGroupInfo(pi_kernel kernel, pi_device device,
                                     size_t param_value_size, void *param_value,
                                     size_t *param_value_size_ret) {
 
+  PI_ASSERT(kernel, PI_ERROR_INVALID_KERNEL);
+  PI_ASSERT(device, PI_ERROR_INVALID_DEVICE);
+
   // Here we want to query about a kernel's cuda blocks!
 
-  if (kernel != nullptr) {
+  switch (param_name) {
+  case PI_KERNEL_GROUP_INFO_GLOBAL_WORK_SIZE: {
+    size_t global_work_size[3] = {0, 0, 0};
 
-    switch (param_name) {
-    case PI_KERNEL_GROUP_INFO_WORK_GROUP_SIZE: {
-      int max_threads = 0;
-      sycl::detail::pi::assertion(
-          cuFuncGetAttribute(&max_threads,
-                             CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
-                             kernel->get()) == CUDA_SUCCESS);
-      return getInfo(param_value_size, param_value, param_value_size_ret,
-                     size_t(max_threads));
-    }
-    case PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE: {
-      size_t group_size[3] = {0, 0, 0};
-      const auto &reqd_wg_size_md_map =
-          kernel->program_->kernelReqdWorkGroupSizeMD_;
-      const auto reqd_wg_size_md = reqd_wg_size_md_map.find(kernel->name_);
-      if (reqd_wg_size_md != reqd_wg_size_md_map.end()) {
-        const auto reqd_wg_size = reqd_wg_size_md->second;
-        group_size[0] = std::get<0>(reqd_wg_size);
-        group_size[1] = std::get<1>(reqd_wg_size);
-        group_size[2] = std::get<2>(reqd_wg_size);
-      }
-      return getInfoArray(3, param_value_size, param_value,
-                          param_value_size_ret, group_size);
-    }
-    case PI_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE: {
-      // OpenCL LOCAL == CUDA SHARED
-      int bytes = 0;
-      sycl::detail::pi::assertion(
-          cuFuncGetAttribute(&bytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,
-                             kernel->get()) == CUDA_SUCCESS);
-      return getInfo(param_value_size, param_value, param_value_size_ret,
-                     pi_uint64(bytes));
-    }
-    case PI_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE: {
-      // Work groups should be multiples of the warp size
-      int warpSize = 0;
-      sycl::detail::pi::assertion(
-          cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE,
-                               device->get()) == CUDA_SUCCESS);
-      return getInfo(param_value_size, param_value, param_value_size_ret,
-                     static_cast<size_t>(warpSize));
-    }
-    case PI_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE: {
-      // OpenCL PRIVATE == CUDA LOCAL
-      int bytes = 0;
-      sycl::detail::pi::assertion(
-          cuFuncGetAttribute(&bytes, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES,
-                             kernel->get()) == CUDA_SUCCESS);
-      return getInfo(param_value_size, param_value, param_value_size_ret,
-                     pi_uint64(bytes));
-    }
-    case PI_KERNEL_GROUP_INFO_NUM_REGS: {
-      int numRegs = 0;
-      sycl::detail::pi::assertion(
-          cuFuncGetAttribute(&numRegs, CU_FUNC_ATTRIBUTE_NUM_REGS,
-                             kernel->get()) == CUDA_SUCCESS);
-      return getInfo(param_value_size, param_value, param_value_size_ret,
-                     pi_uint32(numRegs));
-    }
-    default:
-      __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name);
-    }
+    size_t max_block_dimX{0}, max_block_dimY{0}, max_block_dimZ{0};
+    sycl::detail::pi::assertion(
+        cuDeviceGetAttribute(&max_block_dimX,
+                             CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
+                             device->get()) == CUDA_SUCCESS);
+    sycl::detail::pi::assertion(
+        cuDeviceGetAttribute(&max_block_dimY,
+                             CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y,
+                             device->get()) == CUDA_SUCCESS);
+    sycl::detail::pi::assertion(
+        cuDeviceGetAttribute(&max_block_dimZ,
+                             CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z,
+                             device->get()) == CUDA_SUCCESS);
+
+    size_t max_grid_dimX{0}, max_grid_dimY{0}, max_grid_dimZ{0};
+    sycl::detail::pi::assertion(
+        cuDeviceGetAttribute(&max_grid_dimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
+                             device->get()) == CUDA_SUCCESS);
+    sycl::detail::pi::assertion(
+        cuDeviceGetAttribute(&max_grid_dimY, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y,
+                             device->get()) == CUDA_SUCCESS);
+    sycl::detail::pi::assertion(
+        cuDeviceGetAttribute(&max_grid_dimZ, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z,
+                             device->get()) == CUDA_SUCCESS);
+
+    global_work_size[0] = max_block_dimX * max_grid_dimX;
+    global_work_size[1] = max_block_dimY * max_grid_dimY;
+    global_work_size[2] = max_block_dimZ * max_grid_dimZ;
+    return getInfoArray(3, param_value_size, param_value, param_value_size_ret,
+                        global_work_size);
+  }
+  case PI_KERNEL_GROUP_INFO_WORK_GROUP_SIZE: {
+    int max_threads = 0;
+    sycl::detail::pi::assertion(
+        cuFuncGetAttribute(&max_threads,
+                           CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
+                           kernel->get()) == CUDA_SUCCESS);
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   size_t(max_threads));
+  }
+  case PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE: {
+    size_t group_size[3] = {0, 0, 0};
+    const auto &reqd_wg_size_md_map =
+        kernel->program_->kernelReqdWorkGroupSizeMD_;
+    const auto reqd_wg_size_md = reqd_wg_size_md_map.find(kernel->name_);
+    if (reqd_wg_size_md != reqd_wg_size_md_map.end()) {
+      const auto reqd_wg_size = reqd_wg_size_md->second;
+      group_size[0] = std::get<0>(reqd_wg_size);
+      group_size[1] = std::get<1>(reqd_wg_size);
+      group_size[2] = std::get<2>(reqd_wg_size);
+    }
+    return getInfoArray(3, param_value_size, param_value, param_value_size_ret,
+                        group_size);
+  }
+  case PI_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE: {
+    // OpenCL LOCAL == CUDA SHARED
+    int bytes = 0;
+    sycl::detail::pi::assertion(
+        cuFuncGetAttribute(&bytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,
+                           kernel->get()) == CUDA_SUCCESS);
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   pi_uint64(bytes));
+  }
+  case PI_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE: {
+    // Work groups should be multiples of the warp size
+    int warpSize = 0;
+    sycl::detail::pi::assertion(
+        cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE,
+                             device->get()) == CUDA_SUCCESS);
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   static_cast<size_t>(warpSize));
+  }
+  case PI_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE: {
+    // OpenCL PRIVATE == CUDA LOCAL
+    int bytes = 0;
+    sycl::detail::pi::assertion(
+        cuFuncGetAttribute(&bytes, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES,
+                           kernel->get()) == CUDA_SUCCESS);
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   pi_uint64(bytes));
+  }
+  case PI_KERNEL_GROUP_INFO_NUM_REGS: {
+    int numRegs = 0;
+    sycl::detail::pi::assertion(
+        cuFuncGetAttribute(&numRegs, CU_FUNC_ATTRIBUTE_NUM_REGS,
+                           kernel->get()) == CUDA_SUCCESS);
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   pi_uint32(numRegs));
+  }
+  default:
+    __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name);
   }
 
   return PI_ERROR_INVALID_KERNEL;
@@ -5552,7 +5586,7 @@ pi_result cuda_piextEnqueueDeviceGlobalVariableRead(
 // Windows: dynamically loaded plugins might have been unloaded already
 // when this is called. Sycl RT holds onto the PI plugin so it can be
 // called safely. But this is not transitive. If the PI plugin in turn
-// dynamically loaded a different DLL, that may have been unloaded. 
+// dynamically loaded a different DLL, that may have been unloaded.
 // TODO: add a global variable lifetime management code here (see
 // pi_level_zero.cpp for reference) Currently this is just a NOOP.
 pi_result cuda_piTearDown(void *) {
diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp
index 7fbffe6d804d4..9069c1b608643 100644
--- a/sycl/plugins/level_zero/pi_level_zero.cpp
+++ b/sycl/plugins/level_zero/pi_level_zero.cpp
@@ -4233,10 +4233,13 @@ pi_result piKernelGetGroupInfo(pi_kernel Kernel, pi_device Device,
     // TODO: To revisit after level_zero/issues/262 is resolved
     struct {
       size_t Arr[3];
-    } WorkSize = {{Device->ZeDeviceComputeProperties->maxGroupSizeX,
-                   Device->ZeDeviceComputeProperties->maxGroupSizeY,
-                   Device->ZeDeviceComputeProperties->maxGroupSizeZ}};
-    return ReturnValue(WorkSize);
+    } GlobalWorkSize = {{(Device->ZeDeviceComputeProperties->maxGroupSizeX *
+                          Device->ZeDeviceComputeProperties->maxGroupCountX),
+                         (Device->ZeDeviceComputeProperties->maxGroupSizeY *
+                          Device->ZeDeviceComputeProperties->maxGroupCountY),
+                         (Device->ZeDeviceComputeProperties->maxGroupSizeZ *
+                          Device->ZeDeviceComputeProperties->maxGroupCountZ)}};
+    return ReturnValue(GlobalWorkSize);
   }
   case PI_KERNEL_GROUP_INFO_WORK_GROUP_SIZE: {
     // As of right now, L0 is missing API to query kernel and device specific

From 4498c9a1be1312042cb7409a8f64b1d612e5950a Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <abagusetty@anl.gov>
Date: Fri, 24 Mar 2023 18:03:57 +0000
Subject: [PATCH 2/5] [HIP] add support for HIP plugin

---
 sycl/plugins/cuda/pi_cuda.cpp |   4 +-
 sycl/plugins/hip/pi_hip.cpp   | 147 ++++++++++++++++++++--------------
 2 files changed, 91 insertions(+), 60 deletions(-)

diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp
index 895fcabb2a577..b57b6325de1a3 100644
--- a/sycl/plugins/cuda/pi_cuda.cpp
+++ b/sycl/plugins/cuda/pi_cuda.cpp
@@ -2921,7 +2921,7 @@ pi_result cuda_piKernelGetGroupInfo(pi_kernel kernel, pi_device device,
   case PI_KERNEL_GROUP_INFO_GLOBAL_WORK_SIZE: {
     size_t global_work_size[3] = {0, 0, 0};
 
-    size_t max_block_dimX{0}, max_block_dimY{0}, max_block_dimZ{0};
+    int max_block_dimX{0}, max_block_dimY{0}, max_block_dimZ{0};
     sycl::detail::pi::assertion(
         cuDeviceGetAttribute(&max_block_dimX,
                              CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
@@ -2935,7 +2935,7 @@ pi_result cuda_piKernelGetGroupInfo(pi_kernel kernel, pi_device device,
                              CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z,
                              device->get()) == CUDA_SUCCESS);
 
-    size_t max_grid_dimX{0}, max_grid_dimY{0}, max_grid_dimZ{0};
+    int max_grid_dimX{0}, max_grid_dimY{0}, max_grid_dimZ{0};
     sycl::detail::pi::assertion(
         cuDeviceGetAttribute(&max_grid_dimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
                              device->get()) == CUDA_SUCCESS);
diff --git a/sycl/plugins/hip/pi_hip.cpp b/sycl/plugins/hip/pi_hip.cpp
index b3b2276fdfa5a..afbed7bfa86e9 100644
--- a/sycl/plugins/hip/pi_hip.cpp
+++ b/sycl/plugins/hip/pi_hip.cpp
@@ -3536,67 +3536,98 @@ pi_result hip_piKernelGetGroupInfo(pi_kernel kernel, pi_device device,
                                    size_t param_value_size, void *param_value,
                                    size_t *param_value_size_ret) {
 
+  PI_ASSERT(kernel, PI_ERROR_INVALID_KERNEL);
+  PI_ASSERT(device, PI_ERROR_INVALID_DEVICE);
+
   // here we want to query about a kernel's hip blocks!
 
-  if (kernel != nullptr) {
+  switch (param_name) {
+  case PI_KERNEL_GROUP_INFO_GLOBAL_WORK_SIZE: {
+    size_t global_work_size[3] = {0, 0, 0};
 
-    switch (param_name) {
-    case PI_KERNEL_GROUP_INFO_WORK_GROUP_SIZE: {
-      int max_threads = 0;
-      sycl::detail::pi::assertion(
-          hipFuncGetAttribute(&max_threads,
-                              HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
-                              kernel->get()) == hipSuccess);
-      return getInfo(param_value_size, param_value, param_value_size_ret,
-                     size_t(max_threads));
-    }
-    case PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE: {
-      // Returns the work-group size specified in the kernel source or IL.
-      // If the work-group size is not specified in the kernel source or IL,
-      // (0, 0, 0) is returned.
-      // https://www.khronos.org/registry/OpenCL/sdk/2.1/docs/man/xhtml/clGetKernelWorkGroupInfo.html
-
-      // TODO: can we extract the work group size from the PTX?
-      size_t group_size[3] = {0, 0, 0};
-      return getInfoArray(3, param_value_size, param_value,
-                          param_value_size_ret, group_size);
-    }
-    case PI_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE: {
-      // OpenCL LOCAL == HIP SHARED
-      int bytes = 0;
-      sycl::detail::pi::assertion(
-          hipFuncGetAttribute(&bytes, HIP_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,
-                              kernel->get()) == hipSuccess);
-      return getInfo(param_value_size, param_value, param_value_size_ret,
-                     pi_uint64(bytes));
-    }
-    case PI_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE: {
-      // Work groups should be multiples of the warp size
-      int warpSize = 0;
-      sycl::detail::pi::assertion(
-          hipDeviceGetAttribute(&warpSize, hipDeviceAttributeWarpSize,
-                                device->get()) == hipSuccess);
-      return getInfo(param_value_size, param_value, param_value_size_ret,
-                     static_cast<size_t>(warpSize));
-    }
-    case PI_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE: {
-      // OpenCL PRIVATE == HIP LOCAL
-      int bytes = 0;
-      sycl::detail::pi::assertion(
-          hipFuncGetAttribute(&bytes, HIP_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES,
-                              kernel->get()) == hipSuccess);
-      return getInfo(param_value_size, param_value, param_value_size_ret,
-                     pi_uint64(bytes));
-    }
-    case PI_KERNEL_GROUP_INFO_NUM_REGS: {
-      sycl::detail::pi::die("PI_KERNEL_GROUP_INFO_NUM_REGS in "
-                            "piKernelGetGroupInfo not implemented\n");
-      return {};
-    }
+    int max_block_dimX{0}, max_block_dimY{0}, max_block_dimZ{0};
+    sycl::detail::pi::assertion(
+        hipDeviceGetAttribute(&max_block_dimX, hipDeviceAttributeMaxBlockDimX,
+                              device->get()) == hipSuccess);
+    sycl::detail::pi::assertion(
+        hipDeviceGetAttribute(&max_block_dimY, hipDeviceAttributeMaxBlockDimY,
+                              device->get()) == hipSuccess);
+    sycl::detail::pi::assertion(
+        hipDeviceGetAttribute(&max_block_dimZ, hipDeviceAttributeMaxBlockDimZ,
+                              device->get()) == hipSuccess);
 
-    default:
-      __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name);
-    }
+    int max_grid_dimX{0}, max_grid_dimY{0}, max_grid_dimZ{0};
+    sycl::detail::pi::assertion(
+        hipDeviceGetAttribute(&max_grid_dimX, hipDeviceAttributeMaxGridDimX,
+                              device->get()) == hipSuccess);
+    sycl::detail::pi::assertion(
+        hipDeviceGetAttribute(&max_grid_dimY, hipDeviceAttributeMaxGridDimY,
+                              device->get()) == hipSuccess);
+    sycl::detail::pi::assertion(
+        hipDeviceGetAttribute(&max_grid_dimZ, hipDeviceAttributeMaxGridDimZ,
+                              device->get()) == hipSuccess);
+
+    global_work_size[0] = max_block_dimX * max_grid_dimX;
+    global_work_size[1] = max_block_dimY * max_grid_dimY;
+    global_work_size[2] = max_block_dimZ * max_grid_dimZ;
+    return getInfoArray(3, param_value_size, param_value, param_value_size_ret,
+                        global_work_size);
+  }
+  case PI_KERNEL_GROUP_INFO_WORK_GROUP_SIZE: {
+    int max_threads = 0;
+    sycl::detail::pi::assertion(
+        hipFuncGetAttribute(&max_threads,
+                            HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
+                            kernel->get()) == hipSuccess);
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   size_t(max_threads));
+  }
+  case PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE: {
+    // Returns the work-group size specified in the kernel source or IL.
+    // If the work-group size is not specified in the kernel source or IL,
+    // (0, 0, 0) is returned.
+    // https://www.khronos.org/registry/OpenCL/sdk/2.1/docs/man/xhtml/clGetKernelWorkGroupInfo.html
+
+    // TODO: can we extract the work group size from the PTX?
+    size_t group_size[3] = {0, 0, 0};
+    return getInfoArray(3, param_value_size, param_value, param_value_size_ret,
+                        group_size);
+  }
+  case PI_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE: {
+    // OpenCL LOCAL == HIP SHARED
+    int bytes = 0;
+    sycl::detail::pi::assertion(
+        hipFuncGetAttribute(&bytes, HIP_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,
+                            kernel->get()) == hipSuccess);
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   pi_uint64(bytes));
+  }
+  case PI_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE: {
+    // Work groups should be multiples of the warp size
+    int warpSize = 0;
+    sycl::detail::pi::assertion(
+        hipDeviceGetAttribute(&warpSize, hipDeviceAttributeWarpSize,
+                              device->get()) == hipSuccess);
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   static_cast<size_t>(warpSize));
+  }
+  case PI_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE: {
+    // OpenCL PRIVATE == HIP LOCAL
+    int bytes = 0;
+    sycl::detail::pi::assertion(
+        hipFuncGetAttribute(&bytes, HIP_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES,
+                            kernel->get()) == hipSuccess);
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   pi_uint64(bytes));
+  }
+  case PI_KERNEL_GROUP_INFO_NUM_REGS: {
+    sycl::detail::pi::die("PI_KERNEL_GROUP_INFO_NUM_REGS in "
+                          "piKernelGetGroupInfo not implemented\n");
+    return {};
+  }
+
+  default:
+    __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name);
   }
 
   return PI_ERROR_INVALID_KERNEL;
@@ -5348,7 +5379,7 @@ pi_result hip_piextEnqueueDeviceGlobalVariableRead(
 // Windows: dynamically loaded plugins might have been unloaded already
 // when this is called. Sycl RT holds onto the PI plugin so it can be
 // called safely. But this is not transitive. If the PI plugin in turn
-// dynamically loaded a different DLL, that may have been unloaded. 
+// dynamically loaded a different DLL, that may have been unloaded.
 // TODO: add a global variable lifetime management code here (see
 // pi_level_zero.cpp for reference) Currently this is just a NOOP.
 pi_result hip_piTearDown(void *PluginParameter) {

From 64da35a38a6f3324b29c3afdbb3d45f346972651 Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <abagusetty@anl.gov>
Date: Fri, 24 Mar 2023 18:57:34 +0000
Subject: [PATCH 3/5] added a convience assert similar to UR

---
 sycl/plugins/cuda/pi_cuda.hpp | 5 +++++
 sycl/plugins/hip/pi_hip.hpp   | 5 +++++
 2 files changed, 10 insertions(+)

diff --git a/sycl/plugins/cuda/pi_cuda.hpp b/sycl/plugins/cuda/pi_cuda.hpp
index f6baeab0a4445..d35bf5fd1a3bc 100644
--- a/sycl/plugins/cuda/pi_cuda.hpp
+++ b/sycl/plugins/cuda/pi_cuda.hpp
@@ -42,6 +42,11 @@
 #include <unordered_map>
 #include <vector>
 
+// Helper for one-liner validation
+#define PI_ASSERT(condition, error)                                            \
+  if (!(condition))                                                            \
+    return error;
+
 extern "C" {
 
 /// \cond IGNORE_BLOCK_IN_DOXYGEN
diff --git a/sycl/plugins/hip/pi_hip.hpp b/sycl/plugins/hip/pi_hip.hpp
index 7778f1d07be96..2001c7b22a425 100644
--- a/sycl/plugins/hip/pi_hip.hpp
+++ b/sycl/plugins/hip/pi_hip.hpp
@@ -39,6 +39,11 @@
 #include <string>
 #include <vector>
 
+// Helper for one-liner validation
+#define PI_ASSERT(condition, error)                                            \
+  if (!(condition))                                                            \
+    return error;
+
 extern "C" {
 
 /// \cond INGORE_BLOCK_IN_DOXYGEN

From 28393aba9b11bc4ace839adc1a88f5de2e56a03f Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <abagusetty@anl.gov>
Date: Tue, 28 Mar 2023 18:18:09 +0000
Subject: [PATCH 4/5] fix the failing unit test to query for device-built-in
 kernels, custom device-types appropriately

---
 sycl/plugins/level_zero/pi_level_zero.cpp |  1 -
 sycl/test-e2e/Basic/kernel_info.cpp       | 11 +++++++----
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp
index aff95f4e29001..2096fafe0c4ad 100644
--- a/sycl/plugins/level_zero/pi_level_zero.cpp
+++ b/sycl/plugins/level_zero/pi_level_zero.cpp
@@ -4228,7 +4228,6 @@ pi_result piKernelGetGroupInfo(pi_kernel Kernel, pi_device Device,
   std::shared_lock<pi_shared_mutex> Guard(Kernel->Mutex);
   switch (ParamName) {
   case PI_KERNEL_GROUP_INFO_GLOBAL_WORK_SIZE: {
-    // TODO: To revisit after level_zero/issues/262 is resolved
     struct {
       size_t Arr[3];
     } GlobalWorkSize = {{(Device->ZeDeviceComputeProperties->maxGroupSizeX *
diff --git a/sycl/test-e2e/Basic/kernel_info.cpp b/sycl/test-e2e/Basic/kernel_info.cpp
index 0703ec2d97213..e359ba1eb17ab 100644
--- a/sycl/test-e2e/Basic/kernel_info.cpp
+++ b/sycl/test-e2e/Basic/kernel_info.cpp
@@ -6,8 +6,6 @@
 // Fail is flaky for level_zero, enable when fixed.
 // UNSUPPORTED: level_zero
 //
-// CUDA and HIP do not currently implement global_work_size
-// UNSUPPORTED: cuda, hip
 
 //==--- kernel_info.cpp - SYCL kernel info test ----------------------------==//
 //
@@ -56,9 +54,14 @@ int main() {
   assert(prefWGSizeMult > 0);
 
   try {
+    // To check (a) first if the kernel is device built-in, (b) then check if
+    // the device type is custom
+    if (!sycl::is_compatible({KernelID}, q.get_device())) {
+      assert(dev.get_info<sycl::info::device::device_type>() ==
+             sycl::info::device_type::custom);
+    }
+
     krn.get_info<sycl::info::kernel_device_specific::global_work_size>(dev);
-    assert(dev.get_info<sycl::info::device::device_type>() ==
-           sycl::info::device_type::custom);
   } catch (sycl::exception &e) {
     assert(e.code() == sycl::errc::invalid);
   }

From fb405f51f5d09c9485e67a808c13af89da23966d Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <abagusetty@anl.gov>
Date: Tue, 4 Apr 2023 13:19:09 +0000
Subject: [PATCH 5/5] address PR comments to just reflect the changes related
 to global_work_sizes

---
 sycl/plugins/cuda/pi_cuda.cpp | 193 +++++++++++++++++-----------------
 sycl/plugins/cuda/pi_cuda.hpp |   5 -
 sycl/plugins/hip/pi_hip.cpp   | 172 +++++++++++++++---------------
 sycl/plugins/hip/pi_hip.hpp   |   5 -
 4 files changed, 184 insertions(+), 191 deletions(-)

diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp
index 1156990855686..59a13381e2eb9 100644
--- a/sycl/plugins/cuda/pi_cuda.cpp
+++ b/sycl/plugins/cuda/pi_cuda.cpp
@@ -2935,106 +2935,109 @@ pi_result cuda_piKernelGetGroupInfo(pi_kernel kernel, pi_device device,
                                     size_t param_value_size, void *param_value,
                                     size_t *param_value_size_ret) {
 
-  PI_ASSERT(kernel, PI_ERROR_INVALID_KERNEL);
-  PI_ASSERT(device, PI_ERROR_INVALID_DEVICE);
-
   // Here we want to query about a kernel's cuda blocks!
 
-  switch (param_name) {
-  case PI_KERNEL_GROUP_INFO_GLOBAL_WORK_SIZE: {
-    size_t global_work_size[3] = {0, 0, 0};
+  if (kernel != nullptr) {
 
-    int max_block_dimX{0}, max_block_dimY{0}, max_block_dimZ{0};
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&max_block_dimX,
-                             CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&max_block_dimY,
-                             CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&max_block_dimZ,
-                             CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z,
-                             device->get()) == CUDA_SUCCESS);
+    switch (param_name) {
+    case PI_KERNEL_GROUP_INFO_GLOBAL_WORK_SIZE: {
+      size_t global_work_size[3] = {0, 0, 0};
 
-    int max_grid_dimX{0}, max_grid_dimY{0}, max_grid_dimZ{0};
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&max_grid_dimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&max_grid_dimY, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&max_grid_dimZ, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z,
-                             device->get()) == CUDA_SUCCESS);
+      int max_block_dimX{0}, max_block_dimY{0}, max_block_dimZ{0};
+      sycl::detail::pi::assertion(
+          cuDeviceGetAttribute(&max_block_dimX,
+                               CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
+                               device->get()) == CUDA_SUCCESS);
+      sycl::detail::pi::assertion(
+          cuDeviceGetAttribute(&max_block_dimY,
+                               CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y,
+                               device->get()) == CUDA_SUCCESS);
+      sycl::detail::pi::assertion(
+          cuDeviceGetAttribute(&max_block_dimZ,
+                               CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z,
+                               device->get()) == CUDA_SUCCESS);
 
-    global_work_size[0] = max_block_dimX * max_grid_dimX;
-    global_work_size[1] = max_block_dimY * max_grid_dimY;
-    global_work_size[2] = max_block_dimZ * max_grid_dimZ;
-    return getInfoArray(3, param_value_size, param_value, param_value_size_ret,
-                        global_work_size);
-  }
-  case PI_KERNEL_GROUP_INFO_WORK_GROUP_SIZE: {
-    int max_threads = 0;
-    sycl::detail::pi::assertion(
-        cuFuncGetAttribute(&max_threads,
-                           CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
-                           kernel->get()) == CUDA_SUCCESS);
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   size_t(max_threads));
-  }
-  case PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE: {
-    size_t group_size[3] = {0, 0, 0};
-    const auto &reqd_wg_size_md_map =
-        kernel->program_->kernelReqdWorkGroupSizeMD_;
-    const auto reqd_wg_size_md = reqd_wg_size_md_map.find(kernel->name_);
-    if (reqd_wg_size_md != reqd_wg_size_md_map.end()) {
-      const auto reqd_wg_size = reqd_wg_size_md->second;
-      group_size[0] = std::get<0>(reqd_wg_size);
-      group_size[1] = std::get<1>(reqd_wg_size);
-      group_size[2] = std::get<2>(reqd_wg_size);
-    }
-    return getInfoArray(3, param_value_size, param_value, param_value_size_ret,
-                        group_size);
-  }
-  case PI_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE: {
-    // OpenCL LOCAL == CUDA SHARED
-    int bytes = 0;
-    sycl::detail::pi::assertion(
-        cuFuncGetAttribute(&bytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,
-                           kernel->get()) == CUDA_SUCCESS);
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   pi_uint64(bytes));
-  }
-  case PI_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE: {
-    // Work groups should be multiples of the warp size
-    int warpSize = 0;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE,
-                             device->get()) == CUDA_SUCCESS);
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   static_cast<size_t>(warpSize));
-  }
-  case PI_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE: {
-    // OpenCL PRIVATE == CUDA LOCAL
-    int bytes = 0;
-    sycl::detail::pi::assertion(
-        cuFuncGetAttribute(&bytes, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES,
-                           kernel->get()) == CUDA_SUCCESS);
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   pi_uint64(bytes));
-  }
-  case PI_KERNEL_GROUP_INFO_NUM_REGS: {
-    int numRegs = 0;
-    sycl::detail::pi::assertion(
-        cuFuncGetAttribute(&numRegs, CU_FUNC_ATTRIBUTE_NUM_REGS,
-                           kernel->get()) == CUDA_SUCCESS);
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   pi_uint32(numRegs));
-  }
-  default:
-    __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name);
+      int max_grid_dimX{0}, max_grid_dimY{0}, max_grid_dimZ{0};
+      sycl::detail::pi::assertion(
+          cuDeviceGetAttribute(&max_grid_dimX,
+                               CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
+                               device->get()) == CUDA_SUCCESS);
+      sycl::detail::pi::assertion(
+          cuDeviceGetAttribute(&max_grid_dimY,
+                               CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y,
+                               device->get()) == CUDA_SUCCESS);
+      sycl::detail::pi::assertion(
+          cuDeviceGetAttribute(&max_grid_dimZ,
+                               CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z,
+                               device->get()) == CUDA_SUCCESS);
+
+      global_work_size[0] = max_block_dimX * max_grid_dimX;
+      global_work_size[1] = max_block_dimY * max_grid_dimY;
+      global_work_size[2] = max_block_dimZ * max_grid_dimZ;
+      return getInfoArray(3, param_value_size, param_value,
+                          param_value_size_ret, global_work_size);
+    }
+    case PI_KERNEL_GROUP_INFO_WORK_GROUP_SIZE: {
+      int max_threads = 0;
+      sycl::detail::pi::assertion(
+          cuFuncGetAttribute(&max_threads,
+                             CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
+                             kernel->get()) == CUDA_SUCCESS);
+      return getInfo(param_value_size, param_value, param_value_size_ret,
+                     size_t(max_threads));
+    }
+    case PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE: {
+      size_t group_size[3] = {0, 0, 0};
+      const auto &reqd_wg_size_md_map =
+          kernel->program_->kernelReqdWorkGroupSizeMD_;
+      const auto reqd_wg_size_md = reqd_wg_size_md_map.find(kernel->name_);
+      if (reqd_wg_size_md != reqd_wg_size_md_map.end()) {
+        const auto reqd_wg_size = reqd_wg_size_md->second;
+        group_size[0] = std::get<0>(reqd_wg_size);
+        group_size[1] = std::get<1>(reqd_wg_size);
+        group_size[2] = std::get<2>(reqd_wg_size);
+      }
+      return getInfoArray(3, param_value_size, param_value,
+                          param_value_size_ret, group_size);
+    }
+    case PI_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE: {
+      // OpenCL LOCAL == CUDA SHARED
+      int bytes = 0;
+      sycl::detail::pi::assertion(
+          cuFuncGetAttribute(&bytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,
+                             kernel->get()) == CUDA_SUCCESS);
+      return getInfo(param_value_size, param_value, param_value_size_ret,
+                     pi_uint64(bytes));
+    }
+    case PI_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE: {
+      // Work groups should be multiples of the warp size
+      int warpSize = 0;
+      sycl::detail::pi::assertion(
+          cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE,
+                               device->get()) == CUDA_SUCCESS);
+      return getInfo(param_value_size, param_value, param_value_size_ret,
+                     static_cast<size_t>(warpSize));
+    }
+    case PI_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE: {
+      // OpenCL PRIVATE == CUDA LOCAL
+      int bytes = 0;
+      sycl::detail::pi::assertion(
+          cuFuncGetAttribute(&bytes, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES,
+                             kernel->get()) == CUDA_SUCCESS);
+      return getInfo(param_value_size, param_value, param_value_size_ret,
+                     pi_uint64(bytes));
+    }
+    case PI_KERNEL_GROUP_INFO_NUM_REGS: {
+      int numRegs = 0;
+      sycl::detail::pi::assertion(
+          cuFuncGetAttribute(&numRegs, CU_FUNC_ATTRIBUTE_NUM_REGS,
+                             kernel->get()) == CUDA_SUCCESS);
+      return getInfo(param_value_size, param_value, param_value_size_ret,
+                     pi_uint32(numRegs));
+    }
+    default:
+      __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name);
+    }
   }
 
   return PI_ERROR_INVALID_KERNEL;
diff --git a/sycl/plugins/cuda/pi_cuda.hpp b/sycl/plugins/cuda/pi_cuda.hpp
index d35bf5fd1a3bc..f6baeab0a4445 100644
--- a/sycl/plugins/cuda/pi_cuda.hpp
+++ b/sycl/plugins/cuda/pi_cuda.hpp
@@ -42,11 +42,6 @@
 #include <unordered_map>
 #include <vector>
 
-// Helper for one-liner validation
-#define PI_ASSERT(condition, error)                                            \
-  if (!(condition))                                                            \
-    return error;
-
 extern "C" {
 
 /// \cond IGNORE_BLOCK_IN_DOXYGEN
diff --git a/sycl/plugins/hip/pi_hip.cpp b/sycl/plugins/hip/pi_hip.cpp
index bf07feb565f7e..acac19eef627a 100644
--- a/sycl/plugins/hip/pi_hip.cpp
+++ b/sycl/plugins/hip/pi_hip.cpp
@@ -3581,98 +3581,98 @@ pi_result hip_piKernelGetGroupInfo(pi_kernel kernel, pi_device device,
                                    size_t param_value_size, void *param_value,
                                    size_t *param_value_size_ret) {
 
-  PI_ASSERT(kernel, PI_ERROR_INVALID_KERNEL);
-  PI_ASSERT(device, PI_ERROR_INVALID_DEVICE);
-
   // here we want to query about a kernel's hip blocks!
 
-  switch (param_name) {
-  case PI_KERNEL_GROUP_INFO_GLOBAL_WORK_SIZE: {
-    size_t global_work_size[3] = {0, 0, 0};
+  if (kernel != nullptr) {
 
-    int max_block_dimX{0}, max_block_dimY{0}, max_block_dimZ{0};
-    sycl::detail::pi::assertion(
-        hipDeviceGetAttribute(&max_block_dimX, hipDeviceAttributeMaxBlockDimX,
-                              device->get()) == hipSuccess);
-    sycl::detail::pi::assertion(
-        hipDeviceGetAttribute(&max_block_dimY, hipDeviceAttributeMaxBlockDimY,
-                              device->get()) == hipSuccess);
-    sycl::detail::pi::assertion(
-        hipDeviceGetAttribute(&max_block_dimZ, hipDeviceAttributeMaxBlockDimZ,
-                              device->get()) == hipSuccess);
+    switch (param_name) {
+    case PI_KERNEL_GROUP_INFO_GLOBAL_WORK_SIZE: {
+      size_t global_work_size[3] = {0, 0, 0};
 
-    int max_grid_dimX{0}, max_grid_dimY{0}, max_grid_dimZ{0};
-    sycl::detail::pi::assertion(
-        hipDeviceGetAttribute(&max_grid_dimX, hipDeviceAttributeMaxGridDimX,
-                              device->get()) == hipSuccess);
-    sycl::detail::pi::assertion(
-        hipDeviceGetAttribute(&max_grid_dimY, hipDeviceAttributeMaxGridDimY,
-                              device->get()) == hipSuccess);
-    sycl::detail::pi::assertion(
-        hipDeviceGetAttribute(&max_grid_dimZ, hipDeviceAttributeMaxGridDimZ,
-                              device->get()) == hipSuccess);
+      int max_block_dimX{0}, max_block_dimY{0}, max_block_dimZ{0};
+      sycl::detail::pi::assertion(
+          hipDeviceGetAttribute(&max_block_dimX, hipDeviceAttributeMaxBlockDimX,
+                                device->get()) == hipSuccess);
+      sycl::detail::pi::assertion(
+          hipDeviceGetAttribute(&max_block_dimY, hipDeviceAttributeMaxBlockDimY,
+                                device->get()) == hipSuccess);
+      sycl::detail::pi::assertion(
+          hipDeviceGetAttribute(&max_block_dimZ, hipDeviceAttributeMaxBlockDimZ,
+                                device->get()) == hipSuccess);
 
-    global_work_size[0] = max_block_dimX * max_grid_dimX;
-    global_work_size[1] = max_block_dimY * max_grid_dimY;
-    global_work_size[2] = max_block_dimZ * max_grid_dimZ;
-    return getInfoArray(3, param_value_size, param_value, param_value_size_ret,
-                        global_work_size);
-  }
-  case PI_KERNEL_GROUP_INFO_WORK_GROUP_SIZE: {
-    int max_threads = 0;
-    sycl::detail::pi::assertion(
-        hipFuncGetAttribute(&max_threads,
-                            HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
-                            kernel->get()) == hipSuccess);
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   size_t(max_threads));
-  }
-  case PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE: {
-    // Returns the work-group size specified in the kernel source or IL.
-    // If the work-group size is not specified in the kernel source or IL,
-    // (0, 0, 0) is returned.
-    // https://www.khronos.org/registry/OpenCL/sdk/2.1/docs/man/xhtml/clGetKernelWorkGroupInfo.html
-
-    // TODO: can we extract the work group size from the PTX?
-    size_t group_size[3] = {0, 0, 0};
-    return getInfoArray(3, param_value_size, param_value, param_value_size_ret,
-                        group_size);
-  }
-  case PI_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE: {
-    // OpenCL LOCAL == HIP SHARED
-    int bytes = 0;
-    sycl::detail::pi::assertion(
-        hipFuncGetAttribute(&bytes, HIP_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,
-                            kernel->get()) == hipSuccess);
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   pi_uint64(bytes));
-  }
-  case PI_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE: {
-    // Work groups should be multiples of the warp size
-    int warpSize = 0;
-    sycl::detail::pi::assertion(
-        hipDeviceGetAttribute(&warpSize, hipDeviceAttributeWarpSize,
-                              device->get()) == hipSuccess);
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   static_cast<size_t>(warpSize));
-  }
-  case PI_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE: {
-    // OpenCL PRIVATE == HIP LOCAL
-    int bytes = 0;
-    sycl::detail::pi::assertion(
-        hipFuncGetAttribute(&bytes, HIP_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES,
-                            kernel->get()) == hipSuccess);
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   pi_uint64(bytes));
-  }
-  case PI_KERNEL_GROUP_INFO_NUM_REGS: {
-    sycl::detail::pi::die("PI_KERNEL_GROUP_INFO_NUM_REGS in "
-                          "piKernelGetGroupInfo not implemented\n");
-    return {};
-  }
+      int max_grid_dimX{0}, max_grid_dimY{0}, max_grid_dimZ{0};
+      sycl::detail::pi::assertion(
+          hipDeviceGetAttribute(&max_grid_dimX, hipDeviceAttributeMaxGridDimX,
+                                device->get()) == hipSuccess);
+      sycl::detail::pi::assertion(
+          hipDeviceGetAttribute(&max_grid_dimY, hipDeviceAttributeMaxGridDimY,
+                                device->get()) == hipSuccess);
+      sycl::detail::pi::assertion(
+          hipDeviceGetAttribute(&max_grid_dimZ, hipDeviceAttributeMaxGridDimZ,
+                                device->get()) == hipSuccess);
 
-  default:
-    __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name);
+      global_work_size[0] = max_block_dimX * max_grid_dimX;
+      global_work_size[1] = max_block_dimY * max_grid_dimY;
+      global_work_size[2] = max_block_dimZ * max_grid_dimZ;
+      return getInfoArray(3, param_value_size, param_value,
+                          param_value_size_ret, global_work_size);
+    }
+    case PI_KERNEL_GROUP_INFO_WORK_GROUP_SIZE: {
+      int max_threads = 0;
+      sycl::detail::pi::assertion(
+          hipFuncGetAttribute(&max_threads,
+                              HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
+                              kernel->get()) == hipSuccess);
+      return getInfo(param_value_size, param_value, param_value_size_ret,
+                     size_t(max_threads));
+    }
+    case PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE: {
+      // Returns the work-group size specified in the kernel source or IL.
+      // If the work-group size is not specified in the kernel source or IL,
+      // (0, 0, 0) is returned.
+      // https://www.khronos.org/registry/OpenCL/sdk/2.1/docs/man/xhtml/clGetKernelWorkGroupInfo.html
+
+      // TODO: can we extract the work group size from the PTX?
+      size_t group_size[3] = {0, 0, 0};
+      return getInfoArray(3, param_value_size, param_value,
+                          param_value_size_ret, group_size);
+    }
+    case PI_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE: {
+      // OpenCL LOCAL == HIP SHARED
+      int bytes = 0;
+      sycl::detail::pi::assertion(
+          hipFuncGetAttribute(&bytes, HIP_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,
+                              kernel->get()) == hipSuccess);
+      return getInfo(param_value_size, param_value, param_value_size_ret,
+                     pi_uint64(bytes));
+    }
+    case PI_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE: {
+      // Work groups should be multiples of the warp size
+      int warpSize = 0;
+      sycl::detail::pi::assertion(
+          hipDeviceGetAttribute(&warpSize, hipDeviceAttributeWarpSize,
+                                device->get()) == hipSuccess);
+      return getInfo(param_value_size, param_value, param_value_size_ret,
+                     static_cast<size_t>(warpSize));
+    }
+    case PI_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE: {
+      // OpenCL PRIVATE == HIP LOCAL
+      int bytes = 0;
+      sycl::detail::pi::assertion(
+          hipFuncGetAttribute(&bytes, HIP_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES,
+                              kernel->get()) == hipSuccess);
+      return getInfo(param_value_size, param_value, param_value_size_ret,
+                     pi_uint64(bytes));
+    }
+    case PI_KERNEL_GROUP_INFO_NUM_REGS: {
+      sycl::detail::pi::die("PI_KERNEL_GROUP_INFO_NUM_REGS in "
+                            "piKernelGetGroupInfo not implemented\n");
+      return {};
+    }
+
+    default:
+      __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name);
+    }
   }
 
   return PI_ERROR_INVALID_KERNEL;
diff --git a/sycl/plugins/hip/pi_hip.hpp b/sycl/plugins/hip/pi_hip.hpp
index 2001c7b22a425..7778f1d07be96 100644
--- a/sycl/plugins/hip/pi_hip.hpp
+++ b/sycl/plugins/hip/pi_hip.hpp
@@ -39,11 +39,6 @@
 #include <string>
 #include <vector>
 
-// Helper for one-liner validation
-#define PI_ASSERT(condition, error)                                            \
-  if (!(condition))                                                            \
-    return error;
-
 extern "C" {
 
 /// \cond INGORE_BLOCK_IN_DOXYGEN