From 12bafdcf33280aa911e5645462b35d1bdc17abf2 Mon Sep 17 00:00:00 2001 From: Qun Gao Date: Fri, 21 Jun 2024 22:02:29 +0000 Subject: [PATCH] Enable runtime gpu_arch auto-select based on devices where kernels are executing for gemm_int4 tests; enable device-specific compilation using USE_XETLA (xe_lpg, xe_hpg, xe_hpc). Signed-off-by: Qun Gao --- CMakeLists.txt | 24 ++++ tests/integration/CMakeLists.txt | 38 +++++-- tests/integration/gemm/CMakeLists.txt | 26 +++-- .../gemm/int4_dequantization/main.cpp | 104 ++++++++++++++++-- .../int4_dequantization_bias/CMakeLists.txt | 11 +- tests/integration/vector_add/CMakeLists.txt | 12 +- tests/unit/CMakeLists.txt | 67 +++++++---- 7 files changed, 222 insertions(+), 60 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8717a0212..8d81e881c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -54,6 +54,30 @@ set(XETLA_OFFLINE_OPTIONS "${XETLA_OFFLINE_OPTIONS} -Xfinalizer -enableBCR") # Optimization to reduce the tokens used for DPAS instruction. set(XETLA_OFFLINE_OPTIONS "${XETLA_OFFLINE_OPTIONS} -Xfinalizer -DPASTokenReduction") +# USE_XETLA - Align to IPEX logic +if(USE_XETLA) # A quoted string always evaluates to false unless: The string's value is one of the true constants + string(REPLACE "," ";" USE_XETLA ${USE_XETLA}) + message("The used archs are: ${USE_XETLA}") +elseif(NOT USE_XETLA) # if(): True if given a variable that is defined to a value that is not a false constant + message("No archs specified. Stopping CMake execution here.") + set(USE_XETLA "") +endif() + +set(XETLA_AVAILABLE_ARCHS xe_hpc xe_hpg xe_lpg) +set(USE_XETLA_XE_LPG OFF) +set(USE_XETLA_XE_HPG OFF) +set(USE_XETLA_XE_HPC OFF) + +foreach(used_arch IN LISTS USE_XETLA) + if (used_arch IN_LIST XETLA_AVAILABLE_ARCHS) + string(TOUPPER "${used_arch}" arch_upper) + set(USE_XETLA_${arch_upper} ON) + message(STATUS "XeTLA: Found arch from list: ${arch_upper}") + else() + message(FATAL_ERROR "Unexpected XeTLA architecture: ${used_arch}") + endif() +endforeach() + # AOT device set(USE_AOT_DEVLIST "" CACHE STRING "Set device list for AOT build") if (USE_AOT_DEVLIST) diff --git a/tests/integration/CMakeLists.txt b/tests/integration/CMakeLists.txt index fcb4a9fc2..4d4445d3e 100644 --- a/tests/integration/CMakeLists.txt +++ b/tests/integration/CMakeLists.txt @@ -19,15 +19,29 @@ function(add_integration_test target "host_cpp") # target_link_libraries(${TARGET} PUBLIC MKL::MKL_SYCL) endfunction() -# add_subdirectory(vector_add) -add_subdirectory(gemm) -add_subdirectory(gemv) -add_subdirectory(row_reduction) -add_subdirectory(layer_norm) -add_subdirectory(data_transformer) -add_subdirectory(default_config) -add_subdirectory(sg_dropout_op) -add_subdirectory(limitation) -add_subdirectory(softmax) -add_subdirectory(fmha) -add_subdirectory(col_major_shuf) +if (USE_XETLA_XE_LPG) + add_subdirectory(vector_add) + add_subdirectory(gemm) + # add_subdirectory(row_reduction) + # add_subdirectory(layer_norm) + # add_subdirectory(data_transformer) + # add_subdirectory(default_config) + # add_subdirectory(sg_dropout_op) + add_subdirectory(limitation) + # add_subdirectory(softmax) + add_subdirectory(fmha) + add_subdirectory(col_major_shuf) +else() + # add_subdirectory(vector_add) + add_subdirectory(gemm) + add_subdirectory(gemv) + add_subdirectory(row_reduction) + add_subdirectory(layer_norm) + add_subdirectory(data_transformer) + add_subdirectory(default_config) + add_subdirectory(sg_dropout_op) + add_subdirectory(limitation) + add_subdirectory(softmax) + add_subdirectory(fmha) + add_subdirectory(col_major_shuf) +endif() diff --git a/tests/integration/gemm/CMakeLists.txt b/tests/integration/gemm/CMakeLists.txt index cb9c535c5..f406550f0 100644 --- a/tests/integration/gemm/CMakeLists.txt +++ b/tests/integration/gemm/CMakeLists.txt @@ -1,12 +1,16 @@ include_directories(${CMAKE_SOURCE_DIR}/tests/integration/gemm) - -add_subdirectory(bf16) -add_subdirectory(stream_k) -add_subdirectory(fp16) -add_subdirectory(fp32) -add_subdirectory(int8_quantization) -add_subdirectory(int8) -add_subdirectory(tf32) -add_subdirectory(int4_dequantization) -add_subdirectory(int4_dequantization_bias) -add_subdirectory(unaligned_bf16) +if (USE_XETLA_XE_LPG) + add_subdirectory(int4_dequantization) + add_subdirectory(int4_dequantization_bias) +else() + add_subdirectory(bf16) + add_subdirectory(stream_k) + add_subdirectory(fp16) + add_subdirectory(fp32) + add_subdirectory(int8_quantization) + add_subdirectory(int8) + add_subdirectory(tf32) + add_subdirectory(int4_dequantization) + add_subdirectory(int4_dequantization_bias) + add_subdirectory(unaligned_bf16) +endif() \ No newline at end of file diff --git a/tests/integration/gemm/int4_dequantization/main.cpp b/tests/integration/gemm/int4_dequantization/main.cpp index 18e40ded5..55df77c06 100644 --- a/tests/integration/gemm/int4_dequantization/main.cpp +++ b/tests/integration/gemm/int4_dequantization/main.cpp @@ -157,7 +157,11 @@ class last { using data_type_c = fp16; }; -template +template +class KernalName { + +}; +template void dequantize_gemm_run(uint32_t iter) { using namespace gpu; // Accept incoming parameters @@ -238,16 +242,16 @@ void dequantize_gemm_run(uint32_t iter) { data_type_scale, data_type_zero_pt, quant_info, - mma_engine::xmx, - gpu_arch::XeHpg>; + y, + x>; using gemm_t = xetla::group:: gemm_t; using epilogue_t = xetla::group::epilogue_t< - xetla::group::epilogue_policy_default, + xetla::group::epilogue_policy_default, tile_shape, mem_desc_c_t>; - using group_swizzle = xetla::kernel::group_swizzle_default; + using group_swizzle = xetla::kernel::group_swizzle_default; using gemm_op_t = xetla::kernel::gemm_universal_t< gpu::xetla::kernel::dispatch_policy_int4_dequantize_kslicing< group_swizzle, @@ -366,7 +370,7 @@ void dequantize_gemm_run(uint32_t iter) { for (uint32_t i = 0; i < iter; i++) { prof.cpu_start(); auto e_esimd = queue.submit([&](handler& cgh) { - cgh.parallel_for(nd_range, [=](nd_item<3> item) KERNEL_MAIN { + cgh.parallel_for>(nd_range, [=](nd_item<3> item) KERNEL_MAIN { // allocate slm and nbarrier resource slm_barrier_init(); gemm_op_t gemm_op; @@ -433,8 +437,94 @@ template class dequantize_gemm_test : public ::testing::Test {}; TYPED_TEST_SUITE_P(dequantize_gemm_test); +template class F, class G> +class dispatch_arch_test +{ + using T_RET = std::invoke_result_t::exec)>; + + public: + template + static T_RET exec(Args&&... args) { + // save default formatting + std::ios fmt_bak(nullptr); + fmt_bak.copyfmt(std::cout); + + sycl::device device; + if (!device.has(aspect::ext_intel_device_id)) + throw std::runtime_error("Can not get device ID"); + auto deviceID = device.get_info(); + std::cout << "deviceID: 0x" << std::hex // + << std::right << std::setfill('0') << deviceID << "\n"; + + // restore default formatting + std::cout.copyfmt(fmt_bak); +#if defined(SYCL_EXT_ONEAPI_DEVICE_ARCHITECTURE) && \ + SYCL_EXT_ONEAPI_DEVICE_ARCHITECTURE + // https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_device_architecture.asciidoc#feature-test-macro + try { + namespace ENS = sycl::ext::oneapi::experimental; + auto deviceArch = device.get_info(); + switch (deviceArch) { + case ENS::architecture::intel_gpu_pvc: + return F::exec(std::forward(args)...); + return; + case ENS::architecture::intel_gpu_dg2_g10: + case ENS::architecture::intel_gpu_dg2_g11: + case ENS::architecture::intel_gpu_dg2_g12: + return F::exec(std::forward(args)...); + default: + break; + } + } + catch (...) { + std::cout << "Execption occurred! Please check one api versions."; + } +#endif + std::cout << "No matching architecture, checking device ID ...\n"; + switch (deviceID) { + // MTL devices + case 0x7d55: // Intel® Arc ™ Graphics + std::cout << "MTL devices identified!" << std::endl; + return F::exec(std::forward(args)...); + // DG2 devices + case 0x56a0: // Intel® Arc ™ A770 Graphics + case 0x56a1: // Intel® Arc ™ A750 Graphics + case 0x56a2: // Intel® Arc ™ A580 Graphics + case 0x5690: // Intel® Arc ™ A770M Graphics + case 0x5691: // Intel® Arc ™ A730M Graphics + case 0x5692: // Intel® Arc ™ A550M Graphics + return F::exec(std::forward(args)...); + // PVC devices + case 0x0bda: // + return F::exec(std::forward(args)...); + default: + std::cout << "Unknown device ID \n"; + break; + } + + if (device.has(aspect::ext_intel_gpu_eu_simd_width)) + throw std::runtime_error("Can not get eu_simd_width"); + auto eu_simd_width = + device.get_info(); + if (eu_simd_width == 8) { + return F::exec(std::forward(args)...); + } else if (eu_simd_width == 16) { + return F::exec(std::forward(args)...); + } else { + throw std::runtime_error("Can not get device ID"); + } + } +}; + +template +struct main_wrapper { + static constexpr auto exec = []() { + dequantize_gemm_run(ITER); + }; +}; + TYPED_TEST_P(dequantize_gemm_test, esimd) { - dequantize_gemm_run(ITER); + dispatch_arch_test::exec(); } REGISTER_TYPED_TEST_SUITE_P(dequantize_gemm_test, esimd); diff --git a/tests/integration/gemm/int4_dequantization_bias/CMakeLists.txt b/tests/integration/gemm/int4_dequantization_bias/CMakeLists.txt index 4bdf0a42b..f0b53892e 100644 --- a/tests/integration/gemm/int4_dequantization_bias/CMakeLists.txt +++ b/tests/integration/gemm/int4_dequantization_bias/CMakeLists.txt @@ -5,7 +5,10 @@ set(ProjectIdXe ${ProjectId}) string(PREPEND ProjectIdClient "gemm_client_") string(PREPEND ProjectIdXe "gemm_xe_") -FILE(GLOB src_client main_client.cpp) -add_integration_test(${ProjectIdClient} ${src_client}) -FILE(GLOB src_xe main_xe.cpp) -add_integration_test(${ProjectIdXe} ${src_xe}) +if (USE_XETLA_XE_LPG) + FILE(GLOB src_client main_client.cpp) + add_integration_test(${ProjectIdClient} ${src_client}) +else() + FILE(GLOB src_xe main_xe.cpp) + add_integration_test(${ProjectIdXe} ${src_xe}) +endif() diff --git a/tests/integration/vector_add/CMakeLists.txt b/tests/integration/vector_add/CMakeLists.txt index 7b320376e..a45724975 100644 --- a/tests/integration/vector_add/CMakeLists.txt +++ b/tests/integration/vector_add/CMakeLists.txt @@ -1,4 +1,8 @@ -add_subdirectory(tf32_1d) -add_subdirectory(bf16_2d) -add_subdirectory(int32_1d) -add_subdirectory(int32_2d) +if (USE_XETLA_XE_LPG) + add_subdirectory(int32_1d) +else() + add_subdirectory(tf32_1d) + add_subdirectory(bf16_2d) + add_subdirectory(int32_1d) + add_subdirectory(int32_2d) +endif() diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt index a492e5869..8b5858cc0 100644 --- a/tests/unit/CMakeLists.txt +++ b/tests/unit/CMakeLists.txt @@ -19,25 +19,48 @@ function(add_unit_test target kernel_func_file test_host) set_tests_properties(${TARGET} PROPERTIES LABELS "unit" TIMEOUT ${UNIT_TIMEOUT}) endfunction() -add_subdirectory(global_load_store) -add_subdirectory(global_atomic) -add_subdirectory(block_load_store) -add_subdirectory(tile_load_store) -add_subdirectory(tile_load_store_local) -add_subdirectory(internal_type_load_store_cvt) -add_subdirectory(local_load_store) -add_subdirectory(raw_send) -add_subdirectory(buff_compare) -add_subdirectory(tile_mma) -add_subdirectory(named_barrier) -add_subdirectory(tile_row_reduction) -add_subdirectory(add_c) -add_subdirectory(imul) -add_subdirectory(philox_rng) -add_subdirectory(exp_inv_sqrt_tanh) -add_subdirectory(reg_layout_conversion) -add_subdirectory(reg_reduce) -add_subdirectory(math_general) -add_subdirectory(epilogue_tile_op) -add_subdirectory(bit_mask_manipulation) - +if (USE_XETLA_XE_LPG) + add_subdirectory(global_load_store) + add_subdirectory(global_atomic) + # add_subdirectory(block_load_store) + # add_subdirectory(tile_load_store) + # add_subdirectory(tile_load_store_local) + # add_subdirectory(internal_type_load_store_cvt) + add_subdirectory(local_load_store) + # add_subdirectory(raw_send) + add_subdirectory(buff_compare) + # add_subdirectory(tile_mma) + # add_subdirectory(named_barrier) + # add_subdirectory(tile_row_reduction) + add_subdirectory(add_c) + add_subdirectory(imul) + add_subdirectory(philox_rng) + # add_subdirectory(exp_inv_sqrt_tanh) + # add_subdirectory(reg_layout_conversion) + add_subdirectory(reg_reduce) + add_subdirectory(math_general) + # add_subdirectory(epilogue_tile_op) + # add_subdirectory(bit_mask_manipulation) +else() + add_subdirectory(global_load_store) + add_subdirectory(global_atomic) + add_subdirectory(block_load_store) + add_subdirectory(tile_load_store) + add_subdirectory(tile_load_store_local) + add_subdirectory(internal_type_load_store_cvt) + add_subdirectory(local_load_store) + add_subdirectory(raw_send) + add_subdirectory(buff_compare) + add_subdirectory(tile_mma) + add_subdirectory(named_barrier) + add_subdirectory(tile_row_reduction) + add_subdirectory(add_c) + add_subdirectory(imul) + add_subdirectory(philox_rng) + add_subdirectory(exp_inv_sqrt_tanh) + add_subdirectory(reg_layout_conversion) + add_subdirectory(reg_reduce) + add_subdirectory(math_general) + add_subdirectory(epilogue_tile_op) + add_subdirectory(bit_mask_manipulation) +endif()