From fd346a05805ea1d3da67bf59eb74f6cde31d59a0 Mon Sep 17 00:00:00 2001 From: CrabeExtra Date: Mon, 11 Aug 2025 21:51:55 +0330 Subject: [PATCH 1/8] bitonic sort sample added Signed-off-by: CrabeExtra --- 13_BitonicSort/CMakeLists.txt | 24 ++ .../bitonic_sort_shader.comp.hlsl | 112 ++++++ 13_BitonicSort/app_resources/common.hlsl | 17 + 13_BitonicSort/config.json.template | 28 ++ 13_BitonicSort/main.cpp | 323 ++++++++++++++++++ 13_BitonicSort/pipeline.groovy | 50 +++ CMakeLists.txt | 23 +- 7 files changed, 561 insertions(+), 16 deletions(-) create mode 100644 13_BitonicSort/CMakeLists.txt create mode 100644 13_BitonicSort/app_resources/bitonic_sort_shader.comp.hlsl create mode 100644 13_BitonicSort/app_resources/common.hlsl create mode 100644 13_BitonicSort/config.json.template create mode 100644 13_BitonicSort/main.cpp create mode 100644 13_BitonicSort/pipeline.groovy diff --git a/13_BitonicSort/CMakeLists.txt b/13_BitonicSort/CMakeLists.txt new file mode 100644 index 000000000..b7cad41da --- /dev/null +++ b/13_BitonicSort/CMakeLists.txt @@ -0,0 +1,24 @@ +include(common RESULT_VARIABLE RES) +if(NOT RES) + message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory") +endif() + +nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}") + +if(NBL_EMBED_BUILTIN_RESOURCES) + set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData) + set(RESOURCE_DIR "app_resources") + + get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE) + get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE) + get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE) + + file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*") + foreach(RES_FILE ${BUILTIN_RESOURCE_FILES}) + LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}") + endforeach() + + ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}") + + LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_}) +endif() diff --git a/13_BitonicSort/app_resources/bitonic_sort_shader.comp.hlsl b/13_BitonicSort/app_resources/bitonic_sort_shader.comp.hlsl new file mode 100644 index 000000000..058b14eea --- /dev/null +++ b/13_BitonicSort/app_resources/bitonic_sort_shader.comp.hlsl @@ -0,0 +1,112 @@ +#include "nbl/builtin/hlsl/bda/bda_accessor.hlsl" + +struct BitonicPushData +{ + uint64_t inputKeyAddress; + uint64_t inputValueAddress; + uint64_t outputKeyAddress; + uint64_t outputValueAddress; + uint32_t dataElementCount; +}; + +using namespace nbl::hlsl; + +[[vk::push_constant]] BitonicPushData pushData; + +using DataPtr = bda::__ptr; +using DataAccessor = BdaAccessor; + +groupshared uint32_t sharedKeys[ElementCount]; +groupshared uint32_t sharedValues[ElementCount]; + +[numthreads(WorkgroupSize, 1, 1)] +[shader("compute")] +void main(uint32_t3 dispatchId : SV_DispatchThreadID, uint32_t3 localId : SV_GroupThreadID) +{ + const uint32_t threadId = localId.x; + const uint32_t dataSize = pushData.dataElementCount; + + DataAccessor inputKeys = DataAccessor::create(DataPtr::create(pushData.inputKeyAddress)); + DataAccessor inputValues = DataAccessor::create(DataPtr::create(pushData.inputValueAddress)); + + for (uint32_t i = threadId; i < dataSize; i += WorkgroupSize) + { + inputKeys.get(i, sharedKeys[i]); + inputValues.get(i, sharedValues[i]); + } + + // Synchronize all threads after loading + GroupMemoryBarrierWithGroupSync(); + + + for (uint32_t stage = 0; stage < Log2ElementCount; stage++) + { + for (uint32_t pass = 0; pass <= stage; pass++) + { + const uint32_t compareDistance = 1 << (stage - pass); + + for (uint32_t i = threadId; i < dataSize; i += WorkgroupSize) + { + const uint32_t partnerId = i ^ compareDistance; + + if (partnerId >= dataSize) + continue; + + const uint32_t waveSize = WaveGetLaneCount(); + const uint32_t myWaveId = i / waveSize; + const uint32_t partnerWaveId = partnerId / waveSize; + const bool sameWave = (myWaveId == partnerWaveId); + + uint32_t myKey, myValue, partnerKey, partnerValue; + [branch] + if (sameWave && compareDistance < waveSize) + { + // WAVE INTRINSIC + myKey = sharedKeys[i]; + myValue = sharedValues[i]; + + const uint32_t partnerLane = partnerId % waveSize; + partnerKey = WaveReadLaneAt(myKey, partnerLane); + partnerValue = WaveReadLaneAt(myValue, partnerLane); + } + else + { + // SHARED MEM + myKey = sharedKeys[i]; + myValue = sharedValues[i]; + partnerKey = sharedKeys[partnerId]; + partnerValue = sharedValues[partnerId]; + } + + const uint32_t sequenceSize = 1 << (stage + 1); + const uint32_t sequenceIndex = i / sequenceSize; + const bool sequenceAscending = (sequenceIndex % 2) == 0; + const bool ascending = true; + const bool finalDirection = sequenceAscending == ascending; + + const bool swap = (myKey > partnerKey) == finalDirection; + + // WORKGROUP COORDINATION: Only lower-indexed element writes both + if (i < partnerId && swap) + { + sharedKeys[i] = partnerKey; + sharedKeys[partnerId] = myKey; + sharedValues[i] = partnerValue; + sharedValues[partnerId] = myValue; + } + } + + GroupMemoryBarrierWithGroupSync(); + } + } + + + DataAccessor outputKeys = DataAccessor::create(DataPtr::create(pushData.outputKeyAddress)); + DataAccessor outputValues = DataAccessor::create(DataPtr::create(pushData.outputValueAddress)); + + for (uint32_t i = threadId; i < dataSize; i += WorkgroupSize) + { + outputKeys.set(i, sharedKeys[i]); + outputValues.set(i, sharedValues[i]); + } +} \ No newline at end of file diff --git a/13_BitonicSort/app_resources/common.hlsl b/13_BitonicSort/app_resources/common.hlsl new file mode 100644 index 000000000..5f15d0af1 --- /dev/null +++ b/13_BitonicSort/app_resources/common.hlsl @@ -0,0 +1,17 @@ +// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _BITONIC_SORT_COMMON_INCLUDED_ +#define _BITONIC_SORT_COMMON_INCLUDED_ + +struct BitonicPushData +{ + + uint64_t inputKeyAddress; + uint64_t inputValueAddress; + uint64_t outputKeyAddress; + uint64_t outputValueAddress; + uint32_t dataElementCount; +}; + +#endif \ No newline at end of file diff --git a/13_BitonicSort/config.json.template b/13_BitonicSort/config.json.template new file mode 100644 index 000000000..12215d0bb --- /dev/null +++ b/13_BitonicSort/config.json.template @@ -0,0 +1,28 @@ +{ + "enableParallelBuild": true, + "threadsPerBuildProcess" : 2, + "isExecuted": false, + "scriptPath": "", + "cmake": { + "configurations": [ "Release", "Debug", "RelWithDebInfo" ], + "buildModes": [], + "requiredOptions": [] + }, + "profiles": [ + { + "backend": "vulkan", // should be none + "platform": "windows", + "buildModes": [], + "runConfiguration": "Release", // we also need to run in Debug nad RWDI because foundational example + "gpuArchitectures": [] + } + ], + "dependencies": [], + "data": [ + { + "dependencies": [], + "command": [""], + "outputs": [] + } + ] +} diff --git a/13_BitonicSort/main.cpp b/13_BitonicSort/main.cpp new file mode 100644 index 000000000..7e44c5b11 --- /dev/null +++ b/13_BitonicSort/main.cpp @@ -0,0 +1,323 @@ +#include "nbl/examples/examples.hpp" +#include +#include + +using namespace nbl; +using namespace nbl::core; +using namespace nbl::hlsl; +using namespace nbl::system; +using namespace nbl::asset; +using namespace nbl::ui; +using namespace nbl::video; +using namespace nbl::examples; + +#include "app_resources/common.hlsl" +#include "nbl/builtin/hlsl/bit.hlsl" + +class BitonicSortApp final : public application_templates::MonoDeviceApplication, public BuiltinResourcesApplication +{ + using device_base_t = application_templates::MonoDeviceApplication; + using asset_base_t = BuiltinResourcesApplication; + +public: + BitonicSortApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) : + system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {} + + bool onAppInitialized(smart_refctd_ptr&& system) override + { + if (!device_base_t::onAppInitialized(smart_refctd_ptr(system))) + return false; + if (!asset_base_t::onAppInitialized(std::move(system))) + return false; + + auto limits = m_physicalDevice->getLimits(); + const uint32_t max_shared_memory_size = limits.maxComputeSharedMemorySize; + const uint32_t max_workgroup_size = limits.maxComputeWorkGroupInvocations; // Get actual GPU limit + const uint32_t bytes_per_elements = sizeof(uint32_t) * 2; // 2 uint32_t per element (key and value) + const uint32_t max_element_in_shared_memory = max_shared_memory_size / bytes_per_elements; + + // For bitonic sort: element count MUST be power of 2 + uint32_t element_count = core::roundDownToPoT(max_element_in_shared_memory); + + const uint32_t log2_element_count = static_cast(log2(element_count)); + + m_logger->log("GPU Limits:", ILogger::ELL_INFO); + m_logger->log(" Max Workgroup Size: " + std::to_string(max_workgroup_size), ILogger::ELL_INFO); + m_logger->log(" Max Shared Memory: " + std::to_string(max_shared_memory_size) + " bytes", ILogger::ELL_INFO); + m_logger->log(" Max elements in shared memory: " + std::to_string(max_element_in_shared_memory), ILogger::ELL_INFO); + m_logger->log(" Using element count (power of 2): " + std::to_string(element_count), ILogger::ELL_INFO); + + auto prepShader = [&](const core::string& path) -> smart_refctd_ptr + { + IAssetLoader::SAssetLoadParams lp = {}; + lp.logger = m_logger.get(); + lp.workingDirectory = ""; + auto assetBundle = m_assetMgr->getAsset(path, lp); + const auto assets = assetBundle.getContents(); + if (assets.empty()) + { + logFail("Could not load shader!"); + return nullptr; + } + + auto source = IAsset::castDown(assets[0]); + assert(source); + + auto overrideSource = CHLSLCompiler::createOverridenCopy( + source.get(), "#define ElementCount %d\n#define Log2ElementCount %d\n#define WorkgroupSize %d\n", + element_count, log2_element_count, max_workgroup_size + ); + + auto shader = m_device->compileShader({ overrideSource.get() }); + if (!shader) + { + logFail("Creation of Bitonic Sort Shader from CPU Shader source failed!"); + return nullptr; + } + return shader; + }; + + auto bitonicSortShader = prepShader("app_resources/bitonic_sort_shader.comp.hlsl"); + + if (!bitonicSortShader) + return logFail("Failed to compile bitonic sort shader!"); + + + const nbl::asset::SPushConstantRange pcRange = { .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE,.offset = 0,.size = sizeof(BitonicPushData) }; + + smart_refctd_ptr layout; + smart_refctd_ptr bitonicSortPipeline; + { + layout = m_device->createPipelineLayout({ &pcRange,1 }); + IGPUComputePipeline::SCreationParams params = {}; + params.layout = layout.get(); + params.shader.shader = bitonicSortShader.get(); + params.shader.entryPoint = "main"; + params.shader.entries = nullptr; + if (!m_device->createComputePipelines(nullptr, { ¶ms,1 }, &bitonicSortPipeline)) + return logFail("Failed to create compute pipeline!\n"); + } + + nbl::video::IDeviceMemoryAllocator::SAllocation allocation[4] = {}; + smart_refctd_ptr buffers[4]; + + auto build_buffer = [this]( + smart_refctd_ptr m_device, + nbl::video::IDeviceMemoryAllocator::SAllocation* allocation, + smart_refctd_ptr& buffer, + size_t buffer_size, + const char* label + ) -> void { + IGPUBuffer::SCreationParams params; + params.size = buffer_size; + params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; + buffer = m_device->createBuffer(std::move(params)); + if (!buffer) + logFail("Failed to create GPU buffer of size %d!\n", buffer_size); + + buffer->setObjectDebugName(label); + + auto reqs = buffer->getMemoryReqs(); + reqs.memoryTypeBits &= m_physicalDevice->getHostVisibleMemoryTypeBits(); + + *allocation = m_device->allocate(reqs, buffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT); + if (!allocation->isValid()) + logFail("Failed to allocate Device Memory compatible with our GPU Buffer!\n"); + + assert(allocation->memory.get() == buffer->getBoundMemory().memory); + }; + + build_buffer(m_device, allocation, buffers[0], sizeof(uint32_t) * element_count, "Input Key Buffer"); + build_buffer(m_device, allocation + 1, buffers[1], sizeof(uint32_t) * element_count, "Input Value Buffer"); + build_buffer(m_device, allocation + 2, buffers[2], sizeof(uint32_t) * element_count, "Output Key Buffer"); + build_buffer(m_device, allocation + 3, buffers[3], sizeof(uint32_t) * element_count, "Output Value Buffer"); + + uint64_t buffer_device_address[] = { + buffers[0]->getDeviceAddress(), + buffers[1]->getDeviceAddress(), + buffers[2]->getDeviceAddress(), + buffers[3]->getDeviceAddress() + }; + + + void* mapped_memory[] = { + allocation[0].memory->map({0ull,allocation[0].memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ), + allocation[1].memory->map({0ull,allocation[1].memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ), + allocation[2].memory->map({0ull,allocation[2].memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ), + allocation[3].memory->map({0ull,allocation[3].memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ) + }; + if (!mapped_memory[0] || !mapped_memory[1] || !mapped_memory[2] || !mapped_memory[3]) + return logFail("Failed to map the Device Memory!\n"); + + // Generate random data + unsigned seed = std::chrono::system_clock::now().time_since_epoch().count(); + std::mt19937 g(seed); + + auto bufferData = new uint32_t * [2]; + for (int i = 0; i < 2; ++i) { + bufferData[i] = new uint32_t[element_count]; + } + for (uint32_t i = 0; i < element_count; i++) { + bufferData[0][i] = g() % 10000; + } + + memcpy(mapped_memory[0], bufferData[0], sizeof(uint32_t) * element_count); + + for (uint32_t i = 0; i < element_count; i++) { + bufferData[1][i] = i; // Values are indices for verification + } + + memcpy(mapped_memory[1], bufferData[1], sizeof(uint32_t) * element_count); + + std::string outBuffer; + + outBuffer.append("ALL ELEMENTS: "); + for (auto i = 0; i < element_count; i++) { + outBuffer.append("{"); + outBuffer.append(std::to_string(bufferData[0][i])); + outBuffer.append(", "); + outBuffer.append(std::to_string(bufferData[1][i])); + outBuffer.append("} "); + + // Add newline every 20 elements for readability + if ((i + 1) % 20 == 0) { + outBuffer.append("\n"); + } + } + outBuffer.append("\n"); + outBuffer.append("Count: "); + outBuffer.append(std::to_string(element_count)); + outBuffer.append("\n"); + m_logger->log("Your input array is: \n" + outBuffer, ILogger::ELL_PERFORMANCE); + + + smart_refctd_ptr cmdBuf; + { + smart_refctd_ptr cmdpool = m_device->createCommandPool(getComputeQueue()->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); + if (!cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &cmdBuf)) + return logFail("Failed to create Command Buffers!\n"); + } + + constexpr uint64_t started_value = 0; + uint64_t timeline = started_value; + smart_refctd_ptr progress = m_device->createSemaphore(started_value); + + auto pc = BitonicPushData{ + .inputKeyAddress = buffer_device_address[0], + .inputValueAddress = buffer_device_address[1], + .outputKeyAddress = buffer_device_address[2], + .outputValueAddress = buffer_device_address[3], + .dataElementCount = element_count + }; + + cmdBuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + cmdBuf->beginDebugMarker("Bitonic Sort Single Dispatch", core::vectorSIMDf(0, 1, 0, 1)); + cmdBuf->bindComputePipeline(bitonicSortPipeline.get()); + cmdBuf->pushConstants(layout.get(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0u, sizeof(pc), &pc); + cmdBuf->dispatch(1, 1, 1); + cmdBuf->endDebugMarker(); + cmdBuf->end(); + + { + auto queue = getComputeQueue(); + + IQueue::SSubmitInfo submit_infos[1]; + IQueue::SSubmitInfo::SCommandBufferInfo cmdBufs[] = { + { + .cmdbuf = cmdBuf.get() + } + }; + submit_infos[0].commandBuffers = cmdBufs; + IQueue::SSubmitInfo::SSemaphoreInfo signals[] = { + { + .semaphore = progress.get(), + .value = ++timeline, + .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT + } + }; + submit_infos[0].signalSemaphores = signals; + + m_api->startCapture(); + queue->submit(submit_infos); + m_api->endCapture(); + } + + const ISemaphore::SWaitInfo wait_infos[] = { { + .semaphore = progress.get(), + .value = timeline + } }; + m_device->blockForSemaphores(wait_infos); + + const ILogicalDevice::MappedMemoryRange memory_range[] = { + ILogicalDevice::MappedMemoryRange(allocation[0].memory.get(), 0ull, allocation[0].memory->getAllocationSize()), + ILogicalDevice::MappedMemoryRange(allocation[1].memory.get(), 0ull, allocation[1].memory->getAllocationSize()), + ILogicalDevice::MappedMemoryRange(allocation[2].memory.get(), 0ull, allocation[2].memory->getAllocationSize()), + ILogicalDevice::MappedMemoryRange(allocation[3].memory.get(), 0ull, allocation[3].memory->getAllocationSize()) + }; + + if (!allocation[0].memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) + m_device->invalidateMappedMemoryRanges(1, &memory_range[0]); + if (!allocation[1].memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) + m_device->invalidateMappedMemoryRanges(1, &memory_range[1]); + if (!allocation[2].memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) + m_device->invalidateMappedMemoryRanges(1, &memory_range[2]); + if (!allocation[3].memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) + m_device->invalidateMappedMemoryRanges(1, &memory_range[3]); + + const uint32_t* sortedKeys = reinterpret_cast(allocation[2].memory->getMappedPointer()); + const uint32_t* sortedValues = reinterpret_cast(allocation[3].memory->getMappedPointer()); + + assert(allocation[2].offset == 0); + assert(allocation[3].offset == 0); + + outBuffer.clear(); + + outBuffer.append("ALL SORTED ELEMENTS: "); + for (auto i = 0; i < element_count; i++) { + outBuffer.append("{"); + outBuffer.append(std::to_string(sortedKeys[i])); + outBuffer.append(", "); + outBuffer.append(std::to_string(sortedValues[i])); + outBuffer.append("} "); + + if ((i + 1) % 20 == 0) { + outBuffer.append("\n"); + } + } + outBuffer.append("\n"); + outBuffer.append("Count: "); + outBuffer.append(std::to_string(element_count)); + outBuffer.append("\n"); + m_logger->log("Your sorted array is: \n" + outBuffer, ILogger::ELL_PERFORMANCE); + + bool is_sorted = true; + for (uint32_t i = 1; i < element_count; i++) { + if (sortedKeys[i] < sortedKeys[i - 1]) { + is_sorted = false; + break; + } + } + m_logger->log(is_sorted ? "Array is correctly sorted!" : "Array is NOT sorted correctly!", + is_sorted ? ILogger::ELL_PERFORMANCE : ILogger::ELL_ERROR); + + allocation[0].memory->unmap(); + allocation[1].memory->unmap(); + allocation[2].memory->unmap(); + allocation[3].memory->unmap(); + + m_device->waitIdle(); + + for (int i = 0; i < 2; ++i) { + delete[] bufferData[i]; + } + delete[] bufferData; + + return true; + } + + bool keepRunning() override { return false; } + void workLoopBody() override {} + bool onAppTerminated() override { return true; } +}; + +NBL_MAIN_FUNC(BitonicSortApp) \ No newline at end of file diff --git a/13_BitonicSort/pipeline.groovy b/13_BitonicSort/pipeline.groovy new file mode 100644 index 000000000..0af4402e6 --- /dev/null +++ b/13_BitonicSort/pipeline.groovy @@ -0,0 +1,50 @@ +import org.DevshGraphicsProgramming.Agent +import org.DevshGraphicsProgramming.BuilderInfo +import org.DevshGraphicsProgramming.IBuilder + +class CCountingSortBuilder extends IBuilder +{ + public CCountingSortBuilder(Agent _agent, _info) + { + super(_agent, _info) + } + + @Override + public boolean prepare(Map axisMapping) + { + return true + } + + @Override + public boolean build(Map axisMapping) + { + IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION") + IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE") + + def nameOfBuildDirectory = getNameOfBuildDirectory(buildType) + def nameOfConfig = getNameOfConfig(config) + + agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v") + + return true + } + + @Override + public boolean test(Map axisMapping) + { + return true + } + + @Override + public boolean install(Map axisMapping) + { + return true + } +} + +def create(Agent _agent, _info) +{ + return new CStreamingAndBufferDeviceAddressBuilder(_agent, _info) +} + +return this diff --git a/CMakeLists.txt b/CMakeLists.txt index 5e02eadc1..fc68aef51 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -42,9 +42,10 @@ if(NBL_BUILD_EXAMPLES) # showcase use of FFT for post-FX Bloom effect add_subdirectory(11_FFT) # - add_subdirectory(12_MeshLoaders) - # - #add_subdirectory(13_MaterialCompiler EXCLUDE_FROM_ALL) + add_subdirectory(12_MeshLoaders EXCLUDE_FROM_ALL) + + # bitonic + add_subdirectory(13_BitonicSort) # Waiting for a refactor #add_subdirectory(27_PLYSTLDemo) @@ -96,19 +97,9 @@ if(NBL_BUILD_EXAMPLES) # we link common example api library and force examples to reuse its PCH foreach(T IN LISTS TARGETS) - get_target_property(TYPE ${T} TYPE) - if(NOT ${TYPE} MATCHES INTERFACE) - target_link_libraries(${T} PUBLIC ${NBL_EXAMPLES_API_TARGET}) - target_include_directories(${T} PUBLIC $) - set_target_properties(${T} PROPERTIES DISABLE_PRECOMPILE_HEADERS OFF) - target_precompile_headers(${T} REUSE_FROM "${NBL_EXAMPLES_API_TARGET}") - - if(NBL_EMBED_BUILTIN_RESOURCES) - LINK_BUILTIN_RESOURCES_TO_TARGET(${T} NblExtExamplesAPIBuiltinsSource) - LINK_BUILTIN_RESOURCES_TO_TARGET(${T} NblExtExamplesAPIBuiltinsInclude) - LINK_BUILTIN_RESOURCES_TO_TARGET(${T} NblExtExamplesAPIBuiltinsBuild) - endif() - endif() + target_link_libraries(${T} PUBLIC ${NBL_EXAMPLES_API_TARGET}) + target_include_directories(${T} PUBLIC $) + target_precompile_headers(${T} REUSE_FROM "${NBL_EXAMPLES_API_TARGET}") endforeach() NBL_ADJUST_FOLDERS(examples) From 547e518969423632fb562f13d913437cbd3dd9f6 Mon Sep 17 00:00:00 2001 From: Abbas Garousi <69919151+CrabExtra@users.noreply.github.com> Date: Wed, 22 Oct 2025 17:25:38 +0330 Subject: [PATCH 2/8] Update bitonic_sort_shader.comp.hlsl --- .../bitonic_sort_shader.comp.hlsl | 179 ++++++++---------- 1 file changed, 77 insertions(+), 102 deletions(-) diff --git a/13_BitonicSort/app_resources/bitonic_sort_shader.comp.hlsl b/13_BitonicSort/app_resources/bitonic_sort_shader.comp.hlsl index 058b14eea..f8f11bcaa 100644 --- a/13_BitonicSort/app_resources/bitonic_sort_shader.comp.hlsl +++ b/13_BitonicSort/app_resources/bitonic_sort_shader.comp.hlsl @@ -1,112 +1,87 @@ -#include "nbl/builtin/hlsl/bda/bda_accessor.hlsl" +#include "common.hlsl" +#include "nbl/builtin/hlsl/workgroup/basic.hlsl" +#include "nbl/builtin/hlsl/workgroup/bitonic_sort.hlsl" -struct BitonicPushData +[[vk::push_constant]] PushConstantData pushConstants; + +using namespace nbl::hlsl; + +using BitonicSortConfig = workgroup::bitonic_sort::bitonic_sort_config >; + +NBL_CONSTEXPR uint32_t WorkgroupSize = BitonicSortConfig::WorkgroupSize; + +groupshared uint32_t sharedmem[2 * WorkgroupSize]; + +uint32_t3 glsl::gl_WorkGroupSize() { return uint32_t3(uint32_t(BitonicSortConfig::WorkgroupSize), 1, 1); } + +struct SharedMemoryAccessor { - uint64_t inputKeyAddress; - uint64_t inputValueAddress; - uint64_t outputKeyAddress; - uint64_t outputValueAddress; - uint32_t dataElementCount; + template + void set(IndexType idx, AccessType value) + { + sharedmem[idx] = value; + } + + template + void get(IndexType idx, NBL_REF_ARG(AccessType) value) + { + value = sharedmem[idx]; + } + + void workgroupExecutionAndMemoryBarrier() + { + glsl::barrier(); + } + }; -using namespace nbl::hlsl; +struct Accessor +{ + static Accessor create(const uint64_t address) + { + Accessor accessor; + accessor.address = address; + return accessor; + } -[[vk::push_constant]] BitonicPushData pushData; + template + void get(const IndexType index, NBL_REF_ARG(AccessType) value) + { + value = vk::RawBufferLoad(address + index * sizeof(AccessType)); + } -using DataPtr = bda::__ptr; -using DataAccessor = BdaAccessor; + template + void set(const IndexType index, const AccessType value) + { + vk::RawBufferStore(address + index * sizeof(AccessType), value); + } -groupshared uint32_t sharedKeys[ElementCount]; -groupshared uint32_t sharedValues[ElementCount]; + uint64_t address; +}; -[numthreads(WorkgroupSize, 1, 1)] +[numthreads(BitonicSortConfig::WorkgroupSize, 1, 1)] [shader("compute")] -void main(uint32_t3 dispatchId : SV_DispatchThreadID, uint32_t3 localId : SV_GroupThreadID) +void main() { - const uint32_t threadId = localId.x; - const uint32_t dataSize = pushData.dataElementCount; - - DataAccessor inputKeys = DataAccessor::create(DataPtr::create(pushData.inputKeyAddress)); - DataAccessor inputValues = DataAccessor::create(DataPtr::create(pushData.inputValueAddress)); - - for (uint32_t i = threadId; i < dataSize; i += WorkgroupSize) - { - inputKeys.get(i, sharedKeys[i]); - inputValues.get(i, sharedValues[i]); - } - - // Synchronize all threads after loading - GroupMemoryBarrierWithGroupSync(); - - - for (uint32_t stage = 0; stage < Log2ElementCount; stage++) - { - for (uint32_t pass = 0; pass <= stage; pass++) - { - const uint32_t compareDistance = 1 << (stage - pass); - - for (uint32_t i = threadId; i < dataSize; i += WorkgroupSize) - { - const uint32_t partnerId = i ^ compareDistance; - - if (partnerId >= dataSize) - continue; - - const uint32_t waveSize = WaveGetLaneCount(); - const uint32_t myWaveId = i / waveSize; - const uint32_t partnerWaveId = partnerId / waveSize; - const bool sameWave = (myWaveId == partnerWaveId); - - uint32_t myKey, myValue, partnerKey, partnerValue; - [branch] - if (sameWave && compareDistance < waveSize) - { - // WAVE INTRINSIC - myKey = sharedKeys[i]; - myValue = sharedValues[i]; - - const uint32_t partnerLane = partnerId % waveSize; - partnerKey = WaveReadLaneAt(myKey, partnerLane); - partnerValue = WaveReadLaneAt(myValue, partnerLane); - } - else - { - // SHARED MEM - myKey = sharedKeys[i]; - myValue = sharedValues[i]; - partnerKey = sharedKeys[partnerId]; - partnerValue = sharedValues[partnerId]; - } - - const uint32_t sequenceSize = 1 << (stage + 1); - const uint32_t sequenceIndex = i / sequenceSize; - const bool sequenceAscending = (sequenceIndex % 2) == 0; - const bool ascending = true; - const bool finalDirection = sequenceAscending == ascending; - - const bool swap = (myKey > partnerKey) == finalDirection; - - // WORKGROUP COORDINATION: Only lower-indexed element writes both - if (i < partnerId && swap) - { - sharedKeys[i] = partnerKey; - sharedKeys[partnerId] = myKey; - sharedValues[i] = partnerValue; - sharedValues[partnerId] = myValue; - } - } - - GroupMemoryBarrierWithGroupSync(); - } - } - - - DataAccessor outputKeys = DataAccessor::create(DataPtr::create(pushData.outputKeyAddress)); - DataAccessor outputValues = DataAccessor::create(DataPtr::create(pushData.outputValueAddress)); - - for (uint32_t i = threadId; i < dataSize; i += WorkgroupSize) - { - outputKeys.set(i, sharedKeys[i]); - outputValues.set(i, sharedValues[i]); - } -} \ No newline at end of file + Accessor accessor = Accessor::create(pushConstants.deviceBufferAddress); + SharedMemoryAccessor sharedmemAccessor; + + const uint32_t threadID = glsl::gl_LocalInvocationID().x; + + // Each thread handles 2 ADJACENT elements: lo and hi + // Following bitonic sort pattern: thread i handles elements [2*i] and [2*i + 1] + const uint32_t loIdx = threadID * 2; + const uint32_t hiIdx = threadID * 2 + 1; + + uint32_t loKey, hiKey; + accessor.get(loIdx, loKey); + accessor.get(hiIdx, hiKey); + + uint32_t loVal = loIdx; + uint32_t hiVal = hiIdx; + + workgroup::BitonicSort::template __call(accessor, sharedmemAccessor, loKey, hiKey, loVal, hiVal); + + accessor.set(loIdx, loKey); + accessor.set(hiIdx, hiKey); +} From 6544e047a4b9386d26cf171218aa3c30e76b44de Mon Sep 17 00:00:00 2001 From: Abbas Garousi <69919151+CrabExtra@users.noreply.github.com> Date: Wed, 22 Oct 2025 17:26:25 +0330 Subject: [PATCH 3/8] Update common.hlsl --- 13_BitonicSort/app_resources/common.hlsl | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/13_BitonicSort/app_resources/common.hlsl b/13_BitonicSort/app_resources/common.hlsl index 5f15d0af1..e6ea02580 100644 --- a/13_BitonicSort/app_resources/common.hlsl +++ b/13_BitonicSort/app_resources/common.hlsl @@ -1,17 +1,13 @@ -// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O. -// This file is part of the "Nabla Engine". -// For conditions of distribution and use, see copyright notice in nabla.h #ifndef _BITONIC_SORT_COMMON_INCLUDED_ #define _BITONIC_SORT_COMMON_INCLUDED_ +#include "nbl/builtin/hlsl/cpp_compat.hlsl" -struct BitonicPushData +struct PushConstantData { - - uint64_t inputKeyAddress; - uint64_t inputValueAddress; - uint64_t outputKeyAddress; - uint64_t outputValueAddress; - uint32_t dataElementCount; + uint64_t deviceBufferAddress; }; -#endif \ No newline at end of file +NBL_CONSTEXPR uint32_t WorkgroupSizeLog2 = 7; // 128 threads +NBL_CONSTEXPR uint32_t ElementsPerThreadLog2 = 1; // 2 elements per thread +NBL_CONSTEXPR uint32_t elementCount = uint32_t(1) << (WorkgroupSizeLog2 + ElementsPerThreadLog2); +#endif From 8de0c0f71cea71ab5492417ef4a73d31e77bf063 Mon Sep 17 00:00:00 2001 From: Abbas Garousi <69919151+CrabExtra@users.noreply.github.com> Date: Wed, 22 Oct 2025 17:27:10 +0330 Subject: [PATCH 4/8] Update main.cpp --- 13_BitonicSort/main.cpp | 300 +++++++++++++++++----------------------- 1 file changed, 130 insertions(+), 170 deletions(-) diff --git a/13_BitonicSort/main.cpp b/13_BitonicSort/main.cpp index 7e44c5b11..88908ea15 100644 --- a/13_BitonicSort/main.cpp +++ b/13_BitonicSort/main.cpp @@ -1,6 +1,4 @@ #include "nbl/examples/examples.hpp" -#include -#include using namespace nbl; using namespace nbl::core; @@ -14,12 +12,15 @@ using namespace nbl::examples; #include "app_resources/common.hlsl" #include "nbl/builtin/hlsl/bit.hlsl" + +// Simple showcase of how to run Bitonic Sort on a 1D array using workgroup operations class BitonicSortApp final : public application_templates::MonoDeviceApplication, public BuiltinResourcesApplication { using device_base_t = application_templates::MonoDeviceApplication; using asset_base_t = BuiltinResourcesApplication; public: + BitonicSortApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) : system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {} @@ -30,60 +31,41 @@ class BitonicSortApp final : public application_templates::MonoDeviceApplication if (!asset_base_t::onAppInitialized(std::move(system))) return false; - auto limits = m_physicalDevice->getLimits(); - const uint32_t max_shared_memory_size = limits.maxComputeSharedMemorySize; - const uint32_t max_workgroup_size = limits.maxComputeWorkGroupInvocations; // Get actual GPU limit - const uint32_t bytes_per_elements = sizeof(uint32_t) * 2; // 2 uint32_t per element (key and value) - const uint32_t max_element_in_shared_memory = max_shared_memory_size / bytes_per_elements; - - // For bitonic sort: element count MUST be power of 2 - uint32_t element_count = core::roundDownToPoT(max_element_in_shared_memory); - - const uint32_t log2_element_count = static_cast(log2(element_count)); - - m_logger->log("GPU Limits:", ILogger::ELL_INFO); - m_logger->log(" Max Workgroup Size: " + std::to_string(max_workgroup_size), ILogger::ELL_INFO); - m_logger->log(" Max Shared Memory: " + std::to_string(max_shared_memory_size) + " bytes", ILogger::ELL_INFO); - m_logger->log(" Max elements in shared memory: " + std::to_string(max_element_in_shared_memory), ILogger::ELL_INFO); - m_logger->log(" Using element count (power of 2): " + std::to_string(element_count), ILogger::ELL_INFO); - + // Load shader auto prepShader = [&](const core::string& path) -> smart_refctd_ptr + { + IAssetLoader::SAssetLoadParams lp = {}; + lp.logger = m_logger.get(); + lp.workingDirectory = ""; + auto assetBundle = m_assetMgr->getAsset(path, lp); + const auto assets = assetBundle.getContents(); + if (assets.empty()) { - IAssetLoader::SAssetLoadParams lp = {}; - lp.logger = m_logger.get(); - lp.workingDirectory = ""; - auto assetBundle = m_assetMgr->getAsset(path, lp); - const auto assets = assetBundle.getContents(); - if (assets.empty()) - { - logFail("Could not load shader!"); - return nullptr; - } - - auto source = IAsset::castDown(assets[0]); - assert(source); + logFail("Could not load shader!"); + return nullptr; + } - auto overrideSource = CHLSLCompiler::createOverridenCopy( - source.get(), "#define ElementCount %d\n#define Log2ElementCount %d\n#define WorkgroupSize %d\n", - element_count, log2_element_count, max_workgroup_size - ); + auto source = IAsset::castDown(assets[0]); + assert(source); - auto shader = m_device->compileShader({ overrideSource.get() }); - if (!shader) - { - logFail("Creation of Bitonic Sort Shader from CPU Shader source failed!"); - return nullptr; - } - return shader; - }; + auto shader = m_device->compileShader({ source.get() }); + if (!shader) + { + logFail("Creation of Bitonic Sort Shader failed!"); + return nullptr; + } + return shader; + }; auto bitonicSortShader = prepShader("app_resources/bitonic_sort_shader.comp.hlsl"); - if (!bitonicSortShader) return logFail("Failed to compile bitonic sort shader!"); - - const nbl::asset::SPushConstantRange pcRange = { .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE,.offset = 0,.size = sizeof(BitonicPushData) }; + const nbl::asset::SPushConstantRange pcRange = { + .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE, + .offset = 0, + .size = sizeof(PushConstantData) + }; smart_refctd_ptr layout; smart_refctd_ptr bitonicSortPipeline; @@ -93,107 +75,95 @@ class BitonicSortApp final : public application_templates::MonoDeviceApplication params.layout = layout.get(); params.shader.shader = bitonicSortShader.get(); params.shader.entryPoint = "main"; - params.shader.entries = nullptr; + params.shader.requiredSubgroupSize = static_cast(hlsl::findMSB(m_physicalDevice->getLimits().maxSubgroupSize)); + params.cached.requireFullSubgroups = true; if (!m_device->createComputePipelines(nullptr, { ¶ms,1 }, &bitonicSortPipeline)) return logFail("Failed to create compute pipeline!\n"); } - nbl::video::IDeviceMemoryAllocator::SAllocation allocation[4] = {}; - smart_refctd_ptr buffers[4]; + // Allocate buffers - use HOST_VISIBLE memory so CPU can directly read/write + nbl::video::IDeviceMemoryAllocator::SAllocation allocation = {}; + smart_refctd_ptr buffer; auto build_buffer = [this]( - smart_refctd_ptr m_device, nbl::video::IDeviceMemoryAllocator::SAllocation* allocation, smart_refctd_ptr& buffer, size_t buffer_size, const char* label - ) -> void { - IGPUBuffer::SCreationParams params; - params.size = buffer_size; - params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; - buffer = m_device->createBuffer(std::move(params)); - if (!buffer) - logFail("Failed to create GPU buffer of size %d!\n", buffer_size); + ) -> void { + IGPUBuffer::SCreationParams params; + params.size = buffer_size; + params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; + buffer = m_device->createBuffer(std::move(params)); + if (!buffer) + logFail("Failed to create GPU buffer of size %d!\n", buffer_size); - buffer->setObjectDebugName(label); + buffer->setObjectDebugName(label); - auto reqs = buffer->getMemoryReqs(); - reqs.memoryTypeBits &= m_physicalDevice->getHostVisibleMemoryTypeBits(); + auto reqs = buffer->getMemoryReqs(); + reqs.memoryTypeBits &= m_physicalDevice->getHostVisibleMemoryTypeBits(); - *allocation = m_device->allocate(reqs, buffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT); - if (!allocation->isValid()) - logFail("Failed to allocate Device Memory compatible with our GPU Buffer!\n"); + *allocation = m_device->allocate(reqs, buffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT); + if (!allocation->isValid()) + logFail("Failed to allocate Device Memory compatible with our GPU Buffer!\n"); - assert(allocation->memory.get() == buffer->getBoundMemory().memory); - }; + assert(allocation->memory.get() == buffer->getBoundMemory().memory); + }; - build_buffer(m_device, allocation, buffers[0], sizeof(uint32_t) * element_count, "Input Key Buffer"); - build_buffer(m_device, allocation + 1, buffers[1], sizeof(uint32_t) * element_count, "Input Value Buffer"); - build_buffer(m_device, allocation + 2, buffers[2], sizeof(uint32_t) * element_count, "Output Key Buffer"); - build_buffer(m_device, allocation + 3, buffers[3], sizeof(uint32_t) * element_count, "Output Value Buffer"); + build_buffer(&allocation, buffer, sizeof(uint32_t) * elementCount, "Bitonic Sort Buffer"); - uint64_t buffer_device_address[] = { - buffers[0]->getDeviceAddress(), - buffers[1]->getDeviceAddress(), - buffers[2]->getDeviceAddress(), - buffers[3]->getDeviceAddress() - }; + uint64_t buffer_device_address = buffer->getDeviceAddress(); - - void* mapped_memory[] = { - allocation[0].memory->map({0ull,allocation[0].memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ), - allocation[1].memory->map({0ull,allocation[1].memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ), - allocation[2].memory->map({0ull,allocation[2].memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ), - allocation[3].memory->map({0ull,allocation[3].memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ) - }; - if (!mapped_memory[0] || !mapped_memory[1] || !mapped_memory[2] || !mapped_memory[3]) + // Map buffer memory to CPU + void* mapped_memory = allocation.memory->map( + {0ull, allocation.memory->getAllocationSize()}, + IDeviceMemoryAllocation::EMCAF_READ + ); + if (!mapped_memory) return logFail("Failed to map the Device Memory!\n"); - // Generate random data + // Generate random input data unsigned seed = std::chrono::system_clock::now().time_since_epoch().count(); std::mt19937 g(seed); - auto bufferData = new uint32_t * [2]; - for (int i = 0; i < 2; ++i) { - bufferData[i] = new uint32_t[element_count]; - } - for (uint32_t i = 0; i < element_count; i++) { - bufferData[0][i] = g() % 10000; - } - - memcpy(mapped_memory[0], bufferData[0], sizeof(uint32_t) * element_count); - - for (uint32_t i = 0; i < element_count; i++) { - bufferData[1][i] = i; // Values are indices for verification + uint32_t* bufferData = new uint32_t[elementCount]; + for (uint32_t i = 0; i < elementCount; i++) { + bufferData[i] = g() % 10000; } - memcpy(mapped_memory[1], bufferData[1], sizeof(uint32_t) * element_count); + // Copy to GPU buffer + memcpy(mapped_memory, bufferData, sizeof(uint32_t) * elementCount); + // Print input std::string outBuffer; - - outBuffer.append("ALL ELEMENTS: "); - for (auto i = 0; i < element_count; i++) { - outBuffer.append("{"); - outBuffer.append(std::to_string(bufferData[0][i])); + outBuffer.append("Input array: "); + for (auto i = 0; i < elementCount; i++) { + outBuffer.append(std::to_string(bufferData[i])); outBuffer.append(", "); - outBuffer.append(std::to_string(bufferData[1][i])); - outBuffer.append("} "); - - // Add newline every 20 elements for readability - if ((i + 1) % 20 == 0) { + if ((i + 1) % 40 == 0) { outBuffer.append("\n"); } } - outBuffer.append("\n"); - outBuffer.append("Count: "); - outBuffer.append(std::to_string(element_count)); - outBuffer.append("\n"); - m_logger->log("Your input array is: \n" + outBuffer, ILogger::ELL_PERFORMANCE); + outBuffer.append("\nElement count: " + std::to_string(elementCount) + "\n"); + m_logger->log(outBuffer, ILogger::ELL_PERFORMANCE); + + // Memory range for flush/invalidate + const ILogicalDevice::MappedMemoryRange memory_range( + allocation.memory.get(), + 0ull, + allocation.memory->getAllocationSize() + ); + if (!allocation.memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) + m_device->flushMappedMemoryRanges(1, &memory_range); + // Create command buffer smart_refctd_ptr cmdBuf; { - smart_refctd_ptr cmdpool = m_device->createCommandPool(getComputeQueue()->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); + smart_refctd_ptr cmdpool = m_device->createCommandPool( + getComputeQueue()->getFamilyIndex(), + IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT + ); if (!cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &cmdBuf)) return logFail("Failed to create Command Buffers!\n"); } @@ -202,22 +172,23 @@ class BitonicSortApp final : public application_templates::MonoDeviceApplication uint64_t timeline = started_value; smart_refctd_ptr progress = m_device->createSemaphore(started_value); - auto pc = BitonicPushData{ - .inputKeyAddress = buffer_device_address[0], - .inputValueAddress = buffer_device_address[1], - .outputKeyAddress = buffer_device_address[2], - .outputValueAddress = buffer_device_address[3], - .dataElementCount = element_count + // Setup push constants + auto pc = PushConstantData{ + .deviceBufferAddress = buffer_device_address }; + // Record commands cmdBuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - cmdBuf->beginDebugMarker("Bitonic Sort Single Dispatch", core::vectorSIMDf(0, 1, 0, 1)); + cmdBuf->beginDebugMarker("Workgroup Bitonic Sort", core::vectorSIMDf(0, 1, 0, 1)); cmdBuf->bindComputePipeline(bitonicSortPipeline.get()); cmdBuf->pushConstants(layout.get(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0u, sizeof(pc), &pc); + + // Dispatch 1 workgroup (our sort works on a single workgroup) cmdBuf->dispatch(1, 1, 1); cmdBuf->endDebugMarker(); cmdBuf->end(); + // Submit to queue { auto queue = getComputeQueue(); @@ -242,82 +213,71 @@ class BitonicSortApp final : public application_templates::MonoDeviceApplication m_api->endCapture(); } + // Wait for GPU to finish const ISemaphore::SWaitInfo wait_infos[] = { { .semaphore = progress.get(), .value = timeline } }; m_device->blockForSemaphores(wait_infos); - const ILogicalDevice::MappedMemoryRange memory_range[] = { - ILogicalDevice::MappedMemoryRange(allocation[0].memory.get(), 0ull, allocation[0].memory->getAllocationSize()), - ILogicalDevice::MappedMemoryRange(allocation[1].memory.get(), 0ull, allocation[1].memory->getAllocationSize()), - ILogicalDevice::MappedMemoryRange(allocation[2].memory.get(), 0ull, allocation[2].memory->getAllocationSize()), - ILogicalDevice::MappedMemoryRange(allocation[3].memory.get(), 0ull, allocation[3].memory->getAllocationSize()) - }; - - if (!allocation[0].memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) - m_device->invalidateMappedMemoryRanges(1, &memory_range[0]); - if (!allocation[1].memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) - m_device->invalidateMappedMemoryRanges(1, &memory_range[1]); - if (!allocation[2].memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) - m_device->invalidateMappedMemoryRanges(1, &memory_range[2]); - if (!allocation[3].memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) - m_device->invalidateMappedMemoryRanges(1, &memory_range[3]); + if (!allocation.memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) + m_device->invalidateMappedMemoryRanges(1, &memory_range); - const uint32_t* sortedKeys = reinterpret_cast(allocation[2].memory->getMappedPointer()); - const uint32_t* sortedValues = reinterpret_cast(allocation[3].memory->getMappedPointer()); - - assert(allocation[2].offset == 0); - assert(allocation[3].offset == 0); + // Read sorted results + const uint32_t* sortedData = reinterpret_cast(allocation.memory->getMappedPointer()); + assert(allocation.offset == 0); + // Print sorted output outBuffer.clear(); - - outBuffer.append("ALL SORTED ELEMENTS: "); - for (auto i = 0; i < element_count; i++) { - outBuffer.append("{"); - outBuffer.append(std::to_string(sortedKeys[i])); + outBuffer.append("Sorted array: "); + for (auto i = 0; i < elementCount; i++) { + outBuffer.append(std::to_string(sortedData[i])); outBuffer.append(", "); - outBuffer.append(std::to_string(sortedValues[i])); - outBuffer.append("} "); - - if ((i + 1) % 20 == 0) { + if ((i + 1) % 40 == 0) { outBuffer.append("\n"); } } - outBuffer.append("\n"); - outBuffer.append("Count: "); - outBuffer.append(std::to_string(element_count)); - outBuffer.append("\n"); - m_logger->log("Your sorted array is: \n" + outBuffer, ILogger::ELL_PERFORMANCE); + outBuffer.append("\nElement count: " + std::to_string(elementCount) + "\n"); + m_logger->log(outBuffer, ILogger::ELL_PERFORMANCE); + // Verify correctness bool is_sorted = true; - for (uint32_t i = 1; i < element_count; i++) { - if (sortedKeys[i] < sortedKeys[i - 1]) { + int32_t error_index = -1; + for (uint32_t i = 1; i < elementCount; i++) { + if (sortedData[i] < sortedData[i - 1]) { is_sorted = false; + error_index = i; break; } } - m_logger->log(is_sorted ? "Array is correctly sorted!" : "Array is NOT sorted correctly!", - is_sorted ? ILogger::ELL_PERFORMANCE : ILogger::ELL_ERROR); - allocation[0].memory->unmap(); - allocation[1].memory->unmap(); - allocation[2].memory->unmap(); - allocation[3].memory->unmap(); - - m_device->waitIdle(); - - for (int i = 0; i < 2; ++i) { - delete[] bufferData[i]; + if (is_sorted) { + m_logger->log("Array is correctly sorted!", ILogger::ELL_PERFORMANCE); + } + else { + std::string errorMsg = "Array is NOT sorted correctly!\n"; + errorMsg += "Error at index " + std::to_string(error_index) + ":\n"; + errorMsg += " Previous element [" + std::to_string(error_index - 1) + "] = " + std::to_string(sortedData[error_index - 1]) + "\n"; + errorMsg += " Current element [" + std::to_string(error_index) + "] = " + std::to_string(sortedData[error_index]) + "\n"; + errorMsg += " (" + std::to_string(sortedData[error_index]) + " < " + std::to_string(sortedData[error_index - 1]) + " is WRONG!)\n"; + m_logger->log(errorMsg, ILogger::ELL_ERROR); } + + // Cleanup + allocation.memory->unmap(); delete[] bufferData; return true; } bool keepRunning() override { return false; } + void workLoopBody() override {} - bool onAppTerminated() override { return true; } + + bool onAppTerminated() override + { + return device_base_t::onAppTerminated(); + } }; -NBL_MAIN_FUNC(BitonicSortApp) \ No newline at end of file +NBL_MAIN_FUNC(BitonicSortApp) From 99232940248fb5a4dd3bc7d6868dac09dfc816a3 Mon Sep 17 00:00:00 2001 From: Abbas Garousi <69919151+CrabExtra@users.noreply.github.com> Date: Sun, 2 Nov 2025 18:59:34 +0330 Subject: [PATCH 5/8] Update main.cpp --- 13_BitonicSort/main.cpp | 368 ++++++++++++++++++++++------------------ 1 file changed, 202 insertions(+), 166 deletions(-) diff --git a/13_BitonicSort/main.cpp b/13_BitonicSort/main.cpp index 88908ea15..c1b1515c2 100644 --- a/13_BitonicSort/main.cpp +++ b/13_BitonicSort/main.cpp @@ -1,3 +1,8 @@ +// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + + #include "nbl/examples/examples.hpp" using namespace nbl; @@ -19,6 +24,25 @@ class BitonicSortApp final : public application_templates::MonoDeviceApplication using device_base_t = application_templates::MonoDeviceApplication; using asset_base_t = BuiltinResourcesApplication; + smart_refctd_ptr m_pipeline; + smart_refctd_ptr m_layout; + + smart_refctd_ptr m_utils; + + nbl::video::StreamingTransientDataBufferMT<>* m_upStreamingBuffer; + StreamingTransientDataBufferMT<>* m_downStreamingBuffer; + smart_refctd_ptr m_deviceLocalBuffer; + + // These are Buffer Device Addresses + uint64_t m_upStreamingBufferAddress; + uint64_t m_downStreamingBufferAddress; + uint64_t m_deviceLocalBufferAddress; + + uint32_t m_alignment; + + smart_refctd_ptr m_timeline; + uint64_t semaphoreValue = 0; + public: BitonicSortApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) : @@ -61,211 +85,222 @@ class BitonicSortApp final : public application_templates::MonoDeviceApplication if (!bitonicSortShader) return logFail("Failed to compile bitonic sort shader!"); + m_utils = video::IUtilities::create(smart_refctd_ptr(m_device), smart_refctd_ptr(m_logger)); + if (!m_utils) + return logFail("Failed to create Utilities!"); + m_upStreamingBuffer = m_utils->getDefaultUpStreamingBuffer(); + m_downStreamingBuffer = m_utils->getDefaultDownStreamingBuffer(); + m_upStreamingBufferAddress = m_upStreamingBuffer->getBuffer()->getDeviceAddress(); + m_downStreamingBufferAddress = m_downStreamingBuffer->getBuffer()->getDeviceAddress(); + + // Create device-local buffer + { + IGPUBuffer::SCreationParams deviceLocalBufferParams = {}; + + IQueue* const queue = getComputeQueue(); + uint32_t queueFamilyIndex = queue->getFamilyIndex(); + + deviceLocalBufferParams.queueFamilyIndexCount = 1; + deviceLocalBufferParams.queueFamilyIndices = &queueFamilyIndex; + deviceLocalBufferParams.size = sizeof(uint32_t) * elementCount * 2; // *2 because we store (key, value) pairs + deviceLocalBufferParams.usage = nbl::asset::IBuffer::E_USAGE_FLAGS::EUF_TRANSFER_SRC_BIT | nbl::asset::IBuffer::E_USAGE_FLAGS::EUF_TRANSFER_DST_BIT | nbl::asset::IBuffer::E_USAGE_FLAGS::EUF_SHADER_DEVICE_ADDRESS_BIT; + + m_deviceLocalBuffer = m_device->createBuffer(std::move(deviceLocalBufferParams)); + auto mreqs = m_deviceLocalBuffer->getMemoryReqs(); + mreqs.memoryTypeBits &= m_device->getPhysicalDevice()->getDeviceLocalMemoryTypeBits(); + auto gpubufMem = m_device->allocate(mreqs, m_deviceLocalBuffer.get(), IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_DEVICE_ADDRESS_BIT); + + m_deviceLocalBufferAddress = m_deviceLocalBuffer.get()->getDeviceAddress(); + } + const nbl::asset::SPushConstantRange pcRange = { .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE, .offset = 0, .size = sizeof(PushConstantData) }; - smart_refctd_ptr layout; - smart_refctd_ptr bitonicSortPipeline; { - layout = m_device->createPipelineLayout({ &pcRange,1 }); + m_layout = m_device->createPipelineLayout({ &pcRange,1 }); IGPUComputePipeline::SCreationParams params = {}; - params.layout = layout.get(); + params.layout = m_layout.get(); params.shader.shader = bitonicSortShader.get(); params.shader.entryPoint = "main"; params.shader.requiredSubgroupSize = static_cast(hlsl::findMSB(m_physicalDevice->getLimits().maxSubgroupSize)); params.cached.requireFullSubgroups = true; - if (!m_device->createComputePipelines(nullptr, { ¶ms,1 }, &bitonicSortPipeline)) + if (!m_device->createComputePipelines(nullptr, { ¶ms,1 }, &m_pipeline)) return logFail("Failed to create compute pipeline!\n"); } - // Allocate buffers - use HOST_VISIBLE memory so CPU can directly read/write - nbl::video::IDeviceMemoryAllocator::SAllocation allocation = {}; - smart_refctd_ptr buffer; - - auto build_buffer = [this]( - nbl::video::IDeviceMemoryAllocator::SAllocation* allocation, - smart_refctd_ptr& buffer, - size_t buffer_size, - const char* label - ) -> void { - IGPUBuffer::SCreationParams params; - params.size = buffer_size; - params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; - buffer = m_device->createBuffer(std::move(params)); - if (!buffer) - logFail("Failed to create GPU buffer of size %d!\n", buffer_size); - - buffer->setObjectDebugName(label); - - auto reqs = buffer->getMemoryReqs(); - reqs.memoryTypeBits &= m_physicalDevice->getHostVisibleMemoryTypeBits(); - - *allocation = m_device->allocate(reqs, buffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT); - if (!allocation->isValid()) - logFail("Failed to allocate Device Memory compatible with our GPU Buffer!\n"); - - assert(allocation->memory.get() == buffer->getBoundMemory().memory); - }; + const auto& deviceLimits = m_device->getPhysicalDevice()->getLimits(); + m_alignment = core::max(deviceLimits.nonCoherentAtomSize, alignof(uint32_t)); - build_buffer(&allocation, buffer, sizeof(uint32_t) * elementCount, "Bitonic Sort Buffer"); + m_timeline = m_device->createSemaphore(semaphoreValue); - uint64_t buffer_device_address = buffer->getDeviceAddress(); + IQueue* const queue = getComputeQueue(); - // Map buffer memory to CPU - void* mapped_memory = allocation.memory->map( - {0ull, allocation.memory->getAllocationSize()}, - IDeviceMemoryAllocation::EMCAF_READ - ); - if (!mapped_memory) - return logFail("Failed to map the Device Memory!\n"); + const uint32_t inputSize = sizeof(uint32_t) * elementCount * 2; // *2 because we store (key, value) pairs - // Generate random input data - unsigned seed = std::chrono::system_clock::now().time_since_epoch().count(); - std::mt19937 g(seed); + const uint32_t AllocationCount = 1; - uint32_t* bufferData = new uint32_t[elementCount]; - for (uint32_t i = 0; i < elementCount; i++) { - bufferData[i] = g() % 10000; - } + auto inputOffset = m_upStreamingBuffer->invalid_value; + + std::chrono::steady_clock::time_point waitTill(std::chrono::years(45)); + m_upStreamingBuffer->multi_allocate(waitTill, AllocationCount, &inputOffset, &inputSize, &m_alignment); - // Copy to GPU buffer - memcpy(mapped_memory, bufferData, sizeof(uint32_t) * elementCount); - - // Print input - std::string outBuffer; - outBuffer.append("Input array: "); - for (auto i = 0; i < elementCount; i++) { - outBuffer.append(std::to_string(bufferData[i])); - outBuffer.append(", "); - if ((i + 1) % 40 == 0) { - outBuffer.append("\n"); + { + auto* const inputPtr = reinterpret_cast(reinterpret_cast(m_upStreamingBuffer->getBufferPointer()) + inputOffset); + + // Generate random input data + unsigned seed = std::chrono::system_clock::now().time_since_epoch().count(); + std::mt19937 g(seed); + + std::cout << "Input array: "; + for (uint32_t i = 0; i < elementCount; i++) { + uint32_t key = g() % 10000; + uint32_t value = i; // Use index as value for stable sorting + inputPtr[i * 2] = key; + inputPtr[i * 2 + 1] = value; + std::cout << "(" << key << "," << value << "), "; + if ((i + 1) % 20 == 0) { + std::cout << "\n"; + } + } + std::cout << "\nElement count: " << elementCount << "\n"; + + // Always remember to flush! + if (m_upStreamingBuffer->needsManualFlushOrInvalidate()) + { + const auto bound = m_upStreamingBuffer->getBuffer()->getBoundMemory(); + const ILogicalDevice::MappedMemoryRange range(bound.memory, bound.offset + inputOffset, inputSize); + m_device->flushMappedMemoryRanges(1, &range); } } - outBuffer.append("\nElement count: " + std::to_string(elementCount) + "\n"); - m_logger->log(outBuffer, ILogger::ELL_PERFORMANCE); - - // Memory range for flush/invalidate - const ILogicalDevice::MappedMemoryRange memory_range( - allocation.memory.get(), - 0ull, - allocation.memory->getAllocationSize() - ); - if (!allocation.memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) - m_device->flushMappedMemoryRanges(1, &memory_range); + const uint32_t outputSize = inputSize; + + auto outputOffset = m_downStreamingBuffer->invalid_value; + m_downStreamingBuffer->multi_allocate(waitTill, AllocationCount, &outputOffset, &outputSize, &m_alignment); - // Create command buffer - smart_refctd_ptr cmdBuf; + smart_refctd_ptr cmdbuf; { - smart_refctd_ptr cmdpool = m_device->createCommandPool( - getComputeQueue()->getFamilyIndex(), - IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT - ); - if (!cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &cmdBuf)) + smart_refctd_ptr cmdpool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT); + if (!cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &cmdbuf)) { return logFail("Failed to create Command Buffers!\n"); + } + cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { &cmdbuf,1 }, core::smart_refctd_ptr(m_logger)); + cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + cmdbuf->bindComputePipeline(m_pipeline.get()); + + const PushConstantData pc = {.deviceBufferAddress = m_deviceLocalBufferAddress}; + + IGPUCommandBuffer::SBufferCopy copyInfo = {}; + copyInfo.srcOffset = inputOffset; + copyInfo.dstOffset = 0; + copyInfo.size = m_deviceLocalBuffer->getSize(); + cmdbuf->copyBuffer(m_upStreamingBuffer->getBuffer(), m_deviceLocalBuffer.get(), 1, ©Info); + + IGPUCommandBuffer::SPipelineBarrierDependencyInfo pipelineBarrierInfo1 = {}; + decltype(pipelineBarrierInfo1)::buffer_barrier_t barrier1 = {}; + pipelineBarrierInfo1.bufBarriers = { &barrier1, 1u }; + barrier1.range.buffer = m_deviceLocalBuffer; + barrier1.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT; + barrier1.barrier.dep.srcAccessMask = ACCESS_FLAGS::MEMORY_WRITE_BITS; + barrier1.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT; + barrier1.barrier.dep.dstAccessMask = ACCESS_FLAGS::MEMORY_READ_BITS; + cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS(0), pipelineBarrierInfo1); + + cmdbuf->pushConstants(m_pipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0u, sizeof(pc), &pc); + + cmdbuf->dispatch(1, 1, 1); + + IGPUCommandBuffer::SPipelineBarrierDependencyInfo pipelineBarrierInfo2 = {}; + decltype(pipelineBarrierInfo2)::buffer_barrier_t barrier2 = {}; + pipelineBarrierInfo2.bufBarriers = { &barrier2, 1u }; + barrier2.range.buffer = m_deviceLocalBuffer; + barrier2.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT; + barrier2.barrier.dep.srcAccessMask = ACCESS_FLAGS::MEMORY_WRITE_BITS; + barrier2.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT; + barrier2.barrier.dep.dstAccessMask = ACCESS_FLAGS::MEMORY_READ_BITS; + cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS(0), pipelineBarrierInfo2); + + copyInfo.srcOffset = 0; + copyInfo.dstOffset = outputOffset; + cmdbuf->copyBuffer(m_deviceLocalBuffer.get(), m_downStreamingBuffer->getBuffer(), 1, ©Info); + cmdbuf->end(); } - constexpr uint64_t started_value = 0; - uint64_t timeline = started_value; - smart_refctd_ptr progress = m_device->createSemaphore(started_value); - - // Setup push constants - auto pc = PushConstantData{ - .deviceBufferAddress = buffer_device_address - }; - - // Record commands - cmdBuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - cmdBuf->beginDebugMarker("Workgroup Bitonic Sort", core::vectorSIMDf(0, 1, 0, 1)); - cmdBuf->bindComputePipeline(bitonicSortPipeline.get()); - cmdBuf->pushConstants(layout.get(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0u, sizeof(pc), &pc); - - // Dispatch 1 workgroup (our sort works on a single workgroup) - cmdBuf->dispatch(1, 1, 1); - cmdBuf->endDebugMarker(); - cmdBuf->end(); - - // Submit to queue + semaphoreValue++; { - auto queue = getComputeQueue(); - - IQueue::SSubmitInfo submit_infos[1]; - IQueue::SSubmitInfo::SCommandBufferInfo cmdBufs[] = { - { - .cmdbuf = cmdBuf.get() - } + const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufInfo = + { + .cmdbuf = cmdbuf.get() }; - submit_infos[0].commandBuffers = cmdBufs; - IQueue::SSubmitInfo::SSemaphoreInfo signals[] = { - { - .semaphore = progress.get(), - .value = ++timeline, - .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT - } + const IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = + { + .semaphore = m_timeline.get(), + .value = semaphoreValue, + .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT + }; + + const IQueue::SSubmitInfo submitInfo = { + .waitSemaphores = {}, + .commandBuffers = {&cmdbufInfo,1}, + .signalSemaphores = {&signalInfo,1} }; - submit_infos[0].signalSemaphores = signals; m_api->startCapture(); - queue->submit(submit_infos); + queue->submit({ &submitInfo,1 }); m_api->endCapture(); } - // Wait for GPU to finish - const ISemaphore::SWaitInfo wait_infos[] = { { - .semaphore = progress.get(), - .value = timeline - } }; - m_device->blockForSemaphores(wait_infos); - - if (!allocation.memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) - m_device->invalidateMappedMemoryRanges(1, &memory_range); - - // Read sorted results - const uint32_t* sortedData = reinterpret_cast(allocation.memory->getMappedPointer()); - assert(allocation.offset == 0); - - // Print sorted output - outBuffer.clear(); - outBuffer.append("Sorted array: "); - for (auto i = 0; i < elementCount; i++) { - outBuffer.append(std::to_string(sortedData[i])); - outBuffer.append(", "); - if ((i + 1) % 40 == 0) { - outBuffer.append("\n"); - } - } - outBuffer.append("\nElement count: " + std::to_string(elementCount) + "\n"); - m_logger->log(outBuffer, ILogger::ELL_PERFORMANCE); - - // Verify correctness - bool is_sorted = true; - int32_t error_index = -1; - for (uint32_t i = 1; i < elementCount; i++) { - if (sortedData[i] < sortedData[i - 1]) { - is_sorted = false; - error_index = i; - break; - } - } + const ISemaphore::SWaitInfo futureWait = { m_timeline.get(),semaphoreValue }; - if (is_sorted) { - m_logger->log("Array is correctly sorted!", ILogger::ELL_PERFORMANCE); - } - else { - std::string errorMsg = "Array is NOT sorted correctly!\n"; - errorMsg += "Error at index " + std::to_string(error_index) + ":\n"; - errorMsg += " Previous element [" + std::to_string(error_index - 1) + "] = " + std::to_string(sortedData[error_index - 1]) + "\n"; - errorMsg += " Current element [" + std::to_string(error_index) + "] = " + std::to_string(sortedData[error_index]) + "\n"; - errorMsg += " (" + std::to_string(sortedData[error_index]) + " < " + std::to_string(sortedData[error_index - 1]) + " is WRONG!)\n"; - m_logger->log(errorMsg, ILogger::ELL_ERROR); - } + m_upStreamingBuffer->multi_deallocate(AllocationCount, &inputOffset, &inputSize, futureWait); - // Cleanup - allocation.memory->unmap(); - delete[] bufferData; + auto latchedConsumer = make_smart_refctd_ptr( + IDeviceMemoryAllocation::MemoryRange(outputOffset, outputSize), + [=](const size_t dstOffset, const void* bufSrc, const size_t size)->void + { + assert(dstOffset == 0 && size == outputSize); + + std::cout << "Sorted array: "; + const uint32_t* const data = reinterpret_cast(bufSrc); + for (auto i = 0u; i < elementCount; i++) { + uint32_t key = data[i * 2]; + uint32_t value = data[i * 2 + 1]; + std::cout << "(" << key << "," << value << "), "; + if ((i + 1) % 20 == 0) { + std::cout << "\n"; + } + } + std::cout << "\nElement count: " << elementCount << "\n"; + + bool is_sorted = true; + int32_t error_index = -1; + for (uint32_t i = 1; i < elementCount; i++) { + uint32_t prevKey = data[(i - 1) * 2]; + uint32_t currKey = data[i * 2]; + if (currKey < prevKey) { + is_sorted = false; + error_index = i; + break; + } + } + + if (is_sorted) { + std::cout << "Array is correctly sorted!\n"; + } + else { + std::cout << "Array is NOT sorted correctly!\n"; + std::cout << "Error at index " << error_index << ":\n"; + std::cout << " Previous key [" << (error_index - 1) << "] = " << data[(error_index - 1) * 2] << "\n"; + std::cout << " Current key [" << error_index << "] = " << data[error_index * 2] << "\n"; + std::cout << " (" << data[error_index * 2] << " < " << data[(error_index - 1) * 2] << " is WRONG!)\n"; + } + }, + std::move(cmdbuf), m_downStreamingBuffer + ); + m_downStreamingBuffer->multi_deallocate(AllocationCount, &outputOffset, &outputSize, futureWait, &latchedConsumer.get()); return true; } @@ -276,6 +311,7 @@ class BitonicSortApp final : public application_templates::MonoDeviceApplication bool onAppTerminated() override { + while (m_downStreamingBuffer->cull_frees()) {} return device_base_t::onAppTerminated(); } }; From 800802b13e95f6e2db8cd5b1d0a1f1a9a3f81d9d Mon Sep 17 00:00:00 2001 From: Abbas Garousi <69919151+CrabExtra@users.noreply.github.com> Date: Sun, 2 Nov 2025 19:00:16 +0330 Subject: [PATCH 6/8] Update common.hlsl --- 13_BitonicSort/app_resources/common.hlsl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/13_BitonicSort/app_resources/common.hlsl b/13_BitonicSort/app_resources/common.hlsl index e6ea02580..9f8f4dd3b 100644 --- a/13_BitonicSort/app_resources/common.hlsl +++ b/13_BitonicSort/app_resources/common.hlsl @@ -7,7 +7,7 @@ struct PushConstantData uint64_t deviceBufferAddress; }; -NBL_CONSTEXPR uint32_t WorkgroupSizeLog2 = 7; // 128 threads -NBL_CONSTEXPR uint32_t ElementsPerThreadLog2 = 1; // 2 elements per thread -NBL_CONSTEXPR uint32_t elementCount = uint32_t(1) << (WorkgroupSizeLog2 + ElementsPerThreadLog2); +NBL_CONSTEXPR uint32_t WorkgroupSizeLog2 = 10; // 1024 threads (2^10) +NBL_CONSTEXPR uint32_t ElementsPerThreadLog2 = 2; // 4 elements per thread (2^2) - VIRTUAL THREADING! +NBL_CONSTEXPR uint32_t elementCount = uint32_t(1) << (WorkgroupSizeLog2 + ElementsPerThreadLog2); // 4096 elements (2^12) #endif From 446d4876f55d5a71f2e8bf411226b484c64f1e58 Mon Sep 17 00:00:00 2001 From: Abbas Garousi <69919151+CrabExtra@users.noreply.github.com> Date: Sun, 2 Nov 2025 19:00:51 +0330 Subject: [PATCH 7/8] Update bitonic_sort_shader.comp.hlsl --- .../bitonic_sort_shader.comp.hlsl | 37 +++++++------------ 1 file changed, 13 insertions(+), 24 deletions(-) diff --git a/13_BitonicSort/app_resources/bitonic_sort_shader.comp.hlsl b/13_BitonicSort/app_resources/bitonic_sort_shader.comp.hlsl index f8f11bcaa..d3b2dcdf6 100644 --- a/13_BitonicSort/app_resources/bitonic_sort_shader.comp.hlsl +++ b/13_BitonicSort/app_resources/bitonic_sort_shader.comp.hlsl @@ -10,7 +10,7 @@ using BitonicSortConfig = workgroup::bitonic_sort::bitonic_sort_config void set(IndexType idx, AccessType value) { - sharedmem[idx] = value; + sharedmem[idx * 2] = value.first; + sharedmem[idx * 2 + 1] = value.second; } template void get(IndexType idx, NBL_REF_ARG(AccessType) value) { - value = sharedmem[idx]; + value.first = sharedmem[idx * 2]; + value.second = sharedmem[idx * 2 + 1]; } void workgroupExecutionAndMemoryBarrier() { glsl::barrier(); } - }; struct Accessor @@ -47,13 +48,17 @@ struct Accessor template void get(const IndexType index, NBL_REF_ARG(AccessType) value) { - value = vk::RawBufferLoad(address + index * sizeof(AccessType)); + const uint64_t offset = address + index * sizeof(AccessType); + value.first = vk::RawBufferLoad(offset); + value.second = vk::RawBufferLoad(offset + sizeof(uint32_t)); } template void set(const IndexType index, const AccessType value) { - vk::RawBufferStore(address + index * sizeof(AccessType), value); + const uint64_t offset = address + index * sizeof(AccessType); + vk::RawBufferStore(offset, value.first); + vk::RawBufferStore(offset + sizeof(uint32_t), value.second); } uint64_t address; @@ -66,22 +71,6 @@ void main() Accessor accessor = Accessor::create(pushConstants.deviceBufferAddress); SharedMemoryAccessor sharedmemAccessor; - const uint32_t threadID = glsl::gl_LocalInvocationID().x; - - // Each thread handles 2 ADJACENT elements: lo and hi - // Following bitonic sort pattern: thread i handles elements [2*i] and [2*i + 1] - const uint32_t loIdx = threadID * 2; - const uint32_t hiIdx = threadID * 2 + 1; - - uint32_t loKey, hiKey; - accessor.get(loIdx, loKey); - accessor.get(hiIdx, hiKey); - - uint32_t loVal = loIdx; - uint32_t hiVal = hiIdx; - - workgroup::BitonicSort::template __call(accessor, sharedmemAccessor, loKey, hiKey, loVal, hiVal); - - accessor.set(loIdx, loKey); - accessor.set(hiIdx, hiKey); + // The sort handles load/store internally + workgroup::BitonicSort::template __call(accessor, sharedmemAccessor); } From 2f3126fff69a588e53096cdfaef8d2a2b8676ed6 Mon Sep 17 00:00:00 2001 From: Abbas Garousi <69919151+CrabExtra@users.noreply.github.com> Date: Wed, 5 Nov 2025 19:26:10 +0330 Subject: [PATCH 8/8] Refactor shared memory access in bitonic sort shader --- .../app_resources/bitonic_sort_shader.comp.hlsl | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/13_BitonicSort/app_resources/bitonic_sort_shader.comp.hlsl b/13_BitonicSort/app_resources/bitonic_sort_shader.comp.hlsl index d3b2dcdf6..3ea39262f 100644 --- a/13_BitonicSort/app_resources/bitonic_sort_shader.comp.hlsl +++ b/13_BitonicSort/app_resources/bitonic_sort_shader.comp.hlsl @@ -19,15 +19,13 @@ struct SharedMemoryAccessor template void set(IndexType idx, AccessType value) { - sharedmem[idx * 2] = value.first; - sharedmem[idx * 2 + 1] = value.second; + sharedmem[idx] = value; } template void get(IndexType idx, NBL_REF_ARG(AccessType) value) { - value.first = sharedmem[idx * 2]; - value.second = sharedmem[idx * 2 + 1]; + value = sharedmem[idx]; } void workgroupExecutionAndMemoryBarrier() @@ -48,17 +46,13 @@ struct Accessor template void get(const IndexType index, NBL_REF_ARG(AccessType) value) { - const uint64_t offset = address + index * sizeof(AccessType); - value.first = vk::RawBufferLoad(offset); - value.second = vk::RawBufferLoad(offset + sizeof(uint32_t)); + value = vk::RawBufferLoad(address + index * sizeof(AccessType)); } template void set(const IndexType index, const AccessType value) { - const uint64_t offset = address + index * sizeof(AccessType); - vk::RawBufferStore(offset, value.first); - vk::RawBufferStore(offset + sizeof(uint32_t), value.second); + vk::RawBufferStore(address + index * sizeof(AccessType), value); } uint64_t address;