From fd346a05805ea1d3da67bf59eb74f6cde31d59a0 Mon Sep 17 00:00:00 2001
From: CrabeExtra <abbasgaroosi7@gmail.com>
Date: Mon, 11 Aug 2025 21:51:55 +0330
Subject: [PATCH 1/8] bitonic sort sample added

Signed-off-by: CrabeExtra <abbasgaroosi7@gmail.com>
---
 13_BitonicSort/CMakeLists.txt                 |  24 ++
 .../bitonic_sort_shader.comp.hlsl             | 112 ++++++
 13_BitonicSort/app_resources/common.hlsl      |  17 +
 13_BitonicSort/config.json.template           |  28 ++
 13_BitonicSort/main.cpp                       | 323 ++++++++++++++++++
 13_BitonicSort/pipeline.groovy                |  50 +++
 CMakeLists.txt                                |  23 +-
 7 files changed, 561 insertions(+), 16 deletions(-)
 create mode 100644 13_BitonicSort/CMakeLists.txt
 create mode 100644 13_BitonicSort/app_resources/bitonic_sort_shader.comp.hlsl
 create mode 100644 13_BitonicSort/app_resources/common.hlsl
 create mode 100644 13_BitonicSort/config.json.template
 create mode 100644 13_BitonicSort/main.cpp
 create mode 100644 13_BitonicSort/pipeline.groovy

diff --git a/13_BitonicSort/CMakeLists.txt b/13_BitonicSort/CMakeLists.txt
new file mode 100644
index 000000000..b7cad41da
--- /dev/null
+++ b/13_BitonicSort/CMakeLists.txt
@@ -0,0 +1,24 @@
+include(common RESULT_VARIABLE RES)
+if(NOT RES)
+	message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory")
+endif()
+
+nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")
+
+if(NBL_EMBED_BUILTIN_RESOURCES)
+	set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData)
+	set(RESOURCE_DIR "app_resources")
+
+	get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE)
+	get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE)
+	get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE)
+
+    file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*")
+    foreach(RES_FILE ${BUILTIN_RESOURCE_FILES})
+      LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}")
+    endforeach()
+
+	ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}")
+
+	LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_})
+endif()
diff --git a/13_BitonicSort/app_resources/bitonic_sort_shader.comp.hlsl b/13_BitonicSort/app_resources/bitonic_sort_shader.comp.hlsl
new file mode 100644
index 000000000..058b14eea
--- /dev/null
+++ b/13_BitonicSort/app_resources/bitonic_sort_shader.comp.hlsl
@@ -0,0 +1,112 @@
+#include "nbl/builtin/hlsl/bda/bda_accessor.hlsl"
+
+struct BitonicPushData
+{
+    uint64_t inputKeyAddress;
+    uint64_t inputValueAddress;
+    uint64_t outputKeyAddress;
+    uint64_t outputValueAddress;
+    uint32_t dataElementCount;
+};
+
+using namespace nbl::hlsl;
+
+[[vk::push_constant]] BitonicPushData pushData;
+
+using DataPtr = bda::__ptr<uint32_t>;
+using DataAccessor = BdaAccessor<uint32_t>;
+
+groupshared uint32_t sharedKeys[ElementCount];
+groupshared uint32_t sharedValues[ElementCount];
+
+[numthreads(WorkgroupSize, 1, 1)]
+[shader("compute")]
+void main(uint32_t3 dispatchId : SV_DispatchThreadID, uint32_t3 localId : SV_GroupThreadID)
+{
+    const uint32_t threadId = localId.x;
+    const uint32_t dataSize = pushData.dataElementCount;
+    
+    DataAccessor inputKeys = DataAccessor::create(DataPtr::create(pushData.inputKeyAddress));
+    DataAccessor inputValues = DataAccessor::create(DataPtr::create(pushData.inputValueAddress));
+    
+    for (uint32_t i = threadId; i < dataSize; i += WorkgroupSize)
+    {
+        inputKeys.get(i, sharedKeys[i]);
+        inputValues.get(i, sharedValues[i]);
+    }
+    
+    // Synchronize all threads after loading
+    GroupMemoryBarrierWithGroupSync();
+    
+
+    for (uint32_t stage = 0; stage < Log2ElementCount; stage++)
+    {
+        for (uint32_t pass = 0; pass <= stage; pass++)
+        {
+            const uint32_t compareDistance = 1 << (stage - pass);
+            
+            for (uint32_t i = threadId; i < dataSize; i += WorkgroupSize)
+            {
+                const uint32_t partnerId = i ^ compareDistance;
+                
+                if (partnerId >= dataSize)
+                    continue;
+               
+                const uint32_t waveSize = WaveGetLaneCount();
+                const uint32_t myWaveId = i / waveSize;
+                const uint32_t partnerWaveId = partnerId / waveSize;
+                const bool sameWave = (myWaveId == partnerWaveId);
+
+                uint32_t myKey, myValue, partnerKey, partnerValue;
+                [branch]
+                if (sameWave && compareDistance < waveSize)
+                {
+                    // WAVE INTRINSIC
+                    myKey = sharedKeys[i];
+                    myValue = sharedValues[i];
+
+                    const uint32_t partnerLane = partnerId % waveSize;
+                    partnerKey = WaveReadLaneAt(myKey, partnerLane);
+                    partnerValue = WaveReadLaneAt(myValue, partnerLane);
+                }
+                else
+                {
+                    // SHARED MEM
+                    myKey = sharedKeys[i];
+                    myValue = sharedValues[i];
+                    partnerKey = sharedKeys[partnerId];
+                    partnerValue = sharedValues[partnerId];
+                }
+
+                const uint32_t sequenceSize = 1 << (stage + 1);
+                const uint32_t sequenceIndex = i / sequenceSize;
+                const bool sequenceAscending = (sequenceIndex % 2) == 0;
+                const bool ascending = true;
+                const bool finalDirection = sequenceAscending == ascending;
+                
+                const bool swap = (myKey > partnerKey) == finalDirection;
+                
+                // WORKGROUP COORDINATION: Only lower-indexed element writes both
+                if (i < partnerId && swap)
+                {
+                    sharedKeys[i] = partnerKey;
+                    sharedKeys[partnerId] = myKey;
+                    sharedValues[i] = partnerValue;
+                    sharedValues[partnerId] = myValue;
+                }
+            }
+            
+            GroupMemoryBarrierWithGroupSync();
+        }
+    }
+    
+
+    DataAccessor outputKeys = DataAccessor::create(DataPtr::create(pushData.outputKeyAddress));
+    DataAccessor outputValues = DataAccessor::create(DataPtr::create(pushData.outputValueAddress));
+    
+    for (uint32_t i = threadId; i < dataSize; i += WorkgroupSize)
+    {
+        outputKeys.set(i, sharedKeys[i]);
+        outputValues.set(i, sharedValues[i]);
+    }
+}
\ No newline at end of file
diff --git a/13_BitonicSort/app_resources/common.hlsl b/13_BitonicSort/app_resources/common.hlsl
new file mode 100644
index 000000000..5f15d0af1
--- /dev/null
+++ b/13_BitonicSort/app_resources/common.hlsl
@@ -0,0 +1,17 @@
+// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#ifndef _BITONIC_SORT_COMMON_INCLUDED_
+#define _BITONIC_SORT_COMMON_INCLUDED_
+
+struct BitonicPushData
+{
+
+	uint64_t inputKeyAddress;
+	uint64_t inputValueAddress;
+	uint64_t outputKeyAddress;
+	uint64_t outputValueAddress;
+	uint32_t dataElementCount;
+};
+
+#endif
\ No newline at end of file
diff --git a/13_BitonicSort/config.json.template b/13_BitonicSort/config.json.template
new file mode 100644
index 000000000..12215d0bb
--- /dev/null
+++ b/13_BitonicSort/config.json.template
@@ -0,0 +1,28 @@
+{
+  "enableParallelBuild": true,
+  "threadsPerBuildProcess" : 2,
+  "isExecuted": false,
+  "scriptPath": "",
+  "cmake": {
+    "configurations": [ "Release", "Debug", "RelWithDebInfo" ],
+    "buildModes": [],
+    "requiredOptions": []
+  }, 
+  "profiles": [
+    {
+      "backend": "vulkan", // should be none
+      "platform": "windows",
+      "buildModes": [],
+      "runConfiguration": "Release", // we also need to run in Debug nad RWDI because foundational example
+      "gpuArchitectures": []
+    }
+  ],
+  "dependencies": [],
+  "data": [
+    {
+      "dependencies": [],
+      "command": [""],
+      "outputs": []
+    }
+  ]
+}
diff --git a/13_BitonicSort/main.cpp b/13_BitonicSort/main.cpp
new file mode 100644
index 000000000..7e44c5b11
--- /dev/null
+++ b/13_BitonicSort/main.cpp
@@ -0,0 +1,323 @@
+#include "nbl/examples/examples.hpp"
+#include <cmath>
+#include <chrono>
+
+using namespace nbl;
+using namespace nbl::core;
+using namespace nbl::hlsl;
+using namespace nbl::system;
+using namespace nbl::asset;
+using namespace nbl::ui;
+using namespace nbl::video;
+using namespace nbl::examples;
+
+#include "app_resources/common.hlsl"
+#include "nbl/builtin/hlsl/bit.hlsl"
+
+class BitonicSortApp final : public application_templates::MonoDeviceApplication, public BuiltinResourcesApplication
+{
+	using device_base_t = application_templates::MonoDeviceApplication;
+	using asset_base_t = BuiltinResourcesApplication;
+
+public:
+	BitonicSortApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) :
+		system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {}
+
+	bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
+	{
+		if (!device_base_t::onAppInitialized(smart_refctd_ptr(system)))
+			return false;
+		if (!asset_base_t::onAppInitialized(std::move(system)))
+			return false;
+
+		auto limits = m_physicalDevice->getLimits();
+		const uint32_t max_shared_memory_size = limits.maxComputeSharedMemorySize;
+		const uint32_t max_workgroup_size = limits.maxComputeWorkGroupInvocations; // Get actual GPU limit
+		const uint32_t bytes_per_elements = sizeof(uint32_t) * 2; // 2 uint32_t per element (key and value)
+		const uint32_t max_element_in_shared_memory = max_shared_memory_size / bytes_per_elements;
+
+		// For bitonic sort: element count MUST be power of 2
+		uint32_t element_count = core::roundDownToPoT(max_element_in_shared_memory);
+		
+		const uint32_t log2_element_count = static_cast<uint32_t>(log2(element_count));
+
+		m_logger->log("GPU Limits:", ILogger::ELL_INFO);
+		m_logger->log("  Max Workgroup Size: " + std::to_string(max_workgroup_size), ILogger::ELL_INFO);
+		m_logger->log("  Max Shared Memory: " + std::to_string(max_shared_memory_size) + " bytes", ILogger::ELL_INFO);
+		m_logger->log("  Max elements in shared memory: " + std::to_string(max_element_in_shared_memory), ILogger::ELL_INFO);
+		m_logger->log("  Using element count (power of 2): " + std::to_string(element_count), ILogger::ELL_INFO);
+
+		auto prepShader = [&](const core::string& path) -> smart_refctd_ptr<IShader>
+			{
+				IAssetLoader::SAssetLoadParams lp = {};
+				lp.logger = m_logger.get();
+				lp.workingDirectory = "";
+				auto assetBundle = m_assetMgr->getAsset(path, lp);
+				const auto assets = assetBundle.getContents();
+				if (assets.empty())
+				{
+					logFail("Could not load shader!");
+					return nullptr;
+				}
+
+				auto source = IAsset::castDown<IShader>(assets[0]);
+				assert(source);
+
+				auto overrideSource = CHLSLCompiler::createOverridenCopy(
+					source.get(), "#define ElementCount %d\n#define Log2ElementCount %d\n#define WorkgroupSize %d\n",
+					element_count, log2_element_count, max_workgroup_size
+				);
+
+				auto shader = m_device->compileShader({ overrideSource.get() });
+				if (!shader)
+				{
+					logFail("Creation of Bitonic Sort Shader from CPU Shader source failed!");
+					return nullptr;
+				}
+				return shader;
+			};
+
+		auto bitonicSortShader = prepShader("app_resources/bitonic_sort_shader.comp.hlsl");
+
+		if (!bitonicSortShader)
+			return logFail("Failed to compile bitonic sort shader!");
+
+
+		const nbl::asset::SPushConstantRange pcRange = { .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE,.offset = 0,.size = sizeof(BitonicPushData) };
+
+		smart_refctd_ptr<IGPUPipelineLayout> layout;
+		smart_refctd_ptr<IGPUComputePipeline> bitonicSortPipeline;
+		{
+			layout = m_device->createPipelineLayout({ &pcRange,1 });
+			IGPUComputePipeline::SCreationParams params = {};
+			params.layout = layout.get();
+			params.shader.shader = bitonicSortShader.get();
+			params.shader.entryPoint = "main";
+			params.shader.entries = nullptr;
+			if (!m_device->createComputePipelines(nullptr, { &params,1 }, &bitonicSortPipeline))
+				return logFail("Failed to create compute pipeline!\n");
+		}
+
+		nbl::video::IDeviceMemoryAllocator::SAllocation allocation[4] = {};
+		smart_refctd_ptr<IGPUBuffer> buffers[4];
+
+		auto build_buffer = [this](
+			smart_refctd_ptr<ILogicalDevice> m_device,
+			nbl::video::IDeviceMemoryAllocator::SAllocation* allocation,
+			smart_refctd_ptr<IGPUBuffer>& buffer,
+			size_t buffer_size,
+			const char* label
+			) -> void {
+				IGPUBuffer::SCreationParams params;
+				params.size = buffer_size;
+				params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
+				buffer = m_device->createBuffer(std::move(params));
+				if (!buffer)
+					logFail("Failed to create GPU buffer of size %d!\n", buffer_size);
+
+				buffer->setObjectDebugName(label);
+
+				auto reqs = buffer->getMemoryReqs();
+				reqs.memoryTypeBits &= m_physicalDevice->getHostVisibleMemoryTypeBits();
+
+				*allocation = m_device->allocate(reqs, buffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT);
+				if (!allocation->isValid())
+					logFail("Failed to allocate Device Memory compatible with our GPU Buffer!\n");
+
+				assert(allocation->memory.get() == buffer->getBoundMemory().memory);
+			};
+
+		build_buffer(m_device, allocation, buffers[0], sizeof(uint32_t) * element_count, "Input Key Buffer");
+		build_buffer(m_device, allocation + 1, buffers[1], sizeof(uint32_t) * element_count, "Input Value Buffer");
+		build_buffer(m_device, allocation + 2, buffers[2], sizeof(uint32_t) * element_count, "Output Key Buffer");
+		build_buffer(m_device, allocation + 3, buffers[3], sizeof(uint32_t) * element_count, "Output Value Buffer");
+
+		uint64_t buffer_device_address[] = {
+			buffers[0]->getDeviceAddress(),
+			buffers[1]->getDeviceAddress(),
+			buffers[2]->getDeviceAddress(),
+			buffers[3]->getDeviceAddress()
+		};
+
+	
+		void* mapped_memory[] = {
+			allocation[0].memory->map({0ull,allocation[0].memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ),
+			allocation[1].memory->map({0ull,allocation[1].memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ),
+			allocation[2].memory->map({0ull,allocation[2].memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ),
+			allocation[3].memory->map({0ull,allocation[3].memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ)
+		};
+		if (!mapped_memory[0] || !mapped_memory[1] || !mapped_memory[2] || !mapped_memory[3])
+			return logFail("Failed to map the Device Memory!\n");
+
+		// Generate random data
+		unsigned seed = std::chrono::system_clock::now().time_since_epoch().count();
+		std::mt19937 g(seed);
+
+		auto bufferData = new uint32_t * [2];
+		for (int i = 0; i < 2; ++i) {
+			bufferData[i] = new uint32_t[element_count];
+		}
+		for (uint32_t i = 0; i < element_count; i++) {
+			bufferData[0][i] = g() % 10000; 
+		}
+
+		memcpy(mapped_memory[0], bufferData[0], sizeof(uint32_t) * element_count);
+
+		for (uint32_t i = 0; i < element_count; i++) {
+			bufferData[1][i] = i; // Values are indices for verification
+		}
+
+		memcpy(mapped_memory[1], bufferData[1], sizeof(uint32_t) * element_count);
+
+		std::string outBuffer;
+
+		outBuffer.append("ALL ELEMENTS: ");
+		for (auto i = 0; i < element_count; i++) {
+			outBuffer.append("{");
+			outBuffer.append(std::to_string(bufferData[0][i]));
+			outBuffer.append(", ");
+			outBuffer.append(std::to_string(bufferData[1][i]));
+			outBuffer.append("} ");
+
+			// Add newline every 20 elements for readability
+			if ((i + 1) % 20 == 0) {
+				outBuffer.append("\n");
+			}
+		}
+		outBuffer.append("\n");
+		outBuffer.append("Count: ");
+		outBuffer.append(std::to_string(element_count));
+		outBuffer.append("\n");
+		m_logger->log("Your input array is: \n" + outBuffer, ILogger::ELL_PERFORMANCE);
+
+
+		smart_refctd_ptr<nbl::video::IGPUCommandBuffer> cmdBuf;
+		{
+			smart_refctd_ptr<nbl::video::IGPUCommandPool> cmdpool = m_device->createCommandPool(getComputeQueue()->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
+			if (!cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &cmdBuf))
+				return logFail("Failed to create Command Buffers!\n");
+		}
+
+		constexpr uint64_t started_value = 0;
+		uint64_t timeline = started_value;
+		smart_refctd_ptr<ISemaphore> progress = m_device->createSemaphore(started_value);
+
+		auto pc = BitonicPushData{
+			.inputKeyAddress = buffer_device_address[0],
+			.inputValueAddress = buffer_device_address[1],
+			.outputKeyAddress = buffer_device_address[2],
+			.outputValueAddress = buffer_device_address[3],
+			.dataElementCount = element_count
+		};
+
+		cmdBuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+		cmdBuf->beginDebugMarker("Bitonic Sort Single Dispatch", core::vectorSIMDf(0, 1, 0, 1));
+		cmdBuf->bindComputePipeline(bitonicSortPipeline.get());
+		cmdBuf->pushConstants(layout.get(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0u, sizeof(pc), &pc);
+		cmdBuf->dispatch(1, 1, 1);
+		cmdBuf->endDebugMarker();
+		cmdBuf->end();
+
+		{
+			auto queue = getComputeQueue();
+
+			IQueue::SSubmitInfo submit_infos[1];
+			IQueue::SSubmitInfo::SCommandBufferInfo cmdBufs[] = {
+				{
+					.cmdbuf = cmdBuf.get()
+				}
+			};
+			submit_infos[0].commandBuffers = cmdBufs;
+			IQueue::SSubmitInfo::SSemaphoreInfo signals[] = {
+				{
+					.semaphore = progress.get(),
+					.value = ++timeline,
+					.stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT
+				}
+			};
+			submit_infos[0].signalSemaphores = signals;
+
+			m_api->startCapture();
+			queue->submit(submit_infos);
+			m_api->endCapture();
+		}
+
+		const ISemaphore::SWaitInfo wait_infos[] = { {
+				.semaphore = progress.get(),
+				.value = timeline
+			} };
+		m_device->blockForSemaphores(wait_infos);
+
+		const ILogicalDevice::MappedMemoryRange memory_range[] = {
+			ILogicalDevice::MappedMemoryRange(allocation[0].memory.get(), 0ull, allocation[0].memory->getAllocationSize()),
+			ILogicalDevice::MappedMemoryRange(allocation[1].memory.get(), 0ull, allocation[1].memory->getAllocationSize()),
+			ILogicalDevice::MappedMemoryRange(allocation[2].memory.get(), 0ull, allocation[2].memory->getAllocationSize()),
+			ILogicalDevice::MappedMemoryRange(allocation[3].memory.get(), 0ull, allocation[3].memory->getAllocationSize())
+		};
+
+		if (!allocation[0].memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
+			m_device->invalidateMappedMemoryRanges(1, &memory_range[0]);
+		if (!allocation[1].memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
+			m_device->invalidateMappedMemoryRanges(1, &memory_range[1]);
+		if (!allocation[2].memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
+			m_device->invalidateMappedMemoryRanges(1, &memory_range[2]);
+		if (!allocation[3].memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
+			m_device->invalidateMappedMemoryRanges(1, &memory_range[3]);
+
+		const uint32_t* sortedKeys = reinterpret_cast<const uint32_t*>(allocation[2].memory->getMappedPointer());
+		const uint32_t* sortedValues = reinterpret_cast<const uint32_t*>(allocation[3].memory->getMappedPointer());
+
+		assert(allocation[2].offset == 0);
+		assert(allocation[3].offset == 0);
+
+		outBuffer.clear();
+
+		outBuffer.append("ALL SORTED ELEMENTS: ");
+		for (auto i = 0; i < element_count; i++) {
+			outBuffer.append("{");
+			outBuffer.append(std::to_string(sortedKeys[i]));
+			outBuffer.append(", ");
+			outBuffer.append(std::to_string(sortedValues[i]));
+			outBuffer.append("} ");
+
+			if ((i + 1) % 20 == 0) {
+				outBuffer.append("\n");
+			}
+		}
+		outBuffer.append("\n");
+		outBuffer.append("Count: ");
+		outBuffer.append(std::to_string(element_count));
+		outBuffer.append("\n");
+		m_logger->log("Your sorted array is: \n" + outBuffer, ILogger::ELL_PERFORMANCE);
+
+		bool is_sorted = true;
+		for (uint32_t i = 1; i < element_count; i++) {
+			if (sortedKeys[i] < sortedKeys[i - 1]) {
+				is_sorted = false;
+				break;
+			}
+		}
+		m_logger->log(is_sorted ? "Array is correctly sorted!" : "Array is NOT sorted correctly!",
+			is_sorted ? ILogger::ELL_PERFORMANCE : ILogger::ELL_ERROR);
+
+		allocation[0].memory->unmap();
+		allocation[1].memory->unmap();
+		allocation[2].memory->unmap();
+		allocation[3].memory->unmap();
+
+		m_device->waitIdle();
+
+		for (int i = 0; i < 2; ++i) {
+			delete[] bufferData[i];
+		}
+		delete[] bufferData;
+
+		return true;
+	}
+
+	bool keepRunning() override { return false; }
+	void workLoopBody() override {}
+	bool onAppTerminated() override { return true; }
+};
+
+NBL_MAIN_FUNC(BitonicSortApp)
\ No newline at end of file
diff --git a/13_BitonicSort/pipeline.groovy b/13_BitonicSort/pipeline.groovy
new file mode 100644
index 000000000..0af4402e6
--- /dev/null
+++ b/13_BitonicSort/pipeline.groovy
@@ -0,0 +1,50 @@
+import org.DevshGraphicsProgramming.Agent
+import org.DevshGraphicsProgramming.BuilderInfo
+import org.DevshGraphicsProgramming.IBuilder
+
+class CCountingSortBuilder extends IBuilder
+{
+	public CCountingSortBuilder(Agent _agent, _info)
+	{
+		super(_agent, _info)
+	}
+	
+	@Override
+	public boolean prepare(Map axisMapping)
+	{
+		return true
+	}
+	
+	@Override
+  	public boolean build(Map axisMapping)
+	{
+		IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION")
+		IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE")
+		
+		def nameOfBuildDirectory = getNameOfBuildDirectory(buildType)
+		def nameOfConfig = getNameOfConfig(config)
+		
+		agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v")
+		
+		return true
+	}
+	
+	@Override
+  	public boolean test(Map axisMapping)
+	{
+		return true
+	}
+	
+	@Override
+	public boolean install(Map axisMapping)
+	{
+		return true
+	}
+}
+
+def create(Agent _agent, _info)
+{
+	return new CStreamingAndBufferDeviceAddressBuilder(_agent, _info)
+}
+
+return this
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5e02eadc1..fc68aef51 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -42,9 +42,10 @@ if(NBL_BUILD_EXAMPLES)
 	# showcase use of FFT for post-FX Bloom  effect
 	add_subdirectory(11_FFT)
 	#
-	add_subdirectory(12_MeshLoaders)
-	#
-	#add_subdirectory(13_MaterialCompiler EXCLUDE_FROM_ALL)
+	add_subdirectory(12_MeshLoaders EXCLUDE_FROM_ALL)
+
+	# bitonic 
+	add_subdirectory(13_BitonicSort)
 
 	# Waiting for a refactor
 	#add_subdirectory(27_PLYSTLDemo)
@@ -96,19 +97,9 @@ if(NBL_BUILD_EXAMPLES)
 
 	# we link common example api library and force examples to reuse its PCH
 	foreach(T IN LISTS TARGETS)
-		get_target_property(TYPE ${T} TYPE)
-		if(NOT ${TYPE} MATCHES INTERFACE)
-			target_link_libraries(${T} PUBLIC ${NBL_EXAMPLES_API_TARGET})
-			target_include_directories(${T} PUBLIC $<TARGET_PROPERTY:${NBL_EXAMPLES_API_TARGET},INCLUDE_DIRECTORIES>)
-			set_target_properties(${T} PROPERTIES DISABLE_PRECOMPILE_HEADERS OFF)
-			target_precompile_headers(${T} REUSE_FROM "${NBL_EXAMPLES_API_TARGET}")
-
-			if(NBL_EMBED_BUILTIN_RESOURCES)
-				LINK_BUILTIN_RESOURCES_TO_TARGET(${T} NblExtExamplesAPIBuiltinsSource)
-				LINK_BUILTIN_RESOURCES_TO_TARGET(${T} NblExtExamplesAPIBuiltinsInclude)
-				LINK_BUILTIN_RESOURCES_TO_TARGET(${T} NblExtExamplesAPIBuiltinsBuild)
-			endif()
-		endif()
+        target_link_libraries(${T} PUBLIC ${NBL_EXAMPLES_API_TARGET})
+		target_include_directories(${T} PUBLIC $<TARGET_PROPERTY:${NBL_EXAMPLES_API_TARGET},INCLUDE_DIRECTORIES>)
+		target_precompile_headers(${T} REUSE_FROM "${NBL_EXAMPLES_API_TARGET}")
     endforeach()
 
 	NBL_ADJUST_FOLDERS(examples)

From 547e518969423632fb562f13d913437cbd3dd9f6 Mon Sep 17 00:00:00 2001
From: Abbas Garousi <69919151+CrabExtra@users.noreply.github.com>
Date: Wed, 22 Oct 2025 17:25:38 +0330
Subject: [PATCH 2/8] Update bitonic_sort_shader.comp.hlsl

---
 .../bitonic_sort_shader.comp.hlsl             | 179 ++++++++----------
 1 file changed, 77 insertions(+), 102 deletions(-)

diff --git a/13_BitonicSort/app_resources/bitonic_sort_shader.comp.hlsl b/13_BitonicSort/app_resources/bitonic_sort_shader.comp.hlsl
index 058b14eea..f8f11bcaa 100644
--- a/13_BitonicSort/app_resources/bitonic_sort_shader.comp.hlsl
+++ b/13_BitonicSort/app_resources/bitonic_sort_shader.comp.hlsl
@@ -1,112 +1,87 @@
-#include "nbl/builtin/hlsl/bda/bda_accessor.hlsl"
+#include "common.hlsl"
+#include "nbl/builtin/hlsl/workgroup/basic.hlsl"
+#include "nbl/builtin/hlsl/workgroup/bitonic_sort.hlsl"
 
-struct BitonicPushData
+[[vk::push_constant]] PushConstantData pushConstants;
+
+using namespace nbl::hlsl;
+
+using BitonicSortConfig = workgroup::bitonic_sort::bitonic_sort_config<ElementsPerThreadLog2, WorkgroupSizeLog2, uint32_t, uint32_t, less<uint32_t> >;
+
+NBL_CONSTEXPR uint32_t WorkgroupSize = BitonicSortConfig::WorkgroupSize;
+
+groupshared uint32_t sharedmem[2 * WorkgroupSize];
+
+uint32_t3 glsl::gl_WorkGroupSize() { return uint32_t3(uint32_t(BitonicSortConfig::WorkgroupSize), 1, 1); }
+
+struct SharedMemoryAccessor
 {
-    uint64_t inputKeyAddress;
-    uint64_t inputValueAddress;
-    uint64_t outputKeyAddress;
-    uint64_t outputValueAddress;
-    uint32_t dataElementCount;
+	template <typename AccessType, typename IndexType>
+	void set(IndexType idx, AccessType value)
+	{
+		sharedmem[idx] = value;
+	}
+
+	template <typename AccessType, typename IndexType>
+	void get(IndexType idx, NBL_REF_ARG(AccessType) value)
+	{
+		value = sharedmem[idx];
+	}
+
+	void workgroupExecutionAndMemoryBarrier()
+	{
+		glsl::barrier();
+	}
+
 };
 
-using namespace nbl::hlsl;
+struct Accessor
+{
+	static Accessor create(const uint64_t address)
+	{
+		Accessor accessor;
+		accessor.address = address;
+		return accessor;
+	}
 
-[[vk::push_constant]] BitonicPushData pushData;
+	template <typename AccessType, typename IndexType>
+	void get(const IndexType index, NBL_REF_ARG(AccessType) value)
+	{
+		value = vk::RawBufferLoad<AccessType>(address + index * sizeof(AccessType));
+	}
 
-using DataPtr = bda::__ptr<uint32_t>;
-using DataAccessor = BdaAccessor<uint32_t>;
+	template <typename AccessType, typename IndexType>
+	void set(const IndexType index, const AccessType value)
+	{
+		vk::RawBufferStore<AccessType>(address + index * sizeof(AccessType), value);
+	}
 
-groupshared uint32_t sharedKeys[ElementCount];
-groupshared uint32_t sharedValues[ElementCount];
+	uint64_t address;
+};
 
-[numthreads(WorkgroupSize, 1, 1)]
+[numthreads(BitonicSortConfig::WorkgroupSize, 1, 1)]
 [shader("compute")]
-void main(uint32_t3 dispatchId : SV_DispatchThreadID, uint32_t3 localId : SV_GroupThreadID)
+void main()
 {
-    const uint32_t threadId = localId.x;
-    const uint32_t dataSize = pushData.dataElementCount;
-    
-    DataAccessor inputKeys = DataAccessor::create(DataPtr::create(pushData.inputKeyAddress));
-    DataAccessor inputValues = DataAccessor::create(DataPtr::create(pushData.inputValueAddress));
-    
-    for (uint32_t i = threadId; i < dataSize; i += WorkgroupSize)
-    {
-        inputKeys.get(i, sharedKeys[i]);
-        inputValues.get(i, sharedValues[i]);
-    }
-    
-    // Synchronize all threads after loading
-    GroupMemoryBarrierWithGroupSync();
-    
-
-    for (uint32_t stage = 0; stage < Log2ElementCount; stage++)
-    {
-        for (uint32_t pass = 0; pass <= stage; pass++)
-        {
-            const uint32_t compareDistance = 1 << (stage - pass);
-            
-            for (uint32_t i = threadId; i < dataSize; i += WorkgroupSize)
-            {
-                const uint32_t partnerId = i ^ compareDistance;
-                
-                if (partnerId >= dataSize)
-                    continue;
-               
-                const uint32_t waveSize = WaveGetLaneCount();
-                const uint32_t myWaveId = i / waveSize;
-                const uint32_t partnerWaveId = partnerId / waveSize;
-                const bool sameWave = (myWaveId == partnerWaveId);
-
-                uint32_t myKey, myValue, partnerKey, partnerValue;
-                [branch]
-                if (sameWave && compareDistance < waveSize)
-                {
-                    // WAVE INTRINSIC
-                    myKey = sharedKeys[i];
-                    myValue = sharedValues[i];
-
-                    const uint32_t partnerLane = partnerId % waveSize;
-                    partnerKey = WaveReadLaneAt(myKey, partnerLane);
-                    partnerValue = WaveReadLaneAt(myValue, partnerLane);
-                }
-                else
-                {
-                    // SHARED MEM
-                    myKey = sharedKeys[i];
-                    myValue = sharedValues[i];
-                    partnerKey = sharedKeys[partnerId];
-                    partnerValue = sharedValues[partnerId];
-                }
-
-                const uint32_t sequenceSize = 1 << (stage + 1);
-                const uint32_t sequenceIndex = i / sequenceSize;
-                const bool sequenceAscending = (sequenceIndex % 2) == 0;
-                const bool ascending = true;
-                const bool finalDirection = sequenceAscending == ascending;
-                
-                const bool swap = (myKey > partnerKey) == finalDirection;
-                
-                // WORKGROUP COORDINATION: Only lower-indexed element writes both
-                if (i < partnerId && swap)
-                {
-                    sharedKeys[i] = partnerKey;
-                    sharedKeys[partnerId] = myKey;
-                    sharedValues[i] = partnerValue;
-                    sharedValues[partnerId] = myValue;
-                }
-            }
-            
-            GroupMemoryBarrierWithGroupSync();
-        }
-    }
-    
-
-    DataAccessor outputKeys = DataAccessor::create(DataPtr::create(pushData.outputKeyAddress));
-    DataAccessor outputValues = DataAccessor::create(DataPtr::create(pushData.outputValueAddress));
-    
-    for (uint32_t i = threadId; i < dataSize; i += WorkgroupSize)
-    {
-        outputKeys.set(i, sharedKeys[i]);
-        outputValues.set(i, sharedValues[i]);
-    }
-}
\ No newline at end of file
+	Accessor accessor = Accessor::create(pushConstants.deviceBufferAddress);
+	SharedMemoryAccessor sharedmemAccessor;
+
+	const uint32_t threadID = glsl::gl_LocalInvocationID().x;
+
+	// Each thread handles 2 ADJACENT elements: lo and hi
+	// Following bitonic sort pattern: thread i handles elements [2*i] and [2*i + 1]
+	const uint32_t loIdx = threadID * 2;
+	const uint32_t hiIdx = threadID * 2 + 1;
+
+	uint32_t loKey, hiKey;
+	accessor.get(loIdx, loKey);
+	accessor.get(hiIdx, hiKey);
+
+	uint32_t loVal = loIdx;
+	uint32_t hiVal = hiIdx;
+
+	workgroup::BitonicSort<BitonicSortConfig>::template __call<Accessor, SharedMemoryAccessor>(accessor, sharedmemAccessor, loKey, hiKey, loVal, hiVal);
+
+	accessor.set(loIdx, loKey);
+	accessor.set(hiIdx, hiKey);
+}

From 6544e047a4b9386d26cf171218aa3c30e76b44de Mon Sep 17 00:00:00 2001
From: Abbas Garousi <69919151+CrabExtra@users.noreply.github.com>
Date: Wed, 22 Oct 2025 17:26:25 +0330
Subject: [PATCH 3/8] Update common.hlsl

---
 13_BitonicSort/app_resources/common.hlsl | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/13_BitonicSort/app_resources/common.hlsl b/13_BitonicSort/app_resources/common.hlsl
index 5f15d0af1..e6ea02580 100644
--- a/13_BitonicSort/app_resources/common.hlsl
+++ b/13_BitonicSort/app_resources/common.hlsl
@@ -1,17 +1,13 @@
-// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
-// This file is part of the "Nabla Engine".
-// For conditions of distribution and use, see copyright notice in nabla.h
 #ifndef _BITONIC_SORT_COMMON_INCLUDED_
 #define _BITONIC_SORT_COMMON_INCLUDED_
+#include "nbl/builtin/hlsl/cpp_compat.hlsl"
 
-struct BitonicPushData
+struct PushConstantData
 {
-
-	uint64_t inputKeyAddress;
-	uint64_t inputValueAddress;
-	uint64_t outputKeyAddress;
-	uint64_t outputValueAddress;
-	uint32_t dataElementCount;
+	uint64_t deviceBufferAddress;
 };
 
-#endif
\ No newline at end of file
+NBL_CONSTEXPR uint32_t WorkgroupSizeLog2 = 7;  // 128 threads
+NBL_CONSTEXPR uint32_t ElementsPerThreadLog2 = 1;  // 2 elements per thread
+NBL_CONSTEXPR uint32_t elementCount = uint32_t(1) << (WorkgroupSizeLog2 + ElementsPerThreadLog2);
+#endif

From 8de0c0f71cea71ab5492417ef4a73d31e77bf063 Mon Sep 17 00:00:00 2001
From: Abbas Garousi <69919151+CrabExtra@users.noreply.github.com>
Date: Wed, 22 Oct 2025 17:27:10 +0330
Subject: [PATCH 4/8] Update main.cpp

---
 13_BitonicSort/main.cpp | 300 +++++++++++++++++-----------------------
 1 file changed, 130 insertions(+), 170 deletions(-)

diff --git a/13_BitonicSort/main.cpp b/13_BitonicSort/main.cpp
index 7e44c5b11..88908ea15 100644
--- a/13_BitonicSort/main.cpp
+++ b/13_BitonicSort/main.cpp
@@ -1,6 +1,4 @@
 #include "nbl/examples/examples.hpp"
-#include <cmath>
-#include <chrono>
 
 using namespace nbl;
 using namespace nbl::core;
@@ -14,12 +12,15 @@ using namespace nbl::examples;
 #include "app_resources/common.hlsl"
 #include "nbl/builtin/hlsl/bit.hlsl"
 
+
+// Simple showcase of how to run Bitonic Sort on a 1D array using workgroup operations
 class BitonicSortApp final : public application_templates::MonoDeviceApplication, public BuiltinResourcesApplication
 {
 	using device_base_t = application_templates::MonoDeviceApplication;
 	using asset_base_t = BuiltinResourcesApplication;
 
 public:
+
 	BitonicSortApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) :
 		system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {}
 
@@ -30,60 +31,41 @@ class BitonicSortApp final : public application_templates::MonoDeviceApplication
 		if (!asset_base_t::onAppInitialized(std::move(system)))
 			return false;
 
-		auto limits = m_physicalDevice->getLimits();
-		const uint32_t max_shared_memory_size = limits.maxComputeSharedMemorySize;
-		const uint32_t max_workgroup_size = limits.maxComputeWorkGroupInvocations; // Get actual GPU limit
-		const uint32_t bytes_per_elements = sizeof(uint32_t) * 2; // 2 uint32_t per element (key and value)
-		const uint32_t max_element_in_shared_memory = max_shared_memory_size / bytes_per_elements;
-
-		// For bitonic sort: element count MUST be power of 2
-		uint32_t element_count = core::roundDownToPoT(max_element_in_shared_memory);
-		
-		const uint32_t log2_element_count = static_cast<uint32_t>(log2(element_count));
-
-		m_logger->log("GPU Limits:", ILogger::ELL_INFO);
-		m_logger->log("  Max Workgroup Size: " + std::to_string(max_workgroup_size), ILogger::ELL_INFO);
-		m_logger->log("  Max Shared Memory: " + std::to_string(max_shared_memory_size) + " bytes", ILogger::ELL_INFO);
-		m_logger->log("  Max elements in shared memory: " + std::to_string(max_element_in_shared_memory), ILogger::ELL_INFO);
-		m_logger->log("  Using element count (power of 2): " + std::to_string(element_count), ILogger::ELL_INFO);
-
+		// Load shader
 		auto prepShader = [&](const core::string& path) -> smart_refctd_ptr<IShader>
+		{
+			IAssetLoader::SAssetLoadParams lp = {};
+			lp.logger = m_logger.get();
+			lp.workingDirectory = "";
+			auto assetBundle = m_assetMgr->getAsset(path, lp);
+			const auto assets = assetBundle.getContents();
+			if (assets.empty())
 			{
-				IAssetLoader::SAssetLoadParams lp = {};
-				lp.logger = m_logger.get();
-				lp.workingDirectory = "";
-				auto assetBundle = m_assetMgr->getAsset(path, lp);
-				const auto assets = assetBundle.getContents();
-				if (assets.empty())
-				{
-					logFail("Could not load shader!");
-					return nullptr;
-				}
-
-				auto source = IAsset::castDown<IShader>(assets[0]);
-				assert(source);
+				logFail("Could not load shader!");
+				return nullptr;
+			}
 
-				auto overrideSource = CHLSLCompiler::createOverridenCopy(
-					source.get(), "#define ElementCount %d\n#define Log2ElementCount %d\n#define WorkgroupSize %d\n",
-					element_count, log2_element_count, max_workgroup_size
-				);
+			auto source = IAsset::castDown<IShader>(assets[0]);
+			assert(source);
 
-				auto shader = m_device->compileShader({ overrideSource.get() });
-				if (!shader)
-				{
-					logFail("Creation of Bitonic Sort Shader from CPU Shader source failed!");
-					return nullptr;
-				}
-				return shader;
-			};
+			auto shader = m_device->compileShader({ source.get() });
+			if (!shader)
+			{
+				logFail("Creation of Bitonic Sort Shader failed!");
+				return nullptr;
+			}
+			return shader;
+		};
 
 		auto bitonicSortShader = prepShader("app_resources/bitonic_sort_shader.comp.hlsl");
-
 		if (!bitonicSortShader)
 			return logFail("Failed to compile bitonic sort shader!");
 
-
-		const nbl::asset::SPushConstantRange pcRange = { .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE,.offset = 0,.size = sizeof(BitonicPushData) };
+		const nbl::asset::SPushConstantRange pcRange = {
+			.stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE,
+			.offset = 0,
+			.size = sizeof(PushConstantData)
+		};
 
 		smart_refctd_ptr<IGPUPipelineLayout> layout;
 		smart_refctd_ptr<IGPUComputePipeline> bitonicSortPipeline;
@@ -93,107 +75,95 @@ class BitonicSortApp final : public application_templates::MonoDeviceApplication
 			params.layout = layout.get();
 			params.shader.shader = bitonicSortShader.get();
 			params.shader.entryPoint = "main";
-			params.shader.entries = nullptr;
+			params.shader.requiredSubgroupSize = static_cast<IPipelineBase::SUBGROUP_SIZE>(hlsl::findMSB(m_physicalDevice->getLimits().maxSubgroupSize));
+			params.cached.requireFullSubgroups = true;
 			if (!m_device->createComputePipelines(nullptr, { &params,1 }, &bitonicSortPipeline))
 				return logFail("Failed to create compute pipeline!\n");
 		}
 
-		nbl::video::IDeviceMemoryAllocator::SAllocation allocation[4] = {};
-		smart_refctd_ptr<IGPUBuffer> buffers[4];
+		// Allocate buffers - use HOST_VISIBLE memory so CPU can directly read/write
+		nbl::video::IDeviceMemoryAllocator::SAllocation allocation = {};
+		smart_refctd_ptr<IGPUBuffer> buffer;
 
 		auto build_buffer = [this](
-			smart_refctd_ptr<ILogicalDevice> m_device,
 			nbl::video::IDeviceMemoryAllocator::SAllocation* allocation,
 			smart_refctd_ptr<IGPUBuffer>& buffer,
 			size_t buffer_size,
 			const char* label
-			) -> void {
-				IGPUBuffer::SCreationParams params;
-				params.size = buffer_size;
-				params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
-				buffer = m_device->createBuffer(std::move(params));
-				if (!buffer)
-					logFail("Failed to create GPU buffer of size %d!\n", buffer_size);
+		) -> void {
+			IGPUBuffer::SCreationParams params;
+			params.size = buffer_size;
+			params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
+			buffer = m_device->createBuffer(std::move(params));
+			if (!buffer)
+				logFail("Failed to create GPU buffer of size %d!\n", buffer_size);
 
-				buffer->setObjectDebugName(label);
+			buffer->setObjectDebugName(label);
 
-				auto reqs = buffer->getMemoryReqs();
-				reqs.memoryTypeBits &= m_physicalDevice->getHostVisibleMemoryTypeBits();
+			auto reqs = buffer->getMemoryReqs();
+			reqs.memoryTypeBits &= m_physicalDevice->getHostVisibleMemoryTypeBits();
 
-				*allocation = m_device->allocate(reqs, buffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT);
-				if (!allocation->isValid())
-					logFail("Failed to allocate Device Memory compatible with our GPU Buffer!\n");
+			*allocation = m_device->allocate(reqs, buffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT);
+			if (!allocation->isValid())
+				logFail("Failed to allocate Device Memory compatible with our GPU Buffer!\n");
 
-				assert(allocation->memory.get() == buffer->getBoundMemory().memory);
-			};
+			assert(allocation->memory.get() == buffer->getBoundMemory().memory);
+		};
 
-		build_buffer(m_device, allocation, buffers[0], sizeof(uint32_t) * element_count, "Input Key Buffer");
-		build_buffer(m_device, allocation + 1, buffers[1], sizeof(uint32_t) * element_count, "Input Value Buffer");
-		build_buffer(m_device, allocation + 2, buffers[2], sizeof(uint32_t) * element_count, "Output Key Buffer");
-		build_buffer(m_device, allocation + 3, buffers[3], sizeof(uint32_t) * element_count, "Output Value Buffer");
+		build_buffer(&allocation, buffer, sizeof(uint32_t) * elementCount, "Bitonic Sort Buffer");
 
-		uint64_t buffer_device_address[] = {
-			buffers[0]->getDeviceAddress(),
-			buffers[1]->getDeviceAddress(),
-			buffers[2]->getDeviceAddress(),
-			buffers[3]->getDeviceAddress()
-		};
+		uint64_t buffer_device_address = buffer->getDeviceAddress();
 
-	
-		void* mapped_memory[] = {
-			allocation[0].memory->map({0ull,allocation[0].memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ),
-			allocation[1].memory->map({0ull,allocation[1].memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ),
-			allocation[2].memory->map({0ull,allocation[2].memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ),
-			allocation[3].memory->map({0ull,allocation[3].memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ)
-		};
-		if (!mapped_memory[0] || !mapped_memory[1] || !mapped_memory[2] || !mapped_memory[3])
+		// Map buffer memory to CPU
+		void* mapped_memory = allocation.memory->map(
+			{0ull, allocation.memory->getAllocationSize()},
+			IDeviceMemoryAllocation::EMCAF_READ
+		);
+		if (!mapped_memory)
 			return logFail("Failed to map the Device Memory!\n");
 
-		// Generate random data
+		// Generate random input data
 		unsigned seed = std::chrono::system_clock::now().time_since_epoch().count();
 		std::mt19937 g(seed);
 
-		auto bufferData = new uint32_t * [2];
-		for (int i = 0; i < 2; ++i) {
-			bufferData[i] = new uint32_t[element_count];
-		}
-		for (uint32_t i = 0; i < element_count; i++) {
-			bufferData[0][i] = g() % 10000; 
-		}
-
-		memcpy(mapped_memory[0], bufferData[0], sizeof(uint32_t) * element_count);
-
-		for (uint32_t i = 0; i < element_count; i++) {
-			bufferData[1][i] = i; // Values are indices for verification
+		uint32_t* bufferData = new uint32_t[elementCount];
+		for (uint32_t i = 0; i < elementCount; i++) {
+			bufferData[i] = g() % 10000;
 		}
 
-		memcpy(mapped_memory[1], bufferData[1], sizeof(uint32_t) * element_count);
+		// Copy to GPU buffer
+		memcpy(mapped_memory, bufferData, sizeof(uint32_t) * elementCount);
 
+		// Print input
 		std::string outBuffer;
-
-		outBuffer.append("ALL ELEMENTS: ");
-		for (auto i = 0; i < element_count; i++) {
-			outBuffer.append("{");
-			outBuffer.append(std::to_string(bufferData[0][i]));
+		outBuffer.append("Input array: ");
+		for (auto i = 0; i < elementCount; i++) {
+			outBuffer.append(std::to_string(bufferData[i]));
 			outBuffer.append(", ");
-			outBuffer.append(std::to_string(bufferData[1][i]));
-			outBuffer.append("} ");
-
-			// Add newline every 20 elements for readability
-			if ((i + 1) % 20 == 0) {
+			if ((i + 1) % 40 == 0) {
 				outBuffer.append("\n");
 			}
 		}
-		outBuffer.append("\n");
-		outBuffer.append("Count: ");
-		outBuffer.append(std::to_string(element_count));
-		outBuffer.append("\n");
-		m_logger->log("Your input array is: \n" + outBuffer, ILogger::ELL_PERFORMANCE);
+		outBuffer.append("\nElement count: " + std::to_string(elementCount) + "\n");
+		m_logger->log(outBuffer, ILogger::ELL_PERFORMANCE);
+
+		// Memory range for flush/invalidate
+		const ILogicalDevice::MappedMemoryRange memory_range(
+			allocation.memory.get(),
+			0ull,
+			allocation.memory->getAllocationSize()
+		);
 
+		if (!allocation.memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
+			m_device->flushMappedMemoryRanges(1, &memory_range);
 
+		// Create command buffer
 		smart_refctd_ptr<nbl::video::IGPUCommandBuffer> cmdBuf;
 		{
-			smart_refctd_ptr<nbl::video::IGPUCommandPool> cmdpool = m_device->createCommandPool(getComputeQueue()->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
+			smart_refctd_ptr<nbl::video::IGPUCommandPool> cmdpool = m_device->createCommandPool(
+				getComputeQueue()->getFamilyIndex(),
+				IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT
+			);
 			if (!cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &cmdBuf))
 				return logFail("Failed to create Command Buffers!\n");
 		}
@@ -202,22 +172,23 @@ class BitonicSortApp final : public application_templates::MonoDeviceApplication
 		uint64_t timeline = started_value;
 		smart_refctd_ptr<ISemaphore> progress = m_device->createSemaphore(started_value);
 
-		auto pc = BitonicPushData{
-			.inputKeyAddress = buffer_device_address[0],
-			.inputValueAddress = buffer_device_address[1],
-			.outputKeyAddress = buffer_device_address[2],
-			.outputValueAddress = buffer_device_address[3],
-			.dataElementCount = element_count
+		// Setup push constants
+		auto pc = PushConstantData{
+			.deviceBufferAddress = buffer_device_address
 		};
 
+		// Record commands
 		cmdBuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-		cmdBuf->beginDebugMarker("Bitonic Sort Single Dispatch", core::vectorSIMDf(0, 1, 0, 1));
+		cmdBuf->beginDebugMarker("Workgroup Bitonic Sort", core::vectorSIMDf(0, 1, 0, 1));
 		cmdBuf->bindComputePipeline(bitonicSortPipeline.get());
 		cmdBuf->pushConstants(layout.get(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0u, sizeof(pc), &pc);
+
+		// Dispatch 1 workgroup (our sort works on a single workgroup)
 		cmdBuf->dispatch(1, 1, 1);
 		cmdBuf->endDebugMarker();
 		cmdBuf->end();
 
+		// Submit to queue
 		{
 			auto queue = getComputeQueue();
 
@@ -242,82 +213,71 @@ class BitonicSortApp final : public application_templates::MonoDeviceApplication
 			m_api->endCapture();
 		}
 
+		// Wait for GPU to finish
 		const ISemaphore::SWaitInfo wait_infos[] = { {
 				.semaphore = progress.get(),
 				.value = timeline
 			} };
 		m_device->blockForSemaphores(wait_infos);
 
-		const ILogicalDevice::MappedMemoryRange memory_range[] = {
-			ILogicalDevice::MappedMemoryRange(allocation[0].memory.get(), 0ull, allocation[0].memory->getAllocationSize()),
-			ILogicalDevice::MappedMemoryRange(allocation[1].memory.get(), 0ull, allocation[1].memory->getAllocationSize()),
-			ILogicalDevice::MappedMemoryRange(allocation[2].memory.get(), 0ull, allocation[2].memory->getAllocationSize()),
-			ILogicalDevice::MappedMemoryRange(allocation[3].memory.get(), 0ull, allocation[3].memory->getAllocationSize())
-		};
-
-		if (!allocation[0].memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
-			m_device->invalidateMappedMemoryRanges(1, &memory_range[0]);
-		if (!allocation[1].memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
-			m_device->invalidateMappedMemoryRanges(1, &memory_range[1]);
-		if (!allocation[2].memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
-			m_device->invalidateMappedMemoryRanges(1, &memory_range[2]);
-		if (!allocation[3].memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
-			m_device->invalidateMappedMemoryRanges(1, &memory_range[3]);
+		if (!allocation.memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
+			m_device->invalidateMappedMemoryRanges(1, &memory_range);
 
-		const uint32_t* sortedKeys = reinterpret_cast<const uint32_t*>(allocation[2].memory->getMappedPointer());
-		const uint32_t* sortedValues = reinterpret_cast<const uint32_t*>(allocation[3].memory->getMappedPointer());
-
-		assert(allocation[2].offset == 0);
-		assert(allocation[3].offset == 0);
+		// Read sorted results
+		const uint32_t* sortedData = reinterpret_cast<const uint32_t*>(allocation.memory->getMappedPointer());
+		assert(allocation.offset == 0);
 
+		// Print sorted output
 		outBuffer.clear();
-
-		outBuffer.append("ALL SORTED ELEMENTS: ");
-		for (auto i = 0; i < element_count; i++) {
-			outBuffer.append("{");
-			outBuffer.append(std::to_string(sortedKeys[i]));
+		outBuffer.append("Sorted array: ");
+		for (auto i = 0; i < elementCount; i++) {
+			outBuffer.append(std::to_string(sortedData[i]));
 			outBuffer.append(", ");
-			outBuffer.append(std::to_string(sortedValues[i]));
-			outBuffer.append("} ");
-
-			if ((i + 1) % 20 == 0) {
+			if ((i + 1) % 40 == 0) {
 				outBuffer.append("\n");
 			}
 		}
-		outBuffer.append("\n");
-		outBuffer.append("Count: ");
-		outBuffer.append(std::to_string(element_count));
-		outBuffer.append("\n");
-		m_logger->log("Your sorted array is: \n" + outBuffer, ILogger::ELL_PERFORMANCE);
+		outBuffer.append("\nElement count: " + std::to_string(elementCount) + "\n");
+		m_logger->log(outBuffer, ILogger::ELL_PERFORMANCE);
 
+		// Verify correctness
 		bool is_sorted = true;
-		for (uint32_t i = 1; i < element_count; i++) {
-			if (sortedKeys[i] < sortedKeys[i - 1]) {
+		int32_t error_index = -1;
+		for (uint32_t i = 1; i < elementCount; i++) {
+			if (sortedData[i] < sortedData[i - 1]) {
 				is_sorted = false;
+				error_index = i;
 				break;
 			}
 		}
-		m_logger->log(is_sorted ? "Array is correctly sorted!" : "Array is NOT sorted correctly!",
-			is_sorted ? ILogger::ELL_PERFORMANCE : ILogger::ELL_ERROR);
 
-		allocation[0].memory->unmap();
-		allocation[1].memory->unmap();
-		allocation[2].memory->unmap();
-		allocation[3].memory->unmap();
-
-		m_device->waitIdle();
-
-		for (int i = 0; i < 2; ++i) {
-			delete[] bufferData[i];
+		if (is_sorted) {
+			m_logger->log("Array is correctly sorted!", ILogger::ELL_PERFORMANCE);
+		}
+		else {
+			std::string errorMsg = "Array is NOT sorted correctly!\n";
+			errorMsg += "Error at index " + std::to_string(error_index) + ":\n";
+			errorMsg += "  Previous element [" + std::to_string(error_index - 1) + "] = " + std::to_string(sortedData[error_index - 1]) + "\n";
+			errorMsg += "  Current element  [" + std::to_string(error_index) + "] = " + std::to_string(sortedData[error_index]) + "\n";
+			errorMsg += "  (" + std::to_string(sortedData[error_index]) + " < " + std::to_string(sortedData[error_index - 1]) + " is WRONG!)\n";
+			m_logger->log(errorMsg, ILogger::ELL_ERROR);
 		}
+
+		// Cleanup
+		allocation.memory->unmap();
 		delete[] bufferData;
 
 		return true;
 	}
 
 	bool keepRunning() override { return false; }
+
 	void workLoopBody() override {}
-	bool onAppTerminated() override { return true; }
+
+	bool onAppTerminated() override
+	{
+		return device_base_t::onAppTerminated();
+	}
 };
 
-NBL_MAIN_FUNC(BitonicSortApp)
\ No newline at end of file
+NBL_MAIN_FUNC(BitonicSortApp)

From 99232940248fb5a4dd3bc7d6868dac09dfc816a3 Mon Sep 17 00:00:00 2001
From: Abbas Garousi <69919151+CrabExtra@users.noreply.github.com>
Date: Sun, 2 Nov 2025 18:59:34 +0330
Subject: [PATCH 5/8] Update main.cpp

---
 13_BitonicSort/main.cpp | 368 ++++++++++++++++++++++------------------
 1 file changed, 202 insertions(+), 166 deletions(-)

diff --git a/13_BitonicSort/main.cpp b/13_BitonicSort/main.cpp
index 88908ea15..c1b1515c2 100644
--- a/13_BitonicSort/main.cpp
+++ b/13_BitonicSort/main.cpp
@@ -1,3 +1,8 @@
+// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+
 #include "nbl/examples/examples.hpp"
 
 using namespace nbl;
@@ -19,6 +24,25 @@ class BitonicSortApp final : public application_templates::MonoDeviceApplication
 	using device_base_t = application_templates::MonoDeviceApplication;
 	using asset_base_t = BuiltinResourcesApplication;
 
+	smart_refctd_ptr<IGPUComputePipeline> m_pipeline;
+	smart_refctd_ptr<IGPUPipelineLayout> m_layout;
+
+	smart_refctd_ptr<nbl::video::IUtilities> m_utils;
+
+	nbl::video::StreamingTransientDataBufferMT<>* m_upStreamingBuffer;
+	StreamingTransientDataBufferMT<>* m_downStreamingBuffer;
+	smart_refctd_ptr<nbl::video::IGPUBuffer> m_deviceLocalBuffer;
+
+	// These are Buffer Device Addresses
+	uint64_t m_upStreamingBufferAddress;
+	uint64_t m_downStreamingBufferAddress;
+	uint64_t m_deviceLocalBufferAddress;
+
+	uint32_t m_alignment;
+
+	smart_refctd_ptr<ISemaphore> m_timeline;
+	uint64_t semaphoreValue = 0;
+
 public:
 
 	BitonicSortApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) :
@@ -61,211 +85,222 @@ class BitonicSortApp final : public application_templates::MonoDeviceApplication
 		if (!bitonicSortShader)
 			return logFail("Failed to compile bitonic sort shader!");
 
+		m_utils = video::IUtilities::create(smart_refctd_ptr(m_device), smart_refctd_ptr(m_logger));
+		if (!m_utils)
+			return logFail("Failed to create Utilities!");
+		m_upStreamingBuffer = m_utils->getDefaultUpStreamingBuffer();
+		m_downStreamingBuffer = m_utils->getDefaultDownStreamingBuffer();
+		m_upStreamingBufferAddress = m_upStreamingBuffer->getBuffer()->getDeviceAddress();
+		m_downStreamingBufferAddress = m_downStreamingBuffer->getBuffer()->getDeviceAddress();
+
+		// Create device-local buffer
+		{
+			IGPUBuffer::SCreationParams deviceLocalBufferParams = {};
+
+			IQueue* const queue = getComputeQueue();
+			uint32_t queueFamilyIndex = queue->getFamilyIndex();
+
+			deviceLocalBufferParams.queueFamilyIndexCount = 1;
+			deviceLocalBufferParams.queueFamilyIndices = &queueFamilyIndex;
+			deviceLocalBufferParams.size = sizeof(uint32_t) * elementCount * 2;  // *2 because we store (key, value) pairs
+			deviceLocalBufferParams.usage = nbl::asset::IBuffer::E_USAGE_FLAGS::EUF_TRANSFER_SRC_BIT | nbl::asset::IBuffer::E_USAGE_FLAGS::EUF_TRANSFER_DST_BIT | nbl::asset::IBuffer::E_USAGE_FLAGS::EUF_SHADER_DEVICE_ADDRESS_BIT;
+
+			m_deviceLocalBuffer = m_device->createBuffer(std::move(deviceLocalBufferParams));
+			auto mreqs = m_deviceLocalBuffer->getMemoryReqs();
+			mreqs.memoryTypeBits &= m_device->getPhysicalDevice()->getDeviceLocalMemoryTypeBits();
+			auto gpubufMem = m_device->allocate(mreqs, m_deviceLocalBuffer.get(), IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_DEVICE_ADDRESS_BIT);
+
+			m_deviceLocalBufferAddress = m_deviceLocalBuffer.get()->getDeviceAddress();
+		}
+
 		const nbl::asset::SPushConstantRange pcRange = {
 			.stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE,
 			.offset = 0,
 			.size = sizeof(PushConstantData)
 		};
 
-		smart_refctd_ptr<IGPUPipelineLayout> layout;
-		smart_refctd_ptr<IGPUComputePipeline> bitonicSortPipeline;
 		{
-			layout = m_device->createPipelineLayout({ &pcRange,1 });
+			m_layout = m_device->createPipelineLayout({ &pcRange,1 });
 			IGPUComputePipeline::SCreationParams params = {};
-			params.layout = layout.get();
+			params.layout = m_layout.get();
 			params.shader.shader = bitonicSortShader.get();
 			params.shader.entryPoint = "main";
 			params.shader.requiredSubgroupSize = static_cast<IPipelineBase::SUBGROUP_SIZE>(hlsl::findMSB(m_physicalDevice->getLimits().maxSubgroupSize));
 			params.cached.requireFullSubgroups = true;
-			if (!m_device->createComputePipelines(nullptr, { &params,1 }, &bitonicSortPipeline))
+			if (!m_device->createComputePipelines(nullptr, { &params,1 }, &m_pipeline))
 				return logFail("Failed to create compute pipeline!\n");
 		}
 
-		// Allocate buffers - use HOST_VISIBLE memory so CPU can directly read/write
-		nbl::video::IDeviceMemoryAllocator::SAllocation allocation = {};
-		smart_refctd_ptr<IGPUBuffer> buffer;
-
-		auto build_buffer = [this](
-			nbl::video::IDeviceMemoryAllocator::SAllocation* allocation,
-			smart_refctd_ptr<IGPUBuffer>& buffer,
-			size_t buffer_size,
-			const char* label
-		) -> void {
-			IGPUBuffer::SCreationParams params;
-			params.size = buffer_size;
-			params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
-			buffer = m_device->createBuffer(std::move(params));
-			if (!buffer)
-				logFail("Failed to create GPU buffer of size %d!\n", buffer_size);
-
-			buffer->setObjectDebugName(label);
-
-			auto reqs = buffer->getMemoryReqs();
-			reqs.memoryTypeBits &= m_physicalDevice->getHostVisibleMemoryTypeBits();
-
-			*allocation = m_device->allocate(reqs, buffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT);
-			if (!allocation->isValid())
-				logFail("Failed to allocate Device Memory compatible with our GPU Buffer!\n");
-
-			assert(allocation->memory.get() == buffer->getBoundMemory().memory);
-		};
+		const auto& deviceLimits = m_device->getPhysicalDevice()->getLimits();
+		m_alignment = core::max(deviceLimits.nonCoherentAtomSize, alignof(uint32_t));
 
-		build_buffer(&allocation, buffer, sizeof(uint32_t) * elementCount, "Bitonic Sort Buffer");
+		m_timeline = m_device->createSemaphore(semaphoreValue);
 
-		uint64_t buffer_device_address = buffer->getDeviceAddress();
+		IQueue* const queue = getComputeQueue();
 
-		// Map buffer memory to CPU
-		void* mapped_memory = allocation.memory->map(
-			{0ull, allocation.memory->getAllocationSize()},
-			IDeviceMemoryAllocation::EMCAF_READ
-		);
-		if (!mapped_memory)
-			return logFail("Failed to map the Device Memory!\n");
+		const uint32_t inputSize = sizeof(uint32_t) * elementCount * 2;  // *2 because we store (key, value) pairs
 
-		// Generate random input data
-		unsigned seed = std::chrono::system_clock::now().time_since_epoch().count();
-		std::mt19937 g(seed);
+		const uint32_t AllocationCount = 1;
 
-		uint32_t* bufferData = new uint32_t[elementCount];
-		for (uint32_t i = 0; i < elementCount; i++) {
-			bufferData[i] = g() % 10000;
-		}
+		auto inputOffset = m_upStreamingBuffer->invalid_value;
+
+		std::chrono::steady_clock::time_point waitTill(std::chrono::years(45));
+		m_upStreamingBuffer->multi_allocate(waitTill, AllocationCount, &inputOffset, &inputSize, &m_alignment);
 
-		// Copy to GPU buffer
-		memcpy(mapped_memory, bufferData, sizeof(uint32_t) * elementCount);
-
-		// Print input
-		std::string outBuffer;
-		outBuffer.append("Input array: ");
-		for (auto i = 0; i < elementCount; i++) {
-			outBuffer.append(std::to_string(bufferData[i]));
-			outBuffer.append(", ");
-			if ((i + 1) % 40 == 0) {
-				outBuffer.append("\n");
+		{
+			auto* const inputPtr = reinterpret_cast<uint32_t*>(reinterpret_cast<uint8_t*>(m_upStreamingBuffer->getBufferPointer()) + inputOffset);
+
+			// Generate random input data
+			unsigned seed = std::chrono::system_clock::now().time_since_epoch().count();
+			std::mt19937 g(seed);
+
+			std::cout << "Input array: ";
+			for (uint32_t i = 0; i < elementCount; i++) {
+				uint32_t key = g() % 10000;
+				uint32_t value = i; // Use index as value for stable sorting
+				inputPtr[i * 2] = key;
+				inputPtr[i * 2 + 1] = value;
+				std::cout << "(" << key << "," << value << "), ";
+				if ((i + 1) % 20 == 0) {
+					std::cout << "\n";
+				}
+			}
+			std::cout << "\nElement count: " << elementCount << "\n";
+
+			// Always remember to flush!
+			if (m_upStreamingBuffer->needsManualFlushOrInvalidate())
+			{
+				const auto bound = m_upStreamingBuffer->getBuffer()->getBoundMemory();
+				const ILogicalDevice::MappedMemoryRange range(bound.memory, bound.offset + inputOffset, inputSize);
+				m_device->flushMappedMemoryRanges(1, &range);
 			}
 		}
-		outBuffer.append("\nElement count: " + std::to_string(elementCount) + "\n");
-		m_logger->log(outBuffer, ILogger::ELL_PERFORMANCE);
-
-		// Memory range for flush/invalidate
-		const ILogicalDevice::MappedMemoryRange memory_range(
-			allocation.memory.get(),
-			0ull,
-			allocation.memory->getAllocationSize()
-		);
 
-		if (!allocation.memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
-			m_device->flushMappedMemoryRanges(1, &memory_range);
+		const uint32_t outputSize = inputSize;
+
+		auto outputOffset = m_downStreamingBuffer->invalid_value;
+		m_downStreamingBuffer->multi_allocate(waitTill, AllocationCount, &outputOffset, &outputSize, &m_alignment);
 
-		// Create command buffer
-		smart_refctd_ptr<nbl::video::IGPUCommandBuffer> cmdBuf;
+		smart_refctd_ptr<IGPUCommandBuffer> cmdbuf;
 		{
-			smart_refctd_ptr<nbl::video::IGPUCommandPool> cmdpool = m_device->createCommandPool(
-				getComputeQueue()->getFamilyIndex(),
-				IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT
-			);
-			if (!cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &cmdBuf))
+			smart_refctd_ptr<nbl::video::IGPUCommandPool> cmdpool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT);
+			if (!cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &cmdbuf)) {
 				return logFail("Failed to create Command Buffers!\n");
+			}
+			cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { &cmdbuf,1 }, core::smart_refctd_ptr(m_logger));
+			cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+			cmdbuf->bindComputePipeline(m_pipeline.get());
+
+			const PushConstantData pc = {.deviceBufferAddress = m_deviceLocalBufferAddress};
+
+			IGPUCommandBuffer::SBufferCopy copyInfo = {};
+			copyInfo.srcOffset = inputOffset;
+			copyInfo.dstOffset = 0;
+			copyInfo.size = m_deviceLocalBuffer->getSize();
+			cmdbuf->copyBuffer(m_upStreamingBuffer->getBuffer(), m_deviceLocalBuffer.get(), 1, &copyInfo);
+
+			IGPUCommandBuffer::SPipelineBarrierDependencyInfo pipelineBarrierInfo1 = {};
+			decltype(pipelineBarrierInfo1)::buffer_barrier_t barrier1 = {};
+			pipelineBarrierInfo1.bufBarriers = { &barrier1, 1u };
+			barrier1.range.buffer = m_deviceLocalBuffer;
+			barrier1.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT;
+			barrier1.barrier.dep.srcAccessMask = ACCESS_FLAGS::MEMORY_WRITE_BITS;
+			barrier1.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT;
+			barrier1.barrier.dep.dstAccessMask = ACCESS_FLAGS::MEMORY_READ_BITS;
+			cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS(0), pipelineBarrierInfo1);
+
+			cmdbuf->pushConstants(m_pipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0u, sizeof(pc), &pc);
+
+			cmdbuf->dispatch(1, 1, 1);
+
+			IGPUCommandBuffer::SPipelineBarrierDependencyInfo pipelineBarrierInfo2 = {};
+			decltype(pipelineBarrierInfo2)::buffer_barrier_t barrier2 = {};
+			pipelineBarrierInfo2.bufBarriers = { &barrier2, 1u };
+			barrier2.range.buffer = m_deviceLocalBuffer;
+			barrier2.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT;
+			barrier2.barrier.dep.srcAccessMask = ACCESS_FLAGS::MEMORY_WRITE_BITS;
+			barrier2.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT;
+			barrier2.barrier.dep.dstAccessMask = ACCESS_FLAGS::MEMORY_READ_BITS;
+			cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS(0), pipelineBarrierInfo2);
+
+			copyInfo.srcOffset = 0;
+			copyInfo.dstOffset = outputOffset;
+			cmdbuf->copyBuffer(m_deviceLocalBuffer.get(), m_downStreamingBuffer->getBuffer(), 1, &copyInfo);
+			cmdbuf->end();
 		}
 
-		constexpr uint64_t started_value = 0;
-		uint64_t timeline = started_value;
-		smart_refctd_ptr<ISemaphore> progress = m_device->createSemaphore(started_value);
-
-		// Setup push constants
-		auto pc = PushConstantData{
-			.deviceBufferAddress = buffer_device_address
-		};
-
-		// Record commands
-		cmdBuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-		cmdBuf->beginDebugMarker("Workgroup Bitonic Sort", core::vectorSIMDf(0, 1, 0, 1));
-		cmdBuf->bindComputePipeline(bitonicSortPipeline.get());
-		cmdBuf->pushConstants(layout.get(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0u, sizeof(pc), &pc);
-
-		// Dispatch 1 workgroup (our sort works on a single workgroup)
-		cmdBuf->dispatch(1, 1, 1);
-		cmdBuf->endDebugMarker();
-		cmdBuf->end();
-
-		// Submit to queue
+		semaphoreValue++;
 		{
-			auto queue = getComputeQueue();
-
-			IQueue::SSubmitInfo submit_infos[1];
-			IQueue::SSubmitInfo::SCommandBufferInfo cmdBufs[] = {
-				{
-					.cmdbuf = cmdBuf.get()
-				}
+			const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufInfo =
+			{
+				.cmdbuf = cmdbuf.get()
 			};
-			submit_infos[0].commandBuffers = cmdBufs;
-			IQueue::SSubmitInfo::SSemaphoreInfo signals[] = {
-				{
-					.semaphore = progress.get(),
-					.value = ++timeline,
-					.stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT
-				}
+			const IQueue::SSubmitInfo::SSemaphoreInfo signalInfo =
+			{
+				.semaphore = m_timeline.get(),
+				.value = semaphoreValue,
+				.stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT
+			};
+
+			const IQueue::SSubmitInfo submitInfo = {
+				.waitSemaphores = {},
+				.commandBuffers = {&cmdbufInfo,1},
+				.signalSemaphores = {&signalInfo,1}
 			};
-			submit_infos[0].signalSemaphores = signals;
 
 			m_api->startCapture();
-			queue->submit(submit_infos);
+			queue->submit({ &submitInfo,1 });
 			m_api->endCapture();
 		}
 
-		// Wait for GPU to finish
-		const ISemaphore::SWaitInfo wait_infos[] = { {
-				.semaphore = progress.get(),
-				.value = timeline
-			} };
-		m_device->blockForSemaphores(wait_infos);
-
-		if (!allocation.memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
-			m_device->invalidateMappedMemoryRanges(1, &memory_range);
-
-		// Read sorted results
-		const uint32_t* sortedData = reinterpret_cast<const uint32_t*>(allocation.memory->getMappedPointer());
-		assert(allocation.offset == 0);
-
-		// Print sorted output
-		outBuffer.clear();
-		outBuffer.append("Sorted array: ");
-		for (auto i = 0; i < elementCount; i++) {
-			outBuffer.append(std::to_string(sortedData[i]));
-			outBuffer.append(", ");
-			if ((i + 1) % 40 == 0) {
-				outBuffer.append("\n");
-			}
-		}
-		outBuffer.append("\nElement count: " + std::to_string(elementCount) + "\n");
-		m_logger->log(outBuffer, ILogger::ELL_PERFORMANCE);
-
-		// Verify correctness
-		bool is_sorted = true;
-		int32_t error_index = -1;
-		for (uint32_t i = 1; i < elementCount; i++) {
-			if (sortedData[i] < sortedData[i - 1]) {
-				is_sorted = false;
-				error_index = i;
-				break;
-			}
-		}
+		const ISemaphore::SWaitInfo futureWait = { m_timeline.get(),semaphoreValue };
 
-		if (is_sorted) {
-			m_logger->log("Array is correctly sorted!", ILogger::ELL_PERFORMANCE);
-		}
-		else {
-			std::string errorMsg = "Array is NOT sorted correctly!\n";
-			errorMsg += "Error at index " + std::to_string(error_index) + ":\n";
-			errorMsg += "  Previous element [" + std::to_string(error_index - 1) + "] = " + std::to_string(sortedData[error_index - 1]) + "\n";
-			errorMsg += "  Current element  [" + std::to_string(error_index) + "] = " + std::to_string(sortedData[error_index]) + "\n";
-			errorMsg += "  (" + std::to_string(sortedData[error_index]) + " < " + std::to_string(sortedData[error_index - 1]) + " is WRONG!)\n";
-			m_logger->log(errorMsg, ILogger::ELL_ERROR);
-		}
+		m_upStreamingBuffer->multi_deallocate(AllocationCount, &inputOffset, &inputSize, futureWait);
 
-		// Cleanup
-		allocation.memory->unmap();
-		delete[] bufferData;
+		auto latchedConsumer = make_smart_refctd_ptr<IUtilities::CDownstreamingDataConsumer>(
+			IDeviceMemoryAllocation::MemoryRange(outputOffset, outputSize),
+			[=](const size_t dstOffset, const void* bufSrc, const size_t size)->void
+			{
+				assert(dstOffset == 0 && size == outputSize);
+
+				std::cout << "Sorted array: ";
+				const uint32_t* const data = reinterpret_cast<const uint32_t*>(bufSrc);
+				for (auto i = 0u; i < elementCount; i++) {
+					uint32_t key = data[i * 2];
+					uint32_t value = data[i * 2 + 1];
+					std::cout << "(" << key << "," << value << "), ";
+					if ((i + 1) % 20 == 0) {
+						std::cout << "\n";
+					}
+				}
+				std::cout << "\nElement count: " << elementCount << "\n";
+
+				bool is_sorted = true;
+				int32_t error_index = -1;
+				for (uint32_t i = 1; i < elementCount; i++) {
+					uint32_t prevKey = data[(i - 1) * 2];
+					uint32_t currKey = data[i * 2];
+					if (currKey < prevKey) {
+						is_sorted = false;
+						error_index = i;
+						break;
+					}
+				}
+
+				if (is_sorted) {
+					std::cout << "Array is correctly sorted!\n";
+				}
+				else {
+					std::cout << "Array is NOT sorted correctly!\n";
+					std::cout << "Error at index " << error_index << ":\n";
+					std::cout << "  Previous key [" << (error_index - 1) << "] = " << data[(error_index - 1) * 2] << "\n";
+					std::cout << "  Current key  [" << error_index << "] = " << data[error_index * 2] << "\n";
+					std::cout << "  (" << data[error_index * 2] << " < " << data[(error_index - 1) * 2] << " is WRONG!)\n";
+				}
+			},
+			std::move(cmdbuf), m_downStreamingBuffer
+		);
+		m_downStreamingBuffer->multi_deallocate(AllocationCount, &outputOffset, &outputSize, futureWait, &latchedConsumer.get());
 
 		return true;
 	}
@@ -276,6 +311,7 @@ class BitonicSortApp final : public application_templates::MonoDeviceApplication
 
 	bool onAppTerminated() override
 	{
+		while (m_downStreamingBuffer->cull_frees()) {}
 		return device_base_t::onAppTerminated();
 	}
 };

From 800802b13e95f6e2db8cd5b1d0a1f1a9a3f81d9d Mon Sep 17 00:00:00 2001
From: Abbas Garousi <69919151+CrabExtra@users.noreply.github.com>
Date: Sun, 2 Nov 2025 19:00:16 +0330
Subject: [PATCH 6/8] Update common.hlsl

---
 13_BitonicSort/app_resources/common.hlsl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/13_BitonicSort/app_resources/common.hlsl b/13_BitonicSort/app_resources/common.hlsl
index e6ea02580..9f8f4dd3b 100644
--- a/13_BitonicSort/app_resources/common.hlsl
+++ b/13_BitonicSort/app_resources/common.hlsl
@@ -7,7 +7,7 @@ struct PushConstantData
 	uint64_t deviceBufferAddress;
 };
 
-NBL_CONSTEXPR uint32_t WorkgroupSizeLog2 = 7;  // 128 threads
-NBL_CONSTEXPR uint32_t ElementsPerThreadLog2 = 1;  // 2 elements per thread
-NBL_CONSTEXPR uint32_t elementCount = uint32_t(1) << (WorkgroupSizeLog2 + ElementsPerThreadLog2);
+NBL_CONSTEXPR uint32_t WorkgroupSizeLog2 = 10;  // 1024 threads (2^10)
+NBL_CONSTEXPR uint32_t ElementsPerThreadLog2 = 2;  // 4 elements per thread (2^2) - VIRTUAL THREADING!
+NBL_CONSTEXPR uint32_t elementCount = uint32_t(1) << (WorkgroupSizeLog2 + ElementsPerThreadLog2);  // 4096 elements (2^12)
 #endif

From 446d4876f55d5a71f2e8bf411226b484c64f1e58 Mon Sep 17 00:00:00 2001
From: Abbas Garousi <69919151+CrabExtra@users.noreply.github.com>
Date: Sun, 2 Nov 2025 19:00:51 +0330
Subject: [PATCH 7/8] Update bitonic_sort_shader.comp.hlsl

---
 .../bitonic_sort_shader.comp.hlsl             | 37 +++++++------------
 1 file changed, 13 insertions(+), 24 deletions(-)

diff --git a/13_BitonicSort/app_resources/bitonic_sort_shader.comp.hlsl b/13_BitonicSort/app_resources/bitonic_sort_shader.comp.hlsl
index f8f11bcaa..d3b2dcdf6 100644
--- a/13_BitonicSort/app_resources/bitonic_sort_shader.comp.hlsl
+++ b/13_BitonicSort/app_resources/bitonic_sort_shader.comp.hlsl
@@ -10,7 +10,7 @@ using BitonicSortConfig = workgroup::bitonic_sort::bitonic_sort_config<ElementsP
 
 NBL_CONSTEXPR uint32_t WorkgroupSize = BitonicSortConfig::WorkgroupSize;
 
-groupshared uint32_t sharedmem[2 * WorkgroupSize];
+groupshared uint32_t sharedmem[BitonicSortConfig::SharedmemDWORDs];
 
 uint32_t3 glsl::gl_WorkGroupSize() { return uint32_t3(uint32_t(BitonicSortConfig::WorkgroupSize), 1, 1); }
 
@@ -19,20 +19,21 @@ struct SharedMemoryAccessor
 	template <typename AccessType, typename IndexType>
 	void set(IndexType idx, AccessType value)
 	{
-		sharedmem[idx] = value;
+		sharedmem[idx * 2] = value.first;
+		sharedmem[idx * 2 + 1] = value.second;
 	}
 
 	template <typename AccessType, typename IndexType>
 	void get(IndexType idx, NBL_REF_ARG(AccessType) value)
 	{
-		value = sharedmem[idx];
+		value.first = sharedmem[idx * 2];
+		value.second = sharedmem[idx * 2 + 1];
 	}
 
 	void workgroupExecutionAndMemoryBarrier()
 	{
 		glsl::barrier();
 	}
-
 };
 
 struct Accessor
@@ -47,13 +48,17 @@ struct Accessor
 	template <typename AccessType, typename IndexType>
 	void get(const IndexType index, NBL_REF_ARG(AccessType) value)
 	{
-		value = vk::RawBufferLoad<AccessType>(address + index * sizeof(AccessType));
+		const uint64_t offset = address + index * sizeof(AccessType);
+		value.first = vk::RawBufferLoad<uint32_t>(offset);
+		value.second = vk::RawBufferLoad<uint32_t>(offset + sizeof(uint32_t));
 	}
 
 	template <typename AccessType, typename IndexType>
 	void set(const IndexType index, const AccessType value)
 	{
-		vk::RawBufferStore<AccessType>(address + index * sizeof(AccessType), value);
+		const uint64_t offset = address + index * sizeof(AccessType);
+		vk::RawBufferStore<uint32_t>(offset, value.first);
+		vk::RawBufferStore<uint32_t>(offset + sizeof(uint32_t), value.second);
 	}
 
 	uint64_t address;
@@ -66,22 +71,6 @@ void main()
 	Accessor accessor = Accessor::create(pushConstants.deviceBufferAddress);
 	SharedMemoryAccessor sharedmemAccessor;
 
-	const uint32_t threadID = glsl::gl_LocalInvocationID().x;
-
-	// Each thread handles 2 ADJACENT elements: lo and hi
-	// Following bitonic sort pattern: thread i handles elements [2*i] and [2*i + 1]
-	const uint32_t loIdx = threadID * 2;
-	const uint32_t hiIdx = threadID * 2 + 1;
-
-	uint32_t loKey, hiKey;
-	accessor.get(loIdx, loKey);
-	accessor.get(hiIdx, hiKey);
-
-	uint32_t loVal = loIdx;
-	uint32_t hiVal = hiIdx;
-
-	workgroup::BitonicSort<BitonicSortConfig>::template __call<Accessor, SharedMemoryAccessor>(accessor, sharedmemAccessor, loKey, hiKey, loVal, hiVal);
-
-	accessor.set(loIdx, loKey);
-	accessor.set(hiIdx, hiKey);
+	// The sort handles load/store internally
+	workgroup::BitonicSort<BitonicSortConfig>::template __call<Accessor, SharedMemoryAccessor>(accessor, sharedmemAccessor);
 }

From 2f3126fff69a588e53096cdfaef8d2a2b8676ed6 Mon Sep 17 00:00:00 2001
From: Abbas Garousi <69919151+CrabExtra@users.noreply.github.com>
Date: Wed, 5 Nov 2025 19:26:10 +0330
Subject: [PATCH 8/8] Refactor shared memory access in bitonic sort shader

---
 .../app_resources/bitonic_sort_shader.comp.hlsl    | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/13_BitonicSort/app_resources/bitonic_sort_shader.comp.hlsl b/13_BitonicSort/app_resources/bitonic_sort_shader.comp.hlsl
index d3b2dcdf6..3ea39262f 100644
--- a/13_BitonicSort/app_resources/bitonic_sort_shader.comp.hlsl
+++ b/13_BitonicSort/app_resources/bitonic_sort_shader.comp.hlsl
@@ -19,15 +19,13 @@ struct SharedMemoryAccessor
 	template <typename AccessType, typename IndexType>
 	void set(IndexType idx, AccessType value)
 	{
-		sharedmem[idx * 2] = value.first;
-		sharedmem[idx * 2 + 1] = value.second;
+		sharedmem[idx] = value;
 	}
 
 	template <typename AccessType, typename IndexType>
 	void get(IndexType idx, NBL_REF_ARG(AccessType) value)
 	{
-		value.first = sharedmem[idx * 2];
-		value.second = sharedmem[idx * 2 + 1];
+		value = sharedmem[idx];
 	}
 
 	void workgroupExecutionAndMemoryBarrier()
@@ -48,17 +46,13 @@ struct Accessor
 	template <typename AccessType, typename IndexType>
 	void get(const IndexType index, NBL_REF_ARG(AccessType) value)
 	{
-		const uint64_t offset = address + index * sizeof(AccessType);
-		value.first = vk::RawBufferLoad<uint32_t>(offset);
-		value.second = vk::RawBufferLoad<uint32_t>(offset + sizeof(uint32_t));
+		value = vk::RawBufferLoad<AccessType>(address + index * sizeof(AccessType));
 	}
 
 	template <typename AccessType, typename IndexType>
 	void set(const IndexType index, const AccessType value)
 	{
-		const uint64_t offset = address + index * sizeof(AccessType);
-		vk::RawBufferStore<uint32_t>(offset, value.first);
-		vk::RawBufferStore<uint32_t>(offset + sizeof(uint32_t), value.second);
+		vk::RawBufferStore<AccessType>(address + index * sizeof(AccessType), value);
 	}
 
 	uint64_t address;