zeMemGetAddressRange error with alltoallv and zero-sized buffers

# Summary

I am encountering the error below when running alltoallv on Aurora with oneCCL. This use of alltoallv intentionally sets only two non-zero buffers for each rank, whereas all other buffers are zero, thus performing a communication only with neighboring ranks. The error appears when setting `CCL_ALLTOALLV=topo` and when the non-zero buffers are greater than 8KB in size, but not with other alltoallv algorithms and smaller buffer sizes. 

```
terminate called after throwing an instance of 'ccl::v1::exception'
  what():  oneCCL: ze_call.cpp:43 do_call: EXCEPTION: ze error at zeMemGetAddressRange, code: ZE_RESULT_ERROR_UNKNOWN
```

# Version and environment

oneCCL version 2021.15
oneapi verison 2025.0.5
MPICH from https://github.com/pmodels/mpich/commit/6037a7a7a3fe6b5a62b2896cd2894ef5dca6648f
SUSE Linux Enterprise Server 15 SP4 v15.4
Aurora configuration for Intel Data Center Max 1550 GPU

# Reproducer

alltoallv_ccl.cpp
```
#include <sycl/sycl.hpp>
#include <mpi.h>
#include <cmath>
#include <chrono>
#include "oneapi/ccl.hpp"
#include <algorithm>

// Get the nearest neighbors
std::vector<int> get_nearest_neighbors(int rank, int size)
{
    std::vector<int> neighbors;
    int num_neighbors = 1;

    if (size == 2) {
        int other_rank = (rank - 1 + size) % size;
        neighbors.push_back(other_rank);
    } else {
        for (int i = 0; i < num_neighbors; i++) {
            int left_rank = (rank - (1 + i) + size) % size;
            int right_rank = (rank + (1 + i)) % size;
            neighbors.push_back(left_rank);
            neighbors.push_back(right_rank);
        }
    }

    if (size < 100) {
        std::cout << "Rank " << rank << " neighbor list: ";
        for (int n : neighbors) {
            std::cout << n << " ";
        }
        std::cout << std::endl;
    }
    return neighbors;
}

bool contains(const std::vector<int>& vec, int value) {
    return std::find(vec.begin(), vec.end(), value) != vec.end();
}

int main(int argc, char** argv)
{
    int rank, size;
    double t1, t2, t3, t4, init_timer;

    ccl::init();

    std::chrono::time_point<std::chrono::high_resolution_clock> start = std::chrono::high_resolution_clock::now();
    MPI_Init(&argc, &argv);
    std::chrono::time_point<std::chrono::high_resolution_clock> end = std::chrono::high_resolution_clock::now();
    MPI_Barrier( MPI_COMM_WORLD );

    MPI_Comm_size(MPI_COMM_WORLD, &size);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);

    /* create sycl queue */
    std::vector<sycl::device> gpu_devices;
    for (const auto& dev : sycl::device::get_devices()) {
        if (dev.is_gpu()) {
            gpu_devices.push_back(dev);
        }
    }
    if (gpu_devices.empty()) {
        std::cerr << "No GPU devices found!\n";
        MPI_Abort(MPI_COMM_WORLD, 1);
    }
    sycl::device selected_device = gpu_devices[rank % gpu_devices.size()];
    sycl::queue Q(selected_device);
    //sycl::queue Q(sycl::gpu_selector_v);

    /* create kvs */
    ccl::shared_ptr_class<ccl::kvs> kvs;
    ccl::kvs::address_type main_addr;
    if (rank == 0) {
        kvs = ccl::create_main_kvs();
        main_addr = kvs->get_address();
        MPI_Bcast((void *)main_addr.data(), main_addr.size(), MPI_BYTE, 0, MPI_COMM_WORLD);
    }
    else {
        MPI_Bcast((void *)main_addr.data(), main_addr.size(), MPI_BYTE, 0, MPI_COMM_WORLD);
        kvs = ccl::create_kvs(main_addr);
    }

    /* create communicator */
    auto dev = ccl::create_device(Q.get_device());
    auto ctx = ccl::create_context(Q.get_context());
    auto comm = ccl::create_communicator(size, rank, dev, ctx, kvs);

    /* create stream */
    auto stream = ccl::create_stream(Q);

    if (size < 100) {
        std::cout << "Rank " << rank << " running on " << Q.get_device().get_info<sycl::info::device::name>()  << std::endl;
    }
    int  elements_per_proc;
    int elements_per_proc_other;
    if (argc == 3)
    {
       elements_per_proc = atoi(argv[1])/4;
       elements_per_proc_other = atoi(argv[2])/4;
    }
    else
    {
        elements_per_proc = 1048576;
        elements_per_proc_other = 2;
    }

    // Get the neighboring ranks
    std::vector<int> neighbors;
    neighbors = get_nearest_neighbors(rank, size);
    MPI_Barrier( MPI_COMM_WORLD );

    // Initialize arrays
    std::vector<float> send_buff;
    std::vector<unsigned long> send_counts(size,0);
    std::vector<unsigned long> send_displs(size,0);
    std::vector<unsigned long> rcv_counts(size,0);
    std::vector<unsigned long> rcv_displs(size,0);

    // Fill in the send counts, displacements and buffers
    int global_send_elements = 0;
    for (int i=0; i<size; i++)
    {
        if (contains(neighbors,i)) {
            send_counts[i] = elements_per_proc;
            //global_send_elements += elements_per_proc;
            send_displs[i] = global_send_elements;
            global_send_elements += elements_per_proc;
            for (int n = 0; n < elements_per_proc; n++) {
                send_buff.push_back(rank);
            }
        } else {
            send_counts[i] = elements_per_proc_other;
            //global_send_elements += elements_per_proc;
            send_displs[i] = global_send_elements;
            global_send_elements += elements_per_proc_other;
            for (int n = 0; n < elements_per_proc_other; n++) {
                send_buff.push_back(-1.0);
            }
        }
    }
    if (size < 100) {
        std::cout << "Rank " << rank << " sending " << global_send_elements << " elements" << std::endl;
        fflush(stdout);
    }
    MPI_Barrier( MPI_COMM_WORLD );

    // Get the received data
    int global_rcv_elements = 0;
    MPI_Alltoall(send_counts.data(), 1, MPI_UNSIGNED_LONG,
                 rcv_counts.data(), 1, MPI_UNSIGNED_LONG,
                 MPI_COMM_WORLD);
    for (int i = 0; i < size; i++) {
        if (rcv_counts[i] != 0 and size < 100) {
                std::cout << "Rank " << rank << " receives " << rcv_counts[i] <<
                          " elements from rank " << i << std::endl;
        }
        //global_rcv_elements += rcv_counts[i];
        rcv_displs[i] = global_rcv_elements;
        global_rcv_elements += rcv_counts[i];
    }
    std::vector<float> rcv_buff(global_rcv_elements, -99.0);

    // Move the send and receive buffers to the GPU
    float *dsend_buff  = sycl::malloc_device<float>(global_send_elements,Q);
    float *drcv_buff  = sycl::malloc_device<float>(global_rcv_elements,Q);
    Q.memcpy((void *) dsend_buff, (void *) send_buff.data(), global_send_elements*sizeof(float));
    Q.memcpy((void *) drcv_buff, (void *) rcv_buff.data(), global_rcv_elements*sizeof(float));
    Q.wait();
    MPI_Barrier( MPI_COMM_WORLD );

    int iters = 10;
    std::vector<double> elapsed(iters);
    for (int i = 0; i < iters; i++)
    {
        t3 = MPI_Wtime();
        //MPI_Alltoallv(dsend_buff, send_counts.data(), send_displs.data(), MPI_FLOAT,
        //             drcv_buff, rcv_counts.data(), rcv_displs.data(), MPI_FLOAT,
        //             MPI_COMM_WORLD);
        ccl::alltoallv(dsend_buff, send_counts,
                       drcv_buff, rcv_counts,
                       comm, stream).wait();
        MPI_Barrier( MPI_COMM_WORLD );
        t4 = MPI_Wtime();
        if ( rank == 0 )    elapsed[i]=( t4 - t3 ) * 1e3;
    }

    double avg = 0.0;
    int skip = 4;
    if ( rank == 0 )
    {
        for (int i = skip; i < iters; i++)
        {
            avg = avg + elapsed[i];
            //std::cout<<elapsed[i]<<std::endl;
        }
        avg = avg / (iters - skip);
        std::cout << "Average all2all time: " << avg << " ms" << std::endl;
    }

    /*
    Q.memcpy(rcv_buff.data(), drcv_buff, global_rcv_elements*sizeof(float)).wait();
    if (rank == 0) {
        std::cout << "Rank 0 received: " << std::endl;
        for (int i=0; i<rcv_buff.size(); i++) {
            std::cout << i << " " << rcv_buff[i] << std::endl;
        }
    }
    */

    MPI_Finalize();
    return 0;
}
```

On Aurora, build with
```
module load frameworks
mpicxx -o all2allv_ccl all2allv_ccl.cpp -fsycl -lmpi \
    -I/opt/aurora/24.347.0/oneapi/ccl/2021.14/include \
    -L/opt/aurora/24.347.0/oneapi/ccl/2021.14/lib -lccl
```

and run with 
```
#!/bin/bash

module load frameworks

# Use the latest oneCCL
unset CCL_ROOT
export CCL_CONFIGURATION_PATH=""
export CCL_CONFIGURATION=cpu_gpu_dpcpp
export CCL_ROOT="/lus/flare/projects/Aurora_deployment/datascience/software/ccl_2021.15/oneCCL/build_2021p15/"
export LD_LIBRARY_PATH=${CCL_ROOT}/lib:$LD_LIBRARY_PATH
export CPATH=${CCL_ROOT}/include:$CPATH
export LIBRARY_PATH=${CCL_ROOT}/lib:$LIBRARY_PATH

export CCL_PROCESS_LAUNCHER=pmix
export CCL_ATL_TRANSPORT=mpi
export CCL_KVS_MODE=mpi
export CCL_ENABLE_SYCL_KERNELS=1
export CCL_ALLTOALLV=topo
#export CCL_LOG_LEVEL=debug

EXE=/flare/datascience/balin/oneCCL/dl_scaling/aurora_frameworks_scaling/c++-sycl-oneccl-all2allv/all2allv_ccl
NNODES=`wc -l < $PBS_NODEFILE`
RANKS_PER_NODE=12
NRANKS=$(( NNODES * RANKS_PER_NODE ))
CPU_BINDING=list:1-4:8-11:16-19:24-27:32-35:40-43:53-56:60-63:68-71:76-79:84-87:92-95
EXT_ENV="--env FI_CXI_DEFAULT_CQ_SIZE=1048576 --env CCL_ALLTOALLV_MONOLITHIC_KERNEL=0"
OTHER_BUF_SIZE=0

echo Using MPI from:
ldd $EXE | grep libmpi
echo
echo Using oneCCL from:
ldd $EXE | grep ccl
echo


BUF_SIZE=16384
mpiexec ${EXT_ENV} --np ${NRANKS} -ppn ${RANKS_PER_NODE}  --cpu-bind  $CPU_BINDING \
        $EXE ${BUF_SIZE} ${OTHER_BUF_SIZE}
```

# Logs

Can add if requested, very long log file. 

# Expected behavior

The expected behavior is for the error to not appear.

# Observed behavior

The observed behavior is the error reposted above.

# Existing workarounds

Workarounds are to change the alltoallv algorithm to the other options, but with a drop in performance. 

# Affected projects

This bug impacts scalability of ML applications using alltoallv on Aurora. 


Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Uh oh!

zeMemGetAddressRange error with alltoallv and zero-sized buffers #174

Summary

Version and environment

Reproducer

Logs

Expected behavior

Observed behavior

Existing workarounds

Affected projects

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Uh oh!

zeMemGetAddressRange error with alltoallv and zero-sized buffers #174

Description

Summary

Version and environment

Reproducer

Logs

Expected behavior

Observed behavior

Existing workarounds

Affected projects

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions