Skip to content

zeMemGetAddressRange error with alltoallv and zero-sized buffers #174

@rickybalin

Description

@rickybalin

Summary

I am encountering the error below when running alltoallv on Aurora with oneCCL. This use of alltoallv intentionally sets only two non-zero buffers for each rank, whereas all other buffers are zero, thus performing a communication only with neighboring ranks. The error appears when setting CCL_ALLTOALLV=topo and when the non-zero buffers are greater than 8KB in size, but not with other alltoallv algorithms and smaller buffer sizes.

terminate called after throwing an instance of 'ccl::v1::exception'
  what():  oneCCL: ze_call.cpp:43 do_call: EXCEPTION: ze error at zeMemGetAddressRange, code: ZE_RESULT_ERROR_UNKNOWN

Version and environment

oneCCL version 2021.15
oneapi verison 2025.0.5
MPICH from pmodels/mpich@6037a7a
SUSE Linux Enterprise Server 15 SP4 v15.4
Aurora configuration for Intel Data Center Max 1550 GPU

Reproducer

alltoallv_ccl.cpp

#include <sycl/sycl.hpp>
#include <mpi.h>
#include <cmath>
#include <chrono>
#include "oneapi/ccl.hpp"
#include <algorithm>

// Get the nearest neighbors
std::vector<int> get_nearest_neighbors(int rank, int size)
{
    std::vector<int> neighbors;
    int num_neighbors = 1;

    if (size == 2) {
        int other_rank = (rank - 1 + size) % size;
        neighbors.push_back(other_rank);
    } else {
        for (int i = 0; i < num_neighbors; i++) {
            int left_rank = (rank - (1 + i) + size) % size;
            int right_rank = (rank + (1 + i)) % size;
            neighbors.push_back(left_rank);
            neighbors.push_back(right_rank);
        }
    }

    if (size < 100) {
        std::cout << "Rank " << rank << " neighbor list: ";
        for (int n : neighbors) {
            std::cout << n << " ";
        }
        std::cout << std::endl;
    }
    return neighbors;
}

bool contains(const std::vector<int>& vec, int value) {
    return std::find(vec.begin(), vec.end(), value) != vec.end();
}

int main(int argc, char** argv)
{
    int rank, size;
    double t1, t2, t3, t4, init_timer;

    ccl::init();

    std::chrono::time_point<std::chrono::high_resolution_clock> start = std::chrono::high_resolution_clock::now();
    MPI_Init(&argc, &argv);
    std::chrono::time_point<std::chrono::high_resolution_clock> end = std::chrono::high_resolution_clock::now();
    MPI_Barrier( MPI_COMM_WORLD );

    MPI_Comm_size(MPI_COMM_WORLD, &size);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);

    /* create sycl queue */
    std::vector<sycl::device> gpu_devices;
    for (const auto& dev : sycl::device::get_devices()) {
        if (dev.is_gpu()) {
            gpu_devices.push_back(dev);
        }
    }
    if (gpu_devices.empty()) {
        std::cerr << "No GPU devices found!\n";
        MPI_Abort(MPI_COMM_WORLD, 1);
    }
    sycl::device selected_device = gpu_devices[rank % gpu_devices.size()];
    sycl::queue Q(selected_device);
    //sycl::queue Q(sycl::gpu_selector_v);

    /* create kvs */
    ccl::shared_ptr_class<ccl::kvs> kvs;
    ccl::kvs::address_type main_addr;
    if (rank == 0) {
        kvs = ccl::create_main_kvs();
        main_addr = kvs->get_address();
        MPI_Bcast((void *)main_addr.data(), main_addr.size(), MPI_BYTE, 0, MPI_COMM_WORLD);
    }
    else {
        MPI_Bcast((void *)main_addr.data(), main_addr.size(), MPI_BYTE, 0, MPI_COMM_WORLD);
        kvs = ccl::create_kvs(main_addr);
    }

    /* create communicator */
    auto dev = ccl::create_device(Q.get_device());
    auto ctx = ccl::create_context(Q.get_context());
    auto comm = ccl::create_communicator(size, rank, dev, ctx, kvs);

    /* create stream */
    auto stream = ccl::create_stream(Q);

    if (size < 100) {
        std::cout << "Rank " << rank << " running on " << Q.get_device().get_info<sycl::info::device::name>()  << std::endl;
    }
    int  elements_per_proc;
    int elements_per_proc_other;
    if (argc == 3)
    {
       elements_per_proc = atoi(argv[1])/4;
       elements_per_proc_other = atoi(argv[2])/4;
    }
    else
    {
        elements_per_proc = 1048576;
        elements_per_proc_other = 2;
    }

    // Get the neighboring ranks
    std::vector<int> neighbors;
    neighbors = get_nearest_neighbors(rank, size);
    MPI_Barrier( MPI_COMM_WORLD );

    // Initialize arrays
    std::vector<float> send_buff;
    std::vector<unsigned long> send_counts(size,0);
    std::vector<unsigned long> send_displs(size,0);
    std::vector<unsigned long> rcv_counts(size,0);
    std::vector<unsigned long> rcv_displs(size,0);

    // Fill in the send counts, displacements and buffers
    int global_send_elements = 0;
    for (int i=0; i<size; i++)
    {
        if (contains(neighbors,i)) {
            send_counts[i] = elements_per_proc;
            //global_send_elements += elements_per_proc;
            send_displs[i] = global_send_elements;
            global_send_elements += elements_per_proc;
            for (int n = 0; n < elements_per_proc; n++) {
                send_buff.push_back(rank);
            }
        } else {
            send_counts[i] = elements_per_proc_other;
            //global_send_elements += elements_per_proc;
            send_displs[i] = global_send_elements;
            global_send_elements += elements_per_proc_other;
            for (int n = 0; n < elements_per_proc_other; n++) {
                send_buff.push_back(-1.0);
            }
        }
    }
    if (size < 100) {
        std::cout << "Rank " << rank << " sending " << global_send_elements << " elements" << std::endl;
        fflush(stdout);
    }
    MPI_Barrier( MPI_COMM_WORLD );

    // Get the received data
    int global_rcv_elements = 0;
    MPI_Alltoall(send_counts.data(), 1, MPI_UNSIGNED_LONG,
                 rcv_counts.data(), 1, MPI_UNSIGNED_LONG,
                 MPI_COMM_WORLD);
    for (int i = 0; i < size; i++) {
        if (rcv_counts[i] != 0 and size < 100) {
                std::cout << "Rank " << rank << " receives " << rcv_counts[i] <<
                          " elements from rank " << i << std::endl;
        }
        //global_rcv_elements += rcv_counts[i];
        rcv_displs[i] = global_rcv_elements;
        global_rcv_elements += rcv_counts[i];
    }
    std::vector<float> rcv_buff(global_rcv_elements, -99.0);

    // Move the send and receive buffers to the GPU
    float *dsend_buff  = sycl::malloc_device<float>(global_send_elements,Q);
    float *drcv_buff  = sycl::malloc_device<float>(global_rcv_elements,Q);
    Q.memcpy((void *) dsend_buff, (void *) send_buff.data(), global_send_elements*sizeof(float));
    Q.memcpy((void *) drcv_buff, (void *) rcv_buff.data(), global_rcv_elements*sizeof(float));
    Q.wait();
    MPI_Barrier( MPI_COMM_WORLD );

    int iters = 10;
    std::vector<double> elapsed(iters);
    for (int i = 0; i < iters; i++)
    {
        t3 = MPI_Wtime();
        //MPI_Alltoallv(dsend_buff, send_counts.data(), send_displs.data(), MPI_FLOAT,
        //             drcv_buff, rcv_counts.data(), rcv_displs.data(), MPI_FLOAT,
        //             MPI_COMM_WORLD);
        ccl::alltoallv(dsend_buff, send_counts,
                       drcv_buff, rcv_counts,
                       comm, stream).wait();
        MPI_Barrier( MPI_COMM_WORLD );
        t4 = MPI_Wtime();
        if ( rank == 0 )    elapsed[i]=( t4 - t3 ) * 1e3;
    }

    double avg = 0.0;
    int skip = 4;
    if ( rank == 0 )
    {
        for (int i = skip; i < iters; i++)
        {
            avg = avg + elapsed[i];
            //std::cout<<elapsed[i]<<std::endl;
        }
        avg = avg / (iters - skip);
        std::cout << "Average all2all time: " << avg << " ms" << std::endl;
    }

    /*
    Q.memcpy(rcv_buff.data(), drcv_buff, global_rcv_elements*sizeof(float)).wait();
    if (rank == 0) {
        std::cout << "Rank 0 received: " << std::endl;
        for (int i=0; i<rcv_buff.size(); i++) {
            std::cout << i << " " << rcv_buff[i] << std::endl;
        }
    }
    */

    MPI_Finalize();
    return 0;
}

On Aurora, build with

module load frameworks
mpicxx -o all2allv_ccl all2allv_ccl.cpp -fsycl -lmpi \
    -I/opt/aurora/24.347.0/oneapi/ccl/2021.14/include \
    -L/opt/aurora/24.347.0/oneapi/ccl/2021.14/lib -lccl

and run with

#!/bin/bash

module load frameworks

# Use the latest oneCCL
unset CCL_ROOT
export CCL_CONFIGURATION_PATH=""
export CCL_CONFIGURATION=cpu_gpu_dpcpp
export CCL_ROOT="/lus/flare/projects/Aurora_deployment/datascience/software/ccl_2021.15/oneCCL/build_2021p15/"
export LD_LIBRARY_PATH=${CCL_ROOT}/lib:$LD_LIBRARY_PATH
export CPATH=${CCL_ROOT}/include:$CPATH
export LIBRARY_PATH=${CCL_ROOT}/lib:$LIBRARY_PATH

export CCL_PROCESS_LAUNCHER=pmix
export CCL_ATL_TRANSPORT=mpi
export CCL_KVS_MODE=mpi
export CCL_ENABLE_SYCL_KERNELS=1
export CCL_ALLTOALLV=topo
#export CCL_LOG_LEVEL=debug

EXE=/flare/datascience/balin/oneCCL/dl_scaling/aurora_frameworks_scaling/c++-sycl-oneccl-all2allv/all2allv_ccl
NNODES=`wc -l < $PBS_NODEFILE`
RANKS_PER_NODE=12
NRANKS=$(( NNODES * RANKS_PER_NODE ))
CPU_BINDING=list:1-4:8-11:16-19:24-27:32-35:40-43:53-56:60-63:68-71:76-79:84-87:92-95
EXT_ENV="--env FI_CXI_DEFAULT_CQ_SIZE=1048576 --env CCL_ALLTOALLV_MONOLITHIC_KERNEL=0"
OTHER_BUF_SIZE=0

echo Using MPI from:
ldd $EXE | grep libmpi
echo
echo Using oneCCL from:
ldd $EXE | grep ccl
echo


BUF_SIZE=16384
mpiexec ${EXT_ENV} --np ${NRANKS} -ppn ${RANKS_PER_NODE}  --cpu-bind  $CPU_BINDING \
        $EXE ${BUF_SIZE} ${OTHER_BUF_SIZE}

Logs

Can add if requested, very long log file.

Expected behavior

The expected behavior is for the error to not appear.

Observed behavior

The observed behavior is the error reposted above.

Existing workarounds

Workarounds are to change the alltoallv algorithm to the other options, but with a drop in performance.

Affected projects

This bug impacts scalability of ML applications using alltoallv on Aurora.

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't working

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions