-
Couldn't load subscription status.
- Fork 90
Description
Summary
I am encountering the error below when running alltoallv on Aurora with oneCCL. This use of alltoallv intentionally sets only two non-zero buffers for each rank, whereas all other buffers are zero, thus performing a communication only with neighboring ranks. The error appears when setting CCL_ALLTOALLV=topo and when the non-zero buffers are greater than 8KB in size, but not with other alltoallv algorithms and smaller buffer sizes.
terminate called after throwing an instance of 'ccl::v1::exception'
what(): oneCCL: ze_call.cpp:43 do_call: EXCEPTION: ze error at zeMemGetAddressRange, code: ZE_RESULT_ERROR_UNKNOWN
Version and environment
oneCCL version 2021.15
oneapi verison 2025.0.5
MPICH from pmodels/mpich@6037a7a
SUSE Linux Enterprise Server 15 SP4 v15.4
Aurora configuration for Intel Data Center Max 1550 GPU
Reproducer
alltoallv_ccl.cpp
#include <sycl/sycl.hpp>
#include <mpi.h>
#include <cmath>
#include <chrono>
#include "oneapi/ccl.hpp"
#include <algorithm>
// Get the nearest neighbors
std::vector<int> get_nearest_neighbors(int rank, int size)
{
std::vector<int> neighbors;
int num_neighbors = 1;
if (size == 2) {
int other_rank = (rank - 1 + size) % size;
neighbors.push_back(other_rank);
} else {
for (int i = 0; i < num_neighbors; i++) {
int left_rank = (rank - (1 + i) + size) % size;
int right_rank = (rank + (1 + i)) % size;
neighbors.push_back(left_rank);
neighbors.push_back(right_rank);
}
}
if (size < 100) {
std::cout << "Rank " << rank << " neighbor list: ";
for (int n : neighbors) {
std::cout << n << " ";
}
std::cout << std::endl;
}
return neighbors;
}
bool contains(const std::vector<int>& vec, int value) {
return std::find(vec.begin(), vec.end(), value) != vec.end();
}
int main(int argc, char** argv)
{
int rank, size;
double t1, t2, t3, t4, init_timer;
ccl::init();
std::chrono::time_point<std::chrono::high_resolution_clock> start = std::chrono::high_resolution_clock::now();
MPI_Init(&argc, &argv);
std::chrono::time_point<std::chrono::high_resolution_clock> end = std::chrono::high_resolution_clock::now();
MPI_Barrier( MPI_COMM_WORLD );
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
/* create sycl queue */
std::vector<sycl::device> gpu_devices;
for (const auto& dev : sycl::device::get_devices()) {
if (dev.is_gpu()) {
gpu_devices.push_back(dev);
}
}
if (gpu_devices.empty()) {
std::cerr << "No GPU devices found!\n";
MPI_Abort(MPI_COMM_WORLD, 1);
}
sycl::device selected_device = gpu_devices[rank % gpu_devices.size()];
sycl::queue Q(selected_device);
//sycl::queue Q(sycl::gpu_selector_v);
/* create kvs */
ccl::shared_ptr_class<ccl::kvs> kvs;
ccl::kvs::address_type main_addr;
if (rank == 0) {
kvs = ccl::create_main_kvs();
main_addr = kvs->get_address();
MPI_Bcast((void *)main_addr.data(), main_addr.size(), MPI_BYTE, 0, MPI_COMM_WORLD);
}
else {
MPI_Bcast((void *)main_addr.data(), main_addr.size(), MPI_BYTE, 0, MPI_COMM_WORLD);
kvs = ccl::create_kvs(main_addr);
}
/* create communicator */
auto dev = ccl::create_device(Q.get_device());
auto ctx = ccl::create_context(Q.get_context());
auto comm = ccl::create_communicator(size, rank, dev, ctx, kvs);
/* create stream */
auto stream = ccl::create_stream(Q);
if (size < 100) {
std::cout << "Rank " << rank << " running on " << Q.get_device().get_info<sycl::info::device::name>() << std::endl;
}
int elements_per_proc;
int elements_per_proc_other;
if (argc == 3)
{
elements_per_proc = atoi(argv[1])/4;
elements_per_proc_other = atoi(argv[2])/4;
}
else
{
elements_per_proc = 1048576;
elements_per_proc_other = 2;
}
// Get the neighboring ranks
std::vector<int> neighbors;
neighbors = get_nearest_neighbors(rank, size);
MPI_Barrier( MPI_COMM_WORLD );
// Initialize arrays
std::vector<float> send_buff;
std::vector<unsigned long> send_counts(size,0);
std::vector<unsigned long> send_displs(size,0);
std::vector<unsigned long> rcv_counts(size,0);
std::vector<unsigned long> rcv_displs(size,0);
// Fill in the send counts, displacements and buffers
int global_send_elements = 0;
for (int i=0; i<size; i++)
{
if (contains(neighbors,i)) {
send_counts[i] = elements_per_proc;
//global_send_elements += elements_per_proc;
send_displs[i] = global_send_elements;
global_send_elements += elements_per_proc;
for (int n = 0; n < elements_per_proc; n++) {
send_buff.push_back(rank);
}
} else {
send_counts[i] = elements_per_proc_other;
//global_send_elements += elements_per_proc;
send_displs[i] = global_send_elements;
global_send_elements += elements_per_proc_other;
for (int n = 0; n < elements_per_proc_other; n++) {
send_buff.push_back(-1.0);
}
}
}
if (size < 100) {
std::cout << "Rank " << rank << " sending " << global_send_elements << " elements" << std::endl;
fflush(stdout);
}
MPI_Barrier( MPI_COMM_WORLD );
// Get the received data
int global_rcv_elements = 0;
MPI_Alltoall(send_counts.data(), 1, MPI_UNSIGNED_LONG,
rcv_counts.data(), 1, MPI_UNSIGNED_LONG,
MPI_COMM_WORLD);
for (int i = 0; i < size; i++) {
if (rcv_counts[i] != 0 and size < 100) {
std::cout << "Rank " << rank << " receives " << rcv_counts[i] <<
" elements from rank " << i << std::endl;
}
//global_rcv_elements += rcv_counts[i];
rcv_displs[i] = global_rcv_elements;
global_rcv_elements += rcv_counts[i];
}
std::vector<float> rcv_buff(global_rcv_elements, -99.0);
// Move the send and receive buffers to the GPU
float *dsend_buff = sycl::malloc_device<float>(global_send_elements,Q);
float *drcv_buff = sycl::malloc_device<float>(global_rcv_elements,Q);
Q.memcpy((void *) dsend_buff, (void *) send_buff.data(), global_send_elements*sizeof(float));
Q.memcpy((void *) drcv_buff, (void *) rcv_buff.data(), global_rcv_elements*sizeof(float));
Q.wait();
MPI_Barrier( MPI_COMM_WORLD );
int iters = 10;
std::vector<double> elapsed(iters);
for (int i = 0; i < iters; i++)
{
t3 = MPI_Wtime();
//MPI_Alltoallv(dsend_buff, send_counts.data(), send_displs.data(), MPI_FLOAT,
// drcv_buff, rcv_counts.data(), rcv_displs.data(), MPI_FLOAT,
// MPI_COMM_WORLD);
ccl::alltoallv(dsend_buff, send_counts,
drcv_buff, rcv_counts,
comm, stream).wait();
MPI_Barrier( MPI_COMM_WORLD );
t4 = MPI_Wtime();
if ( rank == 0 ) elapsed[i]=( t4 - t3 ) * 1e3;
}
double avg = 0.0;
int skip = 4;
if ( rank == 0 )
{
for (int i = skip; i < iters; i++)
{
avg = avg + elapsed[i];
//std::cout<<elapsed[i]<<std::endl;
}
avg = avg / (iters - skip);
std::cout << "Average all2all time: " << avg << " ms" << std::endl;
}
/*
Q.memcpy(rcv_buff.data(), drcv_buff, global_rcv_elements*sizeof(float)).wait();
if (rank == 0) {
std::cout << "Rank 0 received: " << std::endl;
for (int i=0; i<rcv_buff.size(); i++) {
std::cout << i << " " << rcv_buff[i] << std::endl;
}
}
*/
MPI_Finalize();
return 0;
}
On Aurora, build with
module load frameworks
mpicxx -o all2allv_ccl all2allv_ccl.cpp -fsycl -lmpi \
-I/opt/aurora/24.347.0/oneapi/ccl/2021.14/include \
-L/opt/aurora/24.347.0/oneapi/ccl/2021.14/lib -lccl
and run with
#!/bin/bash
module load frameworks
# Use the latest oneCCL
unset CCL_ROOT
export CCL_CONFIGURATION_PATH=""
export CCL_CONFIGURATION=cpu_gpu_dpcpp
export CCL_ROOT="/lus/flare/projects/Aurora_deployment/datascience/software/ccl_2021.15/oneCCL/build_2021p15/"
export LD_LIBRARY_PATH=${CCL_ROOT}/lib:$LD_LIBRARY_PATH
export CPATH=${CCL_ROOT}/include:$CPATH
export LIBRARY_PATH=${CCL_ROOT}/lib:$LIBRARY_PATH
export CCL_PROCESS_LAUNCHER=pmix
export CCL_ATL_TRANSPORT=mpi
export CCL_KVS_MODE=mpi
export CCL_ENABLE_SYCL_KERNELS=1
export CCL_ALLTOALLV=topo
#export CCL_LOG_LEVEL=debug
EXE=/flare/datascience/balin/oneCCL/dl_scaling/aurora_frameworks_scaling/c++-sycl-oneccl-all2allv/all2allv_ccl
NNODES=`wc -l < $PBS_NODEFILE`
RANKS_PER_NODE=12
NRANKS=$(( NNODES * RANKS_PER_NODE ))
CPU_BINDING=list:1-4:8-11:16-19:24-27:32-35:40-43:53-56:60-63:68-71:76-79:84-87:92-95
EXT_ENV="--env FI_CXI_DEFAULT_CQ_SIZE=1048576 --env CCL_ALLTOALLV_MONOLITHIC_KERNEL=0"
OTHER_BUF_SIZE=0
echo Using MPI from:
ldd $EXE | grep libmpi
echo
echo Using oneCCL from:
ldd $EXE | grep ccl
echo
BUF_SIZE=16384
mpiexec ${EXT_ENV} --np ${NRANKS} -ppn ${RANKS_PER_NODE} --cpu-bind $CPU_BINDING \
$EXE ${BUF_SIZE} ${OTHER_BUF_SIZE}
Logs
Can add if requested, very long log file.
Expected behavior
The expected behavior is for the error to not appear.
Observed behavior
The observed behavior is the error reposted above.
Existing workarounds
Workarounds are to change the alltoallv algorithm to the other options, but with a drop in performance.
Affected projects
This bug impacts scalability of ML applications using alltoallv on Aurora.