Program Listing for File MPIWorker.hpp¶
↰ Return to documentation for file (pennylane_lightning/core/src/simulators/lightning_gpu/MPIWorker.hpp
)
// Copyright 2022-2023 Xanadu Quantum Technologies Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <algorithm>
#include <bit>
#include <memory>
#include <string>
#include <vector>
#include "MPIManager.hpp"
#include "MPI_helpers.hpp"
#include "cuError.hpp"
#include "cuStateVecError.hpp"
#include <cuda.h>
#include <cuda_runtime.h>
#include <custatevec.h>
namespace {
namespace cuUtil = Pennylane::LightningGPU::Util;
using namespace Pennylane::LightningGPU;
using SharedLocalStream =
std::shared_ptr<std::remove_pointer<cudaStream_t>::type>;
using SharedMPIWorker = std::shared_ptr<
std::remove_pointer<custatevecSVSwapWorkerDescriptor_t>::type>;
inline size_t mebibyteToBytes(const size_t mebibytes) {
return mebibytes * size_t{1024 * 1024};
}
inline double bytesToMebibytes(const size_t bytes) {
return static_cast<double>(bytes) / (1024.0 * 1024.0);
}
} // namespace
namespace Pennylane::LightningGPU::MPI {
struct MPIWorkerDeleter {
void operator()(cudaStream_t localStream) const {
PL_CUDA_IS_SUCCESS(cudaStreamDestroy(localStream));
}
void operator()(custatevecSVSwapWorkerDescriptor_t svSegSwapWorker,
custatevecHandle_t handle,
custatevecCommunicatorDescriptor_t communicator,
void *d_extraWorkspace, void *d_transferWorkspace,
std::vector<void *> d_subSVsP2P,
std::vector<cudaEvent_t> remoteEvents,
cudaEvent_t localEvent) const {
PL_CUSTATEVEC_IS_SUCCESS(
custatevecSVSwapWorkerDestroy(handle, svSegSwapWorker));
PL_CUSTATEVEC_IS_SUCCESS(
custatevecCommunicatorDestroy(handle, communicator));
PL_CUDA_IS_SUCCESS(cudaFree(d_extraWorkspace));
PL_CUDA_IS_SUCCESS(cudaFree(d_transferWorkspace));
// LCOV_EXCL_START
for (auto *d_subSV : d_subSVsP2P)
PL_CUDA_IS_SUCCESS(cudaIpcCloseMemHandle(d_subSV));
for (auto event : remoteEvents)
PL_CUDA_IS_SUCCESS(cudaEventDestroy(event));
// LCOV_EXCL_STOP
PL_CUDA_IS_SUCCESS(cudaEventDestroy(localEvent));
}
};
inline SharedLocalStream make_shared_local_stream() {
cudaStream_t localStream;
PL_CUDA_IS_SUCCESS(cudaStreamCreate(&localStream));
return {localStream, MPIWorkerDeleter()};
}
template <typename CFP_t>
SharedMPIWorker
make_shared_mpi_worker(custatevecHandle_t handle, MPIManager &mpi_manager,
const size_t mpi_buf_size, CFP_t *sv,
const size_t numLocalQubits, cudaStream_t localStream) {
custatevecSVSwapWorkerDescriptor_t svSegSwapWorker = nullptr;
int nDevices_int = 0;
PL_CUDA_IS_SUCCESS(cudaGetDeviceCount(&nDevices_int));
size_t nDevices = static_cast<size_t>(nDevices_int);
// Ensure the number of P2P devices is calculated based on the number of MPI
// processes within the node
nDevices = mpi_manager.getSizeNode() < nDevices ? mpi_manager.getSizeNode()
: nDevices;
size_t nP2PDeviceBits = std::bit_width(nDevices) - 1;
size_t p2pEnabled_local = 1;
// P2P access check
if (nP2PDeviceBits != 0) {
size_t local_device_id = mpi_manager.getRank() % nDevices;
for (size_t devId = 0; devId < nDevices; ++devId) {
if (devId != local_device_id) {
int accessEnabled;
PL_CUDA_IS_SUCCESS(cudaDeviceCanAccessPeer(
&accessEnabled, static_cast<int>(devId),
static_cast<int>(local_device_id)));
if (!accessEnabled) {
p2pEnabled_local = 0;
}
}
}
auto isP2PEnabled =
mpi_manager.allreduce<size_t>(p2pEnabled_local, "min");
// P2PDeviceBits is set as 0 for all MPI processes if P2P access is not
// supported by any pair of devices of any node
if (!isP2PEnabled) {
nP2PDeviceBits = 0;
}
}
cudaDataType_t svDataType;
if constexpr (std::is_same_v<CFP_t, cuDoubleComplex> ||
std::is_same_v<CFP_t, double2>) {
svDataType = CUDA_C_64F;
} else {
svDataType = CUDA_C_32F;
}
cudaEvent_t localEvent = nullptr;
custatevecCommunicatorDescriptor_t communicator = nullptr;
PL_CUDA_IS_SUCCESS(cudaEventCreateWithFlags(
&localEvent, cudaEventInterprocess | cudaEventDisableTiming));
custatevecCommunicatorType_t communicatorType;
if (mpi_manager.getVendor() == "MPICH") {
communicatorType = CUSTATEVEC_COMMUNICATOR_TYPE_MPICH;
}
if (mpi_manager.getVendor() == "Open MPI") {
communicatorType = CUSTATEVEC_COMMUNICATOR_TYPE_OPENMPI;
}
// LCOV_EXCL_START
// The following lines are for handling the errors caused by python
// layer calls. Won't be covered by cpp unit tests.
auto err = custatevecCommunicatorCreate(handle, &communicator,
communicatorType, nullptr);
if (err != CUSTATEVEC_STATUS_SUCCESS) {
communicator = nullptr;
PL_CUSTATEVEC_IS_SUCCESS(custatevecCommunicatorCreate(
handle, &communicator, communicatorType, "libmpi.so"));
}
// LCOV_EXCL_STOP
mpi_manager.Barrier();
void *d_extraWorkspace = nullptr;
void *d_transferWorkspace = nullptr;
std::vector<void *> d_subSVsP2P;
std::vector<int> subSVIndicesP2P;
std::vector<cudaEvent_t> remoteEvents;
size_t extraWorkspaceSize = 0;
size_t minTransferWorkspaceSize = 0;
PL_CUSTATEVEC_IS_SUCCESS(custatevecSVSwapWorkerCreate(
/* custatevecHandle_t */ handle,
/* custatevecSVSwapWorkerDescriptor_t* */ &svSegSwapWorker,
/* custatevecCommunicatorDescriptor_t */ communicator,
/* void* */ sv,
/* int32_t */ mpi_manager.getRank(),
/* cudaEvent_t */ localEvent,
/* cudaDataType_t */ svDataType,
/* cudaStream_t */ localStream,
/* size_t* */ &extraWorkspaceSize,
/* size_t* */ &minTransferWorkspaceSize));
PL_CUDA_IS_SUCCESS(cudaMalloc(&d_extraWorkspace, extraWorkspaceSize));
PL_CUSTATEVEC_IS_SUCCESS(custatevecSVSwapWorkerSetExtraWorkspace(
/* custatevecHandle_t */ handle,
/* custatevecSVSwapWorkerDescriptor_t */ svSegSwapWorker,
/* void* */ d_extraWorkspace,
/* size_t */ extraWorkspaceSize));
size_t transferWorkspaceSize; // In bytes and its value should be power
// of 2.
if (mpi_buf_size == 0) {
transferWorkspaceSize = size_t{1} << numLocalQubits;
if constexpr (std::is_same_v<CFP_t, cuDoubleComplex> ||
std::is_same_v<CFP_t, double2>) {
transferWorkspaceSize = transferWorkspaceSize * sizeof(double) * 2;
} else {
transferWorkspaceSize = transferWorkspaceSize * sizeof(float) * 2;
}
// With the default setting, transfer work space is limited to 64 MiB
// based on the benchmark tests on the Perlmutter.
size_t buffer_limit = 64;
double transferWorkspaceSizeInMiB =
bytesToMebibytes(transferWorkspaceSize);
if (transferWorkspaceSizeInMiB > static_cast<double>(buffer_limit)) {
transferWorkspaceSize = mebibyteToBytes(buffer_limit);
}
} else {
transferWorkspaceSize = mebibyteToBytes(mpi_buf_size);
}
transferWorkspaceSize =
std::max(minTransferWorkspaceSize, transferWorkspaceSize);
PL_CUDA_IS_SUCCESS(cudaMalloc(&d_transferWorkspace, transferWorkspaceSize));
PL_CUSTATEVEC_IS_SUCCESS(custatevecSVSwapWorkerSetTransferWorkspace(
/* custatevecHandle_t */ handle,
/* custatevecSVSwapWorkerDescriptor_t */ svSegSwapWorker,
/* void* */ d_transferWorkspace,
/* size_t */ transferWorkspaceSize));
// LCOV_EXCL_START
// Won't be covered by CI checks with 2 nvidia GPUs without nvlink
// connection
if (nP2PDeviceBits != 0) {
cudaIpcMemHandle_t ipcMemHandle;
PL_CUDA_IS_SUCCESS(cudaIpcGetMemHandle(&ipcMemHandle, sv));
std::vector<cudaIpcMemHandle_t> ipcMemHandles(mpi_manager.getSize());
mpi_manager.Allgather<cudaIpcMemHandle_t>(ipcMemHandle, ipcMemHandles,
sizeof(ipcMemHandle));
cudaIpcEventHandle_t eventHandle;
PL_CUDA_IS_SUCCESS(cudaIpcGetEventHandle(&eventHandle, localEvent));
// distribute event handles
std::vector<cudaIpcEventHandle_t> ipcEventHandles(
mpi_manager.getSize());
mpi_manager.Allgather<cudaIpcEventHandle_t>(
eventHandle, ipcEventHandles, sizeof(eventHandle));
// get remove device pointers and events
size_t nSubSVsP2P = size_t{1} << nP2PDeviceBits;
size_t p2pSubSVIndexBegin =
(mpi_manager.getRank() / nSubSVsP2P) * nSubSVsP2P;
size_t p2pSubSVIndexEnd = p2pSubSVIndexBegin + nSubSVsP2P;
for (size_t p2pSubSVIndex = p2pSubSVIndexBegin;
p2pSubSVIndex < p2pSubSVIndexEnd; p2pSubSVIndex++) {
if (static_cast<size_t>(mpi_manager.getRank()) == p2pSubSVIndex)
continue; // don't need local sub state vector pointer
void *d_subSVP2P = nullptr;
const auto &dstMemHandle = ipcMemHandles[p2pSubSVIndex];
PL_CUDA_IS_SUCCESS(cudaIpcOpenMemHandle(
&d_subSVP2P, dstMemHandle, cudaIpcMemLazyEnablePeerAccess));
d_subSVsP2P.push_back(d_subSVP2P);
cudaEvent_t eventP2P = nullptr;
PL_CUDA_IS_SUCCESS(cudaIpcOpenEventHandle(
&eventP2P, ipcEventHandles[p2pSubSVIndex]));
remoteEvents.push_back(eventP2P);
subSVIndicesP2P.push_back(p2pSubSVIndex);
}
// set p2p sub state vectors
PL_CUSTATEVEC_IS_SUCCESS(custatevecSVSwapWorkerSetSubSVsP2P(
/* custatevecHandle_t */ handle,
/* custatevecSVSwapWorkerDescriptor_t */ svSegSwapWorker,
/* void** */ d_subSVsP2P.data(),
/* const int32_t* */ subSVIndicesP2P.data(),
/* cudaEvent_t */ remoteEvents.data(),
/* const uint32_t */ static_cast<uint32_t>(d_subSVsP2P.size())));
}
// LCOV_EXCL_START
return {svSegSwapWorker,
std::bind(MPIWorkerDeleter(), std::placeholders::_1, handle,
communicator, d_extraWorkspace, d_transferWorkspace,
d_subSVsP2P, remoteEvents, localEvent)};
}
} // namespace Pennylane::LightningGPU::MPI
api/program_listing_file_pennylane_lightning_core_src_simulators_lightning_gpu_MPIWorker.hpp
Download Python script
Download Notebook
View on GitHub