Program Listing for File MeasurementsGPUMPI.hpp¶
↰ Return to documentation for file (pennylane_lightning/core/src/simulators/lightning_gpu/measurements/MeasurementsGPUMPI.hpp
)
// Copyright 2018-2023 Xanadu Quantum Technologies Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <algorithm>
#include <complex>
#include <cuda.h>
#include <cusparse.h>
#include <custatevec.h> // custatevecApplyMatrix
#include <random>
#include <type_traits>
#include <unordered_map>
#include <vector>
#include "MPILinearAlg.hpp"
#include "MPIManager.hpp"
#include "MPIWorker.hpp"
#include "MeasurementsBase.hpp"
#include "Observables.hpp"
#include "ObservablesGPU.hpp"
#include "ObservablesGPUMPI.hpp"
#include "StateVectorCudaMPI.hpp"
#include "StateVectorCudaManaged.hpp"
#include "cuStateVecError.hpp"
#include "cuda_helpers.hpp"
namespace {
using namespace Pennylane;
using namespace Pennylane::Measures;
using namespace Pennylane::Observables;
using namespace Pennylane::LightningGPU::Observables;
using namespace Pennylane::LightningGPU::MPI;
namespace cuUtil = Pennylane::LightningGPU::Util;
using Pennylane::LightningGPU::StateVectorCudaManaged;
using namespace Pennylane::Util;
} // namespace
namespace Pennylane::LightningGPU::Measures {
template <class StateVectorT>
class MeasurementsMPI final
: public MeasurementsBase<StateVectorT, MeasurementsMPI<StateVectorT>> {
private:
using PrecisionT = typename StateVectorT::PrecisionT;
using ComplexT = typename StateVectorT::ComplexT;
using BaseType =
MeasurementsBase<StateVectorT, MeasurementsMPI<StateVectorT>>;
using CFP_t = decltype(cuUtil::getCudaType(PrecisionT{}));
cudaDataType_t data_type_;
MPIManager mpi_manager_;
GateCache<PrecisionT> gate_cache_;
public:
explicit MeasurementsMPI(StateVectorT &statevector)
: BaseType{statevector}, mpi_manager_(statevector.getMPIManager()),
gate_cache_(true, statevector.getDataBuffer().getDevTag()) {
if constexpr (std::is_same_v<CFP_t, cuDoubleComplex> ||
std::is_same_v<CFP_t, double2>) {
data_type_ = CUDA_C_64F;
} else {
data_type_ = CUDA_C_32F;
}
mpi_manager_.Barrier();
};
auto probs(const std::vector<std::size_t> &wires)
-> std::vector<PrecisionT> {
// Data return type fixed as double in custatevec function call
std::vector<double> subgroup_probabilities;
// this should be built upon by the wires not participating
int maskLen = 0;
int *maskBitString = nullptr;
int *maskOrdering = nullptr;
std::vector<int> wires_int(wires.size());
// Transform indices between PL & cuQuantum ordering
std::transform(wires.begin(), wires.end(), wires_int.begin(),
[&](std::size_t x) {
return static_cast<int>(
this->_statevector.getTotalNumQubits() - 1 - x);
});
// split wires_int to global and local ones
std::vector<int> wires_local;
std::vector<int> wires_global;
for (const auto &wire : wires_int) {
if (wire <
static_cast<int>(this->_statevector.getNumLocalQubits())) {
wires_local.push_back(wire);
} else {
wires_global.push_back(wire);
}
}
std::vector<double> local_probabilities(
Pennylane::Util::exp2(wires_local.size()));
PL_CUSTATEVEC_IS_SUCCESS(custatevecAbs2SumArray(
/* custatevecHandle_t */ this->_statevector.getCusvHandle(),
/* const void* */ this->_statevector.getData(),
/* cudaDataType_t */ data_type_,
/* const uint32_t */ this->_statevector.getNumLocalQubits(),
/* double* */ local_probabilities.data(),
/* const int32_t* */ wires_local.data(),
/* const uint32_t */ wires_local.size(),
/* const int32_t* */ maskBitString,
/* const int32_t* */ maskOrdering,
/* const uint32_t */ maskLen));
// create new MPI communicator groups
std::size_t subCommGroupId = 0;
for (size_t i = 0; i < wires_global.size(); i++) {
std::size_t mask =
1 << (wires_global[i] - this->_statevector.getNumLocalQubits());
std::size_t bitValue = mpi_manager_.getRank() & mask;
subCommGroupId += bitValue
<< (wires_global[i] -
this->_statevector.getNumLocalQubits());
}
auto sub_mpi_manager0 =
mpi_manager_.split(subCommGroupId, mpi_manager_.getRank());
if (sub_mpi_manager0.getSize() == 1) {
if constexpr (std::is_same_v<CFP_t, cuDoubleComplex> ||
std::is_same_v<CFP_t, double2>) {
return local_probabilities;
} else {
std::vector<PrecisionT> local_probs(
Pennylane::Util::exp2(wires_local.size()));
std::transform(
local_probabilities.begin(), local_probabilities.end(),
local_probs.begin(),
[&](double x) { return static_cast<PrecisionT>(x); });
return local_probs;
}
} else {
// LCOV_EXCL_START
// Won't be covered with 2 MPI processes in CI checks
if (sub_mpi_manager0.getRank() == 0) {
subgroup_probabilities.resize(
Pennylane::Util::exp2(wires_local.size()));
}
sub_mpi_manager0.Reduce<double>(local_probabilities,
subgroup_probabilities, 0, "sum");
if constexpr (std::is_same_v<CFP_t, cuDoubleComplex> ||
std::is_same_v<CFP_t, double2>) {
return subgroup_probabilities;
} else {
std::vector<PrecisionT> local_probs(
Pennylane::Util::exp2(wires_local.size()));
std::transform(
subgroup_probabilities.begin(),
subgroup_probabilities.end(), local_probs.begin(),
[&](double x) { return static_cast<PrecisionT>(x); });
return local_probs;
}
// LCOV_EXCL_STOP
}
}
auto probs() -> std::vector<PrecisionT> {
std::vector<std::size_t> wires;
for (size_t i = 0; i < this->_statevector.getNumQubits(); i++) {
wires.push_back(i);
}
return this->probs(wires);
}
std::vector<PrecisionT> probs(const Observable<StateVectorT> &obs,
std::size_t num_shots = 0) {
return BaseType::probs(obs, num_shots);
}
std::vector<PrecisionT> probs(size_t num_shots) {
return BaseType::probs(num_shots);
}
std::vector<PrecisionT> probs(const std::vector<std::size_t> &wires,
std::size_t num_shots) {
return BaseType::probs(wires, num_shots);
}
auto generate_samples(size_t num_samples) -> std::vector<std::size_t> {
double epsilon = 1e-15;
std::size_t nSubSvs = 1UL << (this->_statevector.getNumGlobalQubits());
std::vector<double> rand_nums(num_samples);
std::vector<std::size_t> samples(
num_samples * this->_statevector.getTotalNumQubits(), 0);
std::size_t bitStringLen = this->_statevector.getNumGlobalQubits() +
this->_statevector.getNumLocalQubits();
std::vector<int> bitOrdering(bitStringLen);
for (size_t i = 0; i < bitOrdering.size(); i++) {
bitOrdering[i] = i;
}
std::vector<custatevecIndex_t> localBitStrings(num_samples);
std::vector<custatevecIndex_t> globalBitStrings(num_samples);
if (mpi_manager_.getRank() == 0) {
for (size_t n = 0; n < num_samples; n++) {
rand_nums[n] = (n + 1.0) / (num_samples + 2.0);
}
}
mpi_manager_.Bcast<double>(rand_nums, 0);
custatevecSamplerDescriptor_t sampler;
void *extraWorkspace = nullptr;
std::size_t extraWorkspaceSizeInBytes = 0;
PL_CUSTATEVEC_IS_SUCCESS(custatevecSamplerCreate(
/* custatevecHandle_t */ this->_statevector.getCusvHandle(),
/* const void* */ this->_statevector.getData(),
/* cudaDataType_t */ data_type_,
/* const uint32_t */ this->_statevector.getNumLocalQubits(),
/* custatevecSamplerDescriptor_t * */ &sampler,
/* uint32_t */ num_samples,
/* std::size_t* */ &extraWorkspaceSizeInBytes));
if (extraWorkspaceSizeInBytes > 0)
PL_CUDA_IS_SUCCESS(
cudaMalloc(&extraWorkspace, extraWorkspaceSizeInBytes));
PL_CUSTATEVEC_IS_SUCCESS(custatevecSamplerPreprocess(
/* custatevecHandle_t */ this->_statevector.getCusvHandle(),
/* custatevecSamplerDescriptor_t */ sampler,
/* void* */ extraWorkspace,
/* const std::size_t */ extraWorkspaceSizeInBytes));
double subNorm = 0;
PL_CUSTATEVEC_IS_SUCCESS(custatevecSamplerGetSquaredNorm(
/* custatevecHandle_t */ this->_statevector.getCusvHandle(),
/* custatevecSamplerDescriptor_t */ sampler,
/* double * */ &subNorm));
int source = (mpi_manager_.getRank() - 1 + mpi_manager_.getSize()) %
mpi_manager_.getSize();
int dest = (mpi_manager_.getRank() + 1) % mpi_manager_.getSize();
double cumulative = 0;
mpi_manager_.Scan<double>(subNorm, cumulative, "sum");
double norm = cumulative;
mpi_manager_.Bcast<double>(norm, mpi_manager_.getSize() - 1);
double precumulative;
mpi_manager_.Sendrecv<double>(cumulative, dest, precumulative, source);
if (mpi_manager_.getRank() == 0) {
precumulative = 0;
}
PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize());
mpi_manager_.Barrier();
// Ensure the 'custatevecSamplerApplySubSVOffset' function can be called
// successfully without reducing accuracy.
// LCOV_EXCL_START
if (precumulative == norm) {
precumulative = norm - epsilon;
}
// LCOV_EXCL_STOP
PL_CUSTATEVEC_IS_SUCCESS(custatevecSamplerApplySubSVOffset(
/* custatevecHandle_t */ this->_statevector.getCusvHandle(),
/* custatevecSamplerDescriptor_t */ sampler,
/* int32_t */ static_cast<int>(mpi_manager_.getRank()),
/* uint32_t */ nSubSvs,
/* double */ precumulative,
/* double */ norm));
PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize());
auto low = std::lower_bound(rand_nums.begin(), rand_nums.end(),
cumulative / norm);
int shotOffset = std::distance(rand_nums.begin(), low);
if (mpi_manager_.getRank() == (mpi_manager_.getSize() - 1)) {
shotOffset = num_samples;
}
int preshotOffset;
mpi_manager_.Sendrecv<int>(shotOffset, dest, preshotOffset, source);
if (mpi_manager_.getRank() == 0) {
preshotOffset = 0;
}
mpi_manager_.Barrier();
int nSubShots = shotOffset - preshotOffset;
if (nSubShots > 0) {
PL_CUSTATEVEC_IS_SUCCESS(custatevecSamplerSample(
/* custatevecHandle_t */ this->_statevector.getCusvHandle(),
/* custatevecSamplerDescriptor_t */ sampler,
/* custatevecIndex_t* */ &localBitStrings[preshotOffset],
/* const int32_t * */ bitOrdering.data(),
/* const uint32_t */ bitStringLen,
/* const double * */ &rand_nums[preshotOffset],
/* const uint32_t */ nSubShots,
/* enum custatevecSamplerOutput_t */
CUSTATEVEC_SAMPLER_OUTPUT_RANDNUM_ORDER));
}
PL_CUSTATEVEC_IS_SUCCESS(custatevecSamplerDestroy(sampler));
if (extraWorkspaceSizeInBytes > 0) {
PL_CUDA_IS_SUCCESS(cudaFree(extraWorkspace));
}
PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize());
mpi_manager_.Barrier();
mpi_manager_.Allreduce<custatevecIndex_t>(localBitStrings,
globalBitStrings, "sum");
for (size_t i = 0; i < num_samples; i++) {
for (size_t j = 0; j < bitStringLen; j++) {
samples[i * bitStringLen + (bitStringLen - 1 - j)] =
(globalBitStrings[i] >> j) & 1U;
}
}
mpi_manager_.Barrier();
return samples;
}
template <class index_type>
auto expval(const index_type *csrOffsets_ptr, const int64_t csrOffsets_size,
const index_type *columns_ptr,
const std::complex<PrecisionT> *values_ptr,
const int64_t numNNZ) -> PrecisionT {
if (mpi_manager_.getRank() == 0) {
PL_ABORT_IF_NOT(
static_cast<std::size_t>(csrOffsets_size - 1) ==
(size_t{1} << this->_statevector.getTotalNumQubits()),
"Incorrect size of CSR Offsets.");
PL_ABORT_IF_NOT(numNNZ > 0, "Empty CSR matrix.");
}
PrecisionT local_expect = 0;
auto device_id =
this->_statevector.getDataBuffer().getDevTag().getDeviceID();
auto stream_id =
this->_statevector.getDataBuffer().getDevTag().getStreamID();
const std::size_t length_local =
std::size_t{1} << this->_statevector.getNumLocalQubits();
DataBuffer<CFP_t, int> d_res_per_rowblock{length_local, device_id,
stream_id, true};
d_res_per_rowblock.zeroInit();
// with new wrapper
cuUtil::SparseMV_cuSparseMPI<index_type, PrecisionT, CFP_t>(
mpi_manager_, length_local, csrOffsets_ptr, csrOffsets_size,
columns_ptr, values_ptr,
const_cast<CFP_t *>(this->_statevector.getData()),
d_res_per_rowblock.getData(), device_id, stream_id,
this->_statevector.getCusparseHandle());
local_expect =
innerProdC_CUDA(d_res_per_rowblock.getData(),
this->_statevector.getData(),
this->_statevector.getLength(), device_id,
stream_id, this->_statevector.getCublasCaller())
.x;
PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize());
mpi_manager_.Barrier();
auto expect = mpi_manager_.allreduce<PrecisionT>(local_expect, "sum");
return expect;
}
auto expval(const std::string &operation,
const std::vector<std::size_t> &wires) -> PrecisionT {
std::vector<PrecisionT> params = {0.0};
std::vector<CFP_t> gate_matrix = {};
auto expect =
this->_statevector.expval(operation, wires, params, gate_matrix);
return static_cast<PrecisionT>(expect.x);
}
template <typename op_type>
auto expval(const std::vector<op_type> &operations_list,
const std::vector<std::vector<std::size_t>> &wires_list)
-> std::vector<PrecisionT> {
PL_ABORT_IF(
(operations_list.size() != wires_list.size()),
"The lengths of the list of operations and wires do not match.");
std::vector<PrecisionT> expected_value_list;
for (size_t index = 0; index < operations_list.size(); index++) {
expected_value_list.emplace_back(
expval(operations_list[index], wires_list[index]));
PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize());
mpi_manager_.Barrier();
}
return expected_value_list;
}
auto expval(const Observable<StateVectorT> &ob) -> PrecisionT {
StateVectorT ob_sv(this->_statevector);
ob.applyInPlace(ob_sv);
auto device_id = ob_sv.getDataBuffer().getDevTag().getDeviceID();
auto stream_id = ob_sv.getDataBuffer().getDevTag().getStreamID();
auto local_expect =
innerProdC_CUDA(this->_statevector.getData(), ob_sv.getData(),
this->_statevector.getLength(), device_id,
stream_id, this->_statevector.getCublasCaller())
.x;
PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize());
mpi_manager_.Barrier();
auto expect = mpi_manager_.allreduce<PrecisionT>(local_expect, "sum");
return static_cast<PrecisionT>(expect);
}
auto expval(const Observable<StateVectorT> &obs,
const std::size_t &num_shots,
const std::vector<std::size_t> &shot_range) -> PrecisionT {
mpi_manager_.Barrier();
PrecisionT result = BaseType::expval(obs, num_shots, shot_range);
mpi_manager_.Barrier();
return result;
}
auto expval(const std::vector<ComplexT> &matrix,
const std::vector<std::size_t> &wires) -> PrecisionT {
auto expect = this->_statevector.expval(wires, matrix);
return static_cast<PrecisionT>(expect);
}
auto expval(const std::vector<std::string> &pauli_words,
const std::vector<std::vector<std::size_t>> &tgts,
const std::complex<PrecisionT> *coeffs) -> PrecisionT {
return this->_statevector.getExpectationValuePauliWords(pauli_words,
tgts, coeffs);
}
auto var(const Observable<StateVectorT> &ob) -> PrecisionT {
StateVectorT ob_sv(this->_statevector);
ob.applyInPlace(ob_sv);
auto device_id = ob_sv.getDataBuffer().getDevTag().getDeviceID();
auto stream_id = ob_sv.getDataBuffer().getDevTag().getStreamID();
PrecisionT mean_square_local =
innerProdC_CUDA(ob_sv.getData(), ob_sv.getData(), ob_sv.getLength(),
device_id, stream_id, ob_sv.getCublasCaller())
.x;
PrecisionT squared_mean_local =
innerProdC_CUDA(this->_statevector.getData(), ob_sv.getData(),
this->_statevector.getLength(), device_id,
stream_id, this->_statevector.getCublasCaller())
.x;
PrecisionT mean_square =
mpi_manager_.allreduce<PrecisionT>(mean_square_local, "sum");
PrecisionT squared_mean =
mpi_manager_.allreduce<PrecisionT>(squared_mean_local, "sum");
return mean_square - squared_mean * squared_mean;
}
auto var(const std::string &operation,
const std::vector<std::size_t> &wires) -> PrecisionT {
StateVectorT ob_sv(this->_statevector);
ob_sv.applyOperation(operation, wires);
auto device_id = ob_sv.getDataBuffer().getDevTag().getDeviceID();
auto stream_id = ob_sv.getDataBuffer().getDevTag().getStreamID();
PrecisionT mean_square_local =
innerProdC_CUDA(ob_sv.getData(), ob_sv.getData(), ob_sv.getLength(),
device_id, stream_id, ob_sv.getCublasCaller())
.x;
PrecisionT squared_mean_local =
innerProdC_CUDA(this->_statevector.getData(), ob_sv.getData(),
this->_statevector.getLength(), device_id,
stream_id, this->_statevector.getCublasCaller())
.x;
PrecisionT mean_square =
mpi_manager_.allreduce<PrecisionT>(mean_square_local, "sum");
PrecisionT squared_mean =
mpi_manager_.allreduce<PrecisionT>(squared_mean_local, "sum");
return mean_square - squared_mean * squared_mean;
};
auto var(const std::vector<ComplexT> &matrix,
const std::vector<std::size_t> &wires) -> PrecisionT {
StateVectorT ob_sv(this->_statevector);
ob_sv.applyMatrix(matrix, wires);
auto device_id = ob_sv.getDataBuffer().getDevTag().getDeviceID();
auto stream_id = ob_sv.getDataBuffer().getDevTag().getStreamID();
PrecisionT mean_square_local =
innerProdC_CUDA(ob_sv.getData(), ob_sv.getData(), ob_sv.getLength(),
device_id, stream_id, ob_sv.getCublasCaller())
.x;
PrecisionT squared_mean_local =
innerProdC_CUDA(this->_statevector.getData(), ob_sv.getData(),
this->_statevector.getLength(), device_id,
stream_id, this->_statevector.getCublasCaller())
.x;
PrecisionT mean_square =
mpi_manager_.allreduce<PrecisionT>(mean_square_local, "sum");
PrecisionT squared_mean =
mpi_manager_.allreduce<PrecisionT>(squared_mean_local, "sum");
return mean_square - squared_mean * squared_mean;
};
template <typename op_type>
auto var(const std::vector<op_type> &operations_list,
const std::vector<std::vector<std::size_t>> &wires_list)
-> std::vector<PrecisionT> {
PL_ABORT_IF(
(operations_list.size() != wires_list.size()),
"The lengths of the list of operations and wires do not match.");
std::vector<PrecisionT> var_list;
for (size_t index = 0; index < operations_list.size(); index++) {
var_list.emplace_back(
var(operations_list[index], wires_list[index]));
}
return var_list;
};
template <class index_type>
PrecisionT var(const index_type *csrOffsets_ptr,
const int64_t csrOffsets_size, const index_type *columns_ptr,
const std::complex<PrecisionT> *values_ptr,
const int64_t numNNZ) {
if (mpi_manager_.getRank() == 0) {
PL_ABORT_IF_NOT(
static_cast<std::size_t>(csrOffsets_size - 1) ==
(size_t{1} << this->_statevector.getTotalNumQubits()),
"Incorrect size of CSR Offsets.");
PL_ABORT_IF_NOT(numNNZ > 0, "Empty CSR matrix.");
}
auto device_id =
this->_statevector.getDataBuffer().getDevTag().getDeviceID();
auto stream_id =
this->_statevector.getDataBuffer().getDevTag().getStreamID();
const std::size_t length_local =
std::size_t{1} << this->_statevector.getNumLocalQubits();
DataBuffer<CFP_t, int> d_res_per_rowblock{length_local, device_id,
stream_id, true};
d_res_per_rowblock.zeroInit();
cuUtil::SparseMV_cuSparseMPI<index_type, PrecisionT, CFP_t>(
mpi_manager_, length_local, csrOffsets_ptr, csrOffsets_size,
columns_ptr, values_ptr,
const_cast<CFP_t *>(this->_statevector.getData()),
d_res_per_rowblock.getData(), device_id, stream_id,
this->_statevector.getCusparseHandle());
PrecisionT mean_square_local =
innerProdC_CUDA(d_res_per_rowblock.getData(),
d_res_per_rowblock.getData(),
d_res_per_rowblock.getLength(), device_id,
stream_id, this->_statevector.getCublasCaller())
.x;
PrecisionT squared_mean_local =
innerProdC_CUDA(this->_statevector.getData(),
d_res_per_rowblock.getData(),
this->_statevector.getLength(), device_id,
stream_id, this->_statevector.getCublasCaller())
.x;
PrecisionT mean_square =
mpi_manager_.allreduce<PrecisionT>(mean_square_local, "sum");
PrecisionT squared_mean =
mpi_manager_.allreduce<PrecisionT>(squared_mean_local, "sum");
return mean_square - squared_mean * squared_mean;
};
auto var(const Observable<StateVectorT> &obs, const std::size_t &num_shots)
-> PrecisionT {
return BaseType::var(obs, num_shots);
}
}; // class Measurements
} // namespace Pennylane::LightningGPU::Measures
api/program_listing_file_pennylane_lightning_core_src_simulators_lightning_gpu_measurements_MeasurementsGPUMPI.hpp
Download Python script
Download Notebook
View on GitHub