Program Listing for File MeasurementsTNCuda.hpp

Return to documentation for file (pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/measurements/MeasurementsTNCuda.hpp)

// Copyright 2024 Xanadu Quantum Technologies Inc.

// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at

//     http://www.apache.org/licenses/LICENSE-2.0

// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once

#include <algorithm>
#include <complex>
#include <cuComplex.h>
#include <cutensornet.h>
#include <vector>

#include "LinearAlg.hpp"
#include "MPSTNCuda.hpp"
#include "ObservablesTNCuda.hpp"
#include "ObservablesTNCudaOperator.hpp"

#include "tncuda_helpers.hpp"

namespace {
using namespace Pennylane::LightningTensor::TNCuda;
using namespace Pennylane::LightningTensor::TNCuda::Observables;
using namespace Pennylane::LightningTensor::TNCuda::Util;
} // namespace

namespace Pennylane::LightningTensor::TNCuda::Measures {
extern void getProbs_CUDA(cuComplex *state, float *probs, const int data_size,
                          const std::size_t thread_per_block,
                          cudaStream_t stream_id);
extern void getProbs_CUDA(cuDoubleComplex *state, double *probs,
                          const int data_size,
                          const std::size_t thread_per_block,
                          cudaStream_t stream_id);
extern void normalizeProbs_CUDA(float *probs, const int data_size,
                                const float sum,
                                const std::size_t thread_per_block,
                                cudaStream_t stream_id);
extern void normalizeProbs_CUDA(double *probs, const int data_size,
                                const double sum,
                                const std::size_t thread_per_block,
                                cudaStream_t stream_id);
template <class TensorNetT> class MeasurementsTNCuda {
  private:
    using PrecisionT = typename TensorNetT::PrecisionT;
    using ComplexT = typename TensorNetT::ComplexT;
    using CFP_t = typename TensorNetT::CFP_t;

    const TensorNetT &tensor_network_;

  public:
    explicit MeasurementsTNCuda(const TensorNetT &tensor_network)
        : tensor_network_(tensor_network){};

    template <std::size_t thread_per_block = 256>
    auto probs(const std::vector<std::size_t> &wires,
               const int32_t numHyperSamples = 1) -> std::vector<PrecisionT> {
        PL_ABORT_IF_NOT(std::is_sorted(wires.begin(), wires.end()),
                        "Invalid wire indices order. Please ensure that the "
                        "wire indices are in the ascending order.");

        const std::size_t length = std::size_t{1} << wires.size();

        std::vector<PrecisionT> h_res(length, 0.0);

        DataBuffer<CFP_t, int> d_output_tensor(
            length, tensor_network_.getDevTag(), true);

        DataBuffer<PrecisionT, int> d_output_probs(
            length, tensor_network_.getDevTag(), true);

        d_output_tensor.zeroInit();
        d_output_probs.zeroInit();

        auto stateModes = cuUtil::NormalizeCastIndices<std::size_t, int32_t>(
            wires, tensor_network_.getNumQubits());

        std::vector<int32_t> projected_modes{};

        for (int32_t idx = 0;
             idx < static_cast<int32_t>(tensor_network_.getNumQubits());
             idx++) {
            auto it = std::find(stateModes.begin(), stateModes.end(), idx);
            if (it == stateModes.end()) {
                projected_modes.emplace_back(idx);
            }
        }

        std::vector<int64_t> projectedModeValues(projected_modes.size(), 0);

        if (projected_modes.size() == 0) {
            tensor_network_.get_state_tensor(d_output_tensor.getData(),
                                             d_output_tensor.getLength(), {},
                                             {}, numHyperSamples);
            getProbs_CUDA(d_output_tensor.getData(), d_output_probs.getData(),
                          length, static_cast<int>(thread_per_block),
                          tensor_network_.getDevTag().getStreamID());

        } else {
            PL_ABORT_IF(projected_modes.size() > 64,
                        "Number of projected modes is greater than 64 and the "
                        "value of projected_modes_size will exceed "
                        "std::numeric_limits<size_t>::max()");
            const std::size_t projected_modes_size = std::size_t(1U)
                                                     << projected_modes.size();

            DataBuffer<PrecisionT, int> tmp_probs(
                length, tensor_network_.getDevTag(), true);

            for (std::size_t idx = 0; idx < projected_modes_size; idx++) {
                for (std::size_t j = 0; j < projected_modes.size(); j++) {
                    projectedModeValues[j] = (idx >> j) & 1U;
                }

                tensor_network_.get_state_tensor(
                    d_output_tensor.getData(), length, projected_modes,
                    projectedModeValues, numHyperSamples);

                getProbs_CUDA(d_output_tensor.getData(), tmp_probs.getData(),
                              length, static_cast<int>(thread_per_block),
                              tensor_network_.getDevTag().getStreamID());

                // Copy the data to the output tensor
                scaleAndAdd_CUDA(PrecisionT{1.0}, tmp_probs.getData(),
                                 d_output_probs.getData(),
                                 tmp_probs.getLength(),
                                 tensor_network_.getDevTag().getDeviceID(),
                                 tensor_network_.getDevTag().getStreamID(),
                                 tensor_network_.getCublasCaller());
            }
        }

        // `10` here means `1024` elements to be calculated
        // LCOV_EXCL_START
        if (wires.size() > 10) {
            PrecisionT sum;

            asum_CUDA_device<PrecisionT>(
                d_output_probs.getData(), length,
                tensor_network_.getDevTag().getDeviceID(),
                tensor_network_.getDevTag().getStreamID(),
                tensor_network_.getCublasCaller(), &sum);

            PL_ABORT_IF(sum == 0.0, "Sum of probabilities is zero.");

            normalizeProbs_CUDA(d_output_probs.getData(), length, sum,
                                static_cast<int>(thread_per_block),
                                tensor_network_.getDevTag().getStreamID());

            d_output_probs.CopyGpuDataToHost(h_res.data(), h_res.size());
        } else {
            // LCOV_EXCL_STOP
            // This branch dispatches the calculation to the CPU for a small
            // number of wires. The CPU calculation is faster than the GPU
            // calculation for a small number of wires due to the overhead of
            // the GPU kernel launch.
            d_output_probs.CopyGpuDataToHost(h_res.data(), h_res.size());

            // TODO: OMP support
            PrecisionT sum =
                std::accumulate(h_res.begin(), h_res.end(), PrecisionT{0.0});

            PL_ABORT_IF(sum == 0.0, "Sum of probabilities is zero.");
            // TODO: OMP support
            for (std::size_t i = 0; i < length; i++) {
                h_res[i] /= sum;
            }
        }

        return h_res;
    }

    auto generate_samples(const std::vector<std::size_t> &wires,
                          const std::size_t num_samples,
                          const int32_t numHyperSamples = 1)
        -> std::vector<std::size_t> {
        std::vector<int64_t> samples(num_samples * wires.size());

        const std::vector<int32_t> modesToSample =
            cuUtil::NormalizeCastIndices<std::size_t, int32_t>(
                wires, tensor_network_.getNumQubits());

        cutensornetStateSampler_t sampler;

        PL_CUTENSORNET_IS_SUCCESS(cutensornetCreateSampler(
            /* const cutensornetHandle_t */ tensor_network_.getTNCudaHandle(),
            /* cutensornetState_t */ tensor_network_.getQuantumState(),
            /* int32_t numModesToSample */ modesToSample.size(),
            /* const int32_t *modesToSample */ modesToSample.data(),
            /* cutensornetStateSampler_t * */ &sampler));

        // Configure the quantum circuit sampler
        const cutensornetSamplerAttributes_t samplerAttributes =
            CUTENSORNET_SAMPLER_CONFIG_NUM_HYPER_SAMPLES;

        PL_CUTENSORNET_IS_SUCCESS(cutensornetSamplerConfigure(
            /* const cutensornetHandle_t */ tensor_network_.getTNCudaHandle(),
            /* cutensornetStateSampler_t */ sampler,
            /* cutensornetSamplerAttributes_t */ samplerAttributes,
            /* const void *attributeValue */ &numHyperSamples,
            /* size_t attributeSize */ sizeof(numHyperSamples)));

        cutensornetWorkspaceDescriptor_t workDesc;
        PL_CUTENSORNET_IS_SUCCESS(cutensornetCreateWorkspaceDescriptor(
            /* const cutensornetHandle_t */ tensor_network_.getTNCudaHandle(),
            /* cutensornetWorkspaceDescriptor_t * */ &workDesc));

        const std::size_t scratchSize = cuUtil::getFreeMemorySize() / 2;

        // Prepare the quantum circuit sampler for sampling
        PL_CUTENSORNET_IS_SUCCESS(cutensornetSamplerPrepare(
            /* const cutensornetHandle_t */ tensor_network_.getTNCudaHandle(),
            /* cutensornetStateSampler_t */ sampler,
            /* size_t maxWorkspaceSizeDevice */ scratchSize,
            /* cutensornetWorkspaceDescriptor_t */ workDesc,
            /* cudaStream_t unused as of v24.08 */ 0x0));

        std::size_t worksize =
            getWorkSpaceMemorySize(tensor_network_.getTNCudaHandle(), workDesc);

        PL_ABORT_IF(worksize > scratchSize,
                    "Insufficient workspace size on Device.\n");

        const std::size_t d_scratch_length = worksize / sizeof(size_t) + 1;
        DataBuffer<std::size_t> d_scratch(d_scratch_length,
                                          tensor_network_.getDevTag(), true);

        setWorkSpaceMemory(tensor_network_.getTNCudaHandle(), workDesc,
                           reinterpret_cast<void *>(d_scratch.getData()),
                           worksize);

        PL_CUTENSORNET_IS_SUCCESS(cutensornetSamplerSample(
            /* const cutensornetHandle_t */ tensor_network_.getTNCudaHandle(),
            /* cutensornetStateSampler_t */ sampler,
            /* int64_t numShots */ num_samples,
            /* cutensornetWorkspaceDescriptor_t */ workDesc,
            /* int64_t * */ samples.data(),
            /* cudaStream_t unused as of v24.08  */ 0x0));

        PL_CUTENSORNET_IS_SUCCESS(
            cutensornetDestroyWorkspaceDescriptor(workDesc));
        PL_CUTENSORNET_IS_SUCCESS(cutensornetDestroySampler(sampler));

        std::vector<std::size_t> samples_size_t(samples.size());

        std::transform(samples.begin(), samples.end(), samples_size_t.begin(),
                       [](int64_t x) { return static_cast<std::size_t>(x); });
        return samples_size_t;
    }

    auto var(ObservableTNCuda<TensorNetT> &obs,
             const int32_t numHyperSamples = 1) -> PrecisionT {
        auto expectation_val =
            expval(obs, numHyperSamples); // The cutensornetNetworkOperator_t
                                          // object created in expval() will be
                                          // released after the function call.

        const bool val_cal = true;
        auto tnObs2Operator =
            ObservableTNCudaOperator<TensorNetT>(tensor_network_, obs, val_cal);
        auto expectation_squared_obs =
            expval_(tnObs2Operator.getTNOperator(), numHyperSamples);

        return expectation_squared_obs - expectation_val * expectation_val;
    }

    auto expval(ObservableTNCuda<TensorNetT> &obs,
                const int32_t numHyperSamples = 1) -> PrecisionT {
        auto tnoperator =
            ObservableTNCudaOperator<TensorNetT>(tensor_network_, obs);

        return expval_(tnoperator.getTNOperator(), numHyperSamples);
    }

  private:
    auto expval_(cutensornetNetworkOperator_t tnoperator,
                 const int32_t numHyperSamples) -> PrecisionT {
        ComplexT expectation_val{0.0, 0.0};
        ComplexT state_norm2{0.0, 0.0};

        cutensornetStateExpectation_t expectation;

        PL_CUTENSORNET_IS_SUCCESS(cutensornetCreateExpectation(
            /* const cutensornetHandle_t */ tensor_network_.getTNCudaHandle(),
            /* cutensornetState_t */ tensor_network_.getQuantumState(),
            /* cutensornetNetworkOperator_t */ tnoperator,
            /* cutensornetStateExpectation_t * */ &expectation));

        PL_CUTENSORNET_IS_SUCCESS(cutensornetExpectationConfigure(
            /* const cutensornetHandle_t */ tensor_network_.getTNCudaHandle(),
            /* cutensornetStateExpectation_t */ expectation,
            /* cutensornetExpectationAttributes_t */
            CUTENSORNET_EXPECTATION_CONFIG_NUM_HYPER_SAMPLES,
            /* const void * */ &numHyperSamples,
            /* std::size_t */ sizeof(numHyperSamples)));

        cutensornetWorkspaceDescriptor_t workDesc;
        PL_CUTENSORNET_IS_SUCCESS(cutensornetCreateWorkspaceDescriptor(
            /* const cutensornetHandle_t */ tensor_network_.getTNCudaHandle(),
            /* cutensornetWorkspaceDescriptor_t * */ &workDesc));

        const std::size_t scratchSize = cuUtil::getFreeMemorySize() / 2;

        // Prepare the specified quantum circuit expectation value for
        // computation
        PL_CUTENSORNET_IS_SUCCESS(cutensornetExpectationPrepare(
            /* const cutensornetHandle_t */ tensor_network_.getTNCudaHandle(),
            /* cutensornetStateExpectation_t */ expectation,
            /* std::size_t maxWorkspaceSizeDevice */ scratchSize,
            /* cutensornetWorkspaceDescriptor_t */ workDesc,
            /* cudaStream_t [unused] */ 0x0));

        std::size_t worksize =
            getWorkSpaceMemorySize(tensor_network_.getTNCudaHandle(), workDesc);

        PL_ABORT_IF(worksize > scratchSize,
                    "Insufficient workspace size on Device.\n");

        const std::size_t d_scratch_length = worksize / sizeof(std::size_t) + 1;
        DataBuffer<std::size_t, int> d_scratch(
            d_scratch_length, tensor_network_.getDevTag(), true);

        setWorkSpaceMemory(tensor_network_.getTNCudaHandle(), workDesc,
                           reinterpret_cast<void *>(d_scratch.getData()),
                           worksize);

        PL_CUTENSORNET_IS_SUCCESS(cutensornetExpectationCompute(
            /* const cutensornetHandle_t */ tensor_network_.getTNCudaHandle(),
            /* cutensornetStateExpectation_t */ expectation,
            /* cutensornetWorkspaceDescriptor_t */ workDesc,
            /* void* */ static_cast<void *>(&expectation_val),
            /* void* */ static_cast<void *>(&state_norm2),
            /* cudaStream_t unused */ 0x0));

        expectation_val /= state_norm2;

        PL_CUTENSORNET_IS_SUCCESS(
            cutensornetDestroyWorkspaceDescriptor(workDesc));
        PL_CUTENSORNET_IS_SUCCESS(cutensornetDestroyExpectation(expectation));

        return expectation_val.real();
    }
};
} // namespace Pennylane::LightningTensor::TNCuda::Measures