Program Listing for File LinearAlgebra.hpp

Return to documentation for file (pennylane_lightning/core/src/simulators/lightning_qubit/utils/LinearAlgebra.hpp)

// Copyright 2018-2023 Xanadu Quantum Technologies Inc.

// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at

//     http://www.apache.org/licenses/LICENSE-2.0

// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once

#include <algorithm>
#include <complex>
#include <cstdlib>
#include <numeric>
#include <random>
#include <span>
#include <vector>

#include "Macros.hpp"
#include "TypeTraits.hpp" // remove_complex_t
#include "Util.hpp"       // ConstSum, ConstMult, ConstMultConj


#if __has_include(<cblas.h>) && defined _ENABLE_BLAS
#include <cblas.h>
constexpr bool USE_CBLAS = true;
#else
constexpr bool USE_CBLAS = false;
#ifndef CBLAS_TRANSPOSE
using CBLAS_TRANSPOSE = enum CBLAS_TRANSPOSE {
    CblasNoTrans = 111,
    CblasTrans = 112,
    CblasConjTrans = 113
};
#endif

#ifndef CBLAS_LAYOUT
using CBLAS_LAYOUT = enum CBLAS_LAYOUT {
    CblasRowMajor = 101,
    CblasColMajor = 102
};
#endif
#endif

namespace {
using namespace Pennylane::Util;
} // namespace


namespace Pennylane::LightningQubit::Util {
enum class Trans : int {
    NoTranspose = CblasNoTrans,
    Transpose = CblasTrans,
    Adjoint = CblasConjTrans
};
template <class T,
          size_t NTERMS = (1U << 19U)> // NOLINT(readability-magic-numbers)
inline static void
omp_innerProd(const std::complex<T> *v1, const std::complex<T> *v2,
              std::complex<T> &result, const size_t data_size) {
#if defined(_OPENMP)
#pragma omp declare reduction(sm : std::complex<T> : omp_out =                 \
                                  ConstSum(omp_out, omp_in))                   \
    initializer(omp_priv = std::complex<T>{0, 0})

    size_t nthreads = data_size / NTERMS;
    if (nthreads < 1) {
        nthreads = 1;
    }

#pragma omp parallel for num_threads(nthreads) default(none)                   \
    shared(v1, v2, data_size) reduction(sm : result)
#endif
    for (size_t i = 0; i < data_size; i++) {
        result = ConstSum(result, ConstMult(*(v1 + i), *(v2 + i)));
    }
}

template <class T,
          size_t STD_CROSSOVER = (1U
                                  << 20U)> // NOLINT(readability-magic-numbers)
inline auto innerProd(const std::complex<T> *v1, const std::complex<T> *v2,
                      const size_t data_size) -> std::complex<T> {
    std::complex<T> result(0, 0);

    if constexpr (USE_CBLAS) {
        if constexpr (std::is_same_v<T, float>) {
            cblas_cdotu_sub(data_size, v1, 1, v2, 1, &result);
        } else if constexpr (std::is_same_v<T, double>) {
            cblas_zdotu_sub(data_size, v1, 1, v2, 1, &result);
        }
    } else {
        if (data_size < STD_CROSSOVER) {
            result = std::inner_product(
                v1, v1 + data_size, v2, std::complex<T>(), ConstSum<T>,
                static_cast<std::complex<T> (*)(
                    std::complex<T>, std::complex<T>)>(&ConstMult<T>));
        } else {
            omp_innerProd(v1, v2, result, data_size);
        }
    }
    return result;
}

template <class T,
          size_t NTERMS = (1U << 19U)> // NOLINT(readability-magic-numbers)
inline static void
omp_innerProdC(const std::complex<T> *v1, const std::complex<T> *v2,
               std::complex<T> &result, const size_t data_size) {
#if defined(_OPENMP)
#pragma omp declare reduction(sm : std::complex<T> : omp_out =                 \
                                  ConstSum(omp_out, omp_in))                   \
    initializer(omp_priv = std::complex<T>{0, 0})
#endif

#if defined(_OPENMP)
    size_t nthreads = data_size / NTERMS;
    if (nthreads < 1) {
        nthreads = 1;
    }
#endif

#if defined(_OPENMP)
#pragma omp parallel for num_threads(nthreads) default(none)                   \
    shared(v1, v2, data_size) reduction(sm : result)
#endif
    for (size_t i = 0; i < data_size; i++) {
        result = ConstSum(result, ConstMultConj(*(v1 + i), *(v2 + i)));
    }
}

template <class T,
          size_t STD_CROSSOVER = (1U
                                  << 20U)> // NOLINT(readability-magic-numbers)
inline auto innerProdC(const std::complex<T> *v1, const std::complex<T> *v2,
                       const size_t data_size) -> std::complex<T> {
    std::complex<T> result(0, 0);

    if constexpr (USE_CBLAS) {
        if constexpr (std::is_same_v<T, float>) {
            cblas_cdotc_sub(data_size, v1, 1, v2, 1, &result);
        } else if constexpr (std::is_same_v<T, double>) {
            cblas_zdotc_sub(data_size, v1, 1, v2, 1, &result);
        }
    } else {
        if (data_size < STD_CROSSOVER) {
            result =
                std::inner_product(v1, v1 + data_size, v2, std::complex<T>(),
                                   ConstSum<T>, ConstMultConj<T>);
        } else {
            omp_innerProdC(v1, v2, result, data_size);
        }
    }
    return result;
}

template <class T, class AllocA, class AllocB>
inline auto innerProd(const std::vector<std::complex<T>, AllocA> &v1,
                      const std::vector<std::complex<T>, AllocB> &v2)
    -> std::complex<T> {
    return innerProd(v1.data(), v2.data(), v1.size());
}

template <class T, class AllocA, class AllocB>
inline auto innerProdC(const std::vector<std::complex<T>, AllocA> &v1,
                       const std::vector<std::complex<T>, AllocB> &v2)
    -> std::complex<T> {
    return innerProdC(v1.data(), v2.data(), v1.size());
}

template <class T>
inline static void
omp_matrixVecProd(const std::complex<T> *mat, const std::complex<T> *v_in,
                  std::complex<T> *v_out, size_t m, size_t n, Trans transpose) {
    if (!v_out) {
        return;
    }

    size_t row;
    size_t col;

    {
        switch (transpose) {
        case Trans::NoTranspose:
#if defined(_OPENMP)
#pragma omp parallel for default(none) private(row, col) firstprivate(m, n)    \
    shared(v_out, mat, v_in)
#endif
            for (row = 0; row < m; row++) {
                for (col = 0; col < n; col++) {
                    v_out[row] += mat[row * n + col] * v_in[col];
                }
            }
            break;
        case Trans::Transpose:
#if defined(_OPENMP)
#pragma omp parallel for default(none) private(row, col) firstprivate(m, n)    \
    shared(v_out, mat, v_in)
#endif
            for (row = 0; row < m; row++) {
                for (col = 0; col < n; col++) {
                    v_out[row] += mat[col * m + row] * v_in[col];
                }
            }
            break;
        case Trans::Adjoint:
#if defined(_OPENMP)
#pragma omp parallel for default(none) private(row, col) firstprivate(m, n)    \
    shared(v_out, mat, v_in)
#endif
            for (row = 0; row < m; row++) {
                for (col = 0; col < n; col++) {
                    v_out[row] += std::conj(mat[col * m + row]) * v_in[col];
                }
            }
            break;
        }
    }
}

template <class T>
inline void matrixVecProd(const std::complex<T> *mat,
                          const std::complex<T> *v_in, std::complex<T> *v_out,
                          size_t m, size_t n,
                          Trans transpose = Trans::NoTranspose) {
    if (!v_out) {
        return;
    }

    if constexpr (USE_CBLAS) {
        constexpr std::complex<T> co{1, 0};
        constexpr std::complex<T> cz{0, 0};
        const auto tr = static_cast<CBLAS_TRANSPOSE>(transpose);
        if constexpr (std::is_same_v<T, float>) {
            cblas_cgemv(CblasRowMajor, tr, m, n, &co, mat, m, v_in, 1, &cz,
                        v_out, 1);
        } else if constexpr (std::is_same_v<T, double>) {
            cblas_zgemv(CblasRowMajor, tr, m, n, &co, mat, m, v_in, 1, &cz,
                        v_out, 1);
        }
    } else {
        omp_matrixVecProd(mat, v_in, v_out, m, n, transpose);
    }
}

template <class T>
inline auto matrixVecProd(const std::vector<std::complex<T>> &mat,
                          const std::vector<std::complex<T>> &v_in, size_t m,
                          size_t n, Trans transpose = Trans::NoTranspose)
    -> std::vector<std::complex<T>> {
    if (mat.size() != m * n) {
        throw std::invalid_argument(
            "Invalid number of rows and columns for the input matrix");
    }
    if (v_in.size() != n) {
        throw std::invalid_argument("Invalid size for the input vector");
    }

    std::vector<std::complex<T>> v_out(m);
    matrixVecProd(mat.data(), v_in.data(), v_out.data(), m, n, transpose);
    return v_out;
}

template <class T, size_t BLOCKSIZE = 16> // NOLINT(readability-magic-numbers)
inline static void CFTranspose(const T *mat, T *mat_t, size_t m, size_t n,
                               size_t m1, size_t m2, size_t n1, size_t n2) {
    size_t r;
    size_t s;

    size_t r1;
    size_t s1;
    size_t r2;
    size_t s2;

    r1 = m2 - m1;
    s1 = n2 - n1;

    if (r1 >= s1 && r1 > BLOCKSIZE) {
        r2 = (m1 + m2) / 2;
        CFTranspose(mat, mat_t, m, n, m1, r2, n1, n2);
        m1 = r2;
        CFTranspose(mat, mat_t, m, n, m1, m2, n1, n2);
    } else if (s1 > BLOCKSIZE) {
        s2 = (n1 + n2) / 2;
        CFTranspose(mat, mat_t, m, n, m1, m2, n1, s2);
        n1 = s2;
        CFTranspose(mat, mat_t, m, n, m1, m2, n1, n2);
    } else {
        for (r = m1; r < m2; r++) {
            for (s = n1; s < n2; s++) {
                mat_t[s * m + r] = mat[r * n + s];
            }
        }
    }
}

template <class T, size_t BLOCKSIZE = 16> // NOLINT(readability-magic-numbers)
inline static void CFTranspose(const std::complex<T> *mat,
                               std::complex<T> *mat_t, size_t m, size_t n,
                               size_t m1, size_t m2, size_t n1, size_t n2) {
    size_t r;
    size_t s;

    size_t r1;
    size_t s1;
    size_t r2;
    size_t s2;

    r1 = m2 - m1;
    s1 = n2 - n1;

    if (r1 >= s1 && r1 > BLOCKSIZE) {
        r2 = (m1 + m2) / 2;
        CFTranspose(mat, mat_t, m, n, m1, r2, n1, n2);
        m1 = r2;
        CFTranspose(mat, mat_t, m, n, m1, m2, n1, n2);
    } else if (s1 > BLOCKSIZE) {
        s2 = (n1 + n2) / 2;
        CFTranspose(mat, mat_t, m, n, m1, m2, n1, s2);
        n1 = s2;
        CFTranspose(mat, mat_t, m, n, m1, m2, n1, n2);
    } else {
        for (r = m1; r < m2; r++) {
            for (s = n1; s < n2; s++) {
                mat_t[s * m + r] = mat[r * n + s];
            }
        }
    }
}

template <class T, class Allocator = std::allocator<T>>
inline auto Transpose(std::span<const T> mat, size_t m, size_t n,
                      Allocator allocator = std::allocator<T>())
    -> std::vector<T, Allocator> {
    if (mat.size() != m * n) {
        throw std::invalid_argument(
            "Invalid number of rows and columns for the input matrix");
    }

    std::vector<T, Allocator> mat_t(n * m, allocator);
    CFTranspose(mat.data(), mat_t.data(), m, n, 0, m, 0, n);
    return mat_t;
}

template <class T, class Allocator>
inline auto Transpose(const std::vector<T, Allocator> &mat, size_t m, size_t n)
    -> std::vector<T, Allocator> {
    return Transpose(std::span<const T>{mat}, m, n, mat.get_allocator());
}

template <class T>
inline void vecMatrixProd(const T *v_in, const T *mat, T *v_out, size_t m,
                          size_t n) {
    if (!v_out) {
        return;
    }

    size_t i;
    size_t j;

    constexpr T z = static_cast<T>(0.0);
    bool allzero = true;
    for (j = 0; j < m; j++) {
        if (v_in[j] != z) {
            allzero = false;
            break;
        }
    }
    if (allzero) {
        return;
    }

    std::vector<T> mat_t(m * n);
    CFTranspose(mat, mat_t.data(), m, n, 0, m, 0, n);

    for (i = 0; i < n; i++) {
        T t = z;
        for (j = 0; j < m; j++) {
            t += mat_t[i * m + j] * v_in[j];
        }
        v_out[i] = t;
    }
}

template <class T, class AllocA, class AllocB>
inline auto vecMatrixProd(const std::vector<T, AllocA> &v_in,
                          const std::vector<T, AllocB> &mat, size_t m, size_t n)
    -> std::vector<T> {
    if (v_in.size() != m) {
        throw std::invalid_argument("Invalid size for the input vector");
    }
    if (mat.size() != m * n) {
        throw std::invalid_argument(
            "Invalid number of rows and columns for the input matrix");
    }

    std::vector<T, AllocA> v_out(n);
    vecMatrixProd(v_in.data(), mat.data(), v_out.data(), m, n);

    return v_out;
}

template <class T, class AllocA, class AllocB, class AllocC>
inline void
vecMatrixProd(std::vector<T, AllocA> &v_out, const std::vector<T, AllocB> &v_in,
              const std::vector<T, AllocC> &mat, size_t m, size_t n) {
    if (mat.size() != m * n) {
        throw std::invalid_argument(
            "Invalid number of rows and columns for the input matrix");
    }
    if (v_in.size() != m) {
        throw std::invalid_argument("Invalid size for the input vector");
    }
    if (v_out.size() != n) {
        throw std::invalid_argument("Invalid preallocated size for the result");
    }

    vecMatrixProd(v_in.data(), mat.data(), v_out.data(), m, n);
}

template <class T, size_t STRIDE = 2>
// NOLINTNEXTLINE(readability-function-cognitive-complexity)
inline void omp_matrixMatProd(const std::complex<T> *m_left,
                              const std::complex<T> *m_right,
                              std::complex<T> *m_out, size_t m, size_t n,
                              size_t k, Trans transpose) {
    if (!m_out) {
        return;
    }

    switch (transpose) {
    case Trans::Transpose:
#if defined(_OPENMP)
#pragma omp parallel for default(none) shared(m_left, m_right, m_out)          \
    firstprivate(m, n, k)
#endif
        for (size_t row = 0; row < m; row++) {
            for (size_t col = 0; col < n; col++) {
                for (size_t blk = 0; blk < k; blk++) {
                    m_out[row * n + col] +=
                        m_left[row * k + blk] * m_right[col * n + blk];
                }
            }
        }
        break;
    case Trans::Adjoint:
#if defined(_OPENMP)
#pragma omp parallel for default(none) shared(m_left, m_right, m_out)          \
    firstprivate(m, n, k)
#endif
        for (size_t row = 0; row < m; row++) {
            for (size_t col = 0; col < n; col++) {
                for (size_t blk = 0; blk < k; blk++) {
                    m_out[row * n + col] += m_left[row * k + blk] *
                                            std::conj(m_right[col * n + blk]);
                }
            }
        }
        break;
    case Trans::NoTranspose:
#if defined(_OPENMP)
#pragma omp parallel for default(none) shared(m_left, m_right, m_out)          \
    firstprivate(m, n, k)
#endif
        for (size_t row = 0; row < m; row += STRIDE) {
            size_t i;
            size_t j;
            size_t l;
            std::complex<T> t;
            for (size_t col = 0; col < n; col += STRIDE) {
                for (size_t blk = 0; blk < k; blk += STRIDE) {
                    // cache-blocking:
                    for (i = row; i < std::min(row + STRIDE, m); i++) {
                        for (j = col; j < std::min(col + STRIDE, n); j++) {
                            t = 0;
                            for (l = blk; l < std::min(blk + STRIDE, k); l++) {
                                t += m_left[i * k + l] * m_right[l * n + j];
                            }
                            m_out[i * n + j] += t;
                        }
                    }
                }
            }
        }
        break;
    }
}

template <class T>
inline void matrixMatProd(const std::complex<T> *m_left,
                          const std::complex<T> *m_right,
                          std::complex<T> *m_out, size_t m, size_t n, size_t k,
                          Trans transpose = Trans::NoTranspose) {
    if (!m_out) {
        return;
    }
    if constexpr (USE_CBLAS) {
        constexpr std::complex<T> co{1, 0};
        constexpr std::complex<T> cz{0, 0};
        const auto tr = static_cast<CBLAS_TRANSPOSE>(transpose);
        if constexpr (std::is_same_v<T, float>) {
            cblas_cgemm(CblasRowMajor, CblasNoTrans, tr, m, n, k, &co, m_left,
                        k, m_right, (transpose != Trans::NoTranspose) ? k : n,
                        &cz, m_out, n);
        } else if constexpr (std::is_same_v<T, double>) {
            cblas_zgemm(CblasRowMajor, CblasNoTrans, tr, m, n, k, &co, m_left,
                        k, m_right, (transpose != Trans::NoTranspose) ? k : n,
                        &cz, m_out, n);
        } else {
            static_assert(
                std::is_same_v<T, float> || std::is_same_v<T, double>,
                "This procedure only supports a single or double precision "
                "floating point types.");
        }
    } else {
        omp_matrixMatProd(m_left, m_right, m_out, m, n, k, transpose);
    }
}

template <class T>
inline auto matrixMatProd(const std::vector<std::complex<T>> m_left,
                          const std::vector<std::complex<T>> m_right, size_t m,
                          size_t n, size_t k,
                          Trans transpose = Trans::NoTranspose)
    -> std::vector<std::complex<T>> {
    if (m_left.size() != m * k) {
        throw std::invalid_argument(
            "Invalid number of rows and columns for the input left matrix");
    }
    if (m_right.size() != k * n) {
        throw std::invalid_argument(
            "Invalid number of rows and columns for the input right matrix");
    }

    std::vector<std::complex<T>> m_out(m * n);
    matrixMatProd(m_left.data(), m_right.data(), m_out.data(), m, n, k,
                  transpose);

    return m_out;
}

template <class T,
          size_t STD_CROSSOVER = 1U << 12U> // NOLINT(readability-magic-numbers)
void omp_scaleAndAdd(size_t dim, std::complex<T> a, const std::complex<T> *x,
                     std::complex<T> *y) {
    if (dim < STD_CROSSOVER) {
        for (size_t i = 0; i < dim; i++) {
            y[i] += a * x[i];
        }
    } else {
#if defined(_OPENMP)
#pragma omp parallel for default(none) firstprivate(a, dim, x, y)
#endif
        for (size_t i = 0; i < dim; i++) {
            y[i] += a * x[i];
        }
    }
}

template <class T>
void blas_scaleAndAdd(size_t dim, std::complex<T> a, const std::complex<T> *x,
                      std::complex<T> *y) {
    if constexpr (std::is_same_v<T, float>) {
        cblas_caxpy(dim, &a, x, 1, y, 1);
    } else if (std::is_same_v<T, double>) {
        cblas_zaxpy(dim, &a, x, 1, y, 1);
    } else {
        static_assert(
            std::is_same_v<T, float> || std::is_same_v<T, double>,
            "This procedure only supports a single or double precision "
            "floating point types.");
    }
}

template <class T>
void scaleAndAdd(size_t dim, std::complex<T> a, const std::complex<T> *x,
                 std::complex<T> *y) {
    if constexpr (USE_CBLAS) {
        blas_scaleAndAdd(dim, a, x, y);
    } else {
        omp_scaleAndAdd(dim, a, x, y);
    }
}
template <class T>
void scaleAndAdd(std::complex<T> a, const std::vector<std::complex<T>> &x,
                 std::vector<std::complex<T>> &y) {
    if (x.size() != y.size()) {
        throw std::invalid_argument("Dimensions of vectors mismatch");
    }
    scaleAndAdd(x.size(), a, x.data(), y.data());
}
} // namespace Pennylane::LightningQubit::Util