cuLite provides specialized classes for managing dense matrices on NVIDIA GPU devices. These classes are optimized for CUDA-accelerated linear algebra operations and support both real and complex numerical types with column-major storage layout.

Warning: Currently, only matrices with General property are supported. Support for structured matrix types (Symmetric, Hermitian, etc.) is planned for future releases.

Available Dense Matrix Types

The following classes are defined within the culite::dns namespace:

cuLite Class	Numeric Type	Precision
culite::dns::RdMatrix	double	Double precision real
culite::dns::RfMatrix	float	Single precision real
culite::dns::CdMatrix	cuDoubleComplex	Double precision complex
culite::dns::CfMatrix	cuFloatComplex	Single precision complex

Example:

// Instantiate a double precision real matrix (e.g., 3x3)
culite::dns::RdMatrix Ad(3, 3);
 
// Instantiate a single precision complex matrix (e.g., 5x2)
culite::dns::CfMatrix Ac(5, 2);

Creating matrices

In cuLite, dense matrices can be instantiated through various methods, depending on whether you require fresh device memory allocation or a wrapper for existing GPU memory.

Default creation

1) Default Declaration: Creates an empty matrix object with zero rows and columns. No device memory is allocated.
2) Sized Declaration: Allocates device memory for a matrix of size (m x n) on the GPU. Elements are uninitialized and contain indeterminate values.

Code

Output

 
#include <iostream>
#include <culite/dense.hpp>
 
int main()
{
    /*
     * Double precision real empty matrix
     */
    culite::dns::RdMatrix A;
    std::cout << A.info("A");
 
    /*
     * (3x4) single precision real matrix (uninitialized values)
     */
    culite::dns::RfMatrix B(3,4);
    std::cout << B.info("B");
 
    /*
     * Allocate space for A (5x2, uninitialized values)
     */
    A = culite::dns::RdMatrix(5,2);
    std::cout << A.info("A");
 
    return 0;
}

==================== A ====================
  Datatype............. Real
  Precision............ Double (64bit)
  Number of rows....... 0
  Number of columns.... 0
  Leading dimension.... 0
  Values............... 0
  Property............. Unknown
  Owner................ No
===========================================
==================== B ====================
  Datatype............. Real
  Precision............ Single (32bit)
  Number of rows....... 3
  Number of columns.... 4
  Leading dimension.... 3
  Values............... 0x504c00000
  Property............. General Full
  Owner................ Yes
===========================================
==================== A ====================
  Datatype............. Real
  Precision............ Double (64bit)
  Number of rows....... 5
  Number of columns.... 2
  Leading dimension.... 5
  Values............... 0x504c00200
  Property............. General Full
  Owner................ Yes
===========================================

Create dense matrix from aux data

Creates a matrix that "views" an existing device memory buffer (column-major order). This avoids copying large datasets and allows wrapping pre-allocated GPU memory.

Code

Output

 
#include <iostream>
#include <cla3p/support.hpp>
#include <culite/support.hpp>
#include <culite/dense.hpp>
 
int main()
{
    /*
     * Allocate space for a & b and assume are filled with some values
     * Values in a and b are assumed to be in column-major order
     */
    cla3p::uint_t lda = 7;
    cla3p::uint_t ldb = 5;
    cla3p::real_t *a_host = cla3p::i_calloc_t<cla3p::real_t>(lda * 4); 
    cla3p::real_t *b_host = cla3p::i_calloc_t<cla3p::real_t>(ldb * 5); 
 
    for(cla3p::uint_t j = 0, icnt = 0; j < 4; j++)
        for(cla3p::uint_t i = 0; i < 3; i++)
            a_host[lda * j + i] = icnt++;
 
    for(cla3p::uint_t j = 0, icnt = 0; j < 5; j++)
        for(cla3p::uint_t i = 0; i < 5; i++)
            b_host[ldb * j + i] = icnt++;
 
    /*
     * Copy a & b to device
     */
    culite::real_t *a_device = culite::device_alloc_t<culite::real_t>(lda * 4);
    culite::real_t *b_device = culite::device_alloc_t<culite::real_t>(ldb * 5);
    culite::memCopyH2D(3, 4, a_host, lda, a_device, lda);
    culite::memCopyH2D(5, 5, b_host, ldb, b_device, ldb);
 
    /*
     * Assign pointer a in matrix A but do not bind
     * A simply hosts a, need to manually dealloc a
     */
    culite::dns::RdMatrix A(3, 4, a_device, lda, false);
    std::cout << A.info("A") << A;
 
    /*
     * Assign pointer b in matrix B with property and bind
     * B takes ownership of b, no free call for b is required
     */
    cla3p::Property prB = cla3p::Property::General();
    culite::dns::RdMatrix B(5, 5, b_device, ldb, true, prB);
    std::cout << B.info("B") << B;
 
    /* 
     * Free a and exit
     */
 
    cla3p::i_free(a_host);
    cla3p::i_free(b_host);
    culite::device_free(a_device);
    // b_device is freed by B's destructor
 
    return 0;
}

==================== A ====================
  Datatype............. Real
  Precision............ Double (64bit)
  Number of rows....... 3
  Number of columns.... 4
  Leading dimension.... 7
  Values............... 0x504c00000
  Property............. General Full
  Owner................ No
===========================================
                0             1             2             3
0 |  0.000000e+00  3.000000e+00  6.000000e+00  9.000000e+00
1 |  1.000000e+00  4.000000e+00  7.000000e+00  1.000000e+01
2 |  2.000000e+00  5.000000e+00  8.000000e+00  1.100000e+01
 
==================== B ====================
  Datatype............. Real
  Precision............ Double (64bit)
  Number of rows....... 5
  Number of columns.... 5
  Leading dimension.... 5
  Values............... 0x504c00200
  Property............. General Full
  Owner................ Yes
===========================================
                0             1             2             3             4
0 |  0.000000e+00  5.000000e+00  1.000000e+01  1.500000e+01  2.000000e+01
1 |  1.000000e+00  6.000000e+00  1.100000e+01  1.600000e+01  2.100000e+01
2 |  2.000000e+00  7.000000e+00  1.200000e+01  1.700000e+01  2.200000e+01
3 |  3.000000e+00  8.000000e+00  1.300000e+01  1.800000e+01  2.300000e+01
4 |  4.000000e+00  9.000000e+00  1.400000e+01  1.900000e+01  2.400000e+01
 

Algebra

cuLite provides a high-performance interface for dense matrix algebra on GPU, leveraging optimized CUDA backends including cuBLAS and cuSOLVER. Operations include standard arithmetic, linear combinations, and matrix-vector/matrix-matrix multiplications.

Scale

Matrices support in-place scaling and linear combinations using standard operators. GPU-accelerated scaling operations are performed efficiently via cuBLAS kernels.

Code

Output

 
#include <iostream>
#include <cla3p/dense.hpp>
#include <culite/dense.hpp>
#include <culite/algebra.hpp>
 
int main()
{
    cla3p::dns::RdMatrix hostA(3,3);
    hostA = 3.;
    culite::dns::RdMatrix A;
    hostA >> A; // Transfer to GPU
    std::cout << "A:\n" << A << "\n";
 
    /*
     * Scale A using operators and the scale function respectively
     */
    A *= 2.;
    std::cout << "A *= 2:\n" << A;
 
    A.iscale(.5);
    std::cout << "A.iscale(.5):\n" << A << "\n";
 
    culite::dns::RdMatrix B = 2. * A ;
    std::cout << "B:\n" << B;
 
    return 0;
}

A:
                0             1             2
0 |  3.000000e+00  3.000000e+00  3.000000e+00
1 |  3.000000e+00  3.000000e+00  3.000000e+00
2 |  3.000000e+00  3.000000e+00  3.000000e+00
 
 
A *= 2:
                0             1             2
0 |  6.000000e+00  6.000000e+00  6.000000e+00
1 |  6.000000e+00  6.000000e+00  6.000000e+00
2 |  6.000000e+00  6.000000e+00  6.000000e+00
 
A.iscale(.5):
                0             1             2
0 |  3.000000e+00  3.000000e+00  3.000000e+00
1 |  3.000000e+00  3.000000e+00  3.000000e+00
2 |  3.000000e+00  3.000000e+00  3.000000e+00
 
 
B:
                0             1             2
0 |  6.000000e+00  6.000000e+00  6.000000e+00
1 |  6.000000e+00  6.000000e+00  6.000000e+00
2 |  6.000000e+00  6.000000e+00  6.000000e+00
 

Add

Matrices can be added together using overloaded operators or member functions. These operations leverage cuBLAS for high-performance GPU acceleration.

Code

Output

 
#include <iostream>
#include <cla3p/dense.hpp>
#include <culite/dense.hpp>
#include <culite/algebra.hpp>
 
int main()
{
    cla3p::dns::RdMatrix hostA(3, 3);
    cla3p::dns::RdMatrix hostB(3, 3);
    culite::dns::RdMatrix A;
    culite::dns::RdMatrix B;
 
    hostA = 3.;
    hostB = 2.;
    hostA >> A; // Transfer to GPU
    hostB >> B; // Transfer to GPU
    std::cout << "A:\n" << A;
    std::cout << "B:\n" << B << "\n";
 
    /*
     * Perform the operation (A + 2 * B) using operators and the add function respectively
     */
    culite::dns::RdMatrix C1 = A + 2. * B;
    std::cout << "C1:\n" << C1;
 
    culite::dns::RdMatrix C2(3, 3);
    culite::ops::add(cla3p::op_t::N, 1., A, 
                     cla3p::op_t::N, 2., B, C2);
    std::cout << "C2:\n" << C2 << "\n";
 
    /*
     * Perform the operation (Cx += 3 * A) using operators and the update function respectively
     */
    C1 += 3. * A;
    std::cout << "C1:\n" << C1;
 
    culite::ops::update(3., A, C2);
    std::cout << "C2:\n" << C2;
 
    return 0;
}

A:
                0             1             2
0 |  3.000000e+00  3.000000e+00  3.000000e+00
1 |  3.000000e+00  3.000000e+00  3.000000e+00
2 |  3.000000e+00  3.000000e+00  3.000000e+00
 
B:
                0             1             2
0 |  2.000000e+00  2.000000e+00  2.000000e+00
1 |  2.000000e+00  2.000000e+00  2.000000e+00
2 |  2.000000e+00  2.000000e+00  2.000000e+00
 
 
C1:
                0             1             2
0 |  7.000000e+00  7.000000e+00  7.000000e+00
1 |  7.000000e+00  7.000000e+00  7.000000e+00
2 |  7.000000e+00  7.000000e+00  7.000000e+00
 
C2:
                0             1             2
0 |  7.000000e+00  7.000000e+00  7.000000e+00
1 |  7.000000e+00  7.000000e+00  7.000000e+00
2 |  7.000000e+00  7.000000e+00  7.000000e+00
 
 
C1:
                0             1             2
0 |  1.600000e+01  1.600000e+01  1.600000e+01
1 |  1.600000e+01  1.600000e+01  1.600000e+01
2 |  1.600000e+01  1.600000e+01  1.600000e+01
 
C2:
                0             1             2
0 |  1.600000e+01  1.600000e+01  1.600000e+01
1 |  1.600000e+01  1.600000e+01  1.600000e+01
2 |  1.600000e+01  1.600000e+01  1.600000e+01
 

Matrix-Vector Product

You can perform matrix-vector products using the * operator. The resulting object is a dense vector on GPU of the appropriate precision. These operations are accelerated via cuBLAS GEMV routines.

Code

Output

 
#include <iostream>
#include <cla3p/dense.hpp>
#include <culite/dense.hpp>
#include <culite/algebra.hpp>
 
int main()
{
    cla3p::dns::RdMatrix hostA(3, 3);
    cla3p::dns::RdVector hostX(3);
    hostA = 3.;
    hostX = 2.;
 
    culite::dns::RdMatrix A;
    culite::dns::RdVector x;
    hostA >> A; // Transfer to GPU
    hostX >> x; // Transfer to GPU
 
    std::cout << "A:\n" << A;
    std::cout << "x:\n" << x << "\n";
 
    /*
     * Perform the operation (A * x) using operators and the mult function respectively
     */
    culite::dns::RdVector y1 = A * x;
    std::cout << "y1:\n" << y1;
 
    culite::dns::RdVector y2(3);
    culite::ops::mult(1., cla3p::op_t::N, A, x, 0., y2);
    std::cout << "y2:\n" << y2 << "\n";
 
    /*
     * Perform the operation (y1 += A * x) using operators and the mult function respectively
     */
    y1 += A * x;
    std::cout << "y1:\n" << y1;
 
    culite::ops::mult(1., cla3p::op_t::N, A, x, 1., y2);
    std::cout << "y2:\n" << y2;
 
    return 0;
}

A:
                0             1             2
0 |  3.000000e+00  3.000000e+00  3.000000e+00
1 |  3.000000e+00  3.000000e+00  3.000000e+00
2 |  3.000000e+00  3.000000e+00  3.000000e+00
 
x:
                0
0 |  2.000000e+00
1 |  2.000000e+00
2 |  2.000000e+00
 
 
y1:
                0
0 |  1.800000e+01
1 |  1.800000e+01
2 |  1.800000e+01
 
y2:
                0
0 |  1.800000e+01
1 |  1.800000e+01
2 |  1.800000e+01
 
 
y1:
                0
0 |  3.600000e+01
1 |  3.600000e+01
2 |  3.600000e+01
 
y2:
                0
0 |  3.600000e+01
1 |  3.600000e+01
2 |  3.600000e+01
 

Matrix-Matrix Product

Dense matrices can be multiplied together on GPU, provided their dimensions are compatible. Matrix multiplication is performed using high-performance cuBLAS GEMM routines.

Code

Output

 
#include <iostream>
#include <cla3p/dense.hpp>
#include <culite/dense.hpp>
#include <culite/algebra.hpp>
 
int main()
{
    cla3p::dns::RdMatrix hostA(3, 3);
    cla3p::dns::RdMatrix hostB(3, 3);
    hostA = 3.;
    hostB = 2.;
 
    culite::dns::RdMatrix A;
    culite::dns::RdMatrix B;
    hostA >> A; // Transfer to GPU
    hostB >> B; // Transfer to GPU
    std::cout << "A:\n" << A;
    std::cout << "B:\n" << B << "\n";
 
    /*
     * Perform the operation (A * B) using operators and the mult function respectively
     */
    culite::dns::RdMatrix C1 = A * B;
    std::cout << "C1:\n" << C1;
 
    culite::dns::RdMatrix C2(3, 3);
    culite::ops::mult(1., cla3p::op_t::N, A, cla3p::op_t::N, B, 0., C2);
    std::cout << "C2:\n" << C2 << "\n";
 
    /*
     * Perform the operation (Cx += A * B) using operators and the mult function respectively
     */
    C1 += A * B;
    std::cout << "C1:\n" << C1;
 
    culite::ops::mult(1., cla3p::op_t::N, A, cla3p::op_t::N, B, 1., C2);
    std::cout << "C2:\n" << C2;
 
    return 0;
}

A:
                0             1             2
0 |  3.000000e+00  3.000000e+00  3.000000e+00
1 |  3.000000e+00  3.000000e+00  3.000000e+00
2 |  3.000000e+00  3.000000e+00  3.000000e+00
 
B:
                0             1             2
0 |  2.000000e+00  2.000000e+00  2.000000e+00
1 |  2.000000e+00  2.000000e+00  2.000000e+00
2 |  2.000000e+00  2.000000e+00  2.000000e+00
 
 
C1:
                0             1             2
0 |  1.800000e+01  1.800000e+01  1.800000e+01
1 |  1.800000e+01  1.800000e+01  1.800000e+01
2 |  1.800000e+01  1.800000e+01  1.800000e+01
 
C2:
                0             1             2
0 |  1.800000e+01  1.800000e+01  1.800000e+01
1 |  1.800000e+01  1.800000e+01  1.800000e+01
2 |  1.800000e+01  1.800000e+01  1.800000e+01
 
 
C1:
                0             1             2
0 |  3.600000e+01  3.600000e+01  3.600000e+01
1 |  3.600000e+01  3.600000e+01  3.600000e+01
2 |  3.600000e+01  3.600000e+01  3.600000e+01
 
C2:
                0             1             2
0 |  3.600000e+01  3.600000e+01  3.600000e+01
1 |  3.600000e+01  3.600000e+01  3.600000e+01
2 |  3.600000e+01  3.600000e+01  3.600000e+01
 

Transposed Matrix-Matrix Product

You can perform matrix-matrix products involving the transpose or conjugate transpose of matrices without explicitly forming the transposed matrix in memory.

Code

Output

 
#include <iostream>
#include <cla3p/dense.hpp>
#include <culite/dense.hpp>
#include <culite/algebra.hpp>
 
int main()
{
    cla3p::dns::RdMatrix hostA(3, 3);
    cla3p::dns::RdMatrix hostB(3, 3);
    hostA = 3.;
    hostB = 2.;
 
    culite::dns::RdMatrix A;
    culite::dns::RdMatrix B;
    hostA >> A; // Transfer to GPU
    hostB >> B; // Transfer to GPU
    std::cout << "A:\n" << A;
    std::cout << "B:\n" << B << "\n";
 
    /*
     * Perform the operation (A' * B) using operators and the mult function respectively
     */
    culite::dns::RdMatrix C1 = A.transpose() * B;
    std::cout << "C1:\n" << C1;
 
    culite::dns::RdMatrix C2(3, 3);
    culite::ops::mult(1., cla3p::op_t::T, A, cla3p::op_t::N, B, 0., C2);
    std::cout << "C2:\n" << C2;
 
    return 0;
}

A:
                0             1             2
0 |  3.000000e+00  3.000000e+00  3.000000e+00
1 |  3.000000e+00  3.000000e+00  3.000000e+00
2 |  3.000000e+00  3.000000e+00  3.000000e+00
 
B:
                0             1             2
0 |  2.000000e+00  2.000000e+00  2.000000e+00
1 |  2.000000e+00  2.000000e+00  2.000000e+00
2 |  2.000000e+00  2.000000e+00  2.000000e+00
 
 
C1:
                0             1             2
0 |  1.800000e+01  1.800000e+01  1.800000e+01
1 |  1.800000e+01  1.800000e+01  1.800000e+01
2 |  1.800000e+01  1.800000e+01  1.800000e+01
 
C2:
                0             1             2
0 |  1.800000e+01  1.800000e+01  1.800000e+01
1 |  1.800000e+01  1.800000e+01  1.800000e+01
2 |  1.800000e+01  1.800000e+01  1.800000e+01