cuLite provides specialized classes for managing sparse matrices on NVIDIA GPU devices, optimized for matrices where most elements are zero. These classes support multiple compressed storage formats and leverage cuSPARSE for accelerated sparse linear algebra operations.

Warning: Currently, only matrices with General property are supported. Support for structured matrix types (Symmetric, Hermitian, etc.) is planned for future releases.

Available Sparse Matrix Formats

The following storage formats are defined within the culite::csr and culite::csc namespaces:

Format	Description
CSR (Compressed Sparse Row)	Row-compressed format, efficient for row-wise operations and SpMV
CSC (Compressed Sparse Column)	Column-compressed format, efficient for column-wise operations

Available Sparse Matrix Types

cuLite Class	Numeric Type	Precision
culite::csr::RdMatrix, culite::csc::RdMatrix	double	Double precision real
culite::csr::RfMatrix, culite::csc::RfMatrix	float	Single precision real
culite::csr::CdMatrix, culite::csc::CdMatrix	cuDoubleComplex	Double precision complex
culite::csr::CfMatrix, culite::csc::CfMatrix	cuFloatComplex	Single precision complex

Example:

// Instantiate a double precision real matrix (e.g., 3x3)
culite::csr::RdMatrix A(3, 3, 5); // 5 non-zeros
 
// Instantiate a single precision complex matrix (e.g., 5x2)
culite::csc::CfMatrix Ac(5, 2, 7); // 7 non-zeros

Creating matrices

In cuLite, sparse matrices can be instantiated through various methods, depending on the storage format and whether you require fresh device memory allocation or a wrapper for existing GPU memory.

Default creation

1) Default Declaration: Creates an empty sparse matrix object with zero rows, columns, and non-zeros. No device memory is allocated.
2) Sized Declaration: Allocates device memory for a sparse matrix of size (m x n) with a specified number of non-zero elements (nnz). The compressed storage arrays are allocated but contain uninitialized values.

Code

Output

 
#include <iostream>
#include <culite/sparse.hpp>
 
int main()
{
    /*
     * Double precision real empty matrix
     */
 
    culite::csr::RdMatrix A;
    culite::csc::RdMatrix B;
 
    std::cout << A.info("A");
    std::cout << B.info("B");
 
    /*
     * (3x4) single precision real matrix
     */
 
    culite::csr::RfMatrix C(3,4,7);
 
    std::cout << C.info("C");
 
    /*
     * Initialize a (5x2) empty sparse matrix with space for 7 non-zeros
     */
 
    B = culite::csc::RdMatrix(5,2,7);
 
    std::cout << B.info("B");
 
    return 0;
}

==================== A ====================
  Datatype............. Real
  Precision............ Double (64bit)
  Index Precision...... Single (32bit)
  Number of rows....... 0
  Number of columns.... 0
  Number of non zeros.. 0
  Rowptr............... 0
  Colidx............... 0
  Values............... 0
  Property............. Unknown
  Owner................ No
===========================================
==================== B ====================
  Datatype............. Real
  Precision............ Double (64bit)
  Index Precision...... Single (32bit)
  Number of rows....... 0
  Number of columns.... 0
  Number of non zeros.. 0
  Colptr............... 0
  Rowidx............... 0
  Values............... 0
  Property............. Unknown
  Owner................ No
===========================================
==================== C ====================
  Datatype............. Real
  Precision............ Single (32bit)
  Index Precision...... Single (32bit)
  Number of rows....... 3
  Number of columns.... 4
  Number of non zeros.. 7
  Rowptr............... 0x504c00400
  Colidx............... 0x504c00200
  Values............... 0x504c00000
  Property............. General Full
  Owner................ Yes
===========================================
==================== B ====================
  Datatype............. Real
  Precision............ Double (64bit)
  Index Precision...... Single (32bit)
  Number of rows....... 5
  Number of columns.... 2
  Number of non zeros.. 7
  Colptr............... 0x504c00a00
  Rowidx............... 0x504c00800
  Values............... 0x504c00600
  Property............. General Full
  Owner................ Yes
===========================================

Create sparse matrix from aux data

Creates a sparse matrix that "views" existing device memory buffers containing the compressed storage arrays (row/column pointers, indices, and values). This avoids copying large datasets and allows wrapping pre-allocated GPU memory.

Code

Output

 
#include <iostream>
#include <culite/support.hpp>
#include <culite/sparse.hpp>
 
int main()
{
    /*
     * Allocate space for two csc matrix representations and assume are filled with some values
     */
 
    culite::uint_t nr  = 15; // number of rows
    culite::uint_t nc  = 12; // number of columns
    culite::uint_t nnz = 10; // number of non-zeros
 
    culite::int_t  *rowptrA = culite::device_alloc_t<culite::int_t >(nr + 1); 
    culite::int_t  *colidxA = culite::device_alloc_t<culite::int_t >(nnz); 
    culite::real_t *valuesA = culite::device_alloc_t<culite::real_t>(nnz); 
 
    culite::int_t  *colptrB = culite::device_alloc_t<culite::int_t >(nc + 1); 
    culite::int_t  *rowidxB = culite::device_alloc_t<culite::int_t >(nnz); 
    culite::real_t *valuesB = culite::device_alloc_t<culite::real_t>(nnz); 
 
    /*
     * Assign csr pointers in matrix A but do not bind
     * A simply hosts, need to manually dealloc csr vectors
     */
 
    culite::csr::RdMatrix A(nr, nc, rowptrA, colidxA, valuesA, false);
 
    std::cout << A.info("A");
 
    /*
     * Assign pointer b in matrix B with property and bind
     * Assign csc pointers in matrix B with property and bind 
     * B takes ownership of csc vectors, no free call for the csc vectors is required
     */
 
    cla3p::Property prB = cla3p::Property::General();
    culite::csc::RdMatrix B(nc, nc, colptrB, rowidxB, valuesB, true, prB);
 
    std::cout << B.info("B");
 
    /* 
     * Free a and exit
     */
 
    culite::device_free(rowptrA);
    culite::device_free(colidxA);
    culite::device_free(valuesA);
 
    return 0;
}

==================== A ====================
  Datatype............. Real
  Precision............ Double (64bit)
  Index Precision...... Single (32bit)
  Number of rows....... 15
  Number of columns.... 12
  Number of non zeros.. 0
  Rowptr............... 0x504c00000
  Colidx............... 0x504c00200
  Values............... 0x504c00400
  Property............. General Full
  Owner................ No
===========================================
==================== B ====================
  Datatype............. Real
  Precision............ Double (64bit)
  Index Precision...... Single (32bit)
  Number of rows....... 12
  Number of columns.... 12
  Number of non zeros.. 0
  Colptr............... 0x504c00600
  Rowidx............... 0x504c00800
  Values............... 0x504c00a00
  Property............. General Full
  Owner................ Yes
===========================================

Algebra

In cuLite, sparse algebraic operations are performed using matrices in CSR or CSC format. These operations are highly optimized for matrices with a large percentage of zero entries and leverage cuSPARSE for GPU acceleration.

Scale

Sparse matrices support in-place scaling using standard operators. GPU-accelerated scaling operations are performed efficiently via cuSPARSE kernels.

Code

Output

 
#include <iostream>
#include <cla3p/sparse.hpp>
#include <culite/sparse.hpp>
#include <culite/algebra.hpp>
 
int main()
{
    cla3p::coo::RdMatrix hostAcoo(5, 5);
    hostAcoo.insert(0,0,1.0);
    hostAcoo.insert(1,1,2.0);
    hostAcoo.insert(2,1,3.0);
    hostAcoo.insert(1,3,4.0);
    hostAcoo.insert(0,0,5.0);
    cla3p::csr::RdMatrix hostA = hostAcoo.toCsr();
 
    culite::csr::RdMatrix A;
    hostA >> A; // Transfer to GPU
    std::cout << "A:\n" << A << "\n";
 
    /*
     * Scale A using operators and the scale function respectively
     */
    A *= 2.;
    std::cout << "A *= 2:\n" << A << "\n";
 
    A.iscale(.5);
    std::cout << "A.iscale(.5):\n" << A << "\n";
 
    culite::csr::RdMatrix B = 2. * A ;
    std::cout << "B:\n" << B << "\n";
 
    return 0;
}

A:
  #nz |   row   column   value
------------------------------
    0 |     0        0   6.000000e+00
    1 |     1        1   2.000000e+00
    2 |     1        3   4.000000e+00
    3 |     2        1   3.000000e+00
 
A *= 2:
  #nz |   row   column   value
------------------------------
    0 |     0        0   1.200000e+01
    1 |     1        1   4.000000e+00
    2 |     1        3   8.000000e+00
    3 |     2        1   6.000000e+00
 
A.iscale(.5):
  #nz |   row   column   value
------------------------------
    0 |     0        0   6.000000e+00
    1 |     1        1   2.000000e+00
    2 |     1        3   4.000000e+00
    3 |     2        1   3.000000e+00
 
B:
  #nz |   row   column   value
------------------------------
    0 |     0        0   1.200000e+01
    1 |     1        1   4.000000e+00
    2 |     1        3   8.000000e+00
    3 |     2        1   6.000000e+00
 

Add

Sparse matrices can be added together using overloaded operators or member functions. These operations leverage cuSPARSE for high-performance sparse-sparse addition.

Code

Output

 
#include <iostream>
#include <cla3p/sparse.hpp>
#include <culite/sparse.hpp>
#include <culite/algebra.hpp>
 
int main()
{
    cla3p::coo::RdMatrix hostAcoo(5, 5);
    cla3p::coo::RdMatrix hostBcoo(5, 5);
 
    hostAcoo.insert(0,0,1.0);
    hostAcoo.insert(1,1,2.0);
    hostAcoo.insert(2,1,3.0);
    hostAcoo.insert(1,3,4.0);
    hostAcoo.insert(0,0,5.0);
 
    hostBcoo.insert(0,0,-1.3);
    hostBcoo.insert(1,2, 2.2);
    hostBcoo.insert(2,1,-3.1);
    hostBcoo.insert(2,4, 4.6);
    hostBcoo.insert(1,1,-5.7);
 
    cla3p::csr::RdMatrix hostA = hostAcoo.toCsr();
    cla3p::csr::RdMatrix hostB = hostBcoo.toCsr();
 
    culite::csr::RdMatrix A;
    culite::csr::RdMatrix B;
    hostA >> A; // Transfer to GPU
    hostB >> B; // Transfer to GPU
    std::cout << A.info("A") << A << "\n";
    std::cout << B.info("B") << B << "\n";
 
    /*
     * Perform the operation (A + 2 * B) using operators and the add function respectively
     */
    culite::csr::RdMatrix C1 = 3. * A + 2. * B;
    std::cout << "C1:\n" << C1 << "\n";
 
    culite::csr::RdMatrix C2 = culite::ops::add(3., A, 2., B);
    std::cout << "C2:\n" << C2 << "\n";
 
    /*
     * Perform the operation (Cx += 3 * A) using operators and the update function respectively
     */
    C1 += 3. * A;
    std::cout << "C1:\n" << C1 << "\n";
 
    culite::ops::update(3., A, C2);
    std::cout << "C2:\n" << C2 << "\n";
 
    return 0;
}

==================== A ====================
  Datatype............. Real
  Precision............ Double (64bit)
  Index Precision...... Single (32bit)
  Number of rows....... 5
  Number of columns.... 5
  Number of non zeros.. 4
  Rowptr............... 0x504c00400
  Colidx............... 0x504c00200
  Values............... 0x504c00000
  Property............. General Full
  Owner................ Yes
===========================================
  #nz |   row   column   value
------------------------------
    0 |     0        0   6.000000e+00
    1 |     1        1   2.000000e+00
    2 |     1        3   4.000000e+00
    3 |     2        1   3.000000e+00
 
==================== B ====================
  Datatype............. Real
  Precision............ Double (64bit)
  Index Precision...... Single (32bit)
  Number of rows....... 5
  Number of columns.... 5
  Number of non zeros.. 5
  Rowptr............... 0x504c00a00
  Colidx............... 0x504c00800
  Values............... 0x504c00600
  Property............. General Full
  Owner................ Yes
===========================================
  #nz |   row   column   value
------------------------------
    0 |     0        0  -1.300000e+00
    1 |     1        1  -5.700000e+00
    2 |     1        2   2.200000e+00
    3 |     2        1  -3.100000e+00
    4 |     2        4   4.600000e+00
 
C1:
  #nz |   row   column   value
------------------------------
    0 |     0        0   1.540000e+01
    1 |     1        1  -5.400000e+00
    2 |     1        2   4.400000e+00
    3 |     1        3   1.200000e+01
    4 |     2        1   2.800000e+00
    5 |     2        4   9.200000e+00
 
C2:
  #nz |   row   column   value
------------------------------
    0 |     0        0   1.540000e+01
    1 |     1        1  -5.400000e+00
    2 |     1        2   4.400000e+00
    3 |     1        3   1.200000e+01
    4 |     2        1   2.800000e+00
    5 |     2        4   9.200000e+00
 
C1:
  #nz |   row   column   value
------------------------------
    0 |     0        0   3.340000e+01
    1 |     1        1   6.000000e-01
    2 |     1        2   4.400000e+00
    3 |     1        3   2.400000e+01
    4 |     2        1   1.180000e+01
    5 |     2        4   9.200000e+00
 
C2:
  #nz |   row   column   value
------------------------------
    0 |     0        0   3.340000e+01
    1 |     1        1   6.000000e-01
    2 |     1        2   4.400000e+00
    3 |     1        3   2.400000e+01
    4 |     2        1   1.180000e+01
    5 |     2        4   9.200000e+00
 

Matrix-Vector Product

You can perform multiplication between a sparse matrix and a dense vector using the * operator. The result is always a dense vector on GPU. These operations use cuSPARSE SpMV routines.

Code

Output

 
#include <iostream>
#include <cla3p/dense.hpp>
#include <cla3p/sparse.hpp>
#include <culite/dense.hpp>
#include <culite/sparse.hpp>
#include <culite/algebra.hpp>
 
int main()
{
    cla3p::coo::RdMatrix hostAcoo(5, 5);
    hostAcoo.insert(0,0,1.0);
    hostAcoo.insert(1,1,2.0);
    hostAcoo.insert(2,1,3.0);
    hostAcoo.insert(1,3,4.0);
    hostAcoo.insert(0,0,5.0);
    cla3p::csr::RdMatrix hostA = hostAcoo.toCsr();
 
    cla3p::dns::RdVector hostX(5);
    hostX = 2.;
 
    culite::csr::RdMatrix A;
    hostA >> A; // Transfer to GPU
    std::cout << "A:\n" << A << "\n";
 
    culite::dns::RdVector x(5);
    hostX >> x; // Transfer to GPU
    std::cout << "x:\n" << x << "\n";
 
    /*
     * Perform the operation (A * x) using operators and the mult function respectively
     */
    culite::dns::RdVector y1 = A * x;
    std::cout << "y1:\n" << y1 << "\n";
 
    culite::dns::RdVector y2(5);
    culite::ops::mult(1., cla3p::op_t::N, A, x, 0., y2);
    std::cout << "y2:\n" << y2 << "\n";
 
    /*
     * Perform the operation (y1 += A * x) using operators and the mult function respectively
     */
    y1 += A * x;
    std::cout << "y1:\n" << y1 << "\n";
 
    culite::ops::mult(1., cla3p::op_t::N, A, x, 1., y2);
    std::cout << "y2:\n" << y2 << "\n";
 
    return 0;
}

A:
  #nz |   row   column   value
------------------------------
    0 |     0        0   6.000000e+00
    1 |     1        1   2.000000e+00
    2 |     1        3   4.000000e+00
    3 |     2        1   3.000000e+00
 
x:
                0
0 |  2.000000e+00
1 |  2.000000e+00
2 |  2.000000e+00
3 |  2.000000e+00
4 |  2.000000e+00
 
 
y1:
                0
0 |  1.200000e+01
1 |  1.200000e+01
2 |  6.000000e+00
3 |  0.000000e+00
4 |  0.000000e+00
 
 
y2:
                0
0 |  1.200000e+01
1 |  1.200000e+01
2 |  6.000000e+00
3 |  0.000000e+00
4 |  0.000000e+00
 
 
y1:
                0
0 |  2.400000e+01
1 |  2.400000e+01
2 |  1.200000e+01
3 |  0.000000e+00
4 |  0.000000e+00
 
 
y2:
                0
0 |  2.400000e+01
1 |  2.400000e+01
2 |  1.200000e+01
3 |  0.000000e+00
4 |  0.000000e+00
 
 

Matrix-Matrix Product

Sparse matrices can be multiplied with other matrices, provided their dimensions are compatible. cuSPARSE provides optimized kernels for various sparse-dense and sparse-sparse products.
1) Multiplication between a sparse matrix and a dense matrix is supported, resulting in a dense matrix.
2) Multiplication involving the transpose of a sparse matrix is also supported.

Note: Sparse-sparse matrix multiplication is currently not available. Support for this operation is planned for an upcoming release.

Code

Output

 
#include <iostream>
#include <cla3p/dense.hpp>
#include <cla3p/sparse.hpp>
#include <culite/dense.hpp>
#include <culite/sparse.hpp>
#include <culite/algebra.hpp>
 
int main()
{
    cla3p::coo::RdMatrix hostAcoo(5, 5);
    cla3p::coo::RdMatrix hostBcoo(5, 3);
 
    hostAcoo.insert(0,0,1.0);
    hostAcoo.insert(1,1,2.0);
    hostAcoo.insert(2,1,3.0);
    hostAcoo.insert(1,3,4.0);
    hostAcoo.insert(0,0,5.0);
 
    hostBcoo.insert(1,2, 1.3);
    hostBcoo.insert(2,1,-2.1);
    hostBcoo.insert(2,0, 3.5);
    hostBcoo.insert(1,1,-4.4);
    hostBcoo.insert(0,0, 5.3);
 
    cla3p::csr::RdMatrix hostAcsr = hostAcoo.toCsr();
    cla3p::csr::RdMatrix hostBcsr = hostBcoo.toCsr();
 
    cla3p::dns::RdMatrix hostBdns(5, 3);
    hostBdns = 2.;
 
    culite::csr::RdMatrix Acsr;
    culite::csr::RdMatrix Bcsr;
    hostAcsr >> Acsr; // Transfer to GPU
    hostBcsr >> Bcsr; // Transfer to GPU
    std::cout << "A:\n" << Acsr << "\n";
    std::cout << "B:\n" << Bcsr << "\n";
 
    culite::dns::RdMatrix Bdns(5, 3);
    hostBdns >> Bdns; // Transfer to GPU
    std::cout << "B:\n" << Bdns << "\n";
 
    /*
     * Perform the operation (A * B) using operators and the mult function respectively
     */
 
    cla3p::op_t opA = cla3p::op_t::N;
    // cla3p::op_t opB = cla3p::op_t::N;
 
    culite::dns::RdMatrix C1 = Acsr * Bdns;
    std::cout << "C1:\n" << C1 << "\n";
 
    culite::dns::RdMatrix C2(5, 3);
    culite::ops::mult(1., opA, Acsr, Bdns, 0., C2);
    std::cout << "C2:\n" << C2 << "\n";
 
    // Sparse-sparse multiplication is currently not supported...
    // culite::csr::RdMatrix C3 = Acsr * Bcsr;
    // std::cout << "C3:\n" << C3 << "\n";
    //
    // culite::csr::RdMatrix C4 = culite::ops::mult(1., opA, Acsr, opB, Bcsr);
    // std::cout << "C4:\n" << C4 << "\n";
 
    /*
     * Perform the operation (Cx += A * B) using operators and the mult function respectively
     */
    C1 += Acsr * Bdns;
    std::cout << "C1:\n" << C1 << "\n";
 
    culite::ops::mult(1., opA, Acsr, Bdns, 1., C2);
    std::cout << "C2:\n" << C2 << "\n";
 
    // Sparse-sparse addition is currently not supported...
    // C3 += Acsr * Bcsr;
    // std::cout << "C3:\n" << C3 << "\n";
 
    return 0;
}

A:
  #nz |   row   column   value
------------------------------
    0 |     0        0   6.000000e+00
    1 |     1        1   2.000000e+00
    2 |     1        3   4.000000e+00
    3 |     2        1   3.000000e+00
 
B:
  #nz |   row   column   value
------------------------------
    0 |     0        0   5.300000e+00
    1 |     1        1  -4.400000e+00
    2 |     1        2   1.300000e+00
    3 |     2        0   3.500000e+00
    4 |     2        1  -2.100000e+00
 
B:
                0             1             2
0 |  2.000000e+00  2.000000e+00  2.000000e+00
1 |  2.000000e+00  2.000000e+00  2.000000e+00
2 |  2.000000e+00  2.000000e+00  2.000000e+00
3 |  2.000000e+00  2.000000e+00  2.000000e+00
4 |  2.000000e+00  2.000000e+00  2.000000e+00
 
 
C1:
                0             1             2
0 |  1.200000e+01  1.200000e+01  1.200000e+01
1 |  1.200000e+01  1.200000e+01  1.200000e+01
2 |  6.000000e+00  6.000000e+00  6.000000e+00
3 |  0.000000e+00  0.000000e+00  0.000000e+00
4 |  0.000000e+00  0.000000e+00  0.000000e+00
 
 
C2:
                0             1             2
0 |  1.200000e+01  1.200000e+01  1.200000e+01
1 |  1.200000e+01  1.200000e+01  1.200000e+01
2 |  6.000000e+00  6.000000e+00  6.000000e+00
3 |  0.000000e+00  0.000000e+00  0.000000e+00
4 |  0.000000e+00  0.000000e+00  0.000000e+00
 
 
C1:
                0             1             2
0 |  2.400000e+01  2.400000e+01  2.400000e+01
1 |  2.400000e+01  2.400000e+01  2.400000e+01
2 |  1.200000e+01  1.200000e+01  1.200000e+01
3 |  0.000000e+00  0.000000e+00  0.000000e+00
4 |  0.000000e+00  0.000000e+00  0.000000e+00
 
 
C2:
                0             1             2
0 |  2.400000e+01  2.400000e+01  2.400000e+01
1 |  2.400000e+01  2.400000e+01  2.400000e+01
2 |  1.200000e+01  1.200000e+01  1.200000e+01
3 |  0.000000e+00  0.000000e+00  0.000000e+00
4 |  0.000000e+00  0.000000e+00  0.000000e+00