![]() |
cuLite v0.3.1
A lite CUDA C++ Interface
|
cuLite provides specialized classes for managing sparse matrices on NVIDIA GPU devices, optimized for matrices where most elements are zero. These classes support multiple compressed storage formats and leverage cuSPARSE for accelerated sparse linear algebra operations.
The following storage formats are defined within the culite::csr and culite::csc namespaces:
| Format | Description |
|---|---|
| CSR (Compressed Sparse Row) | Row-compressed format, efficient for row-wise operations and SpMV |
| CSC (Compressed Sparse Column) | Column-compressed format, efficient for column-wise operations |
| cuLite Class | Numeric Type | Precision |
|---|---|---|
| culite::csr::RdMatrix, culite::csc::RdMatrix | double | Double precision real |
| culite::csr::RfMatrix, culite::csc::RfMatrix | float | Single precision real |
| culite::csr::CdMatrix, culite::csc::CdMatrix | cuDoubleComplex | Double precision complex |
| culite::csr::CfMatrix, culite::csc::CfMatrix | cuFloatComplex | Single precision complex |
Example:
In cuLite, sparse matrices can be instantiated through various methods, depending on the storage format and whether you require fresh device memory allocation or a wrapper for existing GPU memory.
1) Default Declaration: Creates an empty sparse matrix object with zero rows, columns, and non-zeros. No device memory is allocated.
2) Sized Declaration: Allocates device memory for a sparse matrix of size (m x n) with a specified number of non-zero elements (nnz). The compressed storage arrays are allocated but contain uninitialized values.
| Code | Output |
|---|---|
#include <iostream>
#include <culite/sparse.hpp>
int main()
{
/*
* Double precision real empty matrix
*/
std::cout << A.info("A");
std::cout << B.info("B");
/*
* (3x4) single precision real matrix
*/
culite::csr::RfMatrix C(3,4,7);
std::cout << C.info("C");
/*
* Initialize a (5x2) empty sparse matrix with space for 7 non-zeros
*/
B = culite::csc::RdMatrix(5,2,7);
std::cout << B.info("B");
return 0;
}
std::string info(const std::string &header="") const Get information about the sparse matrix. std::string info(const std::string &header="") const Get information about the sparse matrix. XxMatrix< int_t, real_t > RdMatrix Double precision real matrix. Definition sparse.hpp:62 XxMatrix< int_t, real4_t > RfMatrix Single precision real matrix. Definition sparse.hpp:38 | ==================== A ====================
Datatype............. Real
Precision............ Double (64bit)
Index Precision...... Single (32bit)
Number of rows....... 0
Number of columns.... 0
Number of non zeros.. 0
Rowptr............... 0
Colidx............... 0
Values............... 0
Property............. Unknown
Owner................ No
===========================================
==================== B ====================
Datatype............. Real
Precision............ Double (64bit)
Index Precision...... Single (32bit)
Number of rows....... 0
Number of columns.... 0
Number of non zeros.. 0
Colptr............... 0
Rowidx............... 0
Values............... 0
Property............. Unknown
Owner................ No
===========================================
==================== C ====================
Datatype............. Real
Precision............ Single (32bit)
Index Precision...... Single (32bit)
Number of rows....... 3
Number of columns.... 4
Number of non zeros.. 7
Rowptr............... 0x504c00400
Colidx............... 0x504c00200
Values............... 0x504c00000
Property............. General Full
Owner................ Yes
===========================================
==================== B ====================
Datatype............. Real
Precision............ Double (64bit)
Index Precision...... Single (32bit)
Number of rows....... 5
Number of columns.... 2
Number of non zeros.. 7
Colptr............... 0x504c00a00
Rowidx............... 0x504c00800
Values............... 0x504c00600
Property............. General Full
Owner................ Yes
===========================================
|
Creates a sparse matrix that "views" existing device memory buffers containing the compressed storage arrays (row/column pointers, indices, and values). This avoids copying large datasets and allows wrapping pre-allocated GPU memory.
| Code | Output |
|---|---|
#include <iostream>
#include <culite/support.hpp>
#include <culite/sparse.hpp>
int main()
{
/*
* Allocate space for two csc matrix representations and assume are filled with some values
*/
culite::uint_t nr = 15; // number of rows
culite::uint_t nc = 12; // number of columns
culite::uint_t nnz = 10; // number of non-zeros
culite::int_t *rowptrA = culite::device_alloc_t<culite::int_t >(nr + 1);
culite::int_t *colidxA = culite::device_alloc_t<culite::int_t >(nnz);
culite::real_t *valuesA = culite::device_alloc_t<culite::real_t>(nnz);
culite::int_t *colptrB = culite::device_alloc_t<culite::int_t >(nc + 1);
culite::int_t *rowidxB = culite::device_alloc_t<culite::int_t >(nnz);
culite::real_t *valuesB = culite::device_alloc_t<culite::real_t>(nnz);
/*
* Assign csr pointers in matrix A but do not bind
* A simply hosts, need to manually dealloc csr vectors
*/
culite::csr::RdMatrix A(nr, nc, rowptrA, colidxA, valuesA, false);
std::cout << A.info("A");
/*
* Assign pointer b in matrix B with property and bind
* Assign csc pointers in matrix B with property and bind
* B takes ownership of csc vectors, no free call for the csc vectors is required
*/
cla3p::Property prB = cla3p::Property::General();
culite::csc::RdMatrix B(nc, nc, colptrB, rowidxB, valuesB, true, prB);
std::cout << B.info("B");
/*
* Free a and exit
*/
culite::device_free(rowptrA);
culite::device_free(colidxA);
culite::device_free(valuesA);
return 0;
}
static Property General() T * device_alloc_t(std::size_t n) Allocates typed memory on the device. Definition imalloc.hpp:86 | ==================== A ====================
Datatype............. Real
Precision............ Double (64bit)
Index Precision...... Single (32bit)
Number of rows....... 15
Number of columns.... 12
Number of non zeros.. 0
Rowptr............... 0x504c00000
Colidx............... 0x504c00200
Values............... 0x504c00400
Property............. General Full
Owner................ No
===========================================
==================== B ====================
Datatype............. Real
Precision............ Double (64bit)
Index Precision...... Single (32bit)
Number of rows....... 12
Number of columns.... 12
Number of non zeros.. 0
Colptr............... 0x504c00600
Rowidx............... 0x504c00800
Values............... 0x504c00a00
Property............. General Full
Owner................ Yes
===========================================
|
In cuLite, sparse algebraic operations are performed using matrices in CSR or CSC format. These operations are highly optimized for matrices with a large percentage of zero entries and leverage cuSPARSE for GPU acceleration.
Sparse matrices support in-place scaling using standard operators. GPU-accelerated scaling operations are performed efficiently via cuSPARSE kernels.
| Code | Output |
|---|---|
#include <iostream>
#include <cla3p/sparse.hpp>
#include <culite/sparse.hpp>
#include <culite/algebra.hpp>
int main()
{
cla3p::coo::RdMatrix hostAcoo(5, 5);
hostAcoo.insert(0,0,1.0);
hostAcoo.insert(1,1,2.0);
hostAcoo.insert(2,1,3.0);
hostAcoo.insert(1,3,4.0);
hostAcoo.insert(0,0,5.0);
cla3p::csr::RdMatrix hostA = hostAcoo.toCsr();
hostA >> A; // Transfer to GPU
std::cout << "A:\n" << A << "\n";
/*
* Scale A using operators and the scale function respectively
*/
A *= 2.;
std::cout << "A *= 2:\n" << A << "\n";
A.iscale(.5);
std::cout << "A.iscale(.5):\n" << A << "\n";
culite::csr::RdMatrix B = 2. * A ;
std::cout << "B:\n" << B << "\n";
return 0;
}
XxMatrix< int_t, real_t > RdMatrix XxMatrix< int_t, real_t > RdMatrix | A:
#nz | row column value
------------------------------
0 | 0 0 6.000000e+00
1 | 1 1 2.000000e+00
2 | 1 3 4.000000e+00
3 | 2 1 3.000000e+00
A *= 2:
#nz | row column value
------------------------------
0 | 0 0 1.200000e+01
1 | 1 1 4.000000e+00
2 | 1 3 8.000000e+00
3 | 2 1 6.000000e+00
A.iscale(.5):
#nz | row column value
------------------------------
0 | 0 0 6.000000e+00
1 | 1 1 2.000000e+00
2 | 1 3 4.000000e+00
3 | 2 1 3.000000e+00
B:
#nz | row column value
------------------------------
0 | 0 0 1.200000e+01
1 | 1 1 4.000000e+00
2 | 1 3 8.000000e+00
3 | 2 1 6.000000e+00
|
Sparse matrices can be added together using overloaded operators or member functions. These operations leverage cuSPARSE for high-performance sparse-sparse addition.
| Code | Output |
|---|---|
#include <iostream>
#include <cla3p/sparse.hpp>
#include <culite/sparse.hpp>
#include <culite/algebra.hpp>
int main()
{
cla3p::coo::RdMatrix hostAcoo(5, 5);
cla3p::coo::RdMatrix hostBcoo(5, 5);
hostAcoo.insert(0,0,1.0);
hostAcoo.insert(1,1,2.0);
hostAcoo.insert(2,1,3.0);
hostAcoo.insert(1,3,4.0);
hostAcoo.insert(0,0,5.0);
hostBcoo.insert(0,0,-1.3);
hostBcoo.insert(1,2, 2.2);
hostBcoo.insert(2,1,-3.1);
hostBcoo.insert(2,4, 4.6);
hostBcoo.insert(1,1,-5.7);
cla3p::csr::RdMatrix hostA = hostAcoo.toCsr();
cla3p::csr::RdMatrix hostB = hostBcoo.toCsr();
hostA >> A; // Transfer to GPU
hostB >> B; // Transfer to GPU
/*
* Perform the operation (A + 2 * B) using operators and the add function respectively
*/
culite::csr::RdMatrix C1 = 3. * A + 2. * B;
std::cout << "C1:\n" << C1 << "\n";
culite::csr::RdMatrix C2 = culite::ops::add(3., A, 2., B);
std::cout << "C2:\n" << C2 << "\n";
/*
* Perform the operation (Cx += 3 * A) using operators and the update function respectively
*/
C1 += 3. * A;
std::cout << "C1:\n" << C1 << "\n";
culite::ops::update(3., A, C2);
std::cout << "C2:\n" << C2 << "\n";
return 0;
}
void add(T_Scalar alpha, const dns::XxVector< T_Scalar > &x, T_Scalar beta, const dns::XxVector< T_Scalar > &y, dns::XxVector< T_Scalar > &z, CuBlasHandler &cublasHandler=globalCuBlasHandler()) Adds two compatible scaled dense vectors. void update(T_Scalar alpha, const dns::XxVector< T_Scalar > &x, dns::XxVector< T_Scalar > &y, CuBlasHandler &cublasHandler=globalCuBlasHandler()) Update a dense vector with a compatible scaled dense vector. | ==================== A ====================
Datatype............. Real
Precision............ Double (64bit)
Index Precision...... Single (32bit)
Number of rows....... 5
Number of columns.... 5
Number of non zeros.. 4
Rowptr............... 0x504c00400
Colidx............... 0x504c00200
Values............... 0x504c00000
Property............. General Full
Owner................ Yes
===========================================
#nz | row column value
------------------------------
0 | 0 0 6.000000e+00
1 | 1 1 2.000000e+00
2 | 1 3 4.000000e+00
3 | 2 1 3.000000e+00
==================== B ====================
Datatype............. Real
Precision............ Double (64bit)
Index Precision...... Single (32bit)
Number of rows....... 5
Number of columns.... 5
Number of non zeros.. 5
Rowptr............... 0x504c00a00
Colidx............... 0x504c00800
Values............... 0x504c00600
Property............. General Full
Owner................ Yes
===========================================
#nz | row column value
------------------------------
0 | 0 0 -1.300000e+00
1 | 1 1 -5.700000e+00
2 | 1 2 2.200000e+00
3 | 2 1 -3.100000e+00
4 | 2 4 4.600000e+00
C1:
#nz | row column value
------------------------------
0 | 0 0 1.540000e+01
1 | 1 1 -5.400000e+00
2 | 1 2 4.400000e+00
3 | 1 3 1.200000e+01
4 | 2 1 2.800000e+00
5 | 2 4 9.200000e+00
C2:
#nz | row column value
------------------------------
0 | 0 0 1.540000e+01
1 | 1 1 -5.400000e+00
2 | 1 2 4.400000e+00
3 | 1 3 1.200000e+01
4 | 2 1 2.800000e+00
5 | 2 4 9.200000e+00
C1:
#nz | row column value
------------------------------
0 | 0 0 3.340000e+01
1 | 1 1 6.000000e-01
2 | 1 2 4.400000e+00
3 | 1 3 2.400000e+01
4 | 2 1 1.180000e+01
5 | 2 4 9.200000e+00
C2:
#nz | row column value
------------------------------
0 | 0 0 3.340000e+01
1 | 1 1 6.000000e-01
2 | 1 2 4.400000e+00
3 | 1 3 2.400000e+01
4 | 2 1 1.180000e+01
5 | 2 4 9.200000e+00
|
You can perform multiplication between a sparse matrix and a dense vector using the * operator. The result is always a dense vector on GPU. These operations use cuSPARSE SpMV routines.
| Code | Output |
|---|---|
#include <iostream>
#include <cla3p/dense.hpp>
#include <cla3p/sparse.hpp>
#include <culite/dense.hpp>
#include <culite/sparse.hpp>
#include <culite/algebra.hpp>
int main()
{
cla3p::coo::RdMatrix hostAcoo(5, 5);
hostAcoo.insert(0,0,1.0);
hostAcoo.insert(1,1,2.0);
hostAcoo.insert(2,1,3.0);
hostAcoo.insert(1,3,4.0);
hostAcoo.insert(0,0,5.0);
cla3p::csr::RdMatrix hostA = hostAcoo.toCsr();
cla3p::dns::RdVector hostX(5);
hostX = 2.;
hostA >> A; // Transfer to GPU
std::cout << "A:\n" << A << "\n";
culite::dns::RdVector x(5);
hostX >> x; // Transfer to GPU
std::cout << "x:\n" << x << "\n";
/*
* Perform the operation (A * x) using operators and the mult function respectively
*/
culite::dns::RdVector y1 = A * x;
std::cout << "y1:\n" << y1 << "\n";
culite::dns::RdVector y2(5);
culite::ops::mult(1., cla3p::op_t::N, A, x, 0., y2);
std::cout << "y2:\n" << y2 << "\n";
/*
* Perform the operation (y1 += A * x) using operators and the mult function respectively
*/
y1 += A * x;
std::cout << "y1:\n" << y1 << "\n";
culite::ops::mult(1., cla3p::op_t::N, A, x, 1., y2);
std::cout << "y2:\n" << y2 << "\n";
return 0;
}
XxVector< real_t > RdVector void mult(T_Scalar alpha, op_t opA, const dns::XxMatrix< T_Scalar > &A, op_t opB, const dns::XxMatrix< T_Scalar > &B, T_Scalar beta, dns::XxMatrix< T_Scalar > &C, CuBlasHandler &cuBlasHandler=globalCuBlasHandler()) Updates a general matrix with a matrix-matrix product. XxVector< real_t > RdVector Double precision real device vector. Definition dense.hpp:31 | A:
#nz | row column value
------------------------------
0 | 0 0 6.000000e+00
1 | 1 1 2.000000e+00
2 | 1 3 4.000000e+00
3 | 2 1 3.000000e+00
x:
0
0 | 2.000000e+00
1 | 2.000000e+00
2 | 2.000000e+00
3 | 2.000000e+00
4 | 2.000000e+00
y1:
0
0 | 1.200000e+01
1 | 1.200000e+01
2 | 6.000000e+00
3 | 0.000000e+00
4 | 0.000000e+00
y2:
0
0 | 1.200000e+01
1 | 1.200000e+01
2 | 6.000000e+00
3 | 0.000000e+00
4 | 0.000000e+00
y1:
0
0 | 2.400000e+01
1 | 2.400000e+01
2 | 1.200000e+01
3 | 0.000000e+00
4 | 0.000000e+00
y2:
0
0 | 2.400000e+01
1 | 2.400000e+01
2 | 1.200000e+01
3 | 0.000000e+00
4 | 0.000000e+00
|
Sparse matrices can be multiplied with other matrices, provided their dimensions are compatible. cuSPARSE provides optimized kernels for various sparse-dense and sparse-sparse products.
1) Multiplication between a sparse matrix and a dense matrix is supported, resulting in a dense matrix.
2) Multiplication involving the transpose of a sparse matrix is also supported.
| Code | Output |
|---|---|
#include <iostream>
#include <cla3p/dense.hpp>
#include <cla3p/sparse.hpp>
#include <culite/dense.hpp>
#include <culite/sparse.hpp>
#include <culite/algebra.hpp>
int main()
{
cla3p::coo::RdMatrix hostAcoo(5, 5);
cla3p::coo::RdMatrix hostBcoo(5, 3);
hostAcoo.insert(0,0,1.0);
hostAcoo.insert(1,1,2.0);
hostAcoo.insert(2,1,3.0);
hostAcoo.insert(1,3,4.0);
hostAcoo.insert(0,0,5.0);
hostBcoo.insert(1,2, 1.3);
hostBcoo.insert(2,1,-2.1);
hostBcoo.insert(2,0, 3.5);
hostBcoo.insert(1,1,-4.4);
hostBcoo.insert(0,0, 5.3);
cla3p::csr::RdMatrix hostAcsr = hostAcoo.toCsr();
cla3p::csr::RdMatrix hostBcsr = hostBcoo.toCsr();
cla3p::dns::RdMatrix hostBdns(5, 3);
hostBdns = 2.;
culite::csr::RdMatrix Acsr;
culite::csr::RdMatrix Bcsr;
hostAcsr >> Acsr; // Transfer to GPU
hostBcsr >> Bcsr; // Transfer to GPU
std::cout << "A:\n" << Acsr << "\n";
std::cout << "B:\n" << Bcsr << "\n";
culite::dns::RdMatrix Bdns(5, 3);
hostBdns >> Bdns; // Transfer to GPU
std::cout << "B:\n" << Bdns << "\n";
/*
* Perform the operation (A * B) using operators and the mult function respectively
*/
cla3p::op_t opA = cla3p::op_t::N;
// cla3p::op_t opB = cla3p::op_t::N;
culite::dns::RdMatrix C1 = Acsr * Bdns;
std::cout << "C1:\n" << C1 << "\n";
culite::dns::RdMatrix C2(5, 3);
culite::ops::mult(1., opA, Acsr, Bdns, 0., C2);
std::cout << "C2:\n" << C2 << "\n";
// Sparse-sparse multiplication is currently not supported...
// culite::csr::RdMatrix C3 = Acsr * Bcsr;
// std::cout << "C3:\n" << C3 << "\n";
//
// culite::csr::RdMatrix C4 = culite::ops::mult(1., opA, Acsr, opB, Bcsr);
// std::cout << "C4:\n" << C4 << "\n";
/*
* Perform the operation (Cx += A * B) using operators and the mult function respectively
*/
C1 += Acsr * Bdns;
std::cout << "C1:\n" << C1 << "\n";
culite::ops::mult(1., opA, Acsr, Bdns, 1., C2);
std::cout << "C2:\n" << C2 << "\n";
// Sparse-sparse addition is currently not supported...
// C3 += Acsr * Bcsr;
// std::cout << "C3:\n" << C3 << "\n";
return 0;
}
op_t XxMatrix< real_t > RdMatrix XxMatrix< real_t > RdMatrix Double precision real matrix. Definition dense.hpp:55 | A:
#nz | row column value
------------------------------
0 | 0 0 6.000000e+00
1 | 1 1 2.000000e+00
2 | 1 3 4.000000e+00
3 | 2 1 3.000000e+00
B:
#nz | row column value
------------------------------
0 | 0 0 5.300000e+00
1 | 1 1 -4.400000e+00
2 | 1 2 1.300000e+00
3 | 2 0 3.500000e+00
4 | 2 1 -2.100000e+00
B:
0 1 2
0 | 2.000000e+00 2.000000e+00 2.000000e+00
1 | 2.000000e+00 2.000000e+00 2.000000e+00
2 | 2.000000e+00 2.000000e+00 2.000000e+00
3 | 2.000000e+00 2.000000e+00 2.000000e+00
4 | 2.000000e+00 2.000000e+00 2.000000e+00
C1:
0 1 2
0 | 1.200000e+01 1.200000e+01 1.200000e+01
1 | 1.200000e+01 1.200000e+01 1.200000e+01
2 | 6.000000e+00 6.000000e+00 6.000000e+00
3 | 0.000000e+00 0.000000e+00 0.000000e+00
4 | 0.000000e+00 0.000000e+00 0.000000e+00
C2:
0 1 2
0 | 1.200000e+01 1.200000e+01 1.200000e+01
1 | 1.200000e+01 1.200000e+01 1.200000e+01
2 | 6.000000e+00 6.000000e+00 6.000000e+00
3 | 0.000000e+00 0.000000e+00 0.000000e+00
4 | 0.000000e+00 0.000000e+00 0.000000e+00
C1:
0 1 2
0 | 2.400000e+01 2.400000e+01 2.400000e+01
1 | 2.400000e+01 2.400000e+01 2.400000e+01
2 | 1.200000e+01 1.200000e+01 1.200000e+01
3 | 0.000000e+00 0.000000e+00 0.000000e+00
4 | 0.000000e+00 0.000000e+00 0.000000e+00
C2:
0 1 2
0 | 2.400000e+01 2.400000e+01 2.400000e+01
1 | 2.400000e+01 2.400000e+01 2.400000e+01
2 | 1.200000e+01 1.200000e+01 1.200000e+01
3 | 0.000000e+00 0.000000e+00 0.000000e+00
4 | 0.000000e+00 0.000000e+00 0.000000e+00
|