CUDA C++#
The sphericart::cuda::SphericalHarmonics
class automatically initializes and internally stores
pre-factors and buffers, and its usage is similar to the C++ API, although here the class provides
a single unified function for all purposes (values, gradients, and Hessians). This is
illustrated in the example below. The CUDA C++ API is subject to change, but the example below
should be sufficient to get started.
/** @file example.cu
* @brief Usage example for the CUDA C++ API
*/
#include "sphericart_cuda.hpp"
#include <cmath>
#include <cstdio>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <vector>
/*host macro that checks for errors in CUDA calls, and prints the file + line
* and error string if one occurs
*/
#define CUDA_CHECK(call) \
do { \
cudaError_t cudaStatus = (call); \
if (cudaStatus != cudaSuccess) { \
std::cerr << "CUDA error at " << __FILE__ << ":" << __LINE__ << " - " \
<< cudaGetErrorString(cudaStatus) << std::endl; \
cudaDeviceReset(); \
exit(EXIT_FAILURE); \
} \
} while (0)
int main() {
/* ===== set up the calculation ===== */
// hard-coded parameters for the example
size_t n_samples = 10000;
size_t l_max = 10;
// initializes samples
auto xyz = std::vector<double>(n_samples * 3, 0.0);
for (size_t i = 0; i < n_samples * 3; ++i) {
xyz[i] = (double)rand() / (double)RAND_MAX * 2.0 - 1.0;
}
// to avoid unnecessary allocations, calculators can use pre-allocated
// memory, one also can provide uninitialized vectors that will be
// automatically reshaped
auto sph = std::vector<double>(n_samples * (l_max + 1) * (l_max + 1), 0.0);
auto dsph = std::vector<double>(n_samples * 3 * (l_max + 1) * (l_max + 1), 0.0);
auto ddsph = std::vector<double>(n_samples * 3 * 3 * (l_max + 1) * (l_max + 1), 0.0);
// the class is templated, so one can also use 32-bit float operations
auto xyz_f = std::vector<float>(n_samples * 3, 0.0);
for (size_t i = 0; i < n_samples * 3; ++i) {
xyz_f[i] = (float)xyz[i];
}
auto sph_f = std::vector<float>(n_samples * (l_max + 1) * (l_max + 1), 0.0);
auto dsph_f = std::vector<float>(n_samples * 3 * (l_max + 1) * (l_max + 1), 0.0);
auto ddsph_f = std::vector<float>(n_samples * 3 * 3 * (l_max + 1) * (l_max + 1), 0.0);
/* ===== API calls ===== */
// internal buffers and numerical factors are initalized at construction
sphericart::cuda::SphericalHarmonics<double> calculator_cuda(l_max);
double* xyz_cuda;
CUDA_CHECK(cudaMalloc(&xyz_cuda, n_samples * 3 * sizeof(double)));
CUDA_CHECK(
cudaMemcpy(xyz_cuda, xyz.data(), n_samples * 3 * sizeof(double), cudaMemcpyHostToDevice)
);
double* sph_cuda;
CUDA_CHECK(cudaMalloc(&sph_cuda, n_samples * (l_max + 1) * (l_max + 1) * sizeof(double)));
calculator_cuda.compute(xyz_cuda, n_samples, false, false,
sph_cuda); // no gradients */
CUDA_CHECK(cudaMemcpy(
sph.data(), sph_cuda, n_samples * (l_max + 1) * (l_max + 1) * sizeof(double), cudaMemcpyDeviceToHost
));
// float version
sphericart::cuda::SphericalHarmonics<float> calculator_cuda_f(l_max);
float* xyz_cuda_f;
CUDA_CHECK(cudaMalloc(&xyz_cuda_f, n_samples * 3 * sizeof(float)));
CUDA_CHECK(
cudaMemcpy(xyz_cuda_f, xyz_f.data(), n_samples * 3 * sizeof(float), cudaMemcpyHostToDevice)
);
float* sph_cuda_f;
CUDA_CHECK(cudaMalloc(&sph_cuda_f, n_samples * (l_max + 1) * (l_max + 1) * sizeof(float)));
calculator_cuda_f.compute(xyz_cuda_f, n_samples, false, false,
sph_cuda_f); // no gradients */
CUDA_CHECK(cudaMemcpy(
sph_f.data(),
sph_cuda_f,
n_samples * (l_max + 1) * (l_max + 1) * sizeof(float),
cudaMemcpyDeviceToHost
));
/* ===== check results ===== */
double sph_error = 0.0, sph_norm = 0.0;
for (size_t i = 0; i < n_samples * (l_max + 1) * (l_max + 1); ++i) {
sph_error += (sph_f[i] - sph[i]) * (sph_f[i] - sph[i]);
sph_norm += sph[i] * sph[i];
}
printf("Float vs double relative error: %12.8e\n", sqrt(sph_error / sph_norm));
return 0;
}