Hello-
I'm just starting to learn CUDA programming. I wrote a program that simply adds up two arrays and then checks the sum of the arrays for errors. It works fine without CUDA. When I add in the code to run the "add" function on the GPU, the function does not appear to get called. The program compiles fine and runs, but the output is incorrect. I added in all the visual studio CUDA extensions. The add function is called with "add<<<1, 1>>>(N, x, y);".
Can anyone help me out? Below is the code. I'm a running a Quadro RTX 4000.
Thanks in advance.
#include<iostream>
#include<math.h>
#include<cuda.h>
#include<cuda_runtime.h>
#include<device_launch_parameters.h>
global void add(int n, float *x, float *y)
{
for (int i = 0; i < n; i++)
y[i] = x[i] + y[i];
}
int main(void) {
int N = 1 << 20;
float* x, * y;
// Allocate unified memory - accessible from the CPU or GPU
cudaMallocManaged(&x, N * sizeof(float));
cudaMallocManaged(&y, N * sizeof(float));
//initialize x and y arrays on the host
for (int i = 0; i < N; i++) {
x[i] = 1.0f;
y[i] = 2.0f;
}
// Run kernel on 1M elements on the GPU
add<<<1, 1>>>(N, x, y);
// Wait for the GPU to finish before accessing the data
cudaDeviceSynchronize();
// Check for errors (all values should be 3.0f)
float maxError = 0.0f;
for (int i = 0; i < N; i++)
maxError = fmax(maxError, fabs(y[i] - 3.0f));
std::cout << "Max error: " << maxError << std::endl;
std::cout << y[0] << std::endl;
// Free memory
cudaFree(x);
cudaFree(y);
return 0;
}