I tried writing a CUDA kernel to move data from one memory in GPU to another memory in GPU. Basically I need to do windowing on that data before performing FFT. I was just checking how fast can we move data and compared it to cudaMemcpy library function by comparing the bandwidth that I was getting. My kernel achieves only about ~20 GB/s while the cudaMemcpy achieves about 98 GB/s (theoretical limit is 112.128 GB/s). How can I speed up my kernel?
My kernel is as follows
__global__ void copy_kernel1(const cuComplex *in, cuComplex *__restrict__ out, int width, int height)
{
__shared__ cuComplex shData[16][16];
// map from threadIdx/BlockIdx to data position
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
// calculate the global id into the one dimensional array
int gid = x + y * width;
if (gid > width *height)
return;
// load shared memory
shData[threadIdx.y][threadIdx.x].x = in[gid].x;
shData[threadIdx.y][threadIdx.x].y = in[gid].y;
// synchronize threads not really needed but keep it for convenience
__syncthreads();
// write data back to global memory
out[gid].x = shData[threadIdx.y][threadIdx.x].x;
out[gid].y = shData[threadIdx.y][threadIdx.x].y;
}
__global__ void copy_kernel2(const cuComplex *in, cuComplex *__restrict__ out, int width, int height)
{
int blockId = blockIdx.x + blockIdx.y * gridDim.x;
int threadId = blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x;
cuComplex vx, data;
if (threadId > width *height)
return;
vx = in[threadId];
data.x = vx.x;
data.y = vx.y;
out[threadId] = data;
}
__global__ void copy_kernel3(const cuComplex *in, cuComplex *__restrict__ out, int width, int height)
{
int threadId = (blockIdx.x * blockDim.x) + threadIdx.x;
int i;
if (threadId > height)
return;
for (i = 0; i < width; i++)
{
out[threadId * width + i] = in[threadId * width + i];
}
}
I am launching the kernel as
#define M 1024 //columns or width
#define N 3036 //rows or height
cuComplex *d_inp, *d_out;
cudaMalloc((void **)&d_inp, sizeof(cuComplex) * M * N);
cudaMalloc((void **)&d_out, sizeof(cuComplex) * M * N);
dim3 dimBlock(16,16);
dim3 dimGrid((M + 15)/16, (N + 15)/16);
cudaEventRecord(start, 0)
copy_kernel1<<<dimGrid, dimBlock>>>(d_inp, d_out, M, N);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&userMemcpyTime, start, stop);
printf("User Memcpy1 (D2D) Bandwidth = %.1f GB/s
", M * N * 8 * 2 / (userMemcpyTime * 1e6));
cudaEventRecord(start, 0)
copy_kernel2<<<dimGrid, dimBlock>>>(d_inp, d_out, M, N);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&userMemcpyTime, start, stop);
printf("User Memcpy2 (D2D) Bandwidth = %.1f GB/s
", M * N * 8 * 2 / (userMemcpyTime * 1e6));
dimBlock.x = 256;
dimBlock.y = 1;
dimGrid.x = (N + 255) / 256;
dimGrid.y = 1;
cudaEventRecord(start, 0);
copy_kernel3 << <dimGrid, dimBlock >> > (d_inp, d_out, M, N);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&userMemcpyTime, start, stop);
printf("User Memcpy3 (D2D) Bandwidth = %.1f GB/s
", M * N * 8 * 2 / (userMemcpyTime * 1e6));
cudaEventRecord(start, 0);
cudaMemcpy(d_out, d_inp, sizeof(cuComplex) * M * N, cudaMemcpyDeviceToDevice);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&userMemcpyTime, start, stop);
printf("Lib Memcpy (D2D) Bandwidth = %.1f GB/s
", M * N * 8 * 2 / (userMemcpyTime * 1e6));
I am getting the following results
User Memcpy1 (D2D) Bandwidth = 20.9 GB/s
User Memcpy2 (D2D) Bandwidth = 23.7 GB/s
User Memcpy3 (D2D) Bandwidth = 23.3 GB/s
Lib Memcpy (D2D) Bandwidth = 97.9 GB/s
Why are user kernel about 4 times slower? How can we speed it up?