2D Cuda array device to device (D2D) memcopy is slower

Question

Welcome To Ask or Share your Answers For Others

2D Cuda array device to device (D2D) memcopy is slower

posted Feb 17, 2021 in Technique[技术] by 深蓝 (71.8m points)

2D Cuda array device to device (D2D) memcopy is slower

I tried writing a CUDA kernel to move data from one memory in GPU to another memory in GPU. Basically I need to do windowing on that data before performing FFT. I was just checking how fast can we move data and compared it to cudaMemcpy library function by comparing the bandwidth that I was getting. My kernel achieves only about ~20 GB/s while the cudaMemcpy achieves about 98 GB/s (theoretical limit is 112.128 GB/s). How can I speed up my kernel?

My kernel is as follows

__global__ void copy_kernel1(const cuComplex *in, cuComplex *__restrict__ out, int width, int height)
{
    __shared__ cuComplex shData[16][16];

    // map from threadIdx/BlockIdx to data position
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;
    // calculate the global id into the one dimensional array
    int gid = x + y * width;

    if (gid > width *height)
        return;

    // load shared memory
    shData[threadIdx.y][threadIdx.x].x = in[gid].x;
    shData[threadIdx.y][threadIdx.x].y = in[gid].y;
    
    // synchronize threads not really needed but keep it for convenience
    __syncthreads();

    // write data back to global memory
    out[gid].x = shData[threadIdx.y][threadIdx.x].x;
    out[gid].y = shData[threadIdx.y][threadIdx.x].y;
}

__global__ void copy_kernel2(const cuComplex *in, cuComplex *__restrict__ out, int width, int height)
{
    int blockId = blockIdx.x + blockIdx.y * gridDim.x;
    int threadId = blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x;
    cuComplex vx, data;

    if (threadId > width *height)
        return;

    vx = in[threadId];

    data.x = vx.x;
    data.y = vx.y;

    out[threadId] = data;
}

__global__ void copy_kernel3(const cuComplex *in, cuComplex *__restrict__ out, int width, int height)
{
    int threadId = (blockIdx.x * blockDim.x) + threadIdx.x;
    int i;

    if (threadId > height)
        return;

    for (i = 0; i < width; i++)
    {
        out[threadId * width + i] = in[threadId * width + i];
    }
}

I am launching the kernel as

#define M     1024   //columns or width
#define N     3036   //rows or height

cuComplex *d_inp, *d_out;
cudaMalloc((void **)&d_inp, sizeof(cuComplex) * M * N);
cudaMalloc((void **)&d_out, sizeof(cuComplex) * M * N);

dim3 dimBlock(16,16);
dim3 dimGrid((M + 15)/16, (N + 15)/16);

cudaEventRecord(start, 0)
copy_kernel1<<<dimGrid, dimBlock>>>(d_inp, d_out, M, N);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&userMemcpyTime, start, stop);
printf("User Memcpy1 (D2D) Bandwidth = %.1f GB/s
", M * N * 8 * 2 / (userMemcpyTime * 1e6));

cudaEventRecord(start, 0)
copy_kernel2<<<dimGrid, dimBlock>>>(d_inp, d_out, M, N);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&userMemcpyTime, start, stop);
printf("User Memcpy2 (D2D) Bandwidth = %.1f GB/s
", M * N * 8 * 2 / (userMemcpyTime * 1e6));

dimBlock.x = 256;
dimBlock.y = 1;
dimGrid.x = (N + 255) / 256;
dimGrid.y = 1;
cudaEventRecord(start, 0);
copy_kernel3 << <dimGrid, dimBlock >> > (d_inp, d_out, M, N);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&userMemcpyTime, start, stop);
printf("User Memcpy3 (D2D) Bandwidth = %.1f GB/s
", M * N * 8 * 2 / (userMemcpyTime * 1e6));

cudaEventRecord(start, 0);
cudaMemcpy(d_out, d_inp, sizeof(cuComplex) * M * N, cudaMemcpyDeviceToDevice);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&userMemcpyTime, start, stop);
printf("Lib Memcpy (D2D) Bandwidth = %.1f GB/s
", M * N * 8 * 2 / (userMemcpyTime * 1e6));

I am getting the following results

User Memcpy1 (D2D) Bandwidth = 20.9 GB/s
User Memcpy2 (D2D) Bandwidth = 23.7 GB/s
User Memcpy3 (D2D) Bandwidth = 23.3 GB/s
Lib Memcpy (D2D) Bandwidth = 97.9 GB/s

Why are user kernel about 4 times slower? How can we speed it up?

与恶龙缠斗过久,自身亦成为恶龙；凝视深渊过久,深渊将回以凝视…

Categories

2D Cuda array device to device (D2D) memcopy is slower

2D Cuda array device to device (D2D) memcopy is slower

Please log in or register to add a comment.

Please log in or register to reply this article.

1 Reply

Please log in or register to add a comment.

Just Browsing Browsing

Most popular tags