Welcome to OGeek Q&A Community for programmer and developer-Open, Learning and Share
Welcome To Ask or Share your Answers For Others

Categories

0 votes
236 views
in Technique[技术] by (71.8m points)

2D Cuda array device to device (D2D) memcopy is slower

I tried writing a CUDA kernel to move data from one memory in GPU to another memory in GPU. Basically I need to do windowing on that data before performing FFT. I was just checking how fast can we move data and compared it to cudaMemcpy library function by comparing the bandwidth that I was getting. My kernel achieves only about ~20 GB/s while the cudaMemcpy achieves about 98 GB/s (theoretical limit is 112.128 GB/s). How can I speed up my kernel?

My kernel is as follows

__global__ void copy_kernel1(const cuComplex *in, cuComplex *__restrict__ out, int width, int height)
{
    __shared__ cuComplex shData[16][16];

    // map from threadIdx/BlockIdx to data position
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;
    // calculate the global id into the one dimensional array
    int gid = x + y * width;

    if (gid > width *height)
        return;

    // load shared memory
    shData[threadIdx.y][threadIdx.x].x = in[gid].x;
    shData[threadIdx.y][threadIdx.x].y = in[gid].y;
    
    // synchronize threads not really needed but keep it for convenience
    __syncthreads();

    // write data back to global memory
    out[gid].x = shData[threadIdx.y][threadIdx.x].x;
    out[gid].y = shData[threadIdx.y][threadIdx.x].y;
}

__global__ void copy_kernel2(const cuComplex *in, cuComplex *__restrict__ out, int width, int height)
{
    int blockId = blockIdx.x + blockIdx.y * gridDim.x;
    int threadId = blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x;
    cuComplex vx, data;

    if (threadId > width *height)
        return;

    vx = in[threadId];

    data.x = vx.x;
    data.y = vx.y;

    out[threadId] = data;
}

__global__ void copy_kernel3(const cuComplex *in, cuComplex *__restrict__ out, int width, int height)
{
    int threadId = (blockIdx.x * blockDim.x) + threadIdx.x;
    int i;

    if (threadId > height)
        return;

    for (i = 0; i < width; i++)
    {
        out[threadId * width + i] = in[threadId * width + i];
    }
}

I am launching the kernel as

#define M     1024   //columns or width
#define N     3036   //rows or height

cuComplex *d_inp, *d_out;
cudaMalloc((void **)&d_inp, sizeof(cuComplex) * M * N);
cudaMalloc((void **)&d_out, sizeof(cuComplex) * M * N);

dim3 dimBlock(16,16);
dim3 dimGrid((M + 15)/16, (N + 15)/16);

cudaEventRecord(start, 0)
copy_kernel1<<<dimGrid, dimBlock>>>(d_inp, d_out, M, N);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&userMemcpyTime, start, stop);
printf("User Memcpy1 (D2D) Bandwidth = %.1f GB/s
", M * N * 8 * 2 / (userMemcpyTime * 1e6));

cudaEventRecord(start, 0)
copy_kernel2<<<dimGrid, dimBlock>>>(d_inp, d_out, M, N);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&userMemcpyTime, start, stop);
printf("User Memcpy2 (D2D) Bandwidth = %.1f GB/s
", M * N * 8 * 2 / (userMemcpyTime * 1e6));

dimBlock.x = 256;
dimBlock.y = 1;
dimGrid.x = (N + 255) / 256;
dimGrid.y = 1;
cudaEventRecord(start, 0);
copy_kernel3 << <dimGrid, dimBlock >> > (d_inp, d_out, M, N);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&userMemcpyTime, start, stop);
printf("User Memcpy3 (D2D) Bandwidth = %.1f GB/s
", M * N * 8 * 2 / (userMemcpyTime * 1e6));

cudaEventRecord(start, 0);
cudaMemcpy(d_out, d_inp, sizeof(cuComplex) * M * N, cudaMemcpyDeviceToDevice);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&userMemcpyTime, start, stop);
printf("Lib Memcpy (D2D) Bandwidth = %.1f GB/s
", M * N * 8 * 2 / (userMemcpyTime * 1e6));

I am getting the following results

User Memcpy1 (D2D) Bandwidth = 20.9 GB/s
User Memcpy2 (D2D) Bandwidth = 23.7 GB/s
User Memcpy3 (D2D) Bandwidth = 23.3 GB/s
Lib Memcpy (D2D) Bandwidth = 97.9 GB/s

Why are user kernel about 4 times slower? How can we speed it up?


与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…
Welcome To Ask or Share your Answers For Others

1 Reply

0 votes
by (71.8m points)
等待大神答复

与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…
OGeek|极客中国-欢迎来到极客的世界,一个免费开放的程序员编程交流平台!开放,进步,分享!让技术改变生活,让极客改变未来! Welcome to OGeek Q&A Community for programmer and developer-Open, Learning and Share
Click Here to Ask a Question

1.4m articles

1.4m replys

5 comments

57.0k users

...