c++ - CUDA在内核函数中嵌套循环后未执行代码(CUDA not executing code after nested loop in kernel function)

Question

Welcome To Ask or Share your Answers For Others

c++ - CUDA在内核函数中嵌套循环后未执行代码(CUDA not executing code after nested loop in kernel function)

posted Feb 21, 2021 in Technique[技术] by 深蓝 (71.8m points)

c++ - CUDA在内核函数中嵌套循环后未执行代码(CUDA not executing code after nested loop in kernel function)

I am trying to apply Gaussian Filter on image using CUDA.

(我正在尝试使用CUDA在图像上应用高斯滤波器。)

int main() {

    double GKernel[5][5];
    FilterCreation(GKernel);
    double * kernel = new double[25];
    int i,j,k = 0;
    for( int i = 0; i < 5; i++){
     for( int j = 0; j < 5; j++){
       kernel[k++] = GKernel[i][j];
     }
    }

    double * deviceKernel;

    cudaMalloc((void **)&deviceKernel, 25 * sizeof(double));
    cudaMemcpy(deviceKernel, kernel, 25 * sizeof(double), cudaMemcpyHostToDevice);

    Info dat = readBMP("konik.bmp");
    unsigned char * devPtr;
    unsigned char * devPtrFilter;
    size_t pitch;

    unsigned char * test= new unsigned char  [dat.size - dat.offset ];

    cudaMalloc (( void **)& devPtr , dat.size * sizeof ( unsigned char  ));
    cudaMalloc (( void **)& devPtrFilter , dat.size * sizeof ( unsigned char  ));

    cudaMemcpy ( devPtr , dat.data , sizeof ( unsigned char ) *  dat.size , cudaMemcpyHostToDevice );

        greyScale<<<dat.height,dat.width>>>(devPtr,dat.height,dat.width);
        CreateGaussFilter<<<dat.height,dat.width>>>(devPtr,devPtrFilter,5,5,deviceKernel,dat.height,dat.width);

    cudaMemcpy ( test, devPtrFilter , sizeof ( unsigned char )  *  dat.size ,cudaMemcpyDeviceToHost );
    cudaDeviceSynchronize ();

    ofstream fout;
    fout.open("output.bmp", ios::binary | ios::out);
    fout.write( reinterpret_cast<char *>(dat.info), dat.offset);

    fout.write( reinterpret_cast<char *>(test), dat.size - dat.offset );
    fout.close();
    return 0;
}

First i call greyScale kernel that works as it should.

(首先，我称其为greyScale内核，它可以正常工作。)

__global__ void greyScale( unsigned char * src , int rows, int cols){

        int i = blockDim.x * blockIdx.x * threadIdx.x;
        int j = blockDim.y * blockIdx.y * threadIdx.y;

        if( i >= rows || j >= cols ) {
                return;
        }
            unsigned char r = src[3 * (i * cols + j)];
            unsigned char g = src[3 * (i * cols + j) + 1];
            unsigned char b = src[3 * (i * cols + j) + 2];

            unsigned char linearIntensity = (unsigned char)(0.2126f * r + 0.7512f * g + 0);

            src[3 * (i * cols + j)] = linearIntensity;
            src[3 * (i * cols + j) + 1] = linearIntensity;
            src[3 * (i * cols + j) + 2] = linearIntensity;

}

But what bothers me is my CreateGaussFilter kernel function:

(但是令我困扰的是我的CreateGaussFilter内核函数：)

__global__ void CreateGaussFilter(unsigned char * src, unsigned char * dst, int kernalHeight, int kernalWidth, double *kernalArray, int rows, int cols){
    int verticleImageBound=(kernalHeight-1)/2;
    int horizontalImageBound=(kernalWidth-1)/2;

    int row = ( blockDim.x * blockIdx.x * threadIdx.x ) + verticleImageBound;
    int col = ( blockDim.y * blockIdx.y * threadIdx.y ) + horizontalImageBound;

    if ( row >= rows - verticleImageBound || col >= cols - horizontalImageBound ) return;
            float  value=0;
            for(int kRow=0;kRow<kernalHeight;kRow++){
                  for(int kCol=0;kCol<kernalWidth;kCol++){
                  float pixel = src[ 3 * ((kRow+row-verticleImageBound ) * cols + (kCol+col-horizontalImageBound)) ] * kernalArray[kRow + kCol * kernalWidth];
                  value+=pixel;
             }
            }

            printf("value = %i
",value);

            dst[3 * ( row * cols + col )] = round(value);
            dst[3 * ( row * cols + col ) + 1] = round(value);
            dst[3 * ( row * cols + col ) + 2] = round(value);

}

it actually outputs black image ( this works when im running it on CPU without CUDA ), after debugging with printf it seems like the code wont execute after for loop, im not sure why, the boundaries works in non CUDA version and block size are same for greyScale kernel that works.

(它实际上输出黑色图像（当我在没有CUDA的CPU上运行它时可以正常工作），使用printf调试后，似乎代码不会在for循环后执行，我不确定为什么，边界在非CUDA版本中有效并且块大小相同适用于有效的greyScale内核。)

Why isnt code executed after for loop?

(为什么在for循环之后不执行代码？)

Is error occuring somewhere and shutting down the thread?

(是否在某处发生错误并关闭了线程？)

Im quite confused.

(我很困惑。)

Code for loading images:

(加载图像的代码：)

struct Info{
    int width;
    int height;
    int offset;
    unsigned char * info;
    unsigned char * data;

    int size;
};

Info readBMP(char* filename)
{
    int i;
    std::ifstream is(filename, std::ifstream::binary);
    is.seekg(0, is.end);
    i = is.tellg();
    is.seekg(0);
    unsigned char *info = new unsigned char[i];
    is.read((char *)info,i);

    int width = *(int*)&info[18];
    int height = *(int*)&info[22];
    int offset = *(int*)&info[10];

    unsigned char a[offset];
    unsigned char *b = new unsigned char[i - offset];
    std::copy(info,
              info + offset,
              a);

    std::copy(info + offset,
              info + i,
              b + 0);

    Info dat;
    dat.width = width;
    dat.height = height;
    dat.offset = offset;
    dat.size = i;
    dat.info = new unsigned char[offset - 1];
    dat.data = new unsigned char[i - offset + 1];

    for( int j = 0; j < offset ; j++ ){
        dat.info[j] = a[j];
    }

    for( int j = 0; j < i - offset; j++ ){
        dat.data[j] = b[j];
    }
    return dat;

}

Thanks for help!

(感谢帮助！)

ask by Darlyn translate from so

与恶龙缠斗过久,自身亦成为恶龙；凝视深渊过久,深渊将回以凝视…

1 Reply

深蓝 · Answer 1 · 2021-02-20T21:51:48+0000

I doubt that either of your kernels are working correctly.

(我怀疑您的任何一个内核都能正常工作。)

You have at least 2 issues:

(您至少有2个问题：)

This is not how to create a proper index:
(这不是创建适当索引的方法：)
```
 blockDim.x * blockIdx.x * threadIdx.x 
```
a proper index looks like this:
(正确的索引如下所示：)
```
 blockDim.x * blockIdx.x + threadIdx.x 
```
this error is evident for both .x and .y indices, in both kernels.
(对于两个内核中的.x和.y索引，此错误都很明显。)
You are confused about CUDA kernel launch syntax, eg here: CreateGaussFilter<<<dat.height,dat.width>>> .
(您对CUDA内核启动语法感到困惑，例如： CreateGaussFilter<<<dat.height,dat.width>>> 。)
The first <<<...>>> argument is the number of blocks per grid.
(第一个<<<...>>>变量是每个网格的块数。)
The second is the number of threads per block.
(第二个是每个块的线程数。)
If you pass scalar quantities for both of these arguments (which you do) you will get a 1D grid of 1D threadblocks.
(如果您为这两个参数都传递标量，那么您将获得一维1D线程块的网格。)
1D here means that in-kernel, your .y values will always be zero, so this statement: int col = ( blockDim.y * blockIdx.y * threadIdx.y ) + horizontalImageBound;
(1D在这里意味着在内核中，您的.y值将始终为零，因此该语句： int col = ( blockDim.y * blockIdx.y * threadIdx.y ) + horizontalImageBound;)
will result in every thread in your kernel having a col value of horizontalImageBound
(将导致内核中的每个线程的col值均为horizontalImageBound)
in your in-kernel printf statement, %i is not the correct format parameter for a float quantity.
(在内核中的printf语句中， %i不是float的正确格式参数。)

You should study any CUDA 2D kernel code for proper usage.

(您应该学习任何CUDA 2D内核代码以正确使用。)

Changes are needed in both your host code as well as your kernel code.

(您的主机代码和内核代码都需要进行更改。)

A few other notes.

(其他一些注意事项。)

It's nice if you don't strip off the headers your code uses.
(如果您不剥离代码使用的标头，那就太好了。)
Some people trying to help you will want to run your code .
(一些试图帮助您的人将要运行您的代码 。)
Make it easy for them (if you want help from them).
(使他们变得容易（如果您需要他们的帮助）。)
Just my suggestion, as is this entire post of mine.
(这只是我的建议，我的整个职位也是如此。)
You are expected to provide a complete example.
(您应该提供一个完整的示例。)
See item 1 here .
(请参阅此处的项目1。)
For example, your posted code does not indicate a definition of FilterCreation anywhere.
(例如，您发布的代码未在任何地方指示FilterCreation的定义。)
And I don't have your konik.bmp , so either indicate how I can get it, or even better, write your code that you post here in such a way that it does not depend on an external file.
(而且我没有您的konik.bmp ，所以要么说明我如何获取它，要么更好地编写您在此处发布的代码，而不依赖于外部文件。)
For example create a dummy image in code, and skip the file load process.
(例如，在代码中创建虚拟图像，然后跳过文件加载过程。)
This doesn't have to be that hard.
(这不必那么难。)
Take what you've posted and create a new project with just that code.
(使用您发布的内容并仅使用该代码创建一个新项目。)
Does it compile?
(它可以编译吗？)
If not, keep adding to your posting until it compiles.
(如果没有，请继续添加到您的帖子中，直到它被编译为止。)
Then does your posted code reproduce the issue?
(然后，您发布的代码是否可以重现问题？)
If not, keep adjusting till it does.
(如果没有，请继续进行调整直到达到为止。)
In other words, put yourself in the place of those trying to help you.
(换句话说，将自己放在那些试图帮助您的地方。)
Again, just suggestions.
(同样，只是建议。)

What follows is a code I attempted to build around what you have shown, while avoiding the issues I mentioned above.

(接下来的代码是我尝试围绕您所显示的内容构建的，同时避免了上面提到的问题。)

I make no claim that it produces the correct output, but should give you an idea how to fix some of the mistakes indicated above.

(我不声称它会产生正确的输出，但是应该给您一个解决上面指出的一些错误的想法。)

#include <iostream>
#include <fstream>
struct Info{
    int width;
    int height;
    int offset;
    unsigned char * info;
    unsigned char * data;

    int size;
};

Info readBMP(char* filename)
{
    int i;
    std::ifstream is(filename, std::ifstream::binary);
    is.seekg(0, is.end);
    i = is.tellg();
    is.seekg(0);
    unsigned char *info = new unsigned char[i];
    is.read((char *)info,i);

    int width = *(int*)&info[18];
    int height = *(int*)&info[22];
    int offset = *(int*)&info[10];

    unsigned char a[offset];
    unsigned char *b = new unsigned char[i - offset];
    std::copy(info,
              info + offset,
              a);

    std::copy(info + offset,
              info + i,
              b + 0);

    Info dat;
    dat.width = width;
    dat.height = height;
    dat.offset = offset;
    dat.size = i;
    dat.info = new unsigned char[offset - 1];
    dat.data = new unsigned char[i - offset + 1];

    for( int j = 0; j < offset ; j++ ){
        dat.info[j] = a[j];
    }

    for( int j = 0; j < i - offset; j++ ){
        dat.data[j] = b[j];
    }
    return dat;

}

__global__ void CreateGaussFilter(unsigned char * src, unsigned char * dst, int kernalHeight, int kernalWidth, double *kernalArray, int rows, int cols){
    int verticleImageBound=(kernalHeight-1)/2;
    int horizontalImageBound=(kernalWidth-1)/2;

    int row = ( blockDim.x * blockIdx.x + threadIdx.x ) + verticleImageBound;
    int col = ( blockDim.y * blockIdx.y + threadIdx.y ) + horizontalImageBound;

    if ( row >= rows - verticleImageBound || col >= cols - horizontalImageBound ) return;
            float  value=0;
            for(int kRow=0;kRow<kernalHeight;kRow++){
                  for(int kCol=0;kCol<kernalWidth;kCol++){
                  float pixel = src[ 3 * ((kRow+row-verticleImageBound ) * cols + (kCol+col-horizontalImageBound)) ] * kernalArray[kRow + kCol * kernalWidth];
                  value+=pixel;
             }
            }

            printf("value = %f
",round(value));

            dst[3 * ( row * cols + col )] = round(value);
            dst[3 * ( row * cols + col ) + 1] = round(value);
            dst[3 * ( row * cols + col ) + 2] = round(value);

}

__global__ void greyScale( unsigned char * src , int rows, int cols){

        int i = blockDim.x * blockIdx.x + threadIdx.x;
        int j = blockDim.y * blockIdx.y + threadIdx.y;

        if( i >= rows || j >= cols ) {
                return;
        }
            unsigned char r = src[3 * (i * cols + j)];
            unsigned char g = src[3 * (i * cols + j) + 1];
            unsigned char b = src[3 * (i * cols + j) + 2];

            unsigned char linearIntensity = (unsigned char)(0.2126f * r + 0.7512f * g + 0);

            src[3 * (i * cols + j)] = linearIntensity;
            src[3 * (i * cols + j) + 1] = linearIntensity;
            src[3 * (i * cols + j) + 2] = linearIntensity;

}

int main() {

    double GKernel[5][5] = {0.1};
    //FilterCreation(GKernel);
    double * kernel = new double[25];
    int i,j,k = 0;
    for( int i = 0; i < 5; i++){
     for( int j = 0; j < 5; j++){
       kernel[k++] = GKernel[i][j];
     }
    }

    double * deviceKernel;

    cudaMalloc((void **)&deviceKernel, 25 * sizeof(double));
    cudaMemcpy(deviceKernel, kernel, 25 * sizeof(double), cudaMemcpyHostToDevice);

    Info dat; // = readBMP("konik.bmp");
    dat.width = 766;
    dat.height = 511;
    dat.size = dat.width*dat.height*3;
    dat.offset = 0;
    dat.data = new unsigned char[dat.size];
    unsigned char * devPtr;
    unsigned char * devPtrFilter;
    size_t pitch;

    unsigned char * test= new unsigned char  [dat.size - dat.offset ];

    cudaMalloc (( void **)& devPtr , dat.size * sizeof ( unsigned char  ));
    cudaMalloc (( void **)& devPtrFilter , dat.size * sizeof ( unsigned char  ));

    cudaMemcpy ( devPtr , dat.data , sizeof ( unsigned char ) *  dat.size , cudaMemcpyHostToDevice );

    dim3 block(32,32);
    dim3 grid((dat.height+31)/32, (dat.width+31)/32);

        greyScale<<<grid,block>>>(devPtr,dat.height,dat.width);
        CreateGaussFilter<<<grid,block>>>(devPtr,devPtrFilter,5,5,deviceKernel,dat.height,dat.width);

    cudaMemcpy ( test, devPtrFilter , sizeof ( unsigned char )  *  dat.size ,cudaMemcpyDeviceToHost );
    cudaDeviceSynchronize ();
#if 0
    std::ofstream fout;
    fout.open("output.bmp", std::ios::binary | std::ios::out);
    fout.write( reinterpret_cast<char *>(dat.info), dat.offset);

    fout.write( reinterpret_cast<char *>(test), dat.size - dat.offset );
    fout.close();
#endif
    return 0;
}

Categories

c++ - CUDA在内核函数中嵌套循环后未执行代码(CUDA not executing code after nested loop in kernel function)

c++ - CUDA在内核函数中嵌套循环后未执行代码(CUDA not executing code after nested loop in kernel function)

Please log in or register to add a comment.

Please log in or register to reply this article.

1 Reply

Please log in or register to add a comment.

Just Browsing Browsing

Most popular tags