Black screen when running moving window model

Sep 21, 2015 at 9:56 AM
Edited Sep 21, 2015 at 10:08 AM
When running the GPU code below I get a black screen. Graphics card: Nvidia Quadro K620.

The code is not complete, but the critical stuff is included.

Problem description: A moving circular window scans over a GIS raster, so that for each cell in the raster a neighborhood is evaluated. This evaluation consists of summing the values for all cells in the neighborhood, if the center cell is larger than zero. If the sum exceeds a certain threshold value, the cell at the center of the window will be assigned value 1, otherwise 0. The result is later saved in a new two-dimensional array that is used to create a new raster.

The source raster in this case is a DotSpatial raster object. Raster size = 6784 rows, 3480 columns. The neighborhood radius is 15 cells long.

The CPU-implementation works as expected. The GPU version however gives a black screen at runtime expect for very small neighborhoods (1-2 cells radius). Could there be some error when trying to access indata in the neighborhood?

(Note: For some reason all plus-symbols in the code below are replaced with &#43 when submittting this post, although it looks prefect in Preview mode)

CPU-implementation:
int threshold = 50;
int radiusNbCells = sourceRaster.CellWidth;  // (= 20)
int radiusNbCellsSqr = radiusNbCells * radiusNbCells;
Parallel.For(0, lastRow+1, i =>
{
   for (int j = 0; j <= lastCol; j++)
   {
      if (raster.Value[i, j] > 0)
      {
         var val = getNeighborhoodValue(i, j, radiusNbCells, 
        lastRow, lastCol, sourceRaster, thresHold);
         resultRaster.Value[i, j] = val;
      }
      else
      {
         resultRaster.Value[i, j] = 0;
      }
   }
});
private double getNeighborhoodValue(int i, int j, int radiusNbCells, 
   int lastRow, int lastCol,  IRaster raster, int thresHold)
{
   double tot = 0.0;
   int maxRadiusNbCellsSqr = radiusNbCells* radiusNbCells;
   // Bounding box = cell row +/- radusNbCells, cell column +/- radusNbCells
   for (int row = i - radiusNbCells; row <= i + radiusNbCells; row++)
   {
      if (row > lastRow || row < 0)
         continue;
      int y2 = (row - i) * (row - i);
      for (int col = j - radiusNbCells; col <= j + radiusNbCells; col++)
      {
         if (col > lastCol || col < 0)
            continue;
         if (raster.Value[row, col] > 0.0)
         {
            var x2 = (col - j)*(col - j);
            var distSqr = (x2 + y2);
            if (distSqr <= maxRadiusNbCellsSqr)
               tot += raster.Value[row, col];
         }
      }
   }
   return (tot >= thresHold) ? 1.0 : 0.0;
}
GPU-implementation:
public class HabitatModeGpu
{ 
   public IRaster Execute(IRaster raster, int thresHold)
   { 
     var mb = new float[nbRows, nbCols];
     var cpuMemIn = new float[nbRows, nbCols];
     var resultArray = new float[nbRows, nbCols];
     var resultRaster = new Raster(…)
    
…

     int i2 = 0;
     for (var i = 0; i <= lastRow; i++)
     {
     for (var j = 0; j <= lastCol; j++)
        {
           cpuMemIn[i, j] = (float) raster.Value[i, j];
        }
     }

     var gpu = NewCuda();
     var gpuMemIn = gpu.Allocate(mb);
     var gpuMemOut = gpu.Allocate(resultArray);
     ..

     gpu.CopyToDevice(cpuMemIn, gpuMemIn);

     gpu.Launch(nbRows, 1, NeighborhoodKernel, gpuMemIn, gpuMemOut, radiusNbCells, 
        lastRow, lastCol, thresHold);     
     gpu.Synchronize();  // Required??
     gpu.CopyFromDevice(gpuMemOut, resultArray);

     // Copy to new raster
     Parallel.For(0, lastRow+1, (i) =>
     {
        for (var j = firstCol; j <= lastCol; j++)
        {
          resultRaster.Value[i, j] = resultArray[i, j];
         }
     });
     gpu.FreeAll();
     
     return resultRaster;
   
   }
   public static GPGPU NewCuda()
   {
     GPGPU gpu = CudafyHost.GetDevice(CudafyModes.Target, 0);
     eArchitecture arch = gpu.GetArchitecture();
     CudafyModule km = CudafyTranslator.Cudafy(arch);
     gpu.LoadModule(km);
     return gpu;
   }
   [Cudafy]
   public static float GetNeighborhoodValue(int i, int j, int radiusNbCells, int lastRow, 
      int lastCol, float[,] dataIn, int thresHold)
   {
      float tot = 0;
      int radiusNbCellsSqr =  radiusNbCells * radiusNbCells;
      for (int row = i - radiusNbCells; row <= i + radiusNbCells; row++)
      {
         if (row > lastRow || row < 0)  continue;
         
         int y2 = (row - i) * (row - i);
         for (int col = j - radiusNbCells; col <= j + radiusNbCells; col++)
         {
            if (col > lastCol || col < 0) continue;

            if (dataIn[row,col] > 0.0f)
            {
               int x2 = (col - j)*(col - j);
               // Check within circle x2 + y2 = maxRadiusSqr expressed in terms of number 
               // of cells
               int dist2 = (x2 + y2);
               if (dist2 <= radiusNbCellsSqr) tot += dataIn[row, col];
            }
         }
      }
      // Reclassify
      return (tot >= thresHold) ? 1.0f : 0.0f;
    }
    [Cudafy]
    public static void NeighborhoodKernel(GThread gThread, float[,] dataIn, float[,]
        dataOut, int radiusNbCells, int lastRow, int lastCol, int thresHold)
    {
       int x = gThread.blockIdx.x;
       int nbCols = lastCol + 1;
       for (int y = 0; y <= lastCol; y++)
       {
          dataOut[x, y] = GetNeighborhoodValue(x, y, radiusNbCells, lastRow, 
            lastCol, dataIn, thresHold);
          
       }
    }
Coordinator
Sep 22, 2015 at 10:23 AM
Could it be that your code makes intensive use of GPU and takes a long time to execute? If so that could be the cause. Options: dedicate a GPU for processing or split up your algorithm such that each launch takes a shorter amount of time.
Sep 25, 2015 at 10:56 AM
Do you have any newbie example for how to dedicate a gpu for processing?
Coordinator
Sep 25, 2015 at 1:44 PM
Don't plug a monitor into the GPU you want to dedicate.
Sep 28, 2015 at 7:01 AM
I forgot to mention that the problem is not only a black screen, but a Cudafy ErrorUnknown exception occurs. Tried to unplug the monitor from the GPU used for processing, but the problem persists.
Sep 29, 2015 at 12:47 PM
Hi
Is nbRows=6784 ?
Anyhow, your problem is here:
gpu.Launch(nbRows, 1, ...
you are launching 6784 blocks of 1 thread each.
Btw, if you were to do the opposite (gpu.Launch(1, nbRows, ...) , in theory it would work, but in practice you wont' be allowed to have more than 1024 (device-dependent) threads per block. Besides the ideal number of threads per block is something that needs fine-tuning. Finally, using only 1 block means using only a small% of your gpu capacity.
You should combine threads with blocks and within your kernel compute the overall index of your row based on thread and block id's. There are many such cases within the cudafy examples.
Oct 6, 2015 at 10:42 AM
Thanks for your efforts to help!

Yes, nbRows = 6784. The raster represents a large geographic area.
I have tried with:
gridSize = nbCols*nbRows/512 = 46110 , and
blockSize = 512

Have also tried to make the code faster by creating a precalculated distance array, and use one-dimensional arrays instead of two-dimensional (if that is more efficient?)
But without success.

I am running as an NUnit test, is that a problem?
Oct 7, 2015 at 8:37 AM
Hi
46110 is huge, try, shall we say, [gridsize=512, blocksize=512], and call your kernel several times (make sure you call gpu.Synchronize() between them).
Pass as an extra argument to the kernel with the index of this outer loop, so you may compute the absolute index.
The last time you call your kernel, just use a smaller gridsize to account for the leftovers.
Each kernel run should take you less than 1 second, to keep the OS stable.
Oct 8, 2015 at 6:47 AM
Hi
How does the kernel function know which block to work with when looping? I have looked over the examples but could not identify one that resembles this case. Is there any chance you could give me a small example to solve this? Ignoring the leftovers at the moment, I have done this:
             int firstRow = raster.StartRow;
            int lastRow = raster.EndRow;
            int firstCol = raster.StartColumn;
            int lastCol = raster.EndColumn;
            int nbRows = lastRow - firstRow + 1;
            int nbCols = lastCol - firstCol + 1;
            int thresHold = 50;
            
            var cpuMemIn = new float[nbRows*nbCols];
            int i2 = 0;
            for (var i = firstRow; i <= lastRow; i++)
                for (var j = firstCol; j <= lastCol; j++)
                    if (raster.Value[i, j] > 0)
                        cpuMemIn[i2++] = (float) raster.Value[i, j];
                    else
                        cpuMemIn[i2++] = -1;

            // Precalculate relative distances to spare workload for gpu
            int radiusNbCells = 15; // Number of cells that corresponds to the radius of the neigborhood circle
            var windowWidth = radiusNbCells*2 + 1;
            var cpuDistInSqr = new int[windowWidth*windowWidth];  // Squared distance between two cells with running index distindex
            var distIndex = 0;
            for (int i = -radiusNbCells; i <= radiusNbCells; i++)
                for (int j = -radiusNbCells; j <= radiusNbCells; j++)
                    cpuDistInSqr [distIndex++] = i * i + j * j;

            var gpuMemIn = gpu.Allocate(cpuMemIn);
            var gpuMemDistIn = gpu.Allocate(cpuDistInSqr);
            var resultArray = new float[nbRows*nbCols];
            var gpuMemOut = gpu.Allocate(resultArray);
            var constValue = new int[4];
            constValue[0] = lastRow;
            constValue[1] = lastCol;
            constValue[2] = thresHold;
            constValue[3] = radiusNbCells;
            var gpuConstValue = gpu.Allocate(constValue);

            int nbLoops = nbCols*nbRows/512;
            for (int i = 0; i < nbLoops; i++)
            {
                gpu.Launch(512, 512, NeighborhoodKernel, i, gpuConstValue, gpuMemIn, gpuMemDistIn, gpuMemOut);
                gpu.Synchronize(); // Required??
            }
            gpu.CopyFromDevice(gpuMemOut, resultArray);
        [Cudafy]
        public static void NeighborhoodKernel(GThread thread, int loopCounter, int[] constValues, float[] dataIn, int[] distInSqr, float[] dataOut)
        {            
             int x = thread.threadIdx.x + thread.blockIdx.x * thread.blockDim.x;  // Correct???
             int y = thread.threadIdx.y + thread.blockIdx.y * thread.blockDim.y;   // Correct???
             
             int lastRow = constValues[0];
             int lastCol = constValues[1];
             int thresHold = constValues[2];
             int radiusNbCells = constValues[3];

             // Loop required?
             dataOut[???] = GetNeighborhoodValue(y, x, radiusNbCells, distInSqr, lastRow, lastCol, dataIn, thresHold);  // Correct passing y as row index?
        }
        [Cudafy]
        public static float GetNeighborhoodValue(int i, int j, int radiusNbCells, int[] distInSqr, int lastRow, int lastCol,
             float[] dataIn, int thresHold)
        {
            float tot = 0;
            int maxRadiusNbCellsSqr = radiusNbCells * radiusNbCells;
            int relRowNum = -1;
            for (int row = i - radiusNbCells; row <= i + radiusNbCells; row++)
            {
                relRowNum++;
                int distIndex = relRowNum * (2 * radiusNbCells + 1) - 1;
                if (row > lastRow || row < 0) continue;

               // int y2 = (row - i) * (row - i);
                for (int col = j - radiusNbCells; col <= j + radiusNbCells; col++)
                {
                    int rasterIndex = row * (lastCol + 1) + col;
                    distIndex++;
                    if (col > lastCol || col < 0)
                        continue;
                    if (distInSqr[distIndex] <= maxRadiusNbCellsSqr && dataIn[rasterIndex] > 0.0f)
                    {
                        tot += dataIn[rasterIndex];
                    }

                }
            }
            // Reclassify
            return (tot >= thresHold) ? 1.0f : 0.0f;
        }
Oct 9, 2015 at 5:27 PM
Hi
It seems you need to understand the basics, I recommend you read (at least) this chapter of the cuda toolkit docs:
https://docs.nvidia.com/cuda/cuda-c-programming-guide/#thread-hierarchy
Oct 23, 2015 at 1:29 PM
I am trying to understand the basics but I find it hard. That it why I call for help. I have not found examples, especially with 2D arrays. I have tried to solve it with the following code but the problem remains:
            gpu.Launch(512, 512, NeighborhoodKernel, gpuConstValue, gpuMemIn, gpuMemDistIn, gpuMemOut);
            gpu.CopyFromDevice(gpuMemOut, resultArray);
        [Cudafy]
        public static void NeighborhoodKernel(GThread thread, int[] constValues, float[,] dataIn, int[] distInSqr, float[,] dataOut)
        {
            int nbRows = constValues[0];
            int nbCols = constValues[1];
            int thresHold = constValues[2];
            int radiusNbCells = constValues[3];

            int x = thread.threadIdx.x + thread.blockIdx.x * thread.blockDim.x;
            int y = thread.threadIdx.y + thread.blockIdx.y * thread.blockDim.y;
            while (x < nbRows)
            {
                while (y < nbCols)
                {
                    dataOut[x, y] = GetNeighborhoodValue(x, y, radiusNbCells, distInSqr, nbRows, nbCols, dataIn, thresHold);
                    thread.SyncThreads();
                    y += thread.blockDim.y * thread.gridDim.y;
                }
                x += thread.blockDim.x*thread.gridDim.x;
            }
        }
        [Cudafy]
        public static float GetNeighborhoodValue(int i, int j, int radiusNbCells, int[] distInSqr, int nbRows, int nbCols,
             float[,] dataIn, int thresHold)
        {
            float tot = 0;
            int maxRadiusNbCellsSqr = radiusNbCells * radiusNbCells;
            int relRowNum = -1;
            for (int row = i - radiusNbCells; row <= i + radiusNbCells; row++)
            {
                relRowNum++;
                int distIndex = relRowNum * (2 * radiusNbCells + 1) - 1;
                if (row >= nbRows)
                    break;
                if (row < 0) 
                    continue;

                for (int col = j - radiusNbCells; col <= j + radiusNbCells; col++)
                {
                    distIndex++;
                    if (col >= nbCols)
                        break;
                    if (col < 0)
                        continue;
                    if (distInSqr[distIndex] <= maxRadiusNbCellsSqr && dataIn[row,col] > 0.0f)
                    {
                        tot += dataIn[row,col];
                    }
                }
            }
            // Reclassify
            return (tot >= thresHold) ? 1.0f : 0.0f;
        }