This project is read-only.

GPU vs CPU Performance

Mar 21, 2014 at 12:07 AM
Hi Nick

Thanks for your continued support and patience - it must be tedious dealing with us new players with our VERY limited knowledge.

Anyway, as outlined in my "directory not found" post my results for my GPU experiment are not favourable against the CPU. Here is the output of my code on an HP8770w laptop which has a K3000M GPU and a i7-3720QM, 2.6GHz CPU:


Hardware supports Hi resolution timing
Running examples using Quadro K3000M
GPU Elapsed 00:00:09.0348769
CPU Elapsed 00:00:03.2237525
Done - Press <CR> to exit

If these results are to be believed (which I did based on the fact that my GPU was at 100% utilisation during the GPU test) the CPU wins hands down. This surprised me after all the hype about GPU performance so I am inclined to believe I've done something wrong.

Below is my code and I would very much appreciate any opinions and corrections anyone may have:

Version Info:
VS2012 SP4
Win 8.1
Cudy.Net v1.26
Cudy SDK v5.5.31


using System;
using System.Diagnostics;
using System.Threading.Tasks;
using Cudafy;
using Cudafy.Host;
using Cudafy.Translator;

namespace CudaTest
{
class Program
{
const Int32   N         = 20000000;   // number of samples that can be processed at a time
const Int32   SAMPS     = 200000000;  // total samples to be processed for this experiment
const Single  FS        = 100E6f;     // clock speed
const Single  FREQ      = 26.1E6f;    // test frequency
const Single  FIXED_ARG = (2.0f * (Single)Math.PI * FREQ) / FS;

static void Main (string[] args)
{
  // get timer initialised
  Stopwatch st = new Stopwatch(); 
  if (Stopwatch.IsHighResolution) Console.WriteLine ("Hardware supports Hi resolution timing");

// CudafyModes.Target = eGPUType.Cuda; // To use OpenCL, change this enum
  CudafyModes.Target = eGPUType.OpenCL; // To use OpenCL, change this enum
  CudafyTranslator.Language = CudafyModes.Target == eGPUType.OpenCL ? eLanguage.OpenCL : eLanguage.Cuda;
  int deviceCount = CudafyHost.GetDeviceCount(CudafyModes.Target);
  if (deviceCount == 0)
  {
      Console.WriteLine("No suitable {0} devices found.", CudafyModes.Target);
      return;
  }
  CudafyModes.DeviceId = 0;
  GPGPU gpu = CudafyHost.GetDevice(CudafyModes.Target, CudafyModes.DeviceId);
  Console.WriteLine("Running examples using {0}", gpu.GetDeviceProperties(false).Name);
  CudafyModule km = CudafyTranslator.Cudafy();
  gpu.LoadModule(km);

  Single[] a = new Single[N];
  Single[] b = new Single[N];
  Single[] c = new Single[N];

  // allocate the memory on the GPU
  Single[] dev_a = gpu.Allocate<Single>(a);
  Single[] dev_b = gpu.Allocate<Single>(b);
  Single[] dev_c = gpu.Allocate<Single>(c);

  // total number of samples to be processed
  st.Reset ();
  Int32 j = 0;
  for (Int32 k = 0; k < SAMPS; k++)
  {
    // get the next amplitude sample
    a[j] = (Single)(k % 360); // dummy in some data
    b[j] = k; // i value
    j++;

    // load up 'N' samples for processing as Cuda can't deal with all samples at once
    if (j < N) continue;

    // start the time monitor
    st.Start ();

    // copy the arrays 'a' and 'b' to the GPU
    gpu.CopyToDevice(a, dev_a);
    gpu.CopyToDevice(b, dev_b);

    // launch add on N threads
    gpu.Launch(N, 1).adder(dev_a, dev_b, dev_c);

    // copy the array 'c' back from the GPU to the CPU
    gpu.CopyFromDevice(dev_c, c);

    // stop the monitor
    st.Stop ();

    // store the results
// for (j = 0; j < N; j++) binOut.Write ((Int16)c[j]);
    j = 0;
  }

  st.Stop ();
  Console.WriteLine("{1}GPU Elapsed {0}{1}", st.Elapsed.ToString(), Environment.NewLine) ; 

  // free the memory allocated on the GPU
  gpu.Free(dev_a);
  gpu.Free(dev_b);
  gpu.Free(dev_c);

  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  #region CPU based code

  Double [] xa = new double[SAMPS];
  try
  {
    // load some dummy data
    for (Int32 k = 0; k < SAMPS; k++)
    {
      // get the next amplitude sample
      xa[k] = (Double)(k % 360);
    }

    // setup the time monitor
    st.Reset ();
    st.Start (); 

    // parallelise the processing
    ParallelLoopResult result = Parallel.For (0, SAMPS, k =>
    {
      // load to register
      Double x = xa[k];

      // calc interval
      Double i = k / FS;

      // calc cos & sin wave
      Double y = x * Math.Cos (FIXED_ARG * i);
      Double z = x * Math.Sin (FIXED_ARG * i);

      // mix and save
      xa[k] = y - z;
    });

    st.Stop ();
    Console.WriteLine("CPU Elapsed {0}{1}", st.Elapsed.ToString(), Environment.NewLine) ; 
  }
  catch (Exception ex)
  {
    Console.WriteLine (ex.Message);
  }

  Console.WriteLine ("Done - Press <CR> to exit");
  Console.Read();

  #endregion
}

//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[Cudafy]
public static void adder(GThread thread, Single[] a, Single[] b, Single[] c)
{
  Int32 tid = thread.blockIdx.x;

  // calc interval
  Single i = b[tid] / FS;

  // calc cos & sin wave
  Single y = a[tid] * GMath.Cos (FIXED_ARG * i);
  Single z = a[tid] * GMath.Sin (FIXED_ARG * i);

  // mix and save
  c[tid] = y - z;
}
}
}
Mar 21, 2014 at 10:42 AM
I would recommend playing with the number of blocks and threads. A good all round value for threads per block is 256. You need to then change the first line of adder to calculate based on block ID * block size + thread ID.
```