In Emulator and Device different results

Dec 28, 2011 at 6:46 AM

Hi. I'm trying to use CUDAfy.net. Here is my code:

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using Cudafy;
using Cudafy.Host;
using Cudafy.Translator;

namespace CudafyTest
{
    class Program
    {
        private static GPGPU _gpu;
        
        static void Main(string[] args)
        {
            CudafyModule km = CudafyTranslator.Cudafy(typeof(ParamsStruct), typeof(ImpliedVolatile));

            _gpu = CudafyHost.GetDevice(eGPUType.Emulator);
            _gpu.LoadModule(km);

            ParamsStruct[] host_par = new ParamsStruct[1];
            ParamsStruct[] result = new ParamsStruct[1];
            host_par[0].OP = 96.95;
            host_par[0].Price = 1332.24;
            host_par[0].Strike = 1235;
            host_par[0].TD = 31;
            host_par[0].R = 0.0001355;
            host_par[0].Q = 0.0166;
            host_par[0].N = 1000;
            host_par[0].kind = 1;

            ParamsStruct[] dev_par = _gpu.CopyToDevice(host_par);
            float[] PA = _gpu.Allocate<float>(1001);
            _gpu.Launch(1,1, "impliedVolatile", dev_par, PA);

            _gpu.CopyFromDevice(dev_par, result);

            Console.WriteLine("I={0}, B={1}", result[0].i, result[0].B);           
            Console.ReadKey();
        }

    }

    [Cudafy]
    public struct ParamsStruct
    {
        //public double[] PA;
        public double OP;
        public double Price;
        public double Strike;
        public double TD;
        public double R;
        public double Q;
        public int N;
        public int i;
        public double B;
        public int kind;
    }        


    public class ImpliedVolatile
    {
        [Cudafy]
        public static double FCRR(float[] PA, int N, double S, double X, double sigma, double R, double Q, double delta, int kind)
        {
            double Up = Math.Exp(sigma * Math.Sqrt(delta));
            double p0 = (Math.Exp((R - Q) * delta) - 1 / Up) / (Up - 1 / Up);
            if (p0 > 1 || p0 < 0)
                return -1;
            double p1 = 1 - p0;

            int bm = 1;
            if (kind == 0 || kind == 3)
                bm = -1;

            for (int i = 0; i <= N; ++i)
            {
                PA[i] = (float)((S * Math.Pow(Up, N - 2 * i) - X) * bm);
                if (PA[i] < 0)
                    PA[i] = 0;
            }

            double mexp = Math.Exp(-R * delta);
            double exercize = 0;
            for (int k = N - 1; k >= 0; k--)
                for (int i = 0; i <= k; ++i)
                {
                    PA[i] = (float)((p0 * PA[i] + p1 * PA[i + 1]) * mexp);
                    if (kind == 3 || kind == 2)
                    {
                        exercize = (S * Math.Pow(Up, k - 2 * i) - X) * bm;
                        if (PA[i] < exercize)
                            PA[i] = (float)exercize;
                    }
                }

            return PA[0];
        }

        [Cudafy]
        public static void impliedVolatile(ParamsStruct[] par, float[] PA)
        {
            double Ast;
            double S;
            double A = 0.001;
            par[0].B = 3;
            double C = A;
            double D = 0;
            double FF = par[0].OP;
            double delta = par[0].TD / 365 / par[0].N;
            double FA = FCRR(PA, par[0].N, par[0].Price, par[0].Strike, A, par[0].R, par[0].Q, delta, par[0].kind) - FF;


            while (FA == -1 - FF)
            {
                A += 0.001;
                FA = FCRR(PA, par[0].N, par[0].Price, par[0].Strike, A, par[0].R, par[0].Q, delta, par[0].kind) - FF;
            }
            double FB = FCRR(PA, par[0].N, par[0].Price, par[0].Strike, par[0].B, par[0].R, par[0].Q, delta, par[0].kind) - FF;
            while (FB == -1 - FF)
            {
                par[0].B -= 0.001;
                FB = FCRR(PA, par[0].N, par[0].Price, par[0].Strike, par[0].B, par[0].R, par[0].Q, delta, par[0].kind) - FF;
            }
            //
            double FC = FA;
            double FS = 1;
            bool MFlag = true;

            par[0].i = 0;

            while ((Math.Abs(FB) > 0.01) || (Math.Abs(FS) > 0.01) || (Math.Abs(par[0].B - C) > 0.001))
            {

                if ((FA != FC) && (FB != FC))
                    S = A * FB * FC / (FA - FB) / (FA - FC) + par[0].B * FA * FC / (FB - FA) / (FB - FC) + C * FA * FB / (FC - FA) * (FC - FB);
                else
                    S = par[0].B - FB * (par[0].B - A) / (FB - FA);


                if (
                    !(((3 * A + par[0].B) / 4 <= C) && (C <= par[0].B)) ||
                    ((Math.Abs(S - par[0].B) >= Math.Abs(par[0].B - C) / 2) && MFlag) ||
                    ((Math.Abs(S - par[0].B) >= Math.Abs(C - D) / 2) && MFlag) ||
                    ((Math.Abs(par[0].B - C) <= 0.001) && MFlag) ||
                    ((Math.Abs(C - D) <= 0.001) && MFlag)
                    )
                {
                    S = (A + par[0].B) / 2;
                    MFlag = true;
                }
                else
                    MFlag = false;


                FS = FCRR(PA, par[0].N, par[0].Price, par[0].Strike, S, par[0].R, par[0].Q, delta, par[0].kind) - FF;
                D = C;
                C = par[0].B;
                FC = FB;
                if ((FA * FS) < 0)
                {
                    par[0].B = S;
                    FB = FS;
                }
                else
                {
                    A = S;
                    FA = FS;
                }
                if (Math.Abs(FA) <= Math.Abs(FB))
                {
                    Ast = par[0].B;
                    par[0].B = A;
                    A = Ast;
                    Ast = FB;
                    FB = FA;
                    FA = Ast;
                }
                par[0].i++;
                if (par[0].i > 30)
                {
                    par[0].B = 0;
                    break;
                }
            }
        }
    }
}

But in Emulator result is right, and in device is wrond. What am I doing wrong?

Coordinator
Dec 28, 2011 at 5:43 PM

Hi there,

I compiled your code and indeed there is an issue.  When targeting the device a launch time out error occured (and a dramatic blackening of the screen followed by a message from the NVIDIA driver saying it had recovered).  I reduced host_par[0].N = 500 and this worked fine giving the same result on device and in emulator.  Kernel routines should not take too long, so you may need to split your algorithm or limit N.

Let me know how it works out. Which finance algorithm is this by the way?  A variation on Black-Scholes?

Regards,

Nick

Dec 29, 2011 at 9:19 AM

After installing NSight tonight it gave a tip to disable "WDDM TDR" because when debugging on the GPU it will constantly reset the graphics card after 2 seconds when stopped at a breakpoint.

This isn't something you need NSight to do though. There is a registery edit you can do, check out this Microsoft documentation http://msdn.microsoft.com/en-us/windows/hardware/gg487368 It happens to be for Vista SP1 but if your using Win7 it should also apply. My recommendation is to adjust the timeout to something greater than 2 seconds. Disabling it completely seems harsh to me as it can actually lock up your machine if you accidently write a endless loop or something like that.

Coordinator
Dec 29, 2011 at 10:20 AM

That's a fair enough tip if you will be the only one using your application, but generally if your kernel function is taking so long then you should also ask if the algorithm can be better implemented. 

By the way you can use NSight with CUDAfy fairly easily, see: http://www.hybriddsp.com/Support/CudafyTutorials/UsingtheNVIDIANSightDebugger.aspx

 

Jan 3, 2012 at 4:58 AM

The problem is that reading from memory takes time. More than 2 seconds.

Why so long runs _gpu.CopyFromDevice(dev_par, result); ?

Jan 3, 2012 at 5:05 AM

Did the tip I mentioned work for you?

Coordinator
Jan 3, 2012 at 5:42 AM

Kernel calls are asynchronous, so do not put CPU side timing code around it since this will make no sense.  The CopyFromDevice routine will be performed after the launch, therefore appearing to take all the time.  Try using the gpu timing methods or running your code through NVIDIA Visual Compute tool - it's trivial to use and powerful.

Jan 3, 2012 at 6:53 AM
xer21 wrote:

Did the tip I mentioned work for you?

I tried, but did not help

Jan 3, 2012 at 6:59 AM
NickKopp wrote:

Kernel calls are asynchronous, so do not put CPU side timing code around it since this will make no sense.  The CopyFromDevice routine will be performed after the launch, therefore appearing to take all the time.  Try using the gpu timing methods or running your code through NVIDIA Visual Compute tool - it's trivial to use and powerful.

I use gpu timing methods. 

_gpu.Launch(1,1, "impliedVolatile", dev_par, PA); takes 4ms
_gpu.CopyFromDevice(dev_par, result); takes > 2000ms
Jan 3, 2012 at 7:33 AM

Can you run this code? How much time it will take?

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using Cudafy;
using Cudafy.Host;
using Cudafy.Translator;
using System.Diagnostics;

namespace CudafyTest
{
    class Program
    {
        private static GPGPU _gpu;
        
        static void Main(string[] args)
        {
            CudafyModule km = CudafyTranslator.Cudafy(typeof(ParamsStruct), typeof(ImpliedVolatile));

            _gpu = CudafyHost.GetDevice(eGPUType.Cuda);
            _gpu.LoadModule(km);
            int count = 10;

            ParamsStruct[] host_par = new ParamsStruct[count];
            int[] host_i = new int[count];
            float[] host_B = new float[count];
            int[] result_i = new int[count];
            float[] result_B = new float[count];

            for (int i = 0; i < count; i++)
            {
                host_par[i].OP = 96.95F;
                host_par[i].Price = 1332.24F;
                host_par[i].Strike = 1235;
                host_par[i].TD = 31;
                host_par[i].R = 0.0001355F;
                host_par[i].Q = 0.0166F;
                host_par[i].N = 1000;
                host_par[i].kind = 1;
            }

            ParamsStruct[] dev_par = _gpu.CopyToDevice(host_par);
            int[] dev_i = _gpu.CopyToDevice(host_i);
            float[] dev_B = _gpu.CopyToDevice(host_B);


            _gpu.StartTimer();
            _gpu.Launch(count, 1, "impliedVolatile", dev_par, dev_i, dev_B);
            

            _gpu.CopyFromDevice(dev_i, result_i);
            _gpu.CopyFromDevice(dev_B, result_B);

            float time = _gpu.StopTimer();

            Console.WriteLine("Time {0}", time);
            Console.WriteLine("I={0}, B={1}", result_i[count - 1], result_B[count-1]);           
            Console.ReadKey();
        }

    }

    [Cudafy]
    public struct ParamsStruct
    {
        public float OP;
        public float Price;
        public float Strike;
        public float TD;
        public float R;
        public float Q;
        public int N;
        //public int i;
        //public float B;
        public int kind;
    }        


    public class ImpliedVolatile
    {
        [Cudafy]
        public static float FCRR(float[] PA, int N, float S, float X, float sigma, float R, float Q, float delta, int kind)
        {
            float Up = GMath.Exp(sigma * GMath.Sqrt(delta));
            float p0 = (GMath.Exp((R - Q) * delta) - 1 / Up) / (Up - 1 / Up);
            if (p0 > 1 || p0 < 0)
                return -1;
            float p1 = 1 - p0;

            int bm = 1;
            if (kind == 0 || kind == 3)
                bm = -1;

            for (int i = 0; i <= N; ++i)
            {
                PA[i] = (S * GMath.Pow(Up, N - 2 * i) - X) * bm;
                if (PA[i] < 0)
                    PA[i] = 0;
            }

            float mexp = GMath.Exp(-R * delta);
            float exercize = 0;
            for (int k = N - 1; k >= 0; k--)
                for (int i = 0; i <= k; ++i)
                {
                    PA[i] = (p0 * PA[i] + p1 * PA[i + 1]) * mexp;
                    if (kind == 3 || kind == 2)
                    {
                        exercize = (S * GMath.Pow(Up, k - 2 * i) - X) * bm;
                        if (PA[i] < exercize)
                            PA[i] = exercize;
                    }
                }

            return PA[0];
        }

        [Cudafy]
        public static void impliedVolatile(GThread thread, ParamsStruct[] par, int[] dev_i, float[] dev_B)
        {
            float[] PA = thread.AllocateShared<float>("PA", 1001);
            int idx = thread.blockIdx.x;
            for (int i = 0; i <= 1000; i++)
            {
                PA[i] = 0;
            }
            float Ast;
            float S;
            float A = 0.001F;
            dev_B[idx] = 3;
            float C = A;
            float D = 0;
            float FF = par[idx].OP;
            float delta = par[idx].TD / 365 / par[idx].N;
            float FA = FCRR(PA, par[idx].N, par[idx].Price, par[idx].Strike, A, par[idx].R, par[idx].Q, delta, par[idx].kind) - FF;

            //������ ��������� ������
            while (FA == -1 - FF)
            {
                A += 0.001F;
                FA = FCRR(PA, par[idx].N, par[idx].Price, par[idx].Strike, A, par[idx].R, par[idx].Q, delta, par[idx].kind) - FF;
            }
            float FB = FCRR(PA, par[idx].N, par[idx].Price, par[idx].Strike, dev_B[idx], par[idx].R, par[idx].Q, delta, par[idx].kind) - FF;
            while (FB == -1 - FF)
            {
                dev_B[idx] -= 0.001F;
                FB = FCRR(PA, par[idx].N, par[idx].Price, par[idx].Strike, dev_B[idx], par[idx].R, par[idx].Q, delta, par[idx].kind) - FF;
            }
            //
            float FC = FA;
            float FS = 1;
            bool MFlag = true;

            dev_i[idx] = 0;

            while ((GMath.Abs(FB) > 0.01) || (GMath.Abs(FS) > 0.01) || (GMath.Abs(dev_B[idx] - C) > 0.001))
            {
                //sse � ������
                if ((FA != FC) && (FB != FC))
                    S = A * FB * FC / (FA - FB) / (FA - FC) + dev_B[idx] * FA * FC / (FB - FA) / (FB - FC) + C * FA * FB / (FC - FA) * (FC - FB);
                else
                    S = dev_B[idx] - FB * (dev_B[idx] - A) / (FB - FA);


                if (
                    !(((3 * A + dev_B[idx]) / 4 <= C) && (C <= dev_B[idx])) ||
                    ((GMath.Abs(S - dev_B[idx]) >= GMath.Abs(dev_B[idx] - C) / 2) && MFlag) ||
                    ((GMath.Abs(S - dev_B[idx]) >= GMath.Abs(C - D) / 2) && MFlag) ||
                    ((GMath.Abs(dev_B[idx] - C) <= 0.001) && MFlag) ||
                    ((GMath.Abs(C - D) <= 0.001) && MFlag)
                    )
                {
                    S = (A + dev_B[idx]) / 2;
                    MFlag = true;
                }
                else
                    MFlag = false;


                FS = FCRR(PA, par[idx].N, par[idx].Price, par[idx].Strike, S, par[idx].R, par[idx].Q, delta, par[idx].kind) - FF;
                D = C;
                C = dev_B[idx];
                FC = FB;
                if ((FA * FS) < 0)
                {
                    dev_B[idx] = S;
                    FB = FS;
                }
                else
                {
                    A = S;
                    FA = FS;
                }
                if (GMath.Abs(FA) <= GMath.Abs(FB))
                {
                    Ast = dev_B[idx];
                    dev_B[idx] = A;
                    A = Ast;
                    Ast = FB;
                    FB = FA;
                    FA = Ast;
                }
                dev_i[idx]++;
                if (dev_i[idx] > 30)
                {
                    dev_B[idx] = 0;
                    break;
                }
            }
        }
    }
}


Coordinator
Jan 5, 2012 at 3:37 PM

After the launch you need to synchronize the context [gpu.SynchronizeStream(0);] and then put your timing check.

Apr 28, 2013 at 1:54 AM
Hi i am having the same issue, i am making an image Gaussian filter and tried my code on the emulator gives correct results but with cuda itself the result is not correct and this is my code:

Download Link to the cs file:
TEXT

and i am testing this filter on small images because of the image indexing on the gpu using the blocks and threads together is not finished i am making dim of blocks with the size of the image so maximum image size can be tested is (300x200) = 60000 block so just try on small images

i am really stuck with it so thnx for any reply :)