I ran that and getting the same post 50k errors.

I understand the concept, split the array into 40k sections and feed them to the GPU. What I fail to understand is how you are stopping the GPU from calculating past your desired amount. I.E. When the loop starts out you are passing just "a_d" bc i = 0; so the pass would look like (got rid of N, I think it was a unnecessary remnant of something previously done):

double_array <<< 40000, 1 >>> (a_d);

Now that I think of it, this way SHOULD work, but is inefficient. Correct me if i am wrong, but this way would start at 0 and go to N, then 40k to N, then 80k to N...till the last loop, overwriting the previous incorrect calculations with correct ones.

Here is the output from when it works to when it errs, no idea why it just stops working past 50k...Maybe you can make sense of it with the code and output provided.

Code:

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

#include <iostream>

#include <math.h>

#include <stdio.h>

#include <fstream>

using namespace std;

__global__ void double_array( float *a)

{

int idx = blockIdx.x;

a[idx] = sqrt(sqrt(sqrt(sqrt(sqrt(sqrt(sqrt(sqrt(sqrt(sqrt(sqrt(sqrt(sqrt(sqrt(sqrt(sqrt(sqrt(sqrt(sqrt(sqrt(sqrt(a[idx])))))))))))))))))))));

}

int main( void )

{

float *a_h, *a_d;

const int N = 200000;

ofstream ofs("C:\\Test\\test.txt");

a_h = (float *)malloc( N * sizeof(float) );

cudaMalloc( &a_d, N * sizeof(float) );

for ( int i = 0; i < N; i++ )

a_h[i] = (float)i;

cudaMemcpy( a_d, a_h, N, cudaMemcpyHostToDevice );

for (int i = 0; i < N / 40000 + 1; i++)

{

double_array <<< 40000, 1 >>> (a_d + i * 40000);

}

cudaMemcpy( a_h, a_d, sizeof( float ) * N, cudaMemcpyDeviceToHost );

for ( int i = 0; i < N; i++ )

{

ofs << i << " " << fixed << a_h[i] << endl;

}

free( a_h );

cudaFree( a_d );

}

Output:

49990 1.000005

49991 1.000005

49992 1.000005

49993 1.000005

49994 1.000005

49995 1.000005

49996 1.000005

49997 1.000005

49998 1.000005

49999 1.000005

50000 1.000000

50001 0.000000

50002 1.000000

50003 1.#QNAN0

50004 1.000000

50005 0.000000

50006 1.000000

50007 1.#QNAN0

50008 1.000000

50009 0.000000

50010 1.000000

50011 1.#QNAN0

50012 1.000000

50013 1.000000

50014 1.#QNAN0

50015 1.#QNAN0

50016 0.000000

50017 0.000000

50018 0.000000

50019 0.000000

50020 0.000000

50021 0.000000

50022 0.000000

50023 0.000000

I know I say it a lot but thank you so much, I really appreciate the time you are putting in to help me.