2013年08月03日

CUDAでの配列とCURAND

__device__で宣言した配列を使用して,カーネル内で計算するので,既存のCPUコードを1次元配列に直す必要性が無い.

randをCUDAで使用するために,Hostから生成.
0〜1までの乱数と,正規分布(平均0,標準偏差1.0)を生成,度数分布をFileout.



//cf. cuda toolkit curand guide

// NOTE
// compute capability 1.3 over; USING double (-floating decimal)

#include
#include

#include
#include //using cutilsafecall
#include
#pragma comment (lib, "curand.lib")

int H_a[4][5][6];
double H_b[4][5][6];
double H_c[4][5][6];
__device__ int D_a[4][5][6];
__device__ double D_b[4][5][6];
__device__ double D_c[4][5][6];


__global__ void d_add()
{
//BlockIdx.x= .y= .z= 0;
int i, j, k;
i= threadIdx.x;
j= threadIdx.y;
k= threadIdx.z;

D_c[i][j][k]= (double)D_a[i][j][k]+ D_b[i][j][k]; //__device__
return;
}


int main()
{
std::cout << "main start." << std::endl;

for( int i= 0; i < 4; i++){
for( int j= 0; j < 5; j++){
for(int k= 0; k < 6; k++){
H_a[i][j][k]= i+ j+ k;
H_b[i][j][k]= (i+ j+ k)/ 2.0;
}
}
}

int size_a= sizeof(H_a);
int size_b= sizeof(H_b);
int size_c= sizeof(H_c);

std::cout << "memcpy to device from host, start." << std::endl;

// case: __device__ array Declared
cutilSafeCall(cudaMemcpyToSymbol(D_a, H_a, size_a, 0, cudaMemcpyHostToDevice));
cutilSafeCall(cudaMemcpyToSymbol(D_b, H_b, size_b, 0, cudaMemcpyHostToDevice));
dim3 BlockNum(4, 5, 6);
d_add<<<1, BlockNum>>>();

cutilSafeCall(cudaMemcpyFromSymbol(H_c, D_c, size_c, 0, cudaMemcpyDeviceToHost)); //__device__
//check the "H_c" using debugger, or console out
for( int i= 0; i < 4; i++){
for( int j= 0; j < 5; j++){
for(int k= 0; k < 6; k++){
if(H_c[i][j][k] != ((double)H_a[i][j][k]+ H_b[i][j][k]) ){
std::cout << "device kernel error\t" << std::endl;
}
}
}
}


//distribution
std::cout << "curand start." << std::endl;
double *H_rand; //40million arrays -> dynamic memory allocate;
double *D_rand;

int count= 80000; //80,000blocks, sum 40,960,000threads
int size_rand= sizeof(double)*count* 512; //512threads/block
cutilSafeCall(cudaMallocHost(&H_rand, size_rand));
cutilSafeCall(cudaMalloc (&D_rand, size_rand));

std::cout << size_rand/1024/1024 << "MB" << std::endl;
// 8Byte * 40,960,000elements = 327e+6[Byte] = 312.5MB

// rand generate INITIALIZE
curandGenerator_t generator;
curandCreateGenerator(&generator, CURAND_RNG_PSEUDO_XORWOW); //create generator
curandSetPseudoRandomGeneratorSeed(generator, 1234ULL); //set seed

//** 0 to 1 **
curandGenerateUniformDouble(generator, D_rand, count*512);//generate 0~1 random num.
cutilSafeCall(cudaMemcpy(H_rand, D_rand, size_rand, cudaMemcpyDeviceToHost));

int distribution_01[1000];
int n, temp;
for(n= 0; n < 1000; n++){ //Initialize
distribution_01[n]= 0;
}
for(n= 0; n < count*512; n++){
temp= (int)floor(H_rand[n]*1000); //1,000division
if(0<=temp && temp<1000){
distribution_01[temp]++;
}
}
//file out
std::ofstream foutd("rand.txt");
if( !foutd ){
return 0;
}
for(n= 0; n < 1000; n++){
foutd << (double)(n/1000.0) << "\t" << (double)(distribution_01[n])/(double)(count*512.0) << std::endl;
}
foutd.close();


//** normal distribution **
std::cout << "normal distribution using curand starts." << std::endl;
curandGenerateNormalDouble(generator, D_rand, count* 512, 0.0, 1.0);
//mean=0.0, standard deviation= 1.0
cutilSafeCall(cudaMemcpy(H_rand, D_rand, size_rand, cudaMemcpyDeviceToHost));

int distribution_norm[3000]; //500divisons/1.0, -3.0~+3.0
int count_out= 0; // ~-3.0, and +3.0~ ; 0.36%
for(n= 0; n < 3000; n++){
distribution_norm[n]= 0; //initialize
}
for(n= 0; n < count*512; n++){
temp= (int)floor(H_rand[n]*500)+ 1500;
//500division; -3.0~+3.0 -> 1500divs(-3.0~0.0)
if(0<=temp && temp<3000){
distribution_norm[temp]++;
}else{
count_out++;
}
}
//FO
std::ofstream foutn("rand_normal.txt");
if( !foutn ){
return 0;
}
for(n= 0; n < 3000; n++){
foutn << (double)(n/500.0)-3.0 << "\t" << (double)(distribution_norm[n]*500)/(double)(count*512) << std::endl;
//multiple 500, then sum is 1.0
}
foutn.close();

std::cout << (double)(count_out)*100.0/(double)(count*512) << "%" << std::endl;
//inrange +-3sd -> 99.73%; outrange is 0.27%

curandDestroyGenerator(generator);
cutilSafeCall(cudaFreeHost(H_rand));
cutilSafeCall(cudaFree(D_rand));

return 0;
}
posted by にゃんこ at 20:41| Comment(0) | CUDA C