2013年08月09日

DeviceQueryとCurand


//cf. cuda toolkit curand guide

// NOTE
// compute capability 1.3 over; USING double (-floating decimal)


#include

#include
#include //using cutilsafecall
#include
#pragma comment (lib, "curand.lib")

using namespace std;

#define is_110 110

struct integer3d{
int x, y, z;
};
struct struct_cuda_device_info{
string devName;
int MaxThreadsNum_perBlock;
integer3d MaxThreads3D_perBlock;
integer3d MaxBlocks3D_perGrid;
int computeCapability_Major;
int computeCapability_Minor;
int sharedMem_perBlock;
int register_perBlock;
int warpSize;
};

bool success;
int devSlect;
struct_cuda_device_info *cuda_dev_info;

double H_a[is_110][is_110][is_110];
__device__ double D_a[is_110][is_110][is_110];
__device__ int D_count_0, D_count_1;

__device__ integer3d RTN_index_ijk(dim3, dim3, bool);
__global__ void d_add(double*, double*);
void cuda_initial_chk();


int main()
{
cout << "main start." << endl;
cuda_initial_chk();
if(success == false){
return 0;
}

cout << "curand start." << endl;
int size_a= sizeof(H_a);
double *D_b, *D_c;
cutilSafeCall(cudaMalloc(&D_b, size_a));
cutilSafeCall(cudaMalloc(&D_c, size_a));

for( int i= 0; i < is_110; i++){
for( int j= 0; j < is_110; j++){
for(int k= 0; k < is_110; k++){
H_a[i][j][k]= -2.0;
}
}
}

int H_0= -1; //0であると,バグなのか,ヒットしていないのか分からない
cutilSafeCall(cudaMemcpyToSymbol(D_count_0, &H_0, sizeof(int)));
cutilSafeCall(cudaMemcpyToSymbol(D_count_1, &H_0, sizeof(int)));

//curand失敗の場合,分かりやすい
cutilSafeCall(cudaMemcpy(D_b, H_a, size_a, cudaMemcpyHostToDevice));
cutilSafeCall(cudaMemcpy(D_c, H_a, size_a, cudaMemcpyHostToDevice));

// rand generate INITIALIZE
curandGenerator_t generator;
curandCreateGenerator(&generator, CURAND_RNG_PSEUDO_XORWOW); //create generator
curandSetPseudoRandomGeneratorSeed(generator, 1234ULL); //set seed

//** 0 to 1 **
int count= is_110* is_110* is_110;
curandGenerateUniformDouble(generator, D_b, count); //generate 0~1 random num.
cutilSafeCall(cudaGetLastError());
curandGenerateUniformDouble(generator, D_c, count);//generate 0~1 random num.
cutilSafeCall(cudaGetLastError());
//失敗していても,Error返さないときがある? //CUDA Memcpyして,30 Unknown errorが返る時もある.

cout << "curand generated." << endl;

dim3 blockNum, threadNum; //即ち,GridDim,BlockDim
threadNum.x= threadNum.y= 16;
int MaxThreadsNum= cuda_dev_info[devSlect].MaxThreadsNum_perBlock;
threadNum.z= (int)(floor(MaxThreadsNum/ (double)(threadNum.x* threadNum.y))+ 1.0e-5); //丸め誤差対策
//floor:切り捨て. ex. 1024/(16*15)= 4.2 -> 16*15*5=1,200

//check threads size
integer3d MaxThreads= cuda_dev_info[devSlect].MaxThreads3D_perBlock;
if(threadNum.x > MaxThreads.x){threadNum.x= MaxThreads.x;}
if(threadNum.y > MaxThreads.y){threadNum.y= MaxThreads.y;}
if(threadNum.z > MaxThreads.z){threadNum.z= MaxThreads.z;}
cout << "Threads SIZE;" << endl;
cout << "\tx:" << threadNum.x << "\ty:" << threadNum.y << "\tz:" << threadNum.z << endl;

blockNum.x= (int)(ceil(is_110/(float)threadNum.x) + 5.0e-7); //丸め誤差対策
blockNum.y= (int)(ceil(is_110/(float)threadNum.y) + 5.0e-7); //ex. ceil(9.0/3.0) -> 2.999999とか
blockNum.z= (int)(ceil(is_110/(float)threadNum.z) + 5.0e-7);
cout << "Blocks SIZE;" << endl;
cout << "\tx:" << blockNum.x << "\ty:" << blockNum.y << "\tz:" << blockNum.z << endl;

integer3d MaxBlocks= cuda_dev_info[devSlect].MaxBlocks3D_perGrid;
if(blockNum.x>MaxBlocks.x || blockNum.y>MaxBlocks.y || blockNum.z>MaxBlocks.z){
cout << "Block size ERROR! over the maxblock size of current GPU-device." << endl;
cout << "Current device name:\t" << cuda_dev_info[devSlect].devName << endl;
cout << "Max Block SIZE;" << endl;
cout << "\tx:" << MaxBlocks.x << "\ty:" << MaxBlocks.y << "\tz:" << MaxBlocks.z << endl;
return 0;
}

d_add<<>>(D_b, D_c);
cutilSafeCall(cudaGetLastError());
cudaThreadSynchronize();
cutilSafeCall(cudaMemcpyFromSymbol(H_a, D_a, size_a));

cout << "curand value;" << endl;
cutilSafeCall(cudaMemcpyFromSymbol(&H_0, D_count_0, sizeof(int)));
cout << "\t0.0 hits:\t" << H_0+1 << endl; //H_0を-1で初期化したので
cutilSafeCall(cudaMemcpyFromSymbol(&H_0, D_count_1, sizeof(int)));
cout << "\t1.0 hits:\t" << H_0+1 << endl;
//0,0Hitなら,乱数の範囲は0超過1未満であることが確認できる

curandDestroyGenerator(generator);
cutilSafeCall(cudaFree(D_b));
cutilSafeCall(cudaFree(D_c));
delete [] cuda_dev_info;

return 0;
}


__global__ void d_add(double *D_b, double *D_c)
{
integer3d ijk= RTN_index_ijk(blockIdx, threadIdx, false);
int i, j, k;
i= ijk.x, j= ijk.y, k= ijk.z;
if(is_110<=i || is_110<=j || is_110<=k){
return;
}

int id_array= (is_110* is_110* i)+ (is_110* j)+ k;
double b= D_b[id_array];
double c= D_c[id_array];
D_a[i][j][k]= b+ c;

//グローバルにアクセスしてるけど,基本的に書き込みしてないので衝突しにくい
if(b == 0.0){ D_count_0++; }
if(c == 0.0){ D_count_0++; }
if(b == 1.0){ D_count_1++; }
if(c == 1.0){ D_count_1++; }
return;
}

__device__ integer3d RTN_index_ijk(dim3 B_Idx, dim3 T_Idx, bool Reverce)
{
//blockDimは,1,2... //blockDimはブロック内スレッド数
//B_Idx,T_Idxは0,1,2…
integer3d ijk;
int i= (blockDim.x* B_Idx.x)+ T_Idx.x;
int j= (blockDim.y* B_Idx.y)+ T_Idx.y;
int k= (blockDim.z* B_Idx.z)+ T_Idx.z;

if(Reverce == false){
ijk.x= i;
ijk.y= j;
ijk.z= k;
}else{
ijk.z= i;
ijk.y= j;
ijk.x= k;
}
return ijk;
}

void cuda_initial_chk()
{
success= true;

int devCount; //何個?
cutilSafeCall(cudaGetDeviceCount(&devCount));
// n個のdevice情報を保存するために,動的確保
cuda_dev_info= new struct_cuda_device_info[devCount];

if(devCount == 0){
cout << "Your Computer has not the CUDA device (GPU-card). halt." << endl;
success= false;
return;
}else{
cout << "Your computer has CUDA devices, count is:\t" << devCount << endl;
}

int devNo;
cudaDeviceProp deviceProp;

for(devNo= 0; devNo < devCount; devNo++){ //n個繰り返し
cout << endl;
cout << "**** Device No.\t" << devNo << endl;
cutilSafeCall(cudaGetDeviceProperties(&deviceProp, devNo)); //型番取得
cuda_dev_info[devNo].devName= deviceProp.name;
cout << "Device name:\t" << deviceProp.name << endl;

//compute Capability
cuda_dev_info[devNo].computeCapability_Major= deviceProp.major;
cuda_dev_info[devNo].computeCapability_Minor= deviceProp.minor;
cout << "Compute Capability:\t" << deviceProp.major << "." << deviceProp.minor << endl;
if( (deviceProp.major<1) || (deviceProp.major==1 && deviceProp.minor<3) ){
//0. or 1.3<
cout << "**CATION** This GPU is disabled \"Double-precision floating\"." << endl;
}

cuda_dev_info[devNo].sharedMem_perBlock= deviceProp.sharedMemPerBlock;
cout << "Shared memory per block:\t"<< deviceProp.sharedMemPerBlock/1024 << " KB" << endl;

cuda_dev_info[devNo].register_perBlock= deviceProp.regsPerBlock;
cout << "Register size per block;\t" << deviceProp.regsPerBlock << endl;

cuda_dev_info[devNo].warpSize= deviceProp.warpSize;
cout << "Warp size:\t" << deviceProp.warpSize << endl;

cuda_dev_info[devNo].MaxThreadsNum_perBlock= deviceProp.maxThreadsPerBlock;
// ****perBlock と perMultiProcessorとは違うよ!!****
cout << "Max threads per block:\t" << deviceProp.maxThreadsPerBlock << endl;

cuda_dev_info[devNo].MaxThreads3D_perBlock.x= deviceProp.maxThreadsDim[0];
cuda_dev_info[devNo].MaxThreads3D_perBlock.y= deviceProp.maxThreadsDim[1];
cuda_dev_info[devNo].MaxThreads3D_perBlock.z= deviceProp.maxThreadsDim[2];
cout << "Max Threads per block, SIZE;" << endl;
cout << "\tx:\t" << deviceProp.maxThreadsDim[0];
cout << "\ty:\t" << deviceProp.maxThreadsDim[1];
cout << "\tz:\t" << deviceProp.maxThreadsDim[2] << endl;

cuda_dev_info[devNo].MaxBlocks3D_perGrid.x= deviceProp.maxGridSize[0];
cuda_dev_info[devNo].MaxBlocks3D_perGrid.y= deviceProp.maxGridSize[1];
cuda_dev_info[devNo].MaxBlocks3D_perGrid.z= deviceProp.maxGridSize[2];
cout << "Max Blocks per grid, SIZE;" << endl;
cout << "\tx:\t" << deviceProp.maxGridSize[0];
cout << "\ty:\t" << deviceProp.maxGridSize[1];
cout << "\tz:\t" << deviceProp.maxGridSize[2] << endl;
}

for( ; ; ){
cout << "For lunch CUDA device No, Please Input..." << endl;
cin >> devSlect;
if( cin.fail() ){
cout << "ConsoleIn has the fail state." << endl;
cin.clear(); //error clear
cin.ignore(INT_MAX, '\n'); //buffer clear
}else if(0<=devSlect && devSlect
posted by にゃんこ at 11:03| Comment(0) | CUDA C
この記事へのコメント
コメントを書く
お名前:

メールアドレス:

ホームページアドレス:

コメント: [必須入力]