CUDA-二维kernel的全局ID

确定2Dkernel 的thread 全局ID

假如我configure 了一个kernel：

int row;
int col;
dim3 block(12, 12);
dim3 grid((row + block.x - 1) / block.x, (col + block.y - 1) / block.y);

那么在__globla__中的全局thread ID 用如下方法确定：

__global__ void func(struct Points* dev_a, 
						struct Points* dev_b, 
						struct Points p1,      //注意 C是不支持传入参数的引用的
						struct Points p2,
						float* dev_c, 
						const int row, 
						const int col){

	int ix = blockIdx.x * blockDim.x + threadIdx.x;
	int iy = blockIdx.y * blockDim.y + threadIdx.y;
	int tid = ix * col + iy;    // 用这个公式来确定全局ID

	if (ix < row && iy < col){
		dev_b[tid].x = dev_a[tid].x + p1.x;
		dev_b[tid].y = dev_a[tid].y + p1.y;

        // 对每个元素进行所需要的操作，
		Line(dev_b[tid], p1, p2);
		getValue(dev_b[tid], dev_c[tid]);
	}
}

其中int tid = ix * col + iy;用x和y两个方向的分量来确定threads的全局ID。

同理，在求矩阵转置时的kernel是如下实现的：

__global__ void transpose(int *m, int *mt){

	int idx = blockIdx.x*blockDim.x + threadIdx.x;
	int idy = blockIdx.y*blockDim.y + threadIdx.y;

	int tidM, tidT;

	if (idx < N && idy <N){
		tidM = idx * N + idy;
		tidT = idy * N + idx;

		mt[tidT] = m[tidM];   // copy value from original matrix to transpose matrix
	}
}

其中tidM = idx * N + idy;为原矩阵的thread ID。tidT = idy * N + idx;是转置后的矩阵thread ID。