-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmultidimentionalKernelLaunch.cu
64 lines (50 loc) · 1.64 KB
/
multidimentionalKernelLaunch.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
#include <stdlib.h>
#include <stdio.h>
__global__ void kernel(int *array)
{
int index_x = blockIdx.x * blockDim.x + threadIdx.x;
int index_y = blockIdx.y * blockDim.y + threadIdx.y;
// map the two 2D indices to a single linear, 1D index
int grid_width = gridDim.x * blockDim.x;
int index = index_y * grid_width + index_x;
// map the two 2D block indices to a single linear, 1D block index
int result = blockIdx.y * gridDim.x + blockIdx.x;
// write out the result
array[index] = result;
}
int main(void)
{
int num_elements_x = 16;
int num_elements_y = 16;
int num_bytes = num_elements_x * num_elements_y * sizeof(int);
int *device_array = 0;
int *host_array = 0;
// allocate memory in either space
host_array = (int*)malloc(num_bytes);
cudaMalloc((void**)&device_array, num_bytes);
// create two dimensional 4x4 thread blocks
dim3 block_size;
block_size.x = 4;
block_size.y = 4;
// configure a two dimensional grid as well
dim3 grid_size;
grid_size.x = num_elements_x / block_size.x;
grid_size.y = num_elements_y / block_size.y;
// grid_size & block_size are passed as arguments to the triple chevrons as usual
kernel<<<grid_size,block_size>>>(device_array);
// download and inspect the result on the host:
cudaMemcpy(host_array, device_array, num_bytes, cudaMemcpyDeviceToHost);
// print out the result element by element
for(int row = 0; row < num_elements_y; ++row)
{
for(int col = 0; col < num_elements_x; ++col)
{
printf("%2d ", host_array[row * num_elements_x + col]);
}
printf("\n");
}
printf("\n");
// deallocate memory
free(host_array);
cudaFree(device_array);
}