Overview
This part is intentionally practical. Instead of staying in theory, we will focus on complete, runnable CUDA C++ examples and use them to build a clear mental model of kernels, indexing, and host-device execution.
The examples are organized to increase in complexity:
- Array summation (1D indexing)
- Matrix summation (2D indexing)
- Matrix transpose (an indexing-heavy pattern that appears often in real workloads)
Each example uses the same basic structure—allocate memory, copy inputs to the GPU, launch a kernel, copy results back—so the differences stand out for the right reasons.
Example 1: Summation of Two Arrays Using CUDA
Array addition is a clean starting point for CUDA programming because it makes the mapping obvious: one GPU thread can compute one array element. The kernel in this example calculates a global index and uses that index to read from a and b and write the sum into c.
#include <iostream>
using namespace std;
#define N 1000
#define BLOCK_DIM 256
// CUDA kernel to add two arrays
__global__ void arraySum(int *a, int *b, int *c)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < N)
{
c[idx] = a[idx] + b[idx];
}
}
int main()
{
// Host arrays for input and output
int* h_a = (int*)malloc(N * sizeof(int));
int* h_b = (int*)malloc(N * sizeof(int));
int* h_c = (int*)malloc(N * sizeof(int));
int* h_sum = (int*)malloc(N * sizeof(int));
// Initialize host arrays
for (int i = 0; i < N; i++)
{
h_a[i] = i;
h_b[i] = i;
h_sum[i] = h_a[i] + h_b[i];
}
// Device arrays for input and output
int *d_a;
int *d_b;
int *d_c;
cudaMalloc((void**)&d_a, N * sizeof(int));
cudaMalloc((void**)&d_b, N * sizeof(int));
cudaMalloc((void**)&d_c, N * sizeof(int));
// Copy data from host to device
cudaMemcpy(d_a, h_a, N * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, h_b, N * sizeof(int), cudaMemcpyHostToDevice);
// Calculate block size and grid size for the CUDA kernel
dim3 dimBlock(BLOCK_DIM);
dim3 dimGrid((N+dimBlock.x-1)/dimBlock.x);
// Launch the CUDA kernel
arraySum<<<dimGrid, dimBlock>>>(d_a, d_b, d_c);
// Copy the result from device to host
cudaMemcpy(h_c, d_c, N * sizeof(int), cudaMemcpyDeviceToHost);
// Output the result
for (int i = 0; i < N; i++)
{
cout << h_c[i] << " ";
}
// Free device and host memory
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
free(h_a);
free(h_b);
free(h_c);
return 0;
}Output:
0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30 32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62 64 66 68 70 72 74 76 78 80 82 84 86 88 90 92 94 96 98 100 102 104 106 108 110 112 114 116 118 120 122 124 126 128 130 132 134 136 138 140 142 144 146 148 150 152 154 156 158 160 162 164 166 168 170 172 174 176 178 180 182 184 186 188 190 192 194 196 198 200 202 204 206 208 210 212 214 216 218 220 222 224 226 228 230 232 234 236 238 240 242 244 246 248 250 252 254 256 258 260 262 264 266 268 270 272 274 276 278 280 282 284 286 288 290 292 294 296 298 300 302 304 306 308 310 312 314 316 318 320 322 324 326 328 330 332 334 336 338 340 342 344 346 348 350 352 354 356 358 360 362 364 366 368 370 372 374 376 378 380 382 384 386 388 390 392 394 396 398 400 402 404 406 408 410 412 414 416 418 420 422 424 426 428 430 432 434 436 438 440 442 444 446 448 450 452 454 456 458 460 462 464 466 468 470 472 474 476 478 480 482 484 486 488 490 492 494 496 498 500 502 504 506 508 510 512 514 516 518 520 522 524 526 528 530 532 534 536 538 540 542 544 546 548 550 552 554 556 558 560 562 564 566 568 570 572 574 576 578 580 582 584 586 588 590 592 594 596 598 600 602 604 606 608 610 612 614 616 618 620 622 624 626 628 630 632 634 636 638 640 642 644 646 648 650 652 654 656 658 660 662 664 666 668 670 672 674 676 678Explanation
This example follows the standard CUDA workflow and highlights the core idea of 1D indexing.
- Allocates input/output arrays on the host, then allocates matching buffers on the device using
cudaMalloc. - Copies input arrays from host to device using
cudaMemcpy. - Computes launch configuration (
dimGrid,dimBlock) so there are enough threads to coverNelements. - In the kernel, each thread computes a global index
idxand guards withif (idx < N)to avoid out-of-bounds access. - Copies results back to the host and prints them, then releases device memory with
cudaFree.
Example 2: Summation Of Two Matrices Using CUDA
Once the 1D case feels straightforward, the natural next step is 2D data. Matrix addition uses the same idea as arrays—one thread per element—but the indexing expands into two dimensions (x for columns and y for rows).
// Add Two Matrices
#include <iostream>
#define N 10
#define M 20
#define BLOCK_DIM 16
using namespace std;
// CUDA kernel to add two matrices
__global__ void matrixAdd(int *a, int *b, int *c)
{
int ix = blockIdx.x * blockDim.x + threadIdx.x;
int iy = blockIdx.y * blockDim.y + threadIdx.y;
int index = ix + iy * N;
if (ix < N && iy < M)
{
c[index] = a[index] + b[index];
}
}
int main()
{
// Define matrix dimensions and block size
int h_a[N][M], h_b[N][M], h_c[N][M];
int *d_a, *d_b, *d_c;
int size = N * M * sizeof(int);
// Initialize host matrices h_a and h_b
for (int i = 0; i < N; i++)
{
for (int j = 0; j < M; j++)
{
h_a[i][j] = 1;
h_b[i][j] = 2;
}
}
// Allocate device memory for matrices
cudaMalloc((void **)&d_a, size);
cudaMalloc((void **)&d_b, size);
cudaMalloc((void **)&d_c, size);
// Copy data from host to device
cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice);
// Define the grid and block dimensions
dim3 dimBlock(BLOCK_DIM, BLOCK_DIM);
dim3 dimGrid((N + dimBlock.x - 1) / dimBlock.x, (M + dimBlock.y - 1) / dimBlock.y);
// Launch the kernel to add matrices
matrixAdd<<<dimGrid, dimBlock>>>(d_a, d_b, d_c);
cudaDeviceSynchronize();
// Copy the result from device to host
cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost);
// Output the result
for (int i = 0; i < N; i++)
{
for (int j = 0; j < M; j++)
{
cout << h_c[i][j] << " ";
}
cout << endl;
}
// Free device memory
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return 0;
}Output:
3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3Explanation
This example demonstrates 2D indexing and how a 2D grid maps naturally onto a matrix.
- The kernel computes
(ix, iy)usingblockIdx,blockDim, andthreadIdx, then converts that 2D coordinate into a 1D linearindexfor memory access. - The
if (ix < N && iy < M)check ensures threads outside the matrix bounds do not write invalid memory. - The host code allocates matrices, copies them to the GPU, launches the kernel, and copies the result back.
Example 3: Matrix Transpose Using CUDA
Matrix transpose is a useful example because it forces careful thinking about indexing: each thread reads one location in the input and writes to a different location in the output. This pattern shows up in many performance-sensitive kernels, especially when data layout matters.
// Transpose of a matrix
#include<iostream>
#define N 4
#define M 7
#define block_dim 32
using namespace std;
// CUDA kernel to transpose a matrix
__global__ void matrixTranspose(int *a, int *b)
{
int ix = threadIdx.x + blockIdx.x * blockDim.x;
int iy = threadIdx.y + blockIdx.y * blockDim.y;
int index = ix + iy * N;
int transposed_index = iy + ix * M;
// Check boundaries to avoid out-of-bounds memory access
if (ix < N && iy < M)
{
b[index] = a[transposed_index];
}
}
int main()
{
int h_a[N][M], h_b[M][N];
int *d_a, *d_b;
int size = N * M * sizeof(int);
// Initialize the host matrix h_a
for (int i = 0; i < N; i++)
{
for (int j = 0; j < M; j++)
{
h_a[i][j] = i * j;
}
}
// Allocate device memory for matrices
cudaMalloc((void**)&d_a, size);
cudaMalloc((void**)&d_b, size);
// Copy data from host to device
cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice);
// Define grid and block dimensions
dim3 dimBlock(block_dim, block_dim);
dim3 dimGrid((N + dimBlock.x - 1) / dimBlock.x, (M + dimBlock.y - 1) / dimBlock.y);
// Launch the kernel to transpose the matrix
matrixTranspose<<<dimGrid, dimBlock>>>(d_a, d_b);
cudaDeviceSynchronize();
// Copy the result from device to host
cudaMemcpy(h_b, d_b, size, cudaMemcpyDeviceToHost);
cout << "Matrix=>" << endl;
for (int i = 0; i < N; i++)
{
for (int j = 0; j < M; j++)
{
cout << h_a[i][j] << " ";
}
cout << endl;
}
cout << "Transposed Matrix=>" << endl;
for (int i = 0; i < M; i++)
{
for (int j = 0; j < N; j++)
{
cout << h_b[i][j] << " ";
}
cout << endl;
}
// Free device memory
cudaFree(d_a);
cudaFree(d_b);
return 0;
}Output:
Matrix=>
0 1 2 3 4 5 6
1 2 3 4 5 6 7
2 3 4 5 6 7 8
3 4 5 6 7 8 9
Transposed Matrix=>
0 1 2 3
1 2 3 4
2 3 4 5
3 4 5 6
4 5 6 7
5 6 7 8
6 7 8 9Summary
In this CUDA C++ tutorial, we worked through three practical GPU programming examples and reinforced the core patterns that show up in most CUDA applications.
- CUDA programs commonly follow a repeatable flow: allocate device memory, copy inputs to the GPU, launch a kernel, and copy results back.
- For 1D data (arrays), a single global thread index is often enough to map threads to elements.
- For 2D data (matrices), it is common to use
blockIdx.{x,y}andthreadIdx.{x,y}to compute (row, column) positions. - Indexing is not just a detail; it determines correctness, and it strongly influences performance.
What’s Next
The next step in the series focuses on how CUDA actually executes work on the GPU (warps, scheduling, and the execution model) and how that maps to GPU architecture. Those ideas make it easier to reason about performance and explain why certain kernel configurations are faster than others.
