Create Next App

Overview

This part is intentionally practical. Instead of staying in theory, we will focus on complete, runnable CUDA C++ examples and use them to build a clear mental model of kernels, indexing, and host-device execution.

The examples are organized to increase in complexity:

Array summation (1D indexing)
Matrix summation (2D indexing)
Matrix transpose (an indexing-heavy pattern that appears often in real workloads)

Each example uses the same basic structure—allocate memory, copy inputs to the GPU, launch a kernel, copy results back—so the differences stand out for the right reasons.

Example 1: Summation of Two Arrays Using CUDA

Array addition is a clean starting point for CUDA programming because it makes the mapping obvious: one GPU thread can compute one array element. The kernel in this example calculates a global index and uses that index to read from a and b and write the sum into c.

#include <iostream>
using namespace std;
#define N 1000
#define BLOCK_DIM 256
 
// CUDA kernel to add two arrays
__global__ void arraySum(int *a, int *b, int *c)
{
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (idx < N)
    {
        c[idx] = a[idx] + b[idx];
    }
}
 
int main()
{
    // Host arrays for input and output
    int* h_a = (int*)malloc(N * sizeof(int));
    int* h_b = (int*)malloc(N * sizeof(int));
    int* h_c = (int*)malloc(N * sizeof(int));
    int* h_sum = (int*)malloc(N * sizeof(int));
 
    // Initialize host arrays
    for (int i = 0; i < N; i++)
    {
        h_a[i] = i;
        h_b[i] = i;
        h_sum[i] = h_a[i] + h_b[i];
    }
 
    // Device arrays for input and output
    int *d_a;
    int *d_b;
    int *d_c;
    cudaMalloc((void**)&d_a, N * sizeof(int));
    cudaMalloc((void**)&d_b, N * sizeof(int));
    cudaMalloc((void**)&d_c, N * sizeof(int));
 
    // Copy data from host to device
    cudaMemcpy(d_a, h_a, N * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, N * sizeof(int), cudaMemcpyHostToDevice);
 
    // Calculate block size and grid size for the CUDA kernel
    dim3 dimBlock(BLOCK_DIM);
    dim3 dimGrid((N+dimBlock.x-1)/dimBlock.x);
    
    // Launch the CUDA kernel
    arraySum<<<dimGrid, dimBlock>>>(d_a, d_b, d_c);
 
    // Copy the result from device to host
    cudaMemcpy(h_c, d_c, N * sizeof(int), cudaMemcpyDeviceToHost);
 
    // Output the result
    for (int i = 0; i < N; i++)
    {
        cout << h_c[i] << "  ";
    }
 
    // Free device and host memory
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);
    free(h_a);
    free(h_b);
    free(h_c);
 
    return 0;
}

Output:
0  2  4  6  8  10  12  14  16  18  20  22  24  26  28  30  32  34  36  38  40  42  44  46  48  50  52  54  56  58  60  62  64  66  68  70  72  74  76  78  80  82  84  86  88  90  92  94  96  98  100  102  104  106  108  110  112  114  116  118  120  122  124  126  128  130  132  134  136  138  140  142  144  146  148  150  152  154  156  158  160  162  164  166  168  170  172  174  176  178  180  182  184  186  188  190  192  194  196  198  200  202  204  206  208  210  212  214  216  218  220  222  224  226  228  230  232  234  236  238  240  242  244  246  248  250  252  254  256  258  260  262  264  266  268  270  272  274  276  278  280  282  284  286  288  290  292  294  296  298  300  302  304  306  308  310  312  314  316  318  320  322  324  326  328  330  332  334  336  338  340  342  344  346  348  350  352  354  356  358  360  362  364  366  368  370  372  374  376  378  380  382  384  386  388  390  392  394  396  398  400  402  404  406  408  410  412  414  416  418  420  422  424  426  428  430  432  434  436  438  440  442  444  446  448  450  452  454  456  458  460  462  464  466  468  470  472  474  476  478  480  482  484  486  488  490  492  494  496  498  500  502  504  506  508  510  512  514  516  518  520  522  524  526  528  530  532  534  536  538  540  542  544  546  548  550  552  554  556  558  560  562  564  566  568  570  572  574  576  578  580  582  584  586  588  590  592  594  596  598  600  602  604  606  608  610  612  614  616  618  620  622  624  626  628  630  632  634  636  638  640  642  644  646  648  650  652  654  656  658  660  662  664  666  668  670  672  674  676  678

Explanation

This example follows the standard CUDA workflow and highlights the core idea of 1D indexing.

Allocates input/output arrays on the host, then allocates matching buffers on the device using cudaMalloc.
Copies input arrays from host to device using cudaMemcpy.
Computes launch configuration (dimGrid, dimBlock) so there are enough threads to cover N elements.
In the kernel, each thread computes a global index idx and guards with if (idx < N) to avoid out-of-bounds access.
Copies results back to the host and prints them, then releases device memory with cudaFree.

Example 2: Summation Of Two Matrices Using CUDA

Once the 1D case feels straightforward, the natural next step is 2D data. Matrix addition uses the same idea as arrays—one thread per element—but the indexing expands into two dimensions (x for columns and y for rows).

// Add Two Matrices
#include <iostream>
 
#define N 10
#define M 20
#define BLOCK_DIM 16
 
using namespace std;
 
// CUDA kernel to add two matrices
__global__ void matrixAdd(int *a, int *b, int *c)
{
    int ix = blockIdx.x * blockDim.x + threadIdx.x;
    int iy = blockIdx.y * blockDim.y + threadIdx.y;
    int index = ix + iy * N;
 
    if (ix < N && iy < M)
    {
        c[index] = a[index] + b[index];
    }
}
 
int main()
{
    // Define matrix dimensions and block size
    int h_a[N][M], h_b[N][M], h_c[N][M];
    int *d_a, *d_b, *d_c;
    int size = N * M * sizeof(int);
 
    // Initialize host matrices h_a and h_b
    for (int i = 0; i < N; i++)
    {
        for (int j = 0; j < M; j++)
        {
            h_a[i][j] = 1;
            h_b[i][j] = 2;
        }
    }
 
    // Allocate device memory for matrices
    cudaMalloc((void **)&d_a, size);
    cudaMalloc((void **)&d_b, size);
    cudaMalloc((void **)&d_c, size);
 
    // Copy data from host to device
    cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice);
 
    // Define the grid and block dimensions
    dim3 dimBlock(BLOCK_DIM, BLOCK_DIM);
    dim3 dimGrid((N + dimBlock.x - 1) / dimBlock.x, (M + dimBlock.y - 1) / dimBlock.y);
 
    // Launch the kernel to add matrices
    matrixAdd<<<dimGrid, dimBlock>>>(d_a, d_b, d_c);
    cudaDeviceSynchronize();
 
    // Copy the result from device to host
    cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost);
 
    // Output the result
    for (int i = 0; i < N; i++)
    {
        for (int j = 0; j < M; j++)
        {
            cout << h_c[i][j] << " ";
        }
        cout << endl;
    }
 
    // Free device memory
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);
 
    return 0;
}

Output:
3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 
3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 
3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 
3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 
3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 
3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 
3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 
3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 
3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 
3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3

Explanation

This example demonstrates 2D indexing and how a 2D grid maps naturally onto a matrix.

The kernel computes (ix, iy) using blockIdx, blockDim, and threadIdx, then converts that 2D coordinate into a 1D linear index for memory access.
The if (ix < N && iy < M) check ensures threads outside the matrix bounds do not write invalid memory.
The host code allocates matrices, copies them to the GPU, launches the kernel, and copies the result back.

Example 3: Matrix Transpose Using CUDA

Matrix transpose is a useful example because it forces careful thinking about indexing: each thread reads one location in the input and writes to a different location in the output. This pattern shows up in many performance-sensitive kernels, especially when data layout matters.

// Transpose of a matrix
#include<iostream>
#define N 4
#define M 7
#define block_dim 32
using namespace std;
 
// CUDA kernel to transpose a matrix
__global__ void matrixTranspose(int *a, int *b)
{
    int ix = threadIdx.x + blockIdx.x * blockDim.x;
    int iy = threadIdx.y + blockIdx.y * blockDim.y;
    int index = ix + iy * N;
    int transposed_index = iy + ix * M;
 
    // Check boundaries to avoid out-of-bounds memory access
    if (ix < N && iy < M)
    {
        b[index] = a[transposed_index];
    }
}
 
int main()
{   
    int h_a[N][M], h_b[M][N];
    int *d_a, *d_b;
    int size = N * M * sizeof(int);
 
    // Initialize the host matrix h_a
    for (int i = 0; i < N; i++)
    {
        for (int j = 0; j < M; j++)
        {
            h_a[i][j] = i * j;
        }
    }
 
    // Allocate device memory for matrices
    cudaMalloc((void**)&d_a, size);
    cudaMalloc((void**)&d_b, size);
    
    // Copy data from host to device
    cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice);
    
    // Define grid and block dimensions
    dim3 dimBlock(block_dim, block_dim);
    dim3 dimGrid((N + dimBlock.x - 1) / dimBlock.x, (M + dimBlock.y - 1) / dimBlock.y);
    
    // Launch the kernel to transpose the matrix
    matrixTranspose<<<dimGrid, dimBlock>>>(d_a, d_b);
    cudaDeviceSynchronize();
    
    // Copy the result from device to host
    cudaMemcpy(h_b, d_b, size, cudaMemcpyDeviceToHost);
 
    cout << "Matrix=>" << endl;
    for (int i = 0; i < N; i++)
    {
        for (int j = 0; j < M; j++)
        {
            cout << h_a[i][j] << " ";
        }
        cout << endl;
    }
 
    cout << "Transposed Matrix=>" << endl;
    for (int i = 0; i < M; i++)
    {
        for (int j = 0; j < N; j++)
        {
            cout << h_b[i][j] << " ";
        }
        cout << endl;
    }
 
    // Free device memory
    cudaFree(d_a);
    cudaFree(d_b);
 
    return 0;
}

Output:
Matrix=>
0 1 2 3 4 5 6
1 2 3 4 5 6 7
2 3 4 5 6 7 8
3 4 5 6 7 8 9
 
Transposed Matrix=>
0 1 2 3
1 2 3 4
2 3 4 5
3 4 5 6
4 5 6 7
5 6 7 8
6 7 8 9

Summary

In this CUDA C++ tutorial, we worked through three practical GPU programming examples and reinforced the core patterns that show up in most CUDA applications.

CUDA programs commonly follow a repeatable flow: allocate device memory, copy inputs to the GPU, launch a kernel, and copy results back.
For 1D data (arrays), a single global thread index is often enough to map threads to elements.
For 2D data (matrices), it is common to use blockIdx.{x,y} and threadIdx.{x,y} to compute (row, column) positions.
Indexing is not just a detail; it determines correctness, and it strongly influences performance.

What’s Next

The next step in the series focuses on how CUDA actually executes work on the GPU (warps, scheduling, and the execution model) and how that maps to GPU architecture. Those ideas make it easier to reason about performance and explain why certain kernel configurations are faster than others.

CUDA C++ Tutorial: Learning Through Coding Examples

Overview

Example 1: Summation of Two Arrays Using CUDA

Explanation

Example 2: Summation Of Two Matrices Using CUDA

Explanation

Example 3: Matrix Transpose Using CUDA

Summary

What’s Next

Dr. Partha Majumder