Simple CUDA Program Example: GPU Programming Fundamentals with Parallel Computing
This blog post is an introduction to the world of GPU programming with CUDA. We will cover the fundamental concepts and …
Building upon our foundational CUDA programming concepts, this advanced guide explores sophisticated GPU computing techniques essential for high-performance applications. We’ll cover memory optimization patterns, advanced parallel algorithms, and production-ready CUDA development practices.
1#include <cuda_runtime.h>
2#include <iostream>
3#include <vector>
4#include <chrono>
5
6class UnifiedMemoryManager {
7private:
8 size_t totalSize;
9 void* managedPtr;
10 cudaStream_t stream;
11
12public:
13 UnifiedMemoryManager(size_t size) : totalSize(size) {
14 // Allocate unified memory
15 cudaMallocManaged(&managedPtr, totalSize);
16 cudaStreamCreate(&stream);
17
18 // Set memory advice for optimization
19 cudaMemAdvise(managedPtr, totalSize, cudaMemAdviseSetPreferredLocation, 0);
20 cudaMemAdvise(managedPtr, totalSize, cudaMemAdviseSetAccessedBy, cudaCpuDeviceId);
21 }
22
23 ~UnifiedMemoryManager() {
24 cudaStreamSynchronize(stream);
25 cudaFree(managedPtr);
26 cudaStreamDestroy(stream);
27 }
28
29 template<typename T>
30 T* getPointer() {
31 return static_cast<T*>(managedPtr);
32 }
33
34 void prefetchToGPU(int device = 0) {
35 cudaMemPrefetchAsync(managedPtr, totalSize, device, stream);
36 }
37
38 void prefetchToCPU() {
39 cudaMemPrefetchAsync(managedPtr, totalSize, cudaCpuDeviceId, stream);
40 }
41
42 void synchronize() {
43 cudaStreamSynchronize(stream);
44 }
45};
46
47// Advanced matrix multiplication with memory optimization
48template<typename T>
49class AdvancedMatrixMultiply {
50private:
51 static const int TILE_SIZE = 16;
52 static const int BLOCK_SIZE = 16;
53
54public:
55 __device__ static void tiledMatMul(const T* A, const T* B, T* C,
56 int N, int M, int K) {
57 // Shared memory for tiles
58 __shared__ T As[TILE_SIZE][TILE_SIZE];
59 __shared__ T Bs[TILE_SIZE][TILE_SIZE];
60
61 int bx = blockIdx.x, by = blockIdx.y;
62 int tx = threadIdx.x, ty = threadIdx.y;
63
64 int row = by * TILE_SIZE + ty;
65 int col = bx * TILE_SIZE + tx;
66
67 T sum = 0;
68
69 // Loop over tiles
70 for (int t = 0; t < (K + TILE_SIZE - 1) / TILE_SIZE; ++t) {
71 // Load tiles into shared memory
72 if (row < N && (t * TILE_SIZE + tx) < K) {
73 As[ty][tx] = A[row * K + t * TILE_SIZE + tx];
74 } else {
75 As[ty][tx] = 0;
76 }
77
78 if (col < M && (t * TILE_SIZE + ty) < K) {
79 Bs[ty][tx] = B[(t * TILE_SIZE + ty) * M + col];
80 } else {
81 Bs[ty][tx] = 0;
82 }
83
84 __syncthreads();
85
86 // Compute partial sum
87 for (int k = 0; k < TILE_SIZE; ++k) {
88 sum += As[ty][k] * Bs[k][tx];
89 }
90
91 __syncthreads();
92 }
93
94 // Write result
95 if (row < N && col < M) {
96 C[row * M + col] = sum;
97 }
98 }
99};
100
101__global__ void optimizedMatMulKernel(const float* A, const float* B, float* C,
102 int N, int M, int K) {
103 AdvancedMatrixMultiply<float>::tiledMatMul(A, B, C, N, M, K);
104}
1class CUDAMemoryPool {
2private:
3 struct MemoryBlock {
4 void* ptr;
5 size_t size;
6 bool inUse;
7 cudaStream_t stream;
8
9 MemoryBlock(size_t sz) : size(sz), inUse(false), stream(nullptr) {
10 cudaMalloc(&ptr, size);
11 }
12
13 ~MemoryBlock() {
14 if (ptr) cudaFree(ptr);
15 }
16 };
17
18 std::vector<std::unique_ptr<MemoryBlock>> blocks;
19 std::mutex poolMutex;
20 size_t totalAllocated = 0;
21 size_t maxPoolSize;
22
23public:
24 CUDAMemoryPool(size_t maxSize = 1024 * 1024 * 1024) : maxPoolSize(maxSize) {} // 1GB default
25
26 void* allocate(size_t size, cudaStream_t stream = nullptr) {
27 std::lock_guard<std::mutex> lock(poolMutex);
28
29 // Find available block
30 for (auto& block : blocks) {
31 if (!block->inUse && block->size >= size) {
32 block->inUse = true;
33 block->stream = stream;
34 return block->ptr;
35 }
36 }
37
38 // Create new block if under limit
39 if (totalAllocated + size <= maxPoolSize) {
40 auto newBlock = std::make_unique<MemoryBlock>(size);
41 void* ptr = newBlock->ptr;
42 newBlock->inUse = true;
43 newBlock->stream = stream;
44 totalAllocated += size;
45 blocks.push_back(std::move(newBlock));
46 return ptr;
47 }
48
49 return nullptr; // Pool exhausted
50 }
51
52 void deallocate(void* ptr, cudaStream_t stream = nullptr) {
53 std::lock_guard<std::mutex> lock(poolMutex);
54
55 for (auto& block : blocks) {
56 if (block->ptr == ptr) {
57 if (stream) {
58 // Asynchronous deallocation
59 cudaStreamSynchronize(stream);
60 }
61 block->inUse = false;
62 block->stream = nullptr;
63 break;
64 }
65 }
66 }
67
68 void cleanup() {
69 std::lock_guard<std::mutex> lock(poolMutex);
70
71 // Remove unused blocks to free memory
72 blocks.erase(
73 std::remove_if(blocks.begin(), blocks.end(),
74 [](const std::unique_ptr<MemoryBlock>& block) {
75 return !block->inUse;
76 }),
77 blocks.end()
78 );
79 }
80
81 size_t getTotalAllocated() const { return totalAllocated; }
82 size_t getBlockCount() const { return blocks.size(); }
83};
84
85// Global memory pool instance
86CUDAMemoryPool g_memoryPool;
1class CUDAStreamPipeline {
2private:
3 std::vector<cudaStream_t> streams;
4 std::vector<cudaEvent_t> events;
5 int numStreams;
6
7public:
8 CUDAStreamPipeline(int streamCount = 4) : numStreams(streamCount) {
9 streams.resize(numStreams);
10 events.resize(numStreams);
11
12 for (int i = 0; i < numStreams; ++i) {
13 cudaStreamCreate(&streams[i]);
14 cudaEventCreate(&events[i]);
15 }
16 }
17
18 ~CUDAStreamPipeline() {
19 for (int i = 0; i < numStreams; ++i) {
20 cudaStreamDestroy(streams[i]);
21 cudaEventDestroy(events[i]);
22 }
23 }
24
25 template<typename T>
26 void processDataPipeline(T* hostData, T* deviceData, size_t dataSize,
27 void(*kernel)(T*, size_t, cudaStream_t),
28 T* resultData) {
29
30 const size_t chunkSize = dataSize / numStreams;
31
32 // Launch asynchronous operations on multiple streams
33 for (int i = 0; i < numStreams; ++i) {
34 size_t offset = i * chunkSize;
35 size_t currentChunkSize = (i == numStreams - 1) ?
36 dataSize - offset : chunkSize;
37
38 // Async memory copy H2D
39 cudaMemcpyAsync(deviceData + offset, hostData + offset,
40 currentChunkSize * sizeof(T),
41 cudaMemcpyHostToDevice, streams[i]);
42
43 // Launch kernel
44 kernel(deviceData + offset, currentChunkSize, streams[i]);
45
46 // Async memory copy D2H
47 cudaMemcpyAsync(resultData + offset, deviceData + offset,
48 currentChunkSize * sizeof(T),
49 cudaMemcpyDeviceToHost, streams[i]);
50
51 // Record event for synchronization
52 cudaEventRecord(events[i], streams[i]);
53 }
54
55 // Wait for all streams to complete
56 for (int i = 0; i < numStreams; ++i) {
57 cudaEventSynchronize(events[i]);
58 }
59 }
60
61 void synchronizeAll() {
62 for (int i = 0; i < numStreams; ++i) {
63 cudaStreamSynchronize(streams[i]);
64 }
65 }
66
67 cudaStream_t getStream(int index) {
68 return streams[index % numStreams];
69 }
70};
71
72// Example: Parallel signal processing
73__global__ void signalProcessingKernel(float* data, size_t size,
74 float filter_coef, cudaStream_t stream) {
75 int idx = blockIdx.x * blockDim.x + threadIdx.x;
76 int stride = blockDim.x * gridDim.x;
77
78 for (int i = idx; i < size; i += stride) {
79 // Apply digital filter
80 if (i > 0) {
81 data[i] = data[i] * filter_coef + data[i-1] * (1.0f - filter_coef);
82 }
83
84 // Apply windowing function (Hamming window)
85 float window = 0.54f - 0.46f * cosf(2.0f * M_PI * i / size);
86 data[i] *= window;
87 }
88}
89
90void processSignalData(float* hostSignal, size_t signalLength) {
91 CUDAStreamPipeline pipeline(8); // 8 concurrent streams
92
93 float* deviceSignal;
94 cudaMalloc(&deviceSignal, signalLength * sizeof(float));
95
96 float* processedSignal = new float[signalLength];
97
98 auto kernelWrapper = [](float* data, size_t size, cudaStream_t stream) {
99 int blockSize = 256;
100 int gridSize = (size + blockSize - 1) / blockSize;
101 signalProcessingKernel<<<gridSize, blockSize, 0, stream>>>(
102 data, size, 0.8f, stream
103 );
104 };
105
106 pipeline.processDataPipeline(hostSignal, deviceSignal, signalLength,
107 kernelWrapper, processedSignal);
108
109 cudaFree(deviceSignal);
110 delete[] processedSignal;
111}
1#include <cooperative_groups.h>
2namespace cg = cooperative_groups;
3
4// Advanced reduction using cooperative groups
5template<typename T>
6__device__ T cooperativeReduce(cg::thread_block block, T* data, int size) {
7 // Shared memory for block-level reduction
8 extern __shared__ T sdata[];
9
10 int tid = threadIdx.x;
11 int blockSize = block.size();
12
13 // Load data into shared memory
14 T sum = 0;
15 for (int i = tid; i < size; i += blockSize) {
16 sum += data[i];
17 }
18 sdata[tid] = sum;
19
20 cg::sync(block);
21
22 // Cooperative reduction in shared memory
23 for (int s = blockSize / 2; s > 32; s >>= 1) {
24 if (tid < s) {
25 sdata[tid] += sdata[tid + s];
26 }
27 cg::sync(block);
28 }
29
30 // Warp-level reduction using shuffle
31 if (tid < 32) {
32 cg::coalesced_group active = cg::coalesced_threads();
33 T warpSum = sdata[tid];
34
35 for (int offset = active.size() / 2; offset > 0; offset /= 2) {
36 warpSum += active.shfl_down(warpSum, offset);
37 }
38
39 if (active.thread_rank() == 0) {
40 sdata[0] = warpSum;
41 }
42 }
43
44 cg::sync(block);
45 return sdata[0];
46}
47
48// Multi-block cooperative kernel
49__global__ void cooperativeMatrixTranspose(float* input, float* output,
50 int rows, int cols) {
51 // Create grid group for multi-block cooperation
52 cg::grid_group grid = cg::this_grid();
53 cg::thread_block block = cg::this_thread_block();
54
55 const int TILE_SIZE = 32;
56 __shared__ float tile[TILE_SIZE][TILE_SIZE + 1]; // +1 to avoid bank conflicts
57
58 int blockIdx_x = blockIdx.x;
59 int blockIdx_y = blockIdx.y;
60 int threadIdx_x = threadIdx.x;
61 int threadIdx_y = threadIdx.y;
62
63 // Calculate input and output indices
64 int in_row = blockIdx_y * TILE_SIZE + threadIdx_y;
65 int in_col = blockIdx_x * TILE_SIZE + threadIdx_x;
66 int out_row = blockIdx_x * TILE_SIZE + threadIdx_y;
67 int out_col = blockIdx_y * TILE_SIZE + threadIdx_x;
68
69 // Load tile from input matrix
70 if (in_row < rows && in_col < cols) {
71 tile[threadIdx_y][threadIdx_x] = input[in_row * cols + in_col];
72 } else {
73 tile[threadIdx_y][threadIdx_x] = 0.0f;
74 }
75
76 cg::sync(block);
77
78 // Store transposed tile to output matrix
79 if (out_row < cols && out_col < rows) {
80 output[out_row * rows + out_col] = tile[threadIdx_x][threadIdx_y];
81 }
82
83 // Grid-level synchronization for multi-block coordination
84 cg::sync(grid);
85}
86
87// Launch cooperative kernel
88void launchCooperativeTranspose(float* d_input, float* d_output,
89 int rows, int cols) {
90 const int TILE_SIZE = 32;
91 dim3 blockSize(TILE_SIZE, TILE_SIZE);
92 dim3 gridSize((cols + TILE_SIZE - 1) / TILE_SIZE,
93 (rows + TILE_SIZE - 1) / TILE_SIZE);
94
95 // Check if device supports cooperative launch
96 int device;
97 cudaGetDevice(&device);
98 int supportsCoopLaunch = 0;
99 cudaDeviceGetAttribute(&supportsCoopLaunch,
100 cudaDevAttrCooperativeLaunch, device);
101
102 if (supportsCoopLaunch) {
103 void* kernelArgs[] = { &d_input, &d_output, &rows, &cols };
104 cudaLaunchCooperativeKernel((void*)cooperativeMatrixTranspose,
105 gridSize, blockSize, kernelArgs, 0, 0);
106 } else {
107 // Fallback to regular kernel launch
108 cooperativeMatrixTranspose<<<gridSize, blockSize>>>(
109 d_input, d_output, rows, cols
110 );
111 }
112}
1class CUDAPerformanceProfiler {
2private:
3 std::map<std::string, cudaEvent_t> startEvents;
4 std::map<std::string, cudaEvent_t> stopEvents;
5 std::map<std::string, float> timings;
6
7public:
8 CUDAPerformanceProfiler() {}
9
10 ~CUDAPerformanceProfiler() {
11 for (auto& pair : startEvents) {
12 cudaEventDestroy(pair.second);
13 }
14 for (auto& pair : stopEvents) {
15 cudaEventDestroy(pair.second);
16 }
17 }
18
19 void startTimer(const std::string& name, cudaStream_t stream = 0) {
20 if (startEvents.find(name) == startEvents.end()) {
21 cudaEventCreate(&startEvents[name]);
22 cudaEventCreate(&stopEvents[name]);
23 }
24 cudaEventRecord(startEvents[name], stream);
25 }
26
27 void stopTimer(const std::string& name, cudaStream_t stream = 0) {
28 cudaEventRecord(stopEvents[name], stream);
29 cudaEventSynchronize(stopEvents[name]);
30
31 float milliseconds = 0;
32 cudaEventElapsedTime(&milliseconds, startEvents[name], stopEvents[name]);
33 timings[name] = milliseconds;
34 }
35
36 float getElapsedTime(const std::string& name) {
37 return timings[name];
38 }
39
40 void printTimings() {
41 std::cout << "CUDA Performance Report:" << std::endl;
42 std::cout << "========================" << std::endl;
43 for (const auto& pair : timings) {
44 std::cout << pair.first << ": " << pair.second << " ms" << std::endl;
45 }
46 }
47
48 // Memory bandwidth calculation
49 double calculateBandwidth(const std::string& timerName, size_t bytes) {
50 float timeMs = timings[timerName];
51 double timeS = timeMs / 1000.0;
52 double bandwidthGBps = (bytes / (1024.0 * 1024.0 * 1024.0)) / timeS;
53 return bandwidthGBps;
54 }
55};
56
57// Usage example with comprehensive optimization
58void optimizedVectorAddition(float* a, float* b, float* c, int n) {
59 CUDAPerformanceProfiler profiler;
60
61 // Memory allocation timing
62 profiler.startTimer("memory_allocation");
63 float *d_a, *d_b, *d_c;
64 cudaMalloc(&d_a, n * sizeof(float));
65 cudaMalloc(&d_b, n * sizeof(float));
66 cudaMalloc(&d_c, n * sizeof(float));
67 profiler.stopTimer("memory_allocation");
68
69 // Memory transfer timing
70 profiler.startTimer("memory_transfer_h2d");
71 cudaMemcpy(d_a, a, n * sizeof(float), cudaMemcpyHostToDevice);
72 cudaMemcpy(d_b, b, n * sizeof(float), cudaMemcpyHostToDevice);
73 profiler.stopTimer("memory_transfer_h2d");
74
75 // Kernel execution timing
76 profiler.startTimer("kernel_execution");
77 int threadsPerBlock = 256;
78 int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock;
79
80 // Launch optimized kernel with proper grid configuration
81 vectorAddKernel<<<blocksPerGrid, threadsPerBlock>>>(d_a, d_b, d_c, n);
82 profiler.stopTimer("kernel_execution");
83
84 // Memory transfer back timing
85 profiler.startTimer("memory_transfer_d2h");
86 cudaMemcpy(c, d_c, n * sizeof(float), cudaMemcpyDeviceToHost);
87 profiler.stopTimer("memory_transfer_d2h");
88
89 // Calculate and report performance metrics
90 profiler.printTimings();
91
92 size_t totalBytes = 3 * n * sizeof(float); // Read a, b; Write c
93 double bandwidth = profiler.calculateBandwidth("kernel_execution", totalBytes);
94 std::cout << "Effective Bandwidth: " << bandwidth << " GB/s" << std::endl;
95
96 // Cleanup
97 cudaFree(d_a);
98 cudaFree(d_b);
99 cudaFree(d_c);
100}
This advanced CUDA programming guide provides sophisticated techniques for high-performance GPU computing. The patterns and optimizations shown here enable efficient utilization of modern GPU architectures for compute-intensive applications.
For foundational CUDA concepts, refer to our basic CUDA programming tutorial and explore related parallel computing techniques.