GPU Computing with CUDA: Advanced Parallel Programming Techniques

December 18, 2023 - 11 minutes read - 2192 words

Building upon our foundational CUDA programming concepts, this advanced guide explores sophisticated GPU computing techniques essential for high-performance applications. We’ll cover memory optimization patterns, advanced parallel algorithms, and production-ready CUDA development practices.

Advanced Memory Management Patterns

Unified Memory with Prefetching

  1#include <cuda_runtime.h>
  2#include <iostream>
  3#include <vector>
  4#include <chrono>
  5
  6class UnifiedMemoryManager {
  7private:
  8    size_t totalSize;
  9    void* managedPtr;
 10    cudaStream_t stream;
 11    
 12public:
 13    UnifiedMemoryManager(size_t size) : totalSize(size) {
 14        // Allocate unified memory
 15        cudaMallocManaged(&managedPtr, totalSize);
 16        cudaStreamCreate(&stream);
 17        
 18        // Set memory advice for optimization
 19        cudaMemAdvise(managedPtr, totalSize, cudaMemAdviseSetPreferredLocation, 0);
 20        cudaMemAdvise(managedPtr, totalSize, cudaMemAdviseSetAccessedBy, cudaCpuDeviceId);
 21    }
 22    
 23    ~UnifiedMemoryManager() {
 24        cudaStreamSynchronize(stream);
 25        cudaFree(managedPtr);
 26        cudaStreamDestroy(stream);
 27    }
 28    
 29    template<typename T>
 30    T* getPointer() {
 31        return static_cast<T*>(managedPtr);
 32    }
 33    
 34    void prefetchToGPU(int device = 0) {
 35        cudaMemPrefetchAsync(managedPtr, totalSize, device, stream);
 36    }
 37    
 38    void prefetchToCPU() {
 39        cudaMemPrefetchAsync(managedPtr, totalSize, cudaCpuDeviceId, stream);
 40    }
 41    
 42    void synchronize() {
 43        cudaStreamSynchronize(stream);
 44    }
 45};
 46
 47// Advanced matrix multiplication with memory optimization
 48template<typename T>
 49class AdvancedMatrixMultiply {
 50private:
 51    static const int TILE_SIZE = 16;
 52    static const int BLOCK_SIZE = 16;
 53    
 54public:
 55    __device__ static void tiledMatMul(const T* A, const T* B, T* C, 
 56                                      int N, int M, int K) {
 57        // Shared memory for tiles
 58        __shared__ T As[TILE_SIZE][TILE_SIZE];
 59        __shared__ T Bs[TILE_SIZE][TILE_SIZE];
 60        
 61        int bx = blockIdx.x, by = blockIdx.y;
 62        int tx = threadIdx.x, ty = threadIdx.y;
 63        
 64        int row = by * TILE_SIZE + ty;
 65        int col = bx * TILE_SIZE + tx;
 66        
 67        T sum = 0;
 68        
 69        // Loop over tiles
 70        for (int t = 0; t < (K + TILE_SIZE - 1) / TILE_SIZE; ++t) {
 71            // Load tiles into shared memory
 72            if (row < N && (t * TILE_SIZE + tx) < K) {
 73                As[ty][tx] = A[row * K + t * TILE_SIZE + tx];
 74            } else {
 75                As[ty][tx] = 0;
 76            }
 77            
 78            if (col < M && (t * TILE_SIZE + ty) < K) {
 79                Bs[ty][tx] = B[(t * TILE_SIZE + ty) * M + col];
 80            } else {
 81                Bs[ty][tx] = 0;
 82            }
 83            
 84            __syncthreads();
 85            
 86            // Compute partial sum
 87            for (int k = 0; k < TILE_SIZE; ++k) {
 88                sum += As[ty][k] * Bs[k][tx];
 89            }
 90            
 91            __syncthreads();
 92        }
 93        
 94        // Write result
 95        if (row < N && col < M) {
 96            C[row * M + col] = sum;
 97        }
 98    }
 99};
100
101__global__ void optimizedMatMulKernel(const float* A, const float* B, float* C,
102                                     int N, int M, int K) {
103    AdvancedMatrixMultiply<float>::tiledMatMul(A, B, C, N, M, K);
104}

Memory Pool Management

 1class CUDAMemoryPool {
 2private:
 3    struct MemoryBlock {
 4        void* ptr;
 5        size_t size;
 6        bool inUse;
 7        cudaStream_t stream;
 8        
 9        MemoryBlock(size_t sz) : size(sz), inUse(false), stream(nullptr) {
10            cudaMalloc(&ptr, size);
11        }
12        
13        ~MemoryBlock() {
14            if (ptr) cudaFree(ptr);
15        }
16    };
17    
18    std::vector<std::unique_ptr<MemoryBlock>> blocks;
19    std::mutex poolMutex;
20    size_t totalAllocated = 0;
21    size_t maxPoolSize;
22    
23public:
24    CUDAMemoryPool(size_t maxSize = 1024 * 1024 * 1024) : maxPoolSize(maxSize) {} // 1GB default
25    
26    void* allocate(size_t size, cudaStream_t stream = nullptr) {
27        std::lock_guard<std::mutex> lock(poolMutex);
28        
29        // Find available block
30        for (auto& block : blocks) {
31            if (!block->inUse && block->size >= size) {
32                block->inUse = true;
33                block->stream = stream;
34                return block->ptr;
35            }
36        }
37        
38        // Create new block if under limit
39        if (totalAllocated + size <= maxPoolSize) {
40            auto newBlock = std::make_unique<MemoryBlock>(size);
41            void* ptr = newBlock->ptr;
42            newBlock->inUse = true;
43            newBlock->stream = stream;
44            totalAllocated += size;
45            blocks.push_back(std::move(newBlock));
46            return ptr;
47        }
48        
49        return nullptr; // Pool exhausted
50    }
51    
52    void deallocate(void* ptr, cudaStream_t stream = nullptr) {
53        std::lock_guard<std::mutex> lock(poolMutex);
54        
55        for (auto& block : blocks) {
56            if (block->ptr == ptr) {
57                if (stream) {
58                    // Asynchronous deallocation
59                    cudaStreamSynchronize(stream);
60                }
61                block->inUse = false;
62                block->stream = nullptr;
63                break;
64            }
65        }
66    }
67    
68    void cleanup() {
69        std::lock_guard<std::mutex> lock(poolMutex);
70        
71        // Remove unused blocks to free memory
72        blocks.erase(
73            std::remove_if(blocks.begin(), blocks.end(),
74                          [](const std::unique_ptr<MemoryBlock>& block) {
75                              return !block->inUse;
76                          }),
77            blocks.end()
78        );
79    }
80    
81    size_t getTotalAllocated() const { return totalAllocated; }
82    size_t getBlockCount() const { return blocks.size(); }
83};
84
85// Global memory pool instance
86CUDAMemoryPool g_memoryPool;

Advanced Stream Processing and Concurrency

Multi-Stream Pipeline Processing

  1class CUDAStreamPipeline {
  2private:
  3    std::vector<cudaStream_t> streams;
  4    std::vector<cudaEvent_t> events;
  5    int numStreams;
  6    
  7public:
  8    CUDAStreamPipeline(int streamCount = 4) : numStreams(streamCount) {
  9        streams.resize(numStreams);
 10        events.resize(numStreams);
 11        
 12        for (int i = 0; i < numStreams; ++i) {
 13            cudaStreamCreate(&streams[i]);
 14            cudaEventCreate(&events[i]);
 15        }
 16    }
 17    
 18    ~CUDAStreamPipeline() {
 19        for (int i = 0; i < numStreams; ++i) {
 20            cudaStreamDestroy(streams[i]);
 21            cudaEventDestroy(events[i]);
 22        }
 23    }
 24    
 25    template<typename T>
 26    void processDataPipeline(T* hostData, T* deviceData, size_t dataSize,
 27                            void(*kernel)(T*, size_t, cudaStream_t),
 28                            T* resultData) {
 29        
 30        const size_t chunkSize = dataSize / numStreams;
 31        
 32        // Launch asynchronous operations on multiple streams
 33        for (int i = 0; i < numStreams; ++i) {
 34            size_t offset = i * chunkSize;
 35            size_t currentChunkSize = (i == numStreams - 1) ? 
 36                                    dataSize - offset : chunkSize;
 37            
 38            // Async memory copy H2D
 39            cudaMemcpyAsync(deviceData + offset, hostData + offset,
 40                          currentChunkSize * sizeof(T),
 41                          cudaMemcpyHostToDevice, streams[i]);
 42            
 43            // Launch kernel
 44            kernel(deviceData + offset, currentChunkSize, streams[i]);
 45            
 46            // Async memory copy D2H
 47            cudaMemcpyAsync(resultData + offset, deviceData + offset,
 48                          currentChunkSize * sizeof(T),
 49                          cudaMemcpyDeviceToHost, streams[i]);
 50            
 51            // Record event for synchronization
 52            cudaEventRecord(events[i], streams[i]);
 53        }
 54        
 55        // Wait for all streams to complete
 56        for (int i = 0; i < numStreams; ++i) {
 57            cudaEventSynchronize(events[i]);
 58        }
 59    }
 60    
 61    void synchronizeAll() {
 62        for (int i = 0; i < numStreams; ++i) {
 63            cudaStreamSynchronize(streams[i]);
 64        }
 65    }
 66    
 67    cudaStream_t getStream(int index) {
 68        return streams[index % numStreams];
 69    }
 70};
 71
 72// Example: Parallel signal processing
 73__global__ void signalProcessingKernel(float* data, size_t size, 
 74                                     float filter_coef, cudaStream_t stream) {
 75    int idx = blockIdx.x * blockDim.x + threadIdx.x;
 76    int stride = blockDim.x * gridDim.x;
 77    
 78    for (int i = idx; i < size; i += stride) {
 79        // Apply digital filter
 80        if (i > 0) {
 81            data[i] = data[i] * filter_coef + data[i-1] * (1.0f - filter_coef);
 82        }
 83        
 84        // Apply windowing function (Hamming window)
 85        float window = 0.54f - 0.46f * cosf(2.0f * M_PI * i / size);
 86        data[i] *= window;
 87    }
 88}
 89
 90void processSignalData(float* hostSignal, size_t signalLength) {
 91    CUDAStreamPipeline pipeline(8);  // 8 concurrent streams
 92    
 93    float* deviceSignal;
 94    cudaMalloc(&deviceSignal, signalLength * sizeof(float));
 95    
 96    float* processedSignal = new float[signalLength];
 97    
 98    auto kernelWrapper = [](float* data, size_t size, cudaStream_t stream) {
 99        int blockSize = 256;
100        int gridSize = (size + blockSize - 1) / blockSize;
101        signalProcessingKernel<<<gridSize, blockSize, 0, stream>>>(
102            data, size, 0.8f, stream
103        );
104    };
105    
106    pipeline.processDataPipeline(hostSignal, deviceSignal, signalLength,
107                               kernelWrapper, processedSignal);
108    
109    cudaFree(deviceSignal);
110    delete[] processedSignal;
111}

Cooperative Groups and Advanced Synchronization

  1#include <cooperative_groups.h>
  2namespace cg = cooperative_groups;
  3
  4// Advanced reduction using cooperative groups
  5template<typename T>
  6__device__ T cooperativeReduce(cg::thread_block block, T* data, int size) {
  7    // Shared memory for block-level reduction
  8    extern __shared__ T sdata[];
  9    
 10    int tid = threadIdx.x;
 11    int blockSize = block.size();
 12    
 13    // Load data into shared memory
 14    T sum = 0;
 15    for (int i = tid; i < size; i += blockSize) {
 16        sum += data[i];
 17    }
 18    sdata[tid] = sum;
 19    
 20    cg::sync(block);
 21    
 22    // Cooperative reduction in shared memory
 23    for (int s = blockSize / 2; s > 32; s >>= 1) {
 24        if (tid < s) {
 25            sdata[tid] += sdata[tid + s];
 26        }
 27        cg::sync(block);
 28    }
 29    
 30    // Warp-level reduction using shuffle
 31    if (tid < 32) {
 32        cg::coalesced_group active = cg::coalesced_threads();
 33        T warpSum = sdata[tid];
 34        
 35        for (int offset = active.size() / 2; offset > 0; offset /= 2) {
 36            warpSum += active.shfl_down(warpSum, offset);
 37        }
 38        
 39        if (active.thread_rank() == 0) {
 40            sdata[0] = warpSum;
 41        }
 42    }
 43    
 44    cg::sync(block);
 45    return sdata[0];
 46}
 47
 48// Multi-block cooperative kernel
 49__global__ void cooperativeMatrixTranspose(float* input, float* output,
 50                                         int rows, int cols) {
 51    // Create grid group for multi-block cooperation
 52    cg::grid_group grid = cg::this_grid();
 53    cg::thread_block block = cg::this_thread_block();
 54    
 55    const int TILE_SIZE = 32;
 56    __shared__ float tile[TILE_SIZE][TILE_SIZE + 1]; // +1 to avoid bank conflicts
 57    
 58    int blockIdx_x = blockIdx.x;
 59    int blockIdx_y = blockIdx.y;
 60    int threadIdx_x = threadIdx.x;
 61    int threadIdx_y = threadIdx.y;
 62    
 63    // Calculate input and output indices
 64    int in_row = blockIdx_y * TILE_SIZE + threadIdx_y;
 65    int in_col = blockIdx_x * TILE_SIZE + threadIdx_x;
 66    int out_row = blockIdx_x * TILE_SIZE + threadIdx_y;
 67    int out_col = blockIdx_y * TILE_SIZE + threadIdx_x;
 68    
 69    // Load tile from input matrix
 70    if (in_row < rows && in_col < cols) {
 71        tile[threadIdx_y][threadIdx_x] = input[in_row * cols + in_col];
 72    } else {
 73        tile[threadIdx_y][threadIdx_x] = 0.0f;
 74    }
 75    
 76    cg::sync(block);
 77    
 78    // Store transposed tile to output matrix
 79    if (out_row < cols && out_col < rows) {
 80        output[out_row * rows + out_col] = tile[threadIdx_x][threadIdx_y];
 81    }
 82    
 83    // Grid-level synchronization for multi-block coordination
 84    cg::sync(grid);
 85}
 86
 87// Launch cooperative kernel
 88void launchCooperativeTranspose(float* d_input, float* d_output,
 89                               int rows, int cols) {
 90    const int TILE_SIZE = 32;
 91    dim3 blockSize(TILE_SIZE, TILE_SIZE);
 92    dim3 gridSize((cols + TILE_SIZE - 1) / TILE_SIZE,
 93                  (rows + TILE_SIZE - 1) / TILE_SIZE);
 94    
 95    // Check if device supports cooperative launch
 96    int device;
 97    cudaGetDevice(&device);
 98    int supportsCoopLaunch = 0;
 99    cudaDeviceGetAttribute(&supportsCoopLaunch,
100                          cudaDevAttrCooperativeLaunch, device);
101    
102    if (supportsCoopLaunch) {
103        void* kernelArgs[] = { &d_input, &d_output, &rows, &cols };
104        cudaLaunchCooperativeKernel((void*)cooperativeMatrixTranspose,
105                                  gridSize, blockSize, kernelArgs, 0, 0);
106    } else {
107        // Fallback to regular kernel launch
108        cooperativeMatrixTranspose<<<gridSize, blockSize>>>(
109            d_input, d_output, rows, cols
110        );
111    }
112}

Performance Profiling and Optimization

  1class CUDAPerformanceProfiler {
  2private:
  3    std::map<std::string, cudaEvent_t> startEvents;
  4    std::map<std::string, cudaEvent_t> stopEvents;
  5    std::map<std::string, float> timings;
  6    
  7public:
  8    CUDAPerformanceProfiler() {}
  9    
 10    ~CUDAPerformanceProfiler() {
 11        for (auto& pair : startEvents) {
 12            cudaEventDestroy(pair.second);
 13        }
 14        for (auto& pair : stopEvents) {
 15            cudaEventDestroy(pair.second);
 16        }
 17    }
 18    
 19    void startTimer(const std::string& name, cudaStream_t stream = 0) {
 20        if (startEvents.find(name) == startEvents.end()) {
 21            cudaEventCreate(&startEvents[name]);
 22            cudaEventCreate(&stopEvents[name]);
 23        }
 24        cudaEventRecord(startEvents[name], stream);
 25    }
 26    
 27    void stopTimer(const std::string& name, cudaStream_t stream = 0) {
 28        cudaEventRecord(stopEvents[name], stream);
 29        cudaEventSynchronize(stopEvents[name]);
 30        
 31        float milliseconds = 0;
 32        cudaEventElapsedTime(&milliseconds, startEvents[name], stopEvents[name]);
 33        timings[name] = milliseconds;
 34    }
 35    
 36    float getElapsedTime(const std::string& name) {
 37        return timings[name];
 38    }
 39    
 40    void printTimings() {
 41        std::cout << "CUDA Performance Report:" << std::endl;
 42        std::cout << "========================" << std::endl;
 43        for (const auto& pair : timings) {
 44            std::cout << pair.first << ": " << pair.second << " ms" << std::endl;
 45        }
 46    }
 47    
 48    // Memory bandwidth calculation
 49    double calculateBandwidth(const std::string& timerName, size_t bytes) {
 50        float timeMs = timings[timerName];
 51        double timeS = timeMs / 1000.0;
 52        double bandwidthGBps = (bytes / (1024.0 * 1024.0 * 1024.0)) / timeS;
 53        return bandwidthGBps;
 54    }
 55};
 56
 57// Usage example with comprehensive optimization
 58void optimizedVectorAddition(float* a, float* b, float* c, int n) {
 59    CUDAPerformanceProfiler profiler;
 60    
 61    // Memory allocation timing
 62    profiler.startTimer("memory_allocation");
 63    float *d_a, *d_b, *d_c;
 64    cudaMalloc(&d_a, n * sizeof(float));
 65    cudaMalloc(&d_b, n * sizeof(float));
 66    cudaMalloc(&d_c, n * sizeof(float));
 67    profiler.stopTimer("memory_allocation");
 68    
 69    // Memory transfer timing
 70    profiler.startTimer("memory_transfer_h2d");
 71    cudaMemcpy(d_a, a, n * sizeof(float), cudaMemcpyHostToDevice);
 72    cudaMemcpy(d_b, b, n * sizeof(float), cudaMemcpyHostToDevice);
 73    profiler.stopTimer("memory_transfer_h2d");
 74    
 75    // Kernel execution timing
 76    profiler.startTimer("kernel_execution");
 77    int threadsPerBlock = 256;
 78    int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock;
 79    
 80    // Launch optimized kernel with proper grid configuration
 81    vectorAddKernel<<<blocksPerGrid, threadsPerBlock>>>(d_a, d_b, d_c, n);
 82    profiler.stopTimer("kernel_execution");
 83    
 84    // Memory transfer back timing
 85    profiler.startTimer("memory_transfer_d2h");
 86    cudaMemcpy(c, d_c, n * sizeof(float), cudaMemcpyDeviceToHost);
 87    profiler.stopTimer("memory_transfer_d2h");
 88    
 89    // Calculate and report performance metrics
 90    profiler.printTimings();
 91    
 92    size_t totalBytes = 3 * n * sizeof(float); // Read a, b; Write c
 93    double bandwidth = profiler.calculateBandwidth("kernel_execution", totalBytes);
 94    std::cout << "Effective Bandwidth: " << bandwidth << " GB/s" << std::endl;
 95    
 96    // Cleanup
 97    cudaFree(d_a);
 98    cudaFree(d_b);
 99    cudaFree(d_c);
100}

This advanced CUDA programming guide provides sophisticated techniques for high-performance GPU computing. The patterns and optimizations shown here enable efficient utilization of modern GPU architectures for compute-intensive applications.

For foundational CUDA concepts, refer to our basic CUDA programming tutorial and explore related parallel computing techniques.

Advanced Memory Management Patterns

Unified Memory with Prefetching

Memory Pool Management

Advanced Stream Processing and Concurrency

Multi-Stream Pipeline Processing

Cooperative Groups and Advanced Synchronization

Performance Profiling and Optimization

Related Posts

Simple CUDA Program Example: GPU Programming Fundamentals with Parallel Computing

Advanced Arduino Programming Techniques: From Basics to Professional Firmware Development

Complete Introduction to AVR Microcontroller Programming: From Setup to First Program