多 stream 用于 overlap datatransfer

并发,隐藏延时要实现数据传输与其他操作的重叠,需要使用 CUDA 流.

CUDA 中的流是一系列按主机代码发出的顺序在设备上执行的运算。虽然流内的运算保证按预定顺序执行,但不同流中的运算可以交错执行,并且在可能的情况下,它们甚至可以并行运行。***

所有 CUDA 中在 device 中的操作(内核和数据传输)都在流中运行。当未指定流时,使用默认流(也称为“空流”)。默认流与其他流不同,因为它是一个与设备操作同步的流。

  float *a, *d_a;
  cudaMallocHost((void**)&a, bytes);   // 弃用的   // host pinned 更推荐使用 cudaHostAlloc
  cudaMalloc((void**)&d_a, bytes);    // device

  // create events and streams
  cudaEvent_t startEvent, stopEvent, dummyEvent;
  cudaStream_t stream[nStreams];
  cudaEventCreate(&startEvent);
  cudaEventCreate(&stopEvent);
  cudaEventCreate(&dummyEvent);
  for (int i = 0; i < nStreams; ++i)
    cudaStreamCreate(&stream[i]);

在默认流中:

  // baseline case - sequential transfer and execute
  memset(a, 0, bytes);
  checkCuda( cudaEventRecord(startEvent,0) );
  checkCuda( cudaMemcpy(d_a, a, bytes, cudaMemcpyHostToDevice) );
  kernel<<<n/blockSize, blockSize>>>(d_a, 0);
  checkCuda( cudaMemcpy(a, d_a, bytes, cudaMemcpyDeviceToHost) );
  checkCuda( cudaEventRecord(stopEvent, 0) );
  checkCuda( cudaEventSynchronize(stopEvent) );
  checkCuda( cudaEventElapsedTime(&ms, startEvent, stopEvent) );
  printf("Time for sequential transfer and execute (ms): %f\n", ms);
  printf("  max error: %e\n", maxError(a, n));

版本1:asynchronous version 1: loop over {copy, kernel-exe, copy-back}

  memset(a, 0, bytes);
  checkCuda( cudaEventRecord(startEvent,0) );
  for (int i = 0; i < nStreams; ++i) {
    int offset = i * streamSize;
    checkCuda( cudaMemcpyAsync(&d_a[offset], &a[offset], 
                               streamBytes, cudaMemcpyHostToDevice, 
                               stream[i]) );
    kernel<<<streamSize/blockSize, blockSize, 0, stream[i]>>>(d_a, offset);
    checkCuda( cudaMemcpyAsync(&a[offset], &d_a[offset], 
                               streamBytes, cudaMemcpyDeviceToHost,
                               stream[i]) );
  }
  checkCuda( cudaEventRecord(stopEvent, 0) );
  checkCuda( cudaEventSynchronize(stopEvent) );
  checkCuda( cudaEventElapsedTime(&ms, startEvent, stopEvent) );
  printf("Time for asynchronous V1 transfer and execute (ms): %f\n", ms);
  printf("  max error: %e\n", maxError(a, n));

版本2,loop over copy, loop over kernel-exe, loop over copy-back。

将3个操作分别分配在不同的 Stream 中,实现并行执行。***

  memset(a, 0, bytes);
  checkCuda( cudaEventRecord(startEvent,0) );
  for (int i = 0; i < nStreams; ++i) {
    int offset = i * streamSize;
    checkCuda( cudaMemcpyAsync(&d_a[offset], &a[offset], 
                               streamBytes, cudaMemcpyHostToDevice,
                               stream[i]) );
  }
  for (int i = 0; i < nStreams; ++i) {
    int offset = i * streamSize;
    kernel<<<streamSize/blockSize, blockSize, 0, stream[i]>>>(d_a, offset);
  }
  for (int i = 0; i < nStreams; ++i) {
    int offset = i * streamSize;
    checkCuda( cudaMemcpyAsync(&a[offset], &d_a[offset], 
                               streamBytes, cudaMemcpyDeviceToHost,
                               stream[i]) );
  }
  checkCuda( cudaEventRecord(stopEvent, 0) );
  checkCuda( cudaEventSynchronize(stopEvent) );
  checkCuda( cudaEventElapsedTime(&ms, startEvent, stopEvent) );
  printf("Time for asynchronous V2 transfer and execute (ms): %f\n", ms);
  printf("  max error: %e\n", maxError(a, n));
cudaFreeHost(a);  // 释放pinned memory

申请 host pinned 更推荐使用 cudaHostAlloc,它更灵活