Threads | SunnySun | 沐曦开发者论坛

#include <cooperative_groups.h>
#include <maca_async.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <cstdio>
#include <cmath>

namespace cg = cooperative_groups;

constexpr int BLOCK_SIZE = 256;
constexpr int TILE_SIZE = 6144;      // 每个tile的元素数量
constexpr int NUM_TILES = 2048;         // 处理的tile数量
constexpr int COMPUTE_ITERATIONS = 1; // 每个tile的计算工作量

// compute func
__device__ float compute_kernel(float data, int iterations) {
    float result = data;
    for (int i = 0; i < iterations; ++i) {
        result = result * 0.99f + 0.01f;
    }
    return result;
}

__global__ void async_copy_pipeline(float* global_data, float* global_output) {
    auto cta = cg::this_thread_block();
    __shared__ float smem[2][TILE_SIZE];
    // calc offset
    const int cta_offset = blockIdx.x * NUM_TILES * TILE_SIZE;

    float *src = global_data + cta_offset;

    int read_buf  = 0;   // compute
    int write_buf = 1;   // copy

    // the first tile 异步拷贝
    cg::memcpy_async(cta, smem[read_buf], src, sizeof(float) * TILE_SIZE);
    cg::wait(cta);
    cta.sync();

    for (int tile_idx = 0; tile_idx < NUM_TILES; ++tile_idx) {
        if (tile_idx < NUM_TILES - 1) {
            // 发起next tile 的异步拷贝
            float* next_src = src + (tile_idx + 1) * TILE_SIZE;
            cg::memcpy_async(cta, smem[write_buf], next_src, sizeof(float) * TILE_SIZE);
        }
        // current tile compute
        for (int i = threadIdx.x; i < TILE_SIZE; i += blockDim.x) {
            smem[read_buf][i] = compute_kernel(smem[read_buf][i], COMPUTE_ITERATIONS);
        }

        if (tile_idx < NUM_TILES - 1) {
            cg::wait(cta);
        }
        cta.sync();

        float *dst = global_output + cta_offset + tile_idx * TILE_SIZE;
        for (int i = threadIdx.x; i < TILE_SIZE; i += blockDim.x) {
            dst[i] = smem[read_buf][i];
        }
        if (tile_idx < NUM_TILES - 1) {
            int temp = read_buf;
            read_buf = write_buf;
            write_buf = temp;
        }
    }
}

// 同步版本用于对比
__global__ void sync_copy_sequential(float* global_data, float* global_output) {
    __shared__ float smem[TILE_SIZE];

    float* src = global_data + blockIdx.x * NUM_TILES * TILE_SIZE;
    float* dst = global_output + blockIdx.x * NUM_TILES * TILE_SIZE;

    for (int tile_idx = 0; tile_idx < NUM_TILES; ++tile_idx) {
        float* current_src = src + tile_idx * TILE_SIZE;
        float* current_dst = dst + tile_idx * TILE_SIZE;

        //copy to smem
        for (int i = threadIdx.x; i < TILE_SIZE; i += blockDim.x) {
            smem[i] = current_src[i];
        }
        __syncthreads();

        for (int i = threadIdx.x; i < TILE_SIZE; i += blockDim.x) {
            smem[i] = compute_kernel(smem[i], COMPUTE_ITERATIONS);
        }
        __syncthreads();

        for (int i = threadIdx.x; i < TILE_SIZE; i += blockDim.x) {
            current_dst[i] = smem[i];
        }
        __syncthreads();
    }
}

int main() {
    cudaError_t cudaStatus = cudaSetDevice(0);
    const int num_blocks = 4;
    const int total_elements = num_blocks * NUM_TILES * TILE_SIZE;

    printf("Testing async copy pipeline with %d blocks, %d tiles per block, %d elements per tile\n",
           num_blocks, NUM_TILES, TILE_SIZE);

    thrust::host_vector<float> h_input(total_elements);
    thrust::host_vector<float> h_output_async(total_elements);
    thrust::host_vector<float> h_output_sync(total_elements);
    for (int j = 0; j < total_elements; ++j) h_output_async[j] = static_cast<float>(-1);
    for (int j = 0; j < total_elements; ++j) h_output_sync[j] = static_cast<float>(-1);

    for (int i = 0; i < total_elements; ++i) {
        h_input[i] = static_cast<float>(i) * 0.1f;
    }

    thrust::device_vector<float> d_input = h_input;
    thrust::device_vector<float> d_output_async = h_output_async;
    thrust::device_vector<float> d_output_sync = h_output_sync;

    // test async
    cudaEvent_t start_async, stop_async;
    cudaEventCreate(&start_async);
    cudaEventCreate(&stop_async);

    cudaEventRecord(start_async);
    async_copy_pipeline<<<num_blocks, BLOCK_SIZE>>>(
        d_input.data().get(),d_output_async.data().get()
    );
    cudaEventRecord(stop_async);
    cudaDeviceSynchronize();

    float async_ms = 0;
    cudaEventElapsedTime(&async_ms, start_async, stop_async);

    // test sync
    cudaEvent_t start_sync, stop_sync;
    cudaEventCreate(&start_sync);
    cudaEventCreate(&stop_sync);

    cudaEventRecord(start_sync);
    sync_copy_sequential<<<num_blocks, BLOCK_SIZE>>>(
        d_input.data().get(),d_output_sync.data().get()
    );
    cudaEventRecord(stop_sync);
    cudaDeviceSynchronize();

    float sync_ms = 0;
    cudaEventElapsedTime(&sync_ms, start_sync, stop_sync);

    // 验证结果一致性
    thrust::host_vector<float> h_result_async = d_output_async;
    thrust::host_vector<float> h_result_sync = d_output_sync;

    bool results_match = true;
    for (int i = 0; i < total_elements; ++i) {
        if (fabs(h_result_async[i] - h_result_sync[i]) > 1e-6) {
            results_match = false;
            break;
        }
    }

    printf("Results: %s\n", results_match ? "MATCH" : "MISMATCH");
    printf("Async pipeline time: %.3f ms\n", async_ms);
    printf("Sync sequential time: %.3f ms\n", sync_ms);
    printf("Speedup: %.2fx\n", sync_ms / async_ms);

    // clear
    cudaEventDestroy(start_async);
    cudaEventDestroy(stop_async);
    cudaEventDestroy(start_sync);
    cudaEventDestroy(stop_sync);

    return 0;
}

上面的代码是用于测试异步拷贝加计算的运行时间以及同步拷贝加计算的运行时间，预期结果是使用异步拷贝性能优于同步拷贝，但是在MetaX C500上测试，结果如下（结果显示异步拷贝耗时更长）：

Testing async copy pipeline with 4 blocks, 2048 tiles per block, 6144 elements per tile
Results: MATCH
Async pipeline time: 30.342 ms
Sync sequential time: 9.460 ms
Speedup: 0.31x

同样的代码，在L20上测试，结果如下（L20上异步拷贝的耗时比同步短）：

Testing async copy pipeline with 4 blocks, 2048 tiles per block, 6144 elements per tile
Results: MATCH
Async pipeline time: 4.221 ms
Sync sequential time: 17.118 ms
Speedup: 4.06x

代码中用到了异步拷贝，主要参考developer.metax-tech.com/api/client/document/preview/559/C500_MXMACAC%2B%2BProgrammingGuide_CN.html#ehg6356t2io61 中说明的使用方法

问题：为什么在C500上异步拷贝的性能比同步的还要差呢？请问是异步拷贝的接口使用有问题还是什么原因呢？请指教，谢谢