MetaX-Tech Developer Forum 论坛首页
  • 沐曦开发者
search
Sign in

jasongideon

  • Members
  • Joined 2026年1月6日
  • message 帖子
  • forum 主题
  • favorite 关注者
  • favorite_border Follows
  • person_outline 详细信息

jasongideon has posted 9 messages.

  • See post chevron_right
    jasongideon
    Members
    沐曦的__ballot_sync应该是有问题 已解决 2026年1月23日 16:21
    /*
     * 测试用例:演示沐曦GPU __ballot_sync bug及workaround验证
     * 
     * 问题:在warp 1中,线程59-63(lane_id 27-31)执行__ballot_sync时,
     *       即使predicate为真,返回0x00000000而不是正确的bitmask
     * 
     */
    
    #include <mc_runtime.h>
    #include <stdio.h>
    
    #define USE_METAX  // 启用workaround
    
    
    // 原始的__ballot_sync实现(NVIDIA GPU正常,沐曦GPU有bug)
    __device__ uint32_t ballot_original(bool pred) {
        uint32_t FULL_WARP_ACTIVE_BMSK = 0xFFFFFFFF;
        return __ballot_sync(FULL_WARP_ACTIVE_BMSK, pred);
    }
    
    // Workaround:使用共享内存实现ballot(沐曦GPU专用)
    __device__ uint32_t ballot_workaround(bool pred) {
        uint32_t lane_id = threadIdx.x % 32;
        uint32_t thrdIdxInBlk = threadIdx.x + threadIdx.y * blockDim.x;
        uint32_t warp_id = thrdIdxInBlk / 32;
    
        __shared__ uint32_t s_warp_pred[512];
    
        uint32_t idx = warp_id * 32 + lane_id;
        s_warp_pred[idx] = pred ? 1 : 0;
        __syncthreads();
    
        uint32_t ballotResult = 0;
        for (int i = 0; i < 32; i++) {
            if (s_warp_pred[warp_id * 32 + i]) {
                ballotResult |= (1U << i);
            }
        }
        __syncthreads();
    
        return ballotResult;
    }
    
    // 测试内核:重现生产环境中的bug场景
    __global__ void test_ballot_bug(
        uint32_t* d_original_results,
        uint32_t* d_workaround_results,
        uint32_t nInfoBits
    ) {
        uint32_t thrdIdxInBlk = threadIdx.x + threadIdx.y * blockDim.x;
        uint32_t lane_id = threadIdx.x % 32;
        uint32_t warp_id = thrdIdxInBlk / 32;
    
        // 生产环境中的精确predicate逻辑
        uint32_t N_MAX_INFO_BITS = 128;
        int16_t interleaverTblStartIdx = N_MAX_INFO_BITS - nInfoBits;
        bool pred = ((thrdIdxInBlk < N_MAX_INFO_BITS) && (thrdIdxInBlk >= interleaverTblStartIdx)) ? true : false;
    
        // 测试原始实现
        uint32_t original_ballot = ballot_original(pred);
    
        // 测试workaround
        uint32_t workaround_ballot = ballot_workaround(pred);
    
        // 保存结果(每个warp只保存第一个线程的结果)
        if (lane_id == 0 && warp_id < 2) {  // 只保存warp 0和warp 1
            d_original_results[warp_id] = original_ballot;
            d_workaround_results[warp_id] = workaround_ballot;
        }
    
        // 打印调试信息
        if (pred) {
            printf("threadIdx.x=%d, threadIdx.y=%d, thrdIdxInBlk=%d, lane_id=%d, warp_id=%d, pred=%d\n",
                   threadIdx.x, threadIdx.y, thrdIdxInBlk, lane_id, warp_id, pred);
            printf("  original_ballot=0x%08x, workaround_ballot=0x%08x\n",
                   original_ballot, workaround_ballot);
        }
    }
    
    // 验证结果的辅助函数
    void verify_results(uint32_t* original, uint32_t* workaround, int nWarps, uint32_t expected_mask) {
        printf("\n========== 验证结果 ==========\n");
        printf("预期bitmask (所有pred为true的线程): 0x%08x\n", expected_mask);
        printf("\n原始 __ballot_sync 结果:\n");
        for (int i = 0; i < nWarps; i++) {
            printf("  Warp %d: 0x%08x %s\n", i, original[i],
                   (original[i] == expected_mask) ? "✓ 正确" : "✗ 错误");
        }
    
        printf("\nWorkaround 结果:\n");
        for (int i = 0; i < nWarps; i++) {
            printf("  Warp %d: 0x%08x %s\n", i, workaround[i],
                   (workaround[i] == expected_mask) ? "✓ 正确" : "✗ 错误");
        }
    }
    
    int main() {
        // 测试参数:重现生产环境中的bug场景
        uint32_t nInfoBits = 69;  // 导致线程59-63的predicate为true
        uint32_t N_MAX_INFO_BITS = 128;
    
        printf("========== 测试配置 ==========\n");
        printf("nInfoBits = %d, N_MAX_INFO_BITS = %d\n", nInfoBits, N_MAX_INFO_BITS);
        printf("Block dim: 32x16x1 (512 threads)\n");
        printf("Grid dim: 1x1x1\n\n");
    
        // 计算预期的bitmask
        // 线程59-63 (lane 27-31 in warp 1) 应该有 pred=true
        // 预期bitmask = 0xF8000000 (lanes 27-31 set)
        uint32_t expected_mask = 0xF8000000;
    
        // 分配设备内存
        uint32_t *d_original, *d_workaround;
        uint32_t *h_original, *h_workaround;
    
        mcMalloc(&d_original, 2 * sizeof(uint32_t));
        mcMalloc(&d_workaround, 2 * sizeof(uint32_t));
        h_original = (uint32_t*)malloc(2 * sizeof(uint32_t));
        h_workaround = (uint32_t*)malloc(2 * sizeof(uint32_t));
    
        // 初始化
        mcMemset(d_original, 0, 2 * sizeof(uint32_t));
        mcMemset(d_workaround, 0, 2 * sizeof(uint32_t));
    
        // 启动kernel
        printf("========== 启动Kernel ==========\n");
        dim3 blockDim(32, 16, 1);  // 32x16x1 = 512 threads
        dim3 gridDim(1, 1, 1);
    
        test_ballot_bug<<<gridDim, blockDim>>>(d_original, d_workaround, nInfoBits);
    
        // 同步并检查错误
        mcError_t err = mcDeviceSynchronize();
        if (err != mcSuccess) {
            printf("CUDA Error: %s\n", mcGetErrorString(err));
            return -1;
        }
    
        // 复制结果回主机
        mcMemcpy(h_original, d_original, 2 * sizeof(uint32_t), mcMemcpyDeviceToHost);
        mcMemcpy(h_workaround, d_workaround, 2 * sizeof(uint32_t), mcMemcpyDeviceToHost);
    
        // 验证结果
        verify_results(h_original, h_workaround, 2, expected_mask);
    
        // 清理
        mcFree(d_original);
        mcFree(d_workaround);
        free(h_original);
        free(h_workaround);
    
        printf("\n========== 测试完成 ==========\n");
        return 0;
    }
    

    Warp 0, 2, 3的__ballot_sync工作正常
    只有Warp 1的特定lane范围(27-31)受影响
    Warp 1的lanes 27-31 (线程59-63)在predicate=true时,__ballot_sync返回0x00000000

  • See post chevron_right
    jasongideon
    Members
    mma.h与wmma与 namespace mxmaca相关问题 已解决 2026年1月23日 10:13
    mx-smi
    mx-smi  version: 2.1.10
    
    =================== MetaX System Management Interface Log ===================
    Timestamp                                         : Fri Jan 23 02:13:41 2026
    
    Attached GPUs                                     : 1
    +---------------------------------------------------------------------------------+
    | MX-SMI 2.1.10                       Kernel Mode Driver Version: 2.11.12         |
    | MACA Version: 3.3.0.15              BIOS Version: 1.20.3.0                      |
    |------------------------------------+---------------------+----------------------+
    | GPU         NAME                   | Bus-id              | GPU-Util             |
    | Temp        Pwr:Usage/Cap          | Memory-Usage        |                      |
    |====================================+=====================+======================|
    | 0           MetaX C500             | 0000:19:00.0        | 0%                   |
    | 54C         48W / 350W             | 858/65536 MiB       |                      |
    +------------------------------------+---------------------+----------------------+
    
    +---------------------------------------------------------------------------------+
    | Process:                                                                        |
    |  GPU                    PID         Process Name                 GPU Memory     |
    |                                                                  Usage(MiB)     |
    |=================================================================================|
    |  no process found                                                               |
    +---------------------------------------------------------------------------------+
    
    End of Log
    
  • See post chevron_right
    jasongideon
    Members
    mma.h与wmma与 namespace mxmaca相关问题 已解决 2026年1月23日 10:06
    #include <cmath>
    #include <maca_cooperative_groups.h>
    #include <maca_reduce.h>
    #include <maca_fp16.h>
    #include <assert.h>
    #include <stdio.h>
    #include <string.h>
    #include <stdlib.h>
    
    //#include <__clang_maca_mma_functions.h>
    #include "mctlass/arch/wmma.h"
    
    using namespace mxmaca;
    
    // WMMA kernel: 16x16x16 matrix multiplication
    //__device__ void wmma_ker(half *a, half *b, float *c) {
    __host__ __device__ void wmma_ker(half *a, half *b, float *c) {
        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag;
        wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag;
        wmma::fragment<wmma::accumulator, 16, 16, 16, float> c_frag;
    
        wmma::fill_fragment(c_frag, 0.0f);
    
        wmma::load_matrix_sync(a_frag, a, 16);
        wmma::load_matrix_sync(b_frag, b, 16);
        wmma::mma_sync(c_frag, a_frag, b_frag, c_frag);
    
        wmma::store_matrix_sync(c, c_frag, 16, wmma::mem_row_major);
    }
    
    
    int main() {
    }
    
    root@master2:/opt/caosiyuan/share/gitee.com/maca-samples/0_Introduction/asyncExec# make
    /opt/maca/mxgpu_llvm/bin/mxcc -x maca  -offload-arch native  asyncExec.cpp -o asyncExec --maca-path=/opt/maca
    asyncExec.cpp:39:17: error: expected namespace name
       39 | using namespace mxmaca;
          |                 ^
    asyncExec.cpp:44:5: error: use of undeclared identifier 'wmma'
       44 |     wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag;
          |     ^
    asyncExec.cpp:44:20: error: use of undeclared identifier 'wmma'
       44 |     wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag;
          |                    ^
    asyncExec.cpp:45:5: error: use of undeclared identifier 'wmma'
       45 |     wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag;
          |     ^
    asyncExec.cpp:45:20: error: use of undeclared identifier 'wmma'
       45 |     wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag;
          |                    ^
    asyncExec.cpp:46:5: error: use of undeclared identifier 'wmma'
       46 |     wmma::fragment<wmma::accumulator, 16, 16, 16, float> c_frag;
          |     ^
    asyncExec.cpp:46:20: error: use of undeclared identifier 'wmma'
       46 |     wmma::fragment<wmma::accumulator, 16, 16, 16, float> c_frag;
          |                    ^
    asyncExec.cpp:48:5: error: use of undeclared identifier 'wmma'
       48 |     wmma::fill_fragment(c_frag, 0.0f);
          |     ^
    asyncExec.cpp:50:5: error: use of undeclared identifier 'wmma'
       50 |     wmma::load_matrix_sync(a_frag, a, 16);
          |     ^
    asyncExec.cpp:51:5: error: use of undeclared identifier 'wmma'
       51 |     wmma::load_matrix_sync(b_frag, b, 16);
          |     ^
    asyncExec.cpp:52:5: error: use of undeclared identifier 'wmma'
       52 |     wmma::mma_sync(c_frag, a_frag, b_frag, c_frag);
          |     ^
    asyncExec.cpp:54:5: error: use of undeclared identifier 'wmma'
       54 |     wmma::store_matrix_sync(c, c_frag, 16, wmma::mem_row_major);
          |     ^
    asyncExec.cpp:54:44: error: use of undeclared identifier 'wmma'
       54 |     wmma::store_matrix_sync(c, c_frag, 16, wmma::mem_row_major);
          |                                            ^
    13 errors generated when compiling for host.
    make: *** [Makefile:44: asyncExec] Error 1
    
    
    # cat Makefile
    # Location of the MACA Toolkit
    MACA_PATH ?= /opt/maca
    
    # Compiler
    MXCC           = $(MACA_PATH)/mxgpu_llvm/bin/mxcc
    
    # Internel flags
    MXCCFLAGS     := -x maca
    #ARCHFLAGS     := -offload-arch native
    ARCHFLAGS     := -offload-arch native
    #ARCHFLAGS     := -offload-arch mx1
    ################################################################################
    # Target rules
    all: asyncExec
    
    asyncExec: asyncExec.cpp
            $(MXCC) $(MXCCFLAGS) $(ARCHFLAGS) $< -o $@ --maca-path=$(MACA_PATH)
    
    run: asyncExec
            ./asyncExec
    
    clean:
            rm -rf asyncExec asyncExec.o tdump.mem tdump.txt
    
  • See post chevron_right
    jasongideon
    Members
    mma.h与wmma与 namespace mxmaca相关问题 已解决 2026年1月20日 11:49
    INCLUDE_DIRECTORIES(${MACA_CLANG_PATH}/../lib)
    INCLUDE_DIRECTORIES(${MACA_PATH}/include/)
    INCLUDE_DIRECTORIES(${MACA_PATH}/include/mcfft/)
    INCLUDE_DIRECTORIES(${MACA_PATH}/include/mcr/)
    INCLUDE_DIRECTORIES(${MACA_PATH}/include/common/)
    INCLUDE_DIRECTORIES(${MACA_PATH}/include/mctlass/arch)
    LINK_DIRECTORIES(${MACA_PATH}/lib)
    

    指定${MACA_PATH}/include/mctlass/arch下的mma.h,编译报错

    .cpp:22:17: error: expected namespace name
       22 | using namespace mxmaca;
          |                 ^
    .cpp:405:13: error: use of undeclared identifier 'wmma'
      405 |             wmma::fragment<wmma::matrix_a, m, n, k, half, wmma::row_major> a_frag;
          |             ^
    :405:28: error: use of undeclared identifier 'wmma'
      405 |             wmma::fragment<wmma::matrix_a, m, n, k, half, wmma::row_major> a_frag;
          |                            ^
    .cpp:412:13: error: use of undeclared identifier 'wmma'
      412 |             wmma::fill_fragment(c_frag, 0);
    
    .cpp:419:16: error: use of undeclared identifier 'wmma'
      419 |                wmma::mma_sync(c_frag, a_frag, b_frag, c_frag);
          |                ^
    .cpp:433:13: error: use of undeclared identifier 'wmma'
      433 |             wmma::store_matrix_sync(store_addr, c_frag, store_ldm, wmma::mem_row_major);
          |             ^
    .cpp:433:68: error: use of undeclared identifier 'wmma'
      433 |             wmma::store_matrix_sync(store_addr, c_frag, store_ldm, wmma::mem_row_major);
    
  • See post chevron_right
    jasongideon
    Members
    mma.h与wmma与 namespace mxmaca相关问题 已解决 2026年1月20日 11:44
    #ifdef USE_METAX
    #include <maca_cooperative_groups.h>
    #include <maca_reduce.h>
    #include <maca_fp16.h>
    #include "math_utils.cuh"
    #include <mma.h>
    #include <assert.h>
    // for wmma
    using namespace mxmaca;
    #else
    #include <cooperative_groups.h>
    #include <cooperative_groups/reduce.h>
    #include "cuda_fp16.h"
    #include "math_utils.cuh"
    #include "mma.h"
    #include <assert.h>
    // for wmma
    using namespace nvcuda;
    #endif
    

    我这边在做国产化的适配,nvidia是mma.h和 namespace nvcude。请问沐曦对应的头文件在哪个path?哪个文件?namespace是mxmaca吗?

    找到一堆mma.h,有没有例子?找的太费劲了

    find /opt/maca-3.3.0/ -name *mma.h
    /opt/maca-3.3.0/include/mctlass/arch/mma.h
    /opt/maca-3.3.0/include/mctlass/arch/wmma.h
    /opt/maca-3.3.0/include/mctlass/gemm/warp/mma_tensor_op_wmma.h
    /opt/maca-3.3.0/include/mctlass/gemm/warp/mma_tensor_op_tile_iterator_wmma.h
    /opt/maca-3.3.0/include/mctlass/gemm/warp/mma.h
    /opt/maca-3.3.0/include/mctlass/gemm/thread/mma.h
    /opt/maca-3.3.0/include/mctlass/gemm/threadblock/maca_default_mma.h
    /opt/maca-3.3.0/include/mctlass/gemm/threadblock/maca_default_moe_mma.h
    /opt/maca-3.3.0/include/mctlass/gemm/threadblock/default_mma_core_wmma.h
    /opt/maca-3.3.0/include/mctlass/gemm/threadblock/maca_default_masked_grouped_mma.h
    /opt/maca-3.3.0/include/mctlass/gemm/threadblock/default_sparse_mma.h
    /opt/maca-3.3.0/include/mctlass/gemm/threadblock/maca_default_contiguous_grouped_mma.h
    /opt/maca-3.3.0/include/mctlass/gemm/threadblock/default_mma.h
    /opt/maca-3.3.0/include/mctlass/gemm/threadblock/maca_default_group_mma.h
    
  • See post chevron_right
    jasongideon
    Members
    cudaMipmappedArray相关能力有吗? 已解决 2026年1月6日 16:10
    @shuai_chen has written:

    尊敬的开发者您好,请参考developer.metax-tech.com/api/client/document/preview/885/index.html

    这个地址搜了下,我看是没有MipmappedArray相关的API。这个后续支持吗?我这边在做很重要的国产化工作,咱们有开发者微信群吗?

  • See post chevron_right
    jasongideon
    Members
    cudaMipmappedArray相关能力有吗? 已解决 2026年1月6日 15:37

    是这个吗?developer.metax-tech.com/search?q=MipmappedArray
    没有显示相关的信息,确定有这个能力吗?

  • See post chevron_right
    jasongideon
    Members
    cudaMipmappedArray相关能力有吗? 已解决 2026年1月6日 15:15

    谢谢恢复,但是我搜索了下,没有看到相关API,请问在哪里呢?我用的images是 cr.metax-tech.com/public-library/maca-c500:2.33.0.6-ubuntu20.04-amd64

    root@master2:/opt/maca-3.3.0/samples/0_Introduction/asyncExec# grep -rn Mipmapped  /opt/maca-3.3.0/include/
    /opt/maca-3.3.0/include/mcr/mc_runtime_api.h:167: * - ::mcDevAttrMaxTexture1DMipmappedWidth: Maximum mipmapped 1D texture width
    /opt/maca-3.3.0/include/mcr/mc_runtime_api.h:176: * - ::mcDevAttrMaxTexture2DMipmappedWidth: Maximum mipmapped 2D texture
    /opt/maca-3.3.0/include/mcr/mc_runtime_api.h:178: * - ::mcDevAttrMaxTexture2DMipmappedHeight: Maximum mipmapped 2D texture
    /opt/maca-3.3.0/include/mcr/mc_runtime_api.h:782: * such as ::mcStream_t, ::mcEvent_t, ::mcArray_t, ::mcMipmappedArray_t,
    /opt/maca-3.3.0/include/mcr/mc_runtime_api.h:4806: * types such as mcModule_t, mcFunction_t, mcStream_t, mcEvent_t, mcArray, mcMipmappedArray,
    /opt/maca-3.3.0/include/mcr/mc_runtime_types.h:698:    mcDeviceAttributeMaxTexture2DMipmappedWidth,       // not implement
    /opt/maca-3.3.0/include/mcr/mc_runtime_types.h:699:    mcDeviceAttributeMaxTexture2DMipmappedHeight,      // not implement
    /opt/maca-3.3.0/include/mcr/mc_runtime_types.h:702:    mcDeviceAttrMaxTexture1DMipmappedWidth,            // not implement
    /opt/maca-3.3.0/include/mcr/mc_runtime_types.h:2236:typedef struct _mcMipmappedArray_t {
    /opt/maca-3.3.0/include/mcr/mc_runtime_types.h:2242:} * mcMipmappedArray_t;
    /opt/maca-3.3.0/include/mcr/mc_runtime_types.h:2243:typedef const struct _mcMipmappedArray_t *mcMipmappedArray_const_t;
    /opt/maca-3.3.0/include/mcr/mc_runtime_types.h:2245:typedef struct mcExternalMemoryMipmappedArrayDesc_st {
    /opt/maca-3.3.0/include/mcr/mc_runtime_types.h:2251:} mcExternalMemoryMipmappedArrayDesc;
    /opt/maca-3.3.0/include/mcr/mc_runtime_types.h:2588:    mcResourceTypeMipmappedArray = 0x01, /**< Mipmapped array resource */
    /opt/maca-3.3.0/include/mcr/mc_runtime_types.h:2645:            mcMipmappedArray_t mipmap; /**< MACA mipmapped array */
    
  • See post chevron_right
    jasongideon
    Members
    cudaMipmappedArray相关能力有吗? 已解决 2026年1月6日 15:03

    cudaArrayGetInfo
    cudaMemcpy2DToArrayAsync
    cudaMallocMipmappedArray
    cudaFreeMipmappedArray
    cudaGetMipmappedArrayLevel

  • 沐曦开发者论坛
powered by misago