Posts | jasongideon | 沐曦开发者论坛

Members

沐曦的__ballot_sync应该是有问题已解决 2026年1月23日 16:21

/*
 * 测试用例：演示沐曦GPU __ballot_sync bug及workaround验证
 * 
 * 问题：在warp 1中，线程59-63(lane_id 27-31)执行__ballot_sync时，
 *       即使predicate为真，返回0x00000000而不是正确的bitmask
 * 
 */

#include <mc_runtime.h>
#include <stdio.h>

#define USE_METAX  // 启用workaround


// 原始的__ballot_sync实现（NVIDIA GPU正常，沐曦GPU有bug）
__device__ uint32_t ballot_original(bool pred) {
    uint32_t FULL_WARP_ACTIVE_BMSK = 0xFFFFFFFF;
    return __ballot_sync(FULL_WARP_ACTIVE_BMSK, pred);
}

// Workaround：使用共享内存实现ballot（沐曦GPU专用）
__device__ uint32_t ballot_workaround(bool pred) {
    uint32_t lane_id = threadIdx.x % 32;
    uint32_t thrdIdxInBlk = threadIdx.x + threadIdx.y * blockDim.x;
    uint32_t warp_id = thrdIdxInBlk / 32;

    __shared__ uint32_t s_warp_pred[512];

    uint32_t idx = warp_id * 32 + lane_id;
    s_warp_pred[idx] = pred ? 1 : 0;
    __syncthreads();

    uint32_t ballotResult = 0;
    for (int i = 0; i < 32; i++) {
        if (s_warp_pred[warp_id * 32 + i]) {
            ballotResult |= (1U << i);
        }
    }
    __syncthreads();

    return ballotResult;
}

// 测试内核：重现生产环境中的bug场景
__global__ void test_ballot_bug(
    uint32_t* d_original_results,
    uint32_t* d_workaround_results,
    uint32_t nInfoBits
) {
    uint32_t thrdIdxInBlk = threadIdx.x + threadIdx.y * blockDim.x;
    uint32_t lane_id = threadIdx.x % 32;
    uint32_t warp_id = thrdIdxInBlk / 32;

    // 生产环境中的精确predicate逻辑
    uint32_t N_MAX_INFO_BITS = 128;
    int16_t interleaverTblStartIdx = N_MAX_INFO_BITS - nInfoBits;
    bool pred = ((thrdIdxInBlk < N_MAX_INFO_BITS) && (thrdIdxInBlk >= interleaverTblStartIdx)) ? true : false;

    // 测试原始实现
    uint32_t original_ballot = ballot_original(pred);

    // 测试workaround
    uint32_t workaround_ballot = ballot_workaround(pred);

    // 保存结果（每个warp只保存第一个线程的结果）
    if (lane_id == 0 && warp_id < 2) {  // 只保存warp 0和warp 1
        d_original_results[warp_id] = original_ballot;
        d_workaround_results[warp_id] = workaround_ballot;
    }

    // 打印调试信息
    if (pred) {
        printf("threadIdx.x=%d, threadIdx.y=%d, thrdIdxInBlk=%d, lane_id=%d, warp_id=%d, pred=%d\n",
               threadIdx.x, threadIdx.y, thrdIdxInBlk, lane_id, warp_id, pred);
        printf("  original_ballot=0x%08x, workaround_ballot=0x%08x\n",
               original_ballot, workaround_ballot);
    }
}

// 验证结果的辅助函数
void verify_results(uint32_t* original, uint32_t* workaround, int nWarps, uint32_t expected_mask) {
    printf("\n========== 验证结果 ==========\n");
    printf("预期bitmask (所有pred为true的线程): 0x%08x\n", expected_mask);
    printf("\n原始 __ballot_sync 结果:\n");
    for (int i = 0; i < nWarps; i++) {
        printf("  Warp %d: 0x%08x %s\n", i, original[i],
               (original[i] == expected_mask) ? "✓ 正确" : "✗ 错误");
    }

    printf("\nWorkaround 结果:\n");
    for (int i = 0; i < nWarps; i++) {
        printf("  Warp %d: 0x%08x %s\n", i, workaround[i],
               (workaround[i] == expected_mask) ? "✓ 正确" : "✗ 错误");
    }
}

int main() {
    // 测试参数：重现生产环境中的bug场景
    uint32_t nInfoBits = 69;  // 导致线程59-63的predicate为true
    uint32_t N_MAX_INFO_BITS = 128;

    printf("========== 测试配置 ==========\n");
    printf("nInfoBits = %d, N_MAX_INFO_BITS = %d\n", nInfoBits, N_MAX_INFO_BITS);
    printf("Block dim: 32x16x1 (512 threads)\n");
    printf("Grid dim: 1x1x1\n\n");

    // 计算预期的bitmask
    // 线程59-63 (lane 27-31 in warp 1) 应该有 pred=true
    // 预期bitmask = 0xF8000000 (lanes 27-31 set)
    uint32_t expected_mask = 0xF8000000;

    // 分配设备内存
    uint32_t *d_original, *d_workaround;
    uint32_t *h_original, *h_workaround;

    mcMalloc(&d_original, 2 * sizeof(uint32_t));
    mcMalloc(&d_workaround, 2 * sizeof(uint32_t));
    h_original = (uint32_t*)malloc(2 * sizeof(uint32_t));
    h_workaround = (uint32_t*)malloc(2 * sizeof(uint32_t));

    // 初始化
    mcMemset(d_original, 0, 2 * sizeof(uint32_t));
    mcMemset(d_workaround, 0, 2 * sizeof(uint32_t));

    // 启动kernel
    printf("========== 启动Kernel ==========\n");
    dim3 blockDim(32, 16, 1);  // 32x16x1 = 512 threads
    dim3 gridDim(1, 1, 1);

    test_ballot_bug<<<gridDim, blockDim>>>(d_original, d_workaround, nInfoBits);

    // 同步并检查错误
    mcError_t err = mcDeviceSynchronize();
    if (err != mcSuccess) {
        printf("CUDA Error: %s\n", mcGetErrorString(err));
        return -1;
    }

    // 复制结果回主机
    mcMemcpy(h_original, d_original, 2 * sizeof(uint32_t), mcMemcpyDeviceToHost);
    mcMemcpy(h_workaround, d_workaround, 2 * sizeof(uint32_t), mcMemcpyDeviceToHost);

    // 验证结果
    verify_results(h_original, h_workaround, 2, expected_mask);

    // 清理
    mcFree(d_original);
    mcFree(d_workaround);
    free(h_original);
    free(h_workaround);

    printf("\n========== 测试完成 ==========\n");
    return 0;
}

Warp 0, 2, 3的__ballot_sync工作正常
只有Warp 1的特定lane范围(27-31)受影响
Warp 1的lanes 27-31 (线程59-63)在predicate=true时，__ballot_sync返回0x00000000

See post chevron_right

jasongideon

Members

mma.h与wmma与 namespace mxmaca相关问题已解决 2026年1月23日 10:13

mx-smi
mx-smi  version: 2.1.10

=================== MetaX System Management Interface Log ===================
Timestamp                                         : Fri Jan 23 02:13:41 2026

Attached GPUs                                     : 1
+---------------------------------------------------------------------------------+
| MX-SMI 2.1.10                       Kernel Mode Driver Version: 2.11.12         |
| MACA Version: 3.3.0.15              BIOS Version: 1.20.3.0                      |
|------------------------------------+---------------------+----------------------+
| GPU         NAME                   | Bus-id              | GPU-Util             |
| Temp        Pwr:Usage/Cap          | Memory-Usage        |                      |
|====================================+=====================+======================|
| 0           MetaX C500             | 0000:19:00.0        | 0%                   |
| 54C         48W / 350W             | 858/65536 MiB       |                      |
+------------------------------------+---------------------+----------------------+

+---------------------------------------------------------------------------------+
| Process:                                                                        |
|  GPU                    PID         Process Name                 GPU Memory     |
|                                                                  Usage(MiB)     |
|=================================================================================|
|  no process found                                                               |
+---------------------------------------------------------------------------------+

End of Log

See post chevron_right

jasongideon

Members

mma.h与wmma与 namespace mxmaca相关问题已解决 2026年1月23日 10:06

#include <cmath>
#include <maca_cooperative_groups.h>
#include <maca_reduce.h>
#include <maca_fp16.h>
#include <assert.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>

//#include <__clang_maca_mma_functions.h>
#include "mctlass/arch/wmma.h"

using namespace mxmaca;

// WMMA kernel: 16x16x16 matrix multiplication
//__device__ void wmma_ker(half *a, half *b, float *c) {
__host__ __device__ void wmma_ker(half *a, half *b, float *c) {
    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag;
    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag;
    wmma::fragment<wmma::accumulator, 16, 16, 16, float> c_frag;

    wmma::fill_fragment(c_frag, 0.0f);

    wmma::load_matrix_sync(a_frag, a, 16);
    wmma::load_matrix_sync(b_frag, b, 16);
    wmma::mma_sync(c_frag, a_frag, b_frag, c_frag);

    wmma::store_matrix_sync(c, c_frag, 16, wmma::mem_row_major);
}


int main() {
}

root@master2:/opt/caosiyuan/share/gitee.com/maca-samples/0_Introduction/asyncExec# make
/opt/maca/mxgpu_llvm/bin/mxcc -x maca  -offload-arch native  asyncExec.cpp -o asyncExec --maca-path=/opt/maca
asyncExec.cpp:39:17: error: expected namespace name
   39 | using namespace mxmaca;
      |                 ^
asyncExec.cpp:44:5: error: use of undeclared identifier 'wmma'
   44 |     wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag;
      |     ^
asyncExec.cpp:44:20: error: use of undeclared identifier 'wmma'
   44 |     wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag;
      |                    ^
asyncExec.cpp:45:5: error: use of undeclared identifier 'wmma'
   45 |     wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag;
      |     ^
asyncExec.cpp:45:20: error: use of undeclared identifier 'wmma'
   45 |     wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag;
      |                    ^
asyncExec.cpp:46:5: error: use of undeclared identifier 'wmma'
   46 |     wmma::fragment<wmma::accumulator, 16, 16, 16, float> c_frag;
      |     ^
asyncExec.cpp:46:20: error: use of undeclared identifier 'wmma'
   46 |     wmma::fragment<wmma::accumulator, 16, 16, 16, float> c_frag;
      |                    ^
asyncExec.cpp:48:5: error: use of undeclared identifier 'wmma'
   48 |     wmma::fill_fragment(c_frag, 0.0f);
      |     ^
asyncExec.cpp:50:5: error: use of undeclared identifier 'wmma'
   50 |     wmma::load_matrix_sync(a_frag, a, 16);
      |     ^
asyncExec.cpp:51:5: error: use of undeclared identifier 'wmma'
   51 |     wmma::load_matrix_sync(b_frag, b, 16);
      |     ^
asyncExec.cpp:52:5: error: use of undeclared identifier 'wmma'
   52 |     wmma::mma_sync(c_frag, a_frag, b_frag, c_frag);
      |     ^
asyncExec.cpp:54:5: error: use of undeclared identifier 'wmma'
   54 |     wmma::store_matrix_sync(c, c_frag, 16, wmma::mem_row_major);
      |     ^
asyncExec.cpp:54:44: error: use of undeclared identifier 'wmma'
   54 |     wmma::store_matrix_sync(c, c_frag, 16, wmma::mem_row_major);
      |                                            ^
13 errors generated when compiling for host.
make: *** [Makefile:44: asyncExec] Error 1


# cat Makefile
# Location of the MACA Toolkit
MACA_PATH ?= /opt/maca

# Compiler
MXCC           = $(MACA_PATH)/mxgpu_llvm/bin/mxcc

# Internel flags
MXCCFLAGS     := -x maca
#ARCHFLAGS     := -offload-arch native
ARCHFLAGS     := -offload-arch native
#ARCHFLAGS     := -offload-arch mx1
################################################################################
# Target rules
all: asyncExec

asyncExec: asyncExec.cpp
        $(MXCC) $(MXCCFLAGS) $(ARCHFLAGS) $< -o $@ --maca-path=$(MACA_PATH)

run: asyncExec
        ./asyncExec

clean:
        rm -rf asyncExec asyncExec.o tdump.mem tdump.txt

See post chevron_right

jasongideon

Members

mma.h与wmma与 namespace mxmaca相关问题已解决 2026年1月20日 11:49

INCLUDE_DIRECTORIES(${MACA_CLANG_PATH}/../lib)
INCLUDE_DIRECTORIES(${MACA_PATH}/include/)
INCLUDE_DIRECTORIES(${MACA_PATH}/include/mcfft/)
INCLUDE_DIRECTORIES(${MACA_PATH}/include/mcr/)
INCLUDE_DIRECTORIES(${MACA_PATH}/include/common/)
INCLUDE_DIRECTORIES(${MACA_PATH}/include/mctlass/arch)
LINK_DIRECTORIES(${MACA_PATH}/lib)

指定${MACA_PATH}/include/mctlass/arch下的mma.h,编译报错

.cpp:22:17: error: expected namespace name
   22 | using namespace mxmaca;
      |                 ^
.cpp:405:13: error: use of undeclared identifier 'wmma'
  405 |             wmma::fragment<wmma::matrix_a, m, n, k, half, wmma::row_major> a_frag;
      |             ^
:405:28: error: use of undeclared identifier 'wmma'
  405 |             wmma::fragment<wmma::matrix_a, m, n, k, half, wmma::row_major> a_frag;
      |                            ^
.cpp:412:13: error: use of undeclared identifier 'wmma'
  412 |             wmma::fill_fragment(c_frag, 0);

.cpp:419:16: error: use of undeclared identifier 'wmma'
  419 |                wmma::mma_sync(c_frag, a_frag, b_frag, c_frag);
      |                ^
.cpp:433:13: error: use of undeclared identifier 'wmma'
  433 |             wmma::store_matrix_sync(store_addr, c_frag, store_ldm, wmma::mem_row_major);
      |             ^
.cpp:433:68: error: use of undeclared identifier 'wmma'
  433 |             wmma::store_matrix_sync(store_addr, c_frag, store_ldm, wmma::mem_row_major);

See post chevron_right

jasongideon

Members

mma.h与wmma与 namespace mxmaca相关问题已解决 2026年1月20日 11:44

#ifdef USE_METAX
#include <maca_cooperative_groups.h>
#include <maca_reduce.h>
#include <maca_fp16.h>
#include "math_utils.cuh"
#include <mma.h>
#include <assert.h>
// for wmma
using namespace mxmaca;
#else
#include <cooperative_groups.h>
#include <cooperative_groups/reduce.h>
#include "cuda_fp16.h"
#include "math_utils.cuh"
#include "mma.h"
#include <assert.h>
// for wmma
using namespace nvcuda;
#endif

我这边在做国产化的适配，nvidia是mma.h和 namespace nvcude。请问沐曦对应的头文件在哪个path？哪个文件？namespace是mxmaca吗？

找到一堆mma.h，有没有例子？找的太费劲了

find /opt/maca-3.3.0/ -name *mma.h
/opt/maca-3.3.0/include/mctlass/arch/mma.h
/opt/maca-3.3.0/include/mctlass/arch/wmma.h
/opt/maca-3.3.0/include/mctlass/gemm/warp/mma_tensor_op_wmma.h
/opt/maca-3.3.0/include/mctlass/gemm/warp/mma_tensor_op_tile_iterator_wmma.h
/opt/maca-3.3.0/include/mctlass/gemm/warp/mma.h
/opt/maca-3.3.0/include/mctlass/gemm/thread/mma.h
/opt/maca-3.3.0/include/mctlass/gemm/threadblock/maca_default_mma.h
/opt/maca-3.3.0/include/mctlass/gemm/threadblock/maca_default_moe_mma.h
/opt/maca-3.3.0/include/mctlass/gemm/threadblock/default_mma_core_wmma.h
/opt/maca-3.3.0/include/mctlass/gemm/threadblock/maca_default_masked_grouped_mma.h
/opt/maca-3.3.0/include/mctlass/gemm/threadblock/default_sparse_mma.h
/opt/maca-3.3.0/include/mctlass/gemm/threadblock/maca_default_contiguous_grouped_mma.h
/opt/maca-3.3.0/include/mctlass/gemm/threadblock/default_mma.h
/opt/maca-3.3.0/include/mctlass/gemm/threadblock/maca_default_group_mma.h

See post chevron_right

jasongideon
Members

cudaMipmappedArray相关能力有吗？已解决 2026年1月6日 16:10

@shuai_chen has written:

尊敬的开发者您好，请参考developer.metax-tech.com/api/client/document/preview/885/index.html

这个地址搜了下，我看是没有MipmappedArray相关的API。这个后续支持吗？我这边在做很重要的国产化工作，咱们有开发者微信群吗？
See post chevron_right

jasongideon
Members

cudaMipmappedArray相关能力有吗？已解决 2026年1月6日 15:37

是这个吗？developer.metax-tech.com/search?q=MipmappedArray
没有显示相关的信息，确定有这个能力吗？

See post chevron_right

jasongideon

Members

谢谢恢复，但是我搜索了下，没有看到相关API，请问在哪里呢？我用的images是 cr.metax-tech.com/public-library/maca-c500:2.33.0.6-ubuntu20.04-amd64

root@master2:/opt/maca-3.3.0/samples/0_Introduction/asyncExec# grep -rn Mipmapped  /opt/maca-3.3.0/include/
/opt/maca-3.3.0/include/mcr/mc_runtime_api.h:167: * - ::mcDevAttrMaxTexture1DMipmappedWidth: Maximum mipmapped 1D texture width
/opt/maca-3.3.0/include/mcr/mc_runtime_api.h:176: * - ::mcDevAttrMaxTexture2DMipmappedWidth: Maximum mipmapped 2D texture
/opt/maca-3.3.0/include/mcr/mc_runtime_api.h:178: * - ::mcDevAttrMaxTexture2DMipmappedHeight: Maximum mipmapped 2D texture
/opt/maca-3.3.0/include/mcr/mc_runtime_api.h:782: * such as ::mcStream_t, ::mcEvent_t, ::mcArray_t, ::mcMipmappedArray_t,
/opt/maca-3.3.0/include/mcr/mc_runtime_api.h:4806: * types such as mcModule_t, mcFunction_t, mcStream_t, mcEvent_t, mcArray, mcMipmappedArray,
/opt/maca-3.3.0/include/mcr/mc_runtime_types.h:698:    mcDeviceAttributeMaxTexture2DMipmappedWidth,       // not implement
/opt/maca-3.3.0/include/mcr/mc_runtime_types.h:699:    mcDeviceAttributeMaxTexture2DMipmappedHeight,      // not implement
/opt/maca-3.3.0/include/mcr/mc_runtime_types.h:702:    mcDeviceAttrMaxTexture1DMipmappedWidth,            // not implement
/opt/maca-3.3.0/include/mcr/mc_runtime_types.h:2236:typedef struct _mcMipmappedArray_t {
/opt/maca-3.3.0/include/mcr/mc_runtime_types.h:2242:} * mcMipmappedArray_t;
/opt/maca-3.3.0/include/mcr/mc_runtime_types.h:2243:typedef const struct _mcMipmappedArray_t *mcMipmappedArray_const_t;
/opt/maca-3.3.0/include/mcr/mc_runtime_types.h:2245:typedef struct mcExternalMemoryMipmappedArrayDesc_st {
/opt/maca-3.3.0/include/mcr/mc_runtime_types.h:2251:} mcExternalMemoryMipmappedArrayDesc;
/opt/maca-3.3.0/include/mcr/mc_runtime_types.h:2588:    mcResourceTypeMipmappedArray = 0x01, /**< Mipmapped array resource */
/opt/maca-3.3.0/include/mcr/mc_runtime_types.h:2645:            mcMipmappedArray_t mipmap; /**< MACA mipmapped array */

See post chevron_right

jasongideon
Members

cudaMipmappedArray相关能力有吗？已解决 2026年1月6日 15:03

cudaArrayGetInfo
cudaMemcpy2DToArrayAsync
cudaMallocMipmappedArray
cudaFreeMipmappedArray
cudaGetMipmappedArrayLevel