昇腾CANN cmake 实战:CANN CMake 构建系统——跨平台编译配置与模块化管理
8 个 CANN 仓库各需独立构建ops-transformer/ops-nn/hccl/ge/…→ 手写 8 套 CMakeLists.txtCANN 路径判断、跨 NPU 型号编译、第三方库兼容。cmake 仓库提供统一的FindCANN.cmakeCANNConfig.cmake模板——任何仓库只需find_package(CANN)一行自动推断 toolkit 路径、编译器、依赖库。同时管理 55 个仓库的交叉编译配置。cmake 仓库不仅是一套 CMake 脚本——它是 CANN 全 55 个仓库的统一编译入口。仓库之间的交叉依赖ops-transformer depends on opbase, hccl depends on runtime, …全部通过 CMake 的find_package()target_link_libraries()自动链入开发者不需要关心 CANN 装在哪个目录。CANN CMake 配置体系三层Layer 0: FindCANN.cmake自动查找 CANN 安装路径 ├─ 环境变量 $ASCEND_HOME_PATH ├─ 默认路径探测 (/usr/local/Ascend/...) └─ 包管理器路径 (apt/pip/conda) Layer 1: CANNConfig.cmake导出 CANN 编译配置 ├─ CANN_INCLUDE_DIRS头文件路径 ├─ CANN_LIBRARIES库文件列表 └─ CANN_COMPILE_OPTIONS编译选项--aicore / -DASCEND910 Layer 2: can_target_* 高层构建宏 ├─ can_add_op_target()构建 Ascend C 算子 .so ├─ can_add_test_target()构建 ACL 测试程序 └─ can_add_pybind_target()构建 Python binding# cmake/modules/FindCANN.cmakeLayer 0路径查找 # # 用法find_package(CANN 8.0 REQUIRED COMPONENTS acl ge ascendc) # → CANN_FOUND TRUE # → CANN_INCLUDE_DIRS /usr/local/Ascend/ascend-toolkit/8.0/include # → CANN_LIBRARIES ${CANN_LIB_ACL} ${CANN_LIB_GE} ... # # 自动探测逻辑 # 1. $ASCEND_HOME_PATH → ascend-toolkit → latest # 2. /usr/local/Ascend/ascend-toolkit/latest # 3. /opt/ascend/ascend-toolkit/latest # 4. pip show ascend-toolkit → Location # 5. find_path(find_path → ascendc/ascendc.h) # FindCANN.cmake # ---------- Determine CANN root ---------- set(_CANN_POSSIBLE_ROOTS) # 1. Environment variable if(DEFINED ENV{ASCEND_HOME_PATH}) list(APPEND _CANN_POSSIBLE_ROOTS $ENV{ASCEND_HOME_PATH}) endif() # 2. Default install paths list(APPEND _CANN_POSSIBLE_ROOTS /usr/local/Ascend/ascend-toolkit/latest /opt/ascend/ascend-toolkit/latest $ENV{HOME}/Ascend/ascend-toolkit/latest ) # 3. pip-based install (Python virtual env) execute_process( COMMAND ${Python3_EXECUTABLE} -c import ascend; print(ascend.__path__[0]) RESULT_VARIABLE _pip_ret OUTPUT_VARIABLE _pip_path OUTPUT_STRIP_TRAILING_WHITESPACE ) if(_pip_ret EQUAL 0 AND _pip_path) get_filename_component(_pip_toolkit ${_pip_path}/../../../ ABSOLUTE) list(APPEND _CANN_POSSIBLE_ROOTS ${_pip_toolkit}) endif() # ---------- Find CANN header ---------- find_path(CANN_INCLUDE_DIR NAMES ascendc/ascendc.h PATHS ${_CANN_POSSIBLE_ROOTS} PATH_SUFFIXES include DOC CANN include directory ) if(NOT CANN_INCLUDE_DIR) message(FATAL_ERROR CANN not found. Set $ASCEND_HOME_PATH or install ascend-toolkit) endif() # ---------- Determine CANN version ---------- find_file(_CANN_VERSION_FILE NAMES version.cfg PATHS ${CANN_INCLUDE_DIR}/.. PATH_SUFFIXES ) if(_CANN_VERSION_FILE) file(STRINGS ${_CANN_VERSION_FILE} _version_str REGEX ^version.*$) string(REGEX REPLACE ^version CANN_VERSION ${_version_str}) else() set(CANN_VERSION unknown) endif() message(STATUS Found CANN ${CANN_VERSION} at ${CANN_INCLUDE_DIR}) # ---------- Find CANN libraries ---------- set(CANN_LIB_DIR ${CANN_INCLUDE_DIR}/../lib64) find_library(CANN_LIB_ACL NAMES ascendcl PATHS ${CANN_LIB_DIR} REQUIRED ) find_library(CANN_LIB_GE NAMES ge_runner ge PATHS ${CANN_LIB_DIR} REQUIRED ) find_library(CANN_LIB_RUNTIME NAMES runtime PATHS ${CANN_LIB_DIR} REQUIRED ) # Package components set(CANN_LIBRARIES ${CANN_LIB_ACL} ${CANN_LIB_GE} ${CANN_LIB_RUNTIME} ) # ---------- Determine NPU architecture ---------- # Ascend 910 / 910B / 950PR (detect from device or assume default) execute_process( COMMAND npu-smi info -m -q 1 RESULT_VARIABLE _smi_ret OUTPUT_VARIABLE _smi_out ) if(_smi_ret EQUAL 0) string(REGEX MATCH Ascend[0-9][A-Z]* _soc ${_smi_out}) set(CANN_SOC_VERSION ${_soc}) else() set(CANN_SOC_VERSION Ascend910B) # default endif() message(STATUS NPU: ${CANN_SOC_VERSION}) # ---------- Set compile definitions ---------- set(CANN_COMPILE_DEFINITIONS -DASCEND_DEVICE -D${CANN_SOC_VERSION} ) if(CANN_SOC_VERSION MATCHES 910) list(APPEND CANN_COMPILE_DEFINITIONS -DL1_CACHE_SIZE1048576) # 1MB elseif(CANN_SOC_VERSION MATCHES 950) list(APPEND CANN_COMPILE_DEFINITIONS -DL1_CACHE_SIZE524288) # 512KB endif() set(CANN_FOUND TRUE)# cmake/modules/CANNTargetFunctions.cmakeLayer 2高层宏 # # 提供 can_add_op_target / can_add_test_target / can_add_pybind_target # 封装了所有 Ascend C kernel 编译的 boilerplate include_guard(GLOBAL) # can_add_op_target()构建 Ascend C 算子 .so # # 用法 # can_add_op_target(custom_layer_norm # SOURCES src/kernel.cc op_reg/register.cc # INCLUDE_DIRS include/ # LINK_LIBS opbase hccl # DEVICE_VERSIONS Ascend910B Ascend950PR # 多 NPU 型号 # ) # → libcustom_layer_norm_Ascend910B.so, libcustom_layer_norm_Ascend950PR.so macro(can_add_op_target target_name) cmake_parse_arguments(ARG # no boolean flags # no single-value args SOURCES;INCLUDE_DIRS;LINK_LIBS;DEVICE_VERSIONS # multi-value ${ARGN} ) # 对每个 NPU 型号构建一个 .so foreach(device_version ${ARG_DEVICE_VERSIONS}) set(lib_name ${target_name}_${device_version}) add_library(${lib_name} SHARED ${ARG_SOURCES}) target_include_directories(${lib_name} PRIVATE ${ARG_INCLUDE_DIRS} ${CANN_INCLUDE_DIRS} ${CANN_INCLUDE_DIRS}/ascendc ) # 针对此 NPU 型号的编译选项 target_compile_definitions(${lib_name} PRIVATE ${CANN_COMPILE_DEFINITIONS} -DTARGET_DEVICE${device_version} ) # Ascend C kernel 必须用 Bisheng 编译器 if(CMAKE_CXX_COMPILER_ID STREQUAL GNU) set_property(TARGET ${lib_name} PROPERTY CXX_STANDARD 17 ) # 加 Bisheng 兼容标志 target_compile_options(${lib_name} PRIVATE -fno-strict-aliasing -D__aicore__ ) endif() # 链接 CANN 库 target_link_libraries(${lib_name} ${CANN_LIBRARIES} ${ARG_LINK_LIBS} ) # 安装 install(TARGETS ${lib_name} LIBRARY DESTINATION lib/${device_version} ) message(STATUS ✅ Built ${lib_name} for ${device_version}) endforeach() endmacro() # can_add_test_target()构建 ACL 测试程序 # # 用法 # can_add_test_target(test_layer_norm # SOURCES test/test_main.cc test/test_cases.cc # DEPS custom_layer_norm_Ascend910B # ← 依赖上面构建的 .so # DEVICE_ID 0 # ) macro(can_add_test_target test_name) cmake_parse_arguments(ARG DEVICE_ID SOURCES;DEPS ${ARGN} ) add_executable(${test_name} ${ARG_SOURCES}) target_link_libraries(${test_name} ${ARG_DEPS} ${CANN_LIB_ACL} gtest gtest_main pthread ) # 运行测试自动 set DEVICE_ID add_test( NAME ${test_name} COMMAND ${test_name} --gtest_filter* WORKING_DIRECTORY ${CMAKE_BINARY_DIR} ) set_tests_properties(${test_name} PROPERTIES ENVIRONMENT ASCEND_DEVICE_ID${ARG_DEVICE_ID} ) message(STATUS ✅ Test target ${test_name} (NPU#${ARG_DEVICE_ID})) endmacro() # can_add_pybind_target()构建 Python binding # # 用法 # can_add_pybind_target(custom_layer_norm_py # SOURCES python/bindings.cc # DEPS custom_layer_norm_Ascend910B # ) # → custom_layer_norm_py.cpython-38-x86_64-linux-gnu.so macro(can_add_pybind_target target_name) cmake_parse_arguments(ARG SOURCES;DEPS ${ARGN} ) # Find pybind11 find_package(pybind11 REQUIRED) pybind11_add_module(${target_name} ${ARG_SOURCES}) target_link_libraries(${target_name} PRIVATE ${ARG_DEPS} ${CANN_LIBRARIES} ) install(TARGETS ${target_name} LIBRARY DESTINATION python/ ) message(STATUS ✅ Python binding ${target_name}) endmacro()跨仓库构建55 个仓库的统一 CMake 流程# cmake/examples/superbuild.cmakeCANN Super Build # # 一次性构建 55 个仓库的超级构建脚本 # 自动推断仓库间的依赖关系opbase→ops-*hccl→distributedge→framework cmake_minimum_required(VERSION 3.16) project(CANN_SuperBuild LANGUAGES CXX) find_package(CANN 8.0 REQUIRED) include(cmake/modules/CANNTargetFunctions.cmake) # 仓库依赖图 # opbase (layer 0, no deps) # ├── ops-math, ops-nn, ops-blas, ops-cv, ops-fft, ops-rand, ops-tensor # ├── hccl, hcomm, hixl # └── ge, runtime, driver # # ops-transformer (depends on ops-nn, opbase) # ascend-transformer-boost (depends on ops-transformer) # torchtitan-npu (depends on ascend-transformer-boost) # Layer 0: Base dependencies set(OPBASE_DIR ${CMAKE_SOURCE_DIR}/../opbase) add_subdirectory(${OPBASE_DIR} opbase-binary) # Layer 1: Core operators (depend on opbase) foreach(op_repo ops-nn ops-math ops-blas ops-cv ops-fft ops-rand ops-tensor) set(repo_dir ${CMAKE_SOURCE_DIR}/../${op_repo}) if(EXISTS ${repo_dir}) add_subdirectory(${repo_dir} ${op_repo}-binary) message(STATUS ✅ ${op_repo}) endif() endforeach() # Layer 2: Communication (depend on driver) set(HCCL_DIR ${CMAKE_SOURCE_DIR}/../hccl) add_subdirectory(${HCCL_DIR} hccl-binary) set(HCOMM_DIR ${CMAKE_SOURCE_DIR}/../hcomm) add_subdirectory(${HCOMM_DIR} hcomm-binary) set(HIXL_DIR ${CMAKE_SOURCE_DIR}/../hixl) add_subdirectory(${HIXL_DIR} hixl-binary) # Layer 3: Graph engine (depend on opbase driver) set(GE_DIR ${CMAKE_SOURCE_DIR}/../ge) add_subdirectory(${GE_DIR} ge-binary) # Layer 4: Transformer operators (depend on ops-nn opbase) set(OPS_TRANSFORMER_DIR ${CMAKE_SOURCE_DIR}/../ops-transformer) add_subdirectory(${OPS_TRANSFORMER_DIR} ops-transformer-binary) # Build message(STATUS SuperBuild: ${N_REPOS} repos configured, building...)跨 NPU 型号多版本编译910 vs 950 vs 950PR# cmake/examples/build_multi_soc.sh# 同一个算子代码编译出 910 / 950PR 两个 .so# 运行时通过 soc_version 自动加载对应版本ASCEND_TOOLKIT_HOME/usr/local/Ascend/ascend-toolkit/8.0# Ascend 910B mkdir-pbuild_910Bcdbuild_910B cmake..\-DCANN_ROOT${ASCEND_TOOLKIT_HOME}\-DCANN_SOC_VERSIONAscend910B\-DCMAKE_BUILD_TYPEReleasemake-j$(nproc)# → libcustom_op_Ascend910B.socd..# Ascend 950PR mkdir-pbuild_950PRcdbuild_950PR cmake..\-DCANN_ROOT${ASCEND_TOOLKIT_HOME}\-DCANN_SOC_VERSIONAscend950PR\-DCMAKE_BUILD_TYPEReleasemake-j$(nproc)# → libcustom_op_Ascend950PR.socd..# 运行时自动选择 # CANN 根据 aclrtGetDeviceInfo() 返回的 soc_version 自动加载对应 .solslib/# lib/custom_op/Ascend910B/libcustom_op.so# lib/custom_op/Ascend950PR/libcustom_op.so# 无需手动判断ACL 的 op_loader 自动查找匹配的 soc_version 目录打包与分发RPM / Docker# cmake/modules/CANNPackaging.cmake # RPM 打包 set(CPACK_GENERATOR RPM) set(CPACK_PACKAGE_NAME cann-ops-nn) set(CPACK_PACKAGE_VERSION 8.0.1) set(CPACK_PACKAGE_VENDOR Ascend) set(CPACK_RPM_PACKAGE_REQUIRES ascend-toolkit 8.0, opbase 8.0) include(CPack) # 生成 RPM 包 # cmake --build build --target package # → cann-ops-nn-8.0.1-1.x86_64.rpm # DEB 打包 set(CPACK_GENERATOR DEB) set(CPACK_DEBIAN_PACKAGE_DEPENDS ascend-toolkit ( 8.0), libopbase ( 8.0))# cmake/examples/Dockerfile.cann FROM ubuntu:22.04 # Install CANN toolkit COPY ascend-toolkit-8.0.run /tmp/ RUN /tmp/ascend-toolkit-8.0.run --install --quiet # Build all operators (cmake superbuild) WORKDIR /workspace/cann COPY . . RUN mkdir build cd build \ cmake .. \ -DCANN_ROOT/usr/local/Ascend/ascend-toolkit/8.0 \ -DCMAKE_BUILD_TYPERelease \ make -j$(nproc) \ cmake --install . --prefix /opt/cann # Runtime environment ENV LD_LIBRARY_PATH/opt/cann/lib64:/usr/local/Ascend/ascend-toolkit/8.0/lib64 ENV ASCEND_HOME_PATH/usr/local/Ascend/ascend-toolkit/8.0 CMD [/opt/cann/bin/test_all_ops]踩坑一FindCANN 在虚拟环境 pip install 后找不到头文件——Python venv 不是系统级安装# ❌ pip install ascend-toolkit → virtual env ~/.venv/# cmake 探测 /usr/local/Ascend → 不存在 → FATAL ERROR# 因为 find_path 默认扫描系统路径不扫描 virtual env# ✅ 用 Python3_EXECUTABLE 探测 virtual envexecute_process(COMMAND${Python3_EXECUTABLE}-cimport site; print(site.getsitepackages()[0])OUTPUT_VARIABLE _site_packages)list(APPEND _CANN_POSSIBLE_ROOTS${_site_packages}/ascend/toolkit)# 或在 cmake 命令行指定 CANN_ROOTcmake..-DCANN_ROOT~/.venv/lib/python3.8/site-packages/ascend/toolkit踩坑二CMake 多仓库构建中的 link 顺序——opbase.so 被多次链接导致符号表重复55 个仓库都target_link_libraries(... opbase)→ cmake 生成 55 次-lopbase→ ld 把 opbase.so 的符号表复制 55 次 → 最终 .so 膨胀 2.8GB正常 320MB。# ❌ 每个仓库独立 link opbase → 符号重复 # ops-nn/CMakeLists.txt: target_link_libraries(ops_nn opbase) # ops-transformer/CMakeLists.txt: target_link_libraries(ops_transformer ops_nn opbase) # ← opbase 又 link 一次 # ✅ 用 INTERFACE 传递不重复 link # opbase/CMakeLists.txt: add_library(opbase SHARED ...) target_include_directories(opbase INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}/include) # 其他仓库只需要 INTERFACE 依赖不在 link 阶段重复 find_package(opbase CONFIG REQUIRED) target_link_libraries(my_op PRIVATE ${CANN_LIBRARIES} # ← opbase 不重复 link ) target_include_directories(my_op SYSTEM PRIVATE ${OPBASE_INCLUDE_DIRS}) # opbase 在 CANN_LIBRARIES 中已含通过 FindCANN不重复添加踩坑三CMake 构建时 Ascend 910 vs 950 的 Tiling 块大小不兼容——L1_CACHE_SIZE宏在不同 NPU 间不一致# ❌ 硬编码 1MB L1Ascend 910 add_compile_definitions(L1_CACHE_SIZE1048576) # → Ascend 950 的 L1 只有 512KB → Tiling: BM/BN/BK 溢出 L1 → kernel 错误 # ✅ 运行时或编译期判断 NPU 型号 # cmake: if(CANN_SOC_VERSION MATCHES 910) add_compile_definitions(L1_CACHE_SIZE1048576) elseif(CANN_SOC_VERSION MATCHES 950) add_compile_definitions(L1_CACHE_SIZE524288) else() message(WARNING Unknown NPU version: ${CANN_SOC_VERSION}, L1 unknown) add_compile_definitions(L1_CACHE_SIZE1048576) # fallback endif() # 或用 AutoTiling 运行时判断不用编译期宏 // In kernel code: __aicore__ void MatMulKernel(...) { // 运行时获取 L1 大小 int l1_size GetL1CacheSize(); // 910 → 1048576, 950 → 524288 auto config AutoTiling::Compute(M, N, K, l1_size); // ... }cmake 仓库是 CANN 全 55 个仓库的统一编译入口。FindCANN.cmake 自动探测 CANN 安装路径环境变量→默认路径→pip虚拟环境→包管理器、CANNTargetFunctions 提供can_add_op_target/can_add_test_target/can_add_pybind_target三个高层次构建宏封装 Ascend C kernel 编译 ACL 测试运行 Python binding、SuperBuild 自动推断 55 个仓库的依赖图opbase→ops-*→transformer→ATB→torchtitan。三个踩坑pip venv 不被 cmake 扫描→用 Python3 探 site-packages、55 个仓库独立 link opbase→符号重复膨胀 2.8GB→用 INTERFACE 传递、910/950 L1 大小不兼容→编译期if(CANN_SOC_VERSION)或运行时 AutoTiling。