CANN/cann-learning-hub:AICPU Tiling下沉编程
AICPU Tiling下沉编程【免费下载链接】cann-learning-hubCANN 学习中心仓支持在线互动运行、边学边练提供教程、示例与优化方案一站式助力昇腾开发者快速上手。项目地址: https://gitcode.com/cann/cann-learning-hub基础知识准备本文内容基于Ascend C算子开发衍生而来对于算子开发还不了解的读者可以通过以下资源进行学习《Ascend C算子开发文档手册》《Ascend C算子开发系列课程》背景介绍Host Bound一直是算子调用的显著性能瓶颈造成Host Bound的核心原因就在于算子在Kernel执行前都需要计算出TilingData而TilingData的计算通常是在Host侧完成再拷贝到Device侧的。针对这一问题我们推出了AICPU Tiling下沉编程方式使用Device侧的AICPU计算TilingData节省了Host侧拷贝TilingData到Device侧的步骤降低算子执行耗时。亮点介绍通过减少Host与Device的交互提升算子执行性能通过调用AICPU的方式降低了编程成本AICPU Tiling下沉编程使用详解一、开发流程目录结构以一个简单的abs算子的demo为例 aicpu_demo/ # demo目录 ├── main.cpp # 算子入口分别调用AICPU和AICORE ├── abs.aicpu # 算子AICPU实现 ├── abs.asc # 算子AICORE实现 ├── kernel_args.h # 结构体定义 ├── CMakeLists.txt # cmake文件编写AICPU Tiling实现逻辑定义AICPU Tiling的KernelArgs入参对应kernel_args.h文件当前方式调用AICPU函数可以通过传入一个结构体指针的方式进行调用如下将算子需要的用于计算Tiling的入参和输出的TilingData地址定义在一个struct中。// kernel_args.h struct TilingInfo { uint32_t data_size_per_block; }; struct KernelArgs { uint32_t block_num; uint32_t data_size; TilingInfo *ti; // 与aicore共享的参数 };AICPU Tiling的实现将上一步定义的KernelArgs作为入参实现AICPU Tiling的逻辑将计算好的结果写入TilingData中。// abs.aicpu __global__ __aicpu__ int32_t TemplateAicpuKernel(T *args) { // 计算每个核需要处理的数据量将结果保存在tiling地址对应的device空间中 args-ti-data_size_per_block args-data_size / args-block_num; return 0; }编写AICORE实现逻辑实现一个简单的abs算子示例只使用一个核计算所有输入的abs结果通过tiling地址来访问计算好的tiling数据。// abs.asc templatetypename T __aicore__ void hello_world_impl(GM_ADDR src_gm, GM_ADDR dst_gm, GM_ADDR tiling_addr) { __gm__ struct TilingInfo *tiling (__gm__ struct TilingInfo *)tiling_addr; uint64_t dataSize tiling-data_size_per_block; AscendC::printf(aicore get dataSize %d\n, dataSize); AscendC::GlobalTensorfloat inputGlobal; AscendC::GlobalTensorfloat outputGlobal; inputGlobal.SetGlobalBuffer(reinterpret_cast__gm__ float *(src_gm), dataSize); outputGlobal.SetGlobalBuffer(reinterpret_cast__gm__ float *(dst_gm), dataSize); AscendC::TPipe pipe; AscendC::TBufAscendC::TPosition::VECCALC calcBuf; pipe.InitBuffer(calcBuf, dataSize * sizeof(float)); AscendC::LocalTensorfloat tempTensor1 calcBuf.Getfloat(); AscendC::DataCopy(tempTensor1, inputGlobal, dataSize); event_t eventID1 static_castevent_t(pipe.FetchEventID(AscendC::HardEvent::MTE2_V)); AscendC::SetFlagAscendC::HardEvent::MTE2_V(eventID1); AscendC::WaitFlagAscendC::HardEvent::MTE2_V(eventID1); AscendC::Abs(tempTensor1, tempTensor1, dataSize); event_t eventIdVToMte3 static_castevent_t(pipe.FetchEventID(AscendC::HardEvent::V_MTE3)); AscendC::SetFlagAscendC::HardEvent::V_MTE3(eventIdVToMte3); AscendC::WaitFlagAscendC::HardEvent::V_MTE3(eventIdVToMte3); AscendC::DataCopy(outputGlobal, tempTensor1, dataSize); }通过两条不同的流分别调用AICPU和AICORE任务出于性能的考虑需要使用不同的两条流来分别执行AICPU和AICORE任务目的是在网络场景中让AICPU和AICORE的计算能够并行同时对于单算子内部的实现需要使用event机制来保证AICPU的计算结束后再执行AICORE上的任务。// main.cpp int32_t main(void) { CHECK_ACL(aclInit(nullptr)); int32_t deviceId 0; printf(acl init ok! \n); CHECK_ACL(aclrtSetDevice(deviceId)); printf(set device ok! \n); aclrtStream aicpu_stream nullptr; aclrtStream aicore_stream nullptr; CHECK_ACL(aclrtCreateStream(aicpu_stream)); CHECK_ACL(aclrtCreateStream(aicore_stream)); printf(create stream ok! \n); aclrtEvent event; CHECK_ACL(aclrtCreateEventExWithFlag(event, ACL_EVENT_SYNC)); void *srcDevice; void *dstDevice; void *ti; CHECK_ACL(aclrtMalloc((void **)srcDevice, 4096, ACL_MEM_MALLOC_HUGE_FIRST)); CHECK_ACL(aclrtMalloc((void **)dstDevice, 4096, ACL_MEM_MALLOC_HUGE_FIRST)); CHECK_ACL(aclrtMalloc((void **)ti, 4096, ACL_MEM_MALLOC_HUGE_FIRST)); void *zHost malloc(4096); memset(zHost, 0, 4096); CHECK_ACL(aclrtMemcpy(srcDevice, 4096, zHost, 4096, ACL_MEMCPY_HOST_TO_DEVICE)); CHECK_ACL(aclrtMemcpy(dstDevice, 4096, zHost, 4096, ACL_MEMCPY_HOST_TO_DEVICE)); struct KernelArgs args {0}; args.block_num 1; args.data_size 10; args.ti (TilingInfo *)ti; TemplateAicpuKernel_do(aicpu_stream, args); CHECK_ACL(aclrtRecordEvent(event, aicpu_stream)); CHECK_ACL(aclrtStreamWaitEvent(aicore_stream, event)); hello_world_do(1, aicore_stream, (uint8_t *)srcDevice, (uint8_t *)dstDevice, (uint8_t *)ti); printf(launch ok! \n); CHECK_ACL(aclrtSynchronizeStreamWithTimeout(aicore_stream, 10000)); printf(sync ok!\n); CHECK_ACL(aclrtFree(srcDevice)); CHECK_ACL(aclrtFree(dstDevice)); free(zHost); CHECK_ACL(aclrtDestroyStream(aicpu_stream)); CHECK_ACL(aclrtDestroyStream(aicore_stream)); CHECK_ACL(aclrtResetDevice(deviceId)); CHECK_ACL(aclFinalize()); return 0; }AICPU Tiling入口// abs.asc templatetypename T extern __global__ __aicpu__ int32_t TemplateAicpuKernel(T *args); template extern __global__ __aicpu__ int32_t TemplateAicpuKernelKernelArgs(KernelArgs *args); void TemplateAicpuKernel_do(void *stream, KernelArgs *args) // aicpu entrance { TemplateAicpuKernelKernelArgs1, nullptr, stream(args, sizeof(KernelArgs)); }AICORE入口// abs.asc templatetypename T __global__ __aicore__ void hello_world(GM_ADDR src, GM_ADDR dst, GM_ADDR tiling) { hello_world_implT(src, dst, tiling); } extern C { void hello_world_do(uint32_t blockDim, void *stream, uint8_t *src, uint8_t *dst, uint8_t *ti) // aicore entrance { hello_worldint1, nullptr, stream(src, dst, ti); } }CMake编译在CMakeLists.txt文件中分别使用不同的编译配置编译AICORE和AICPU最终将结果打包成一个静态库。// CMakeLists.txt cmake_minimum_required(VERSION 3.18) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) set(ASCEND_CANN_PACKAGE_PATH $ENV{ASCEND_HOME_PATH} CACHE PATH ASCEND CANN package installation directory FORCE) set(CMAKE_BUILD_TYPE Release CACHE STRING Build type Release/Debug (default Debug) FORCE) set(CMAKE_INSTALL_PREFIX ${CMAKE_CURRENT_LIST_DIR}/out CACHE STRING path for install() FORCE) set(CMAKE_PREFIX_PATH ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) set(ASCEND_LIB_DIR $ENV{ASCEND_HOME_PATH}/x86_64-linux/lib64) link_directories(${ASCEND_LIB_DIR}) find_package(ASC REQUIRED) find_package(AICPU REQUIRED) add_library(my_kernel SHARED abs.aicpu abs.asc ) set_target_properties(my_kernel PROPERTIES LINKER_LANGUAGE CXX) project(my_ops LANGUAGES ASC AICPU CXX) target_link_libraries(my_kernel PRIVATE ascendc_runtime profapi ascendalog ascendcl runtime c_sec mmpa error_manager ascend_dump pthread ) target_compile_options(my_kernel PRIVATE $$COMPILE_LANGUAGE:ASC: --npu-archdav-2201 ) target_include_directories(my_kernel PUBLIC $ENV{ASCEND_HOME_PATH}/lib64 $ENV{ASCEND_HOME_PATH}/x86_64-linux/include $ENV{ASCEND_HOME_PATH}/x86_64-linux/lib64 ${ASCEND_CANN_PACKAGE_PATH}/include/ascendc/aicpu_api ) add_executable(main main.cpp) target_link_libraries(main PRIVATE my_kernel ascendcl )二. 代码调测Host侧 可使用通用C语言的维测手段包括打印GDB等。Device侧AICORE 可直接使用AscendC::printf或AscendC::DumpTensor打印变量调试。AICPU 也可使用AscendC::printf打印变量调试。__global__ __aicpu__ int32_t TemplateAicpuKernel(T *args) { int32_t var 0; AscendC::printf(TemplateAicpuKernel inited! %d\n, var); ... }三. 性能调优该方案中由于把Tiling计算移动到了AICPU上因此Tilingkey无法在Host上获取只能将原本的Tilingkey分发逻辑移动到AICORE Kernel中进行判断在实际开发dequant_swiglu_quant算子时初步性能测试时发现这一改动导致了额外的icache miss算子整体性能下降5%。__global__ __aicore__ __attribute__((aiv)) void dequant_swiglu_quant(GM_ADDR x, GM_ADDR weight_scale, GM_ADDR activation_scale, GM_ADDR bias, GM_ADDR quant_scale, GM_ADDR quant_offset, GM_ADDR y, GM_ADDR scale, GM_ADDR tiling_data) { __gm__ struct DequantSwigluQuantTiling *tiling (__gm__ struct DequantSwigluQuantTiling *)tiling_data; if (AscendC::GetBlockIdx() tiling-core_num) { return; } // 原本是在Host上进行判断 if (tiling-tiling_key 0) { swiglu_quant_implfloat16_t, 2(x, weight_scale, activation_scale, bias, quant_scale, quant_offset, y, scale, tiling_data); } else if (tiling-tiling_key 1) { swiglu_quant_implbfloat16_t, 2(x, weight_scale, activation_scale, bias, quant_scale, quant_offset, y, scale, tiling_data); } else if (tiling-tiling_key 2) { swiglu_quant_implfloat16_t, 1(x, weight_scale, activation_scale, bias, quant_scale, quant_offset, y, scale, tiling_data); } else if (tiling-tiling_key 3) { swiglu_quant_implbfloat16_t, 1(x, weight_scale, activation_scale, bias, quant_scale, quant_offset, y, scale, tiling_data); } else if (tiling-tiling_key 4) { dequant_swiglu_quant_impl2(x, weight_scale, activation_scale, bias, quant_scale, quant_offset, y, scale, tiling_data); } else if (tiling-tiling_key 5) { dequant_swiglu_quant_impl1(x, weight_scale, activation_scale, bias, quant_scale, quant_offset, y, scale, tiling_data); } }针对这一现象可以使用Ascend C提供的ICachePreLoad接口将代码段预加载到ICache中使得该算子整体性能相较于原本提升了15%。template int BufferNum __aicore__ void dequant_swiglu_quant_impl(GM_ADDR x, GM_ADDR weight_scale, GM_ADDR activation_scale, GM_ADDR bias, GM_ADDR quant_scale, GM_ADDR quant_offset, GM_ADDR y, GM_ADDR scale, GM_ADDR tiling_data) { AscendC::ICachePreLoad(2); // 按照实际代码段长度根据接口文档来设置参数 AscendC::TPipe pipe; DequantSwigluQuantKernelBufferNum op(pipe); op.init(x, weight_scale, activation_scale, bias, quant_scale, quant_offset, y, scale, tiling_data); op.process(); }AICPU TilingICachePreLoad耗时数据类型大case(us)小case(us)FP1660.89.6BF1662.389.2INT328511.36原版耗时数据类型大case(us)小case(us)FP1669.089.46BF1669.488.56INT3210512.96总结AICPU Tiling下沉方案优化了算子在Host侧上动态计算Tiling场景的性能同时通过的方式调用AICPU让开发者能轻松地完成方案的代码适配。此方案正在逐步应用到实际的商用业务场景中成为解决算子Host-Bound问题的有效路径之一。【免费下载链接】cann-learning-hubCANN 学习中心仓支持在线互动运行、边学边练提供教程、示例与优化方案一站式助力昇腾开发者快速上手。项目地址: https://gitcode.com/cann/cann-learning-hub创作声明:本文部分内容由AI辅助生成(AIGC),仅供参考