diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp index 8eaf31de44..47a83f654c 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp @@ -31,7 +31,22 @@ using namespace vpux; namespace { -static const SmallVector SW_DUMMY_KERNELS_PREFETCH_SUPPORTED = {"convert", "softmax"}; +struct GapCandidate { + uint64_t lookaheadGap = 0; + int64_t insertionPointTaskIndex = -1; + + // used for sort + bool operator>(const GapCandidate& other) const { + return lookaheadGap > other.lookaheadGap; + } +}; + +static const SmallVector SW_DUMMY_KERNELS_PREFETCH_SUPPORTED = { + "activation_swish", "eltwise_mul", "softmax", "convert", "rms_norm", + "activation_swish", "activation_sin", "eltwise_equal", "activation_cos", "eltwise_select"}; + +static const SmallVector SW_DUMMY_KERNELS_WITHOUT_ARGS = { + "convert", "eltwise_mul", "activation_cos", "activation_sin", "eltwise_equal", "eltwise_select", "rms_norm"}; // // AddSwKernelInstructionPrefetch @@ -66,12 +81,13 @@ class AddSwKernelInstructionPrefetch final : size_t clusterIdx, std::string& kernelName, mlir::SymbolRefAttr functionSymbol); - VPUIP::SwKernelOp insertDummyKernelOpBeforeFirstKernelTask(mlir::Operation* firstSwTask, mlir::Value updateBarrier, - size_t clusterIdx, std::string& kernelName); + VPUIP::SwKernelOp insertDummyKernelOpBeforeFirstKernelTask(mlir::Operation* firstSwTask, + mlir::ValueRange updateBarrier, size_t clusterIdx, + std::string& kernelName); mlir::Operation* getFirstSwTaskInIRWaitingForBarrier(mlir::Value waitBarrier); std::pair getKernelNameAndSize(VPUIP::SwKernelOp swKernelOp); - using SwKernelPrefetchVec = std::vector>; + using SwKernelPrefetchVec = std::vector>; std::pair getPrefetchCandidatesAndFirstSwTask(mlir::Operation* funcOp, VPURT::TaskConfigVec& allTasks); std::tuple getFirstSwTaskInIRAndBestUpdateBarrier( @@ -79,6 +95,12 @@ class AddSwKernelInstructionPrefetch final : std::vector insertPrefetchTasks(mlir::Operation* funcOp, SwKernelPrefetchVec& kernelsToPrefetch, mlir::Operation* firstShaveTaskInIR, mlir::Value bestUpdateBarrier); + std::optional findBestInsertionGapDuringExec(const std::string& kernelName, + uint64_t targetKernelGroupStartTime, + VPURT::TaskConfigVec& allTasks, size_t numClusters); + std::vector insertPrefetchTasksDuringExec( + mlir::Operation* funcOp, AddSwKernelInstructionPrefetch::SwKernelPrefetchVec& kernelsToPrefetch, + VPURT::TaskConfigVec& allTasks); bool hasVPUSWModule(mlir::Operation* funcOp); size_t getOffsetReservedMem(const mlir::ModuleOp module); @@ -94,6 +116,13 @@ class AddSwKernelInstructionPrefetch final : bool _minFreeCyclesHasValue = false; size_t _minimumFreeCyclesForPrefetch = 250000; bool _useDummyKernelForInstructionPrefetch = false; + size_t _dynamicPrefetchTileCounter = 0; + // Using Tile 1 as the target for insertion to enable prefetching only when the available tile count is larger + // than 1. + int64_t _targetInsertTileDuringExec = 1; + // The threshold of 50,000 cycles is empirically chosen to ensure there is a sufficient gap + // to perform instruction prefetching without causing stalls. + uint64_t _prefetchGapThresholdDuringExec = 50000; }; bool AddSwKernelInstructionPrefetch::hasVPUSWModule(mlir::Operation* funcOp) { @@ -186,21 +215,26 @@ VPUIP::SwKernelOp AddSwKernelInstructionPrefetch::insertPrefetchOpBeforeFirstKer } // For LNL, Shave kernel instruction prefetch needs to insert a dummy kernel instead of prefetch kernel -VPUIP::SwKernelOp AddSwKernelInstructionPrefetch::insertDummyKernelOpBeforeFirstKernelTask(mlir::Operation* firstSwTask, - mlir::Value updateBarrier, - size_t clusterIdx, - std::string& kernelName) { +VPUIP::SwKernelOp AddSwKernelInstructionPrefetch::insertDummyKernelOpBeforeFirstKernelTask( + mlir::Operation* firstSwTask, mlir::ValueRange updateBarrier, size_t clusterIdx, std::string& kernelName) { mlir::OpBuilder builder(firstSwTask); - auto moduleOp = firstSwTask->getParentOfType(); + auto kernelOp = kernelNameToOps[kernelName]; + auto moduleOp = kernelOp->getParentOfType(); auto reservedMemOffset = getOffsetReservedMem(moduleOp); auto offsetAttr = getIntAttr(moduleOp->getContext(), reservedMemOffset); - auto kernelOp = kernelNameToOps[kernelName]; + auto tileIndexAttr = kernelOp.getTileIndexAttr(); + VPUX_THROW_UNLESS(tileIndexAttr, "SwKernelOp '{0}' does not have a tileIndex attribute", kernelOp->getLoc()); + const int64_t tileIndex = static_cast(clusterIdx); auto createBuffer = [&](mlir::Value io, StringRef suffix, mlir::SmallVector& buffers) { if (auto bufOp = io.getDefiningOp()) { - auto newType = mlir::cast(io.getType()).changeShape({1, 1, 1, 1}); + auto origType = mlir::cast(io.getType()); + auto newMemSpaceAttr = vpux::IndexedSymbolAttr::get(moduleOp->getContext(), + stringifyEnum(VPU::MemoryKind::CMX_NN), tileIndex); + auto newSectionIndexAttr = builder.getI64ArrayAttr({tileIndex}); + auto newType = origType.changeShape({1, 1, 1, 1}).changeMemSpace(newMemSpaceAttr); auto newBuff = builder.create(appendLoc(bufOp->getLoc(), suffix), newType, - bufOp.getSectionAttr(), bufOp.getSectionIndexAttr(), + bufOp.getSectionAttr(), newSectionIndexAttr, offsetAttr, bufOp.getSwizzlingKeyAttr()); buffers.push_back(newBuff); return true; @@ -230,14 +264,16 @@ VPUIP::SwKernelOp AddSwKernelInstructionPrefetch::insertDummyKernelOpBeforeFirst auto cachePrefetchSwKernel = vpux::VPURT::wrapIntoTaskOp( builder, mlir::ValueRange(), updateBarrier, newLoc, mlir::ValueRange(srcBuffers), - mlir::ValueRange(dstBuffers), nullptr, kernelNameToSymbol[kernelName], kernelOp.getTileIndexAttr(), + mlir::ValueRange(dstBuffers), nullptr, kernelNameToSymbol[kernelName], builder.getI64IntegerAttr(tileIndex), kernelOp.getInputStridesAttr(), kernelOp.getOutputStridesAttr()); // The dummy kernels here are generated after ActShaveProfilingPass, // so we need to add skipProfiling as attribute to avoid capturing their metadata cachePrefetchSwKernel->setAttr("skipProfiling", mlir::UnitAttr::get(firstSwTask->getContext())); - auto args = - (kernelName == "convert") ? mlir::ArrayAttr::get(moduleOp->getContext(), {}) : kernelNameToArgs[kernelName]; + auto args = llvm::is_contained(SW_DUMMY_KERNELS_WITHOUT_ARGS, kernelName) + ? mlir::ArrayAttr::get(moduleOp->getContext(), {}) + : kernelNameToArgs[kernelName]; + vpux::VPUIP::initSwKernel(cachePrefetchSwKernel, mlir::ValueRange(srcBuffers), mlir::ValueRange(dstBuffers), args, _log.nest(), /*swKernelRunOp=*/nullptr); @@ -316,7 +352,7 @@ AddSwKernelInstructionPrefetch::getPrefetchCandidatesAndFirstSwTask(mlir::Operat } if (!cache.isLoaded(kernelName)) { - kernelsToPrefetch.push_back(std::move(kernelNameAndSize)); + kernelsToPrefetch.push_back(std::make_tuple(kernelName, kernelSize, shvTaskIndex)); } cache.loadKernel(kernelName, kernelSize); @@ -355,7 +391,8 @@ AddSwKernelInstructionPrefetch::getFirstSwTaskInIRAndBestUpdateBarrier(VPURT::In _log.trace("First SW kernel start time {0}, best barrier release time {1}", firstKernelTask.cycleStart, bestReleaseCycle); if (bestReleaseCycle < _minimumFreeCyclesForPrefetch) { - _log.info("bestReleaseCycle: {0} is smaller than _minimumFreeCyclesForPrefetch {1}, skipping prefetching", + _log.info("bestReleaseCycle: {0} is smaller than _minimumFreeCyclesForPrefetch {1}, try prefetching during " + "execution", bestReleaseCycle, _minimumFreeCyclesForPrefetch); return std::make_tuple(nullptr, nullptr, 0); } @@ -394,7 +431,7 @@ std::vector AddSwKernelInstructionPrefetch::insertPrefetchTas for (size_t shaveIdx = 0; (shaveIdx < numClusters * noOfShavesPerCluster) && (shaveIdx < kernelsToPrefetch.size()); shaveIdx++) { auto clusterIdx = shaveIdx / noOfShavesPerCluster; - auto [kernelName, kernelSize] = kernelsToPrefetch[shaveIdx]; + auto [kernelName, kernelSize, shvTaskIndex] = kernelsToPrefetch[shaveIdx]; _log.trace("Prefetching kernel {0} on cluster {1}", kernelName, clusterIdx); auto newPrefetchKernel = _useDummyKernelForInstructionPrefetch @@ -410,6 +447,169 @@ std::vector AddSwKernelInstructionPrefetch::insertPrefetchTas return prefetchedKernels; } +size_t getSwKernelCountAtTime(uint64_t startTime, VPURT::TaskConfigVec& allTasks) { + size_t count = 0; + for (auto& taskConfig : allTasks) { + if (static_cast(taskConfig.cycleStart) == startTime) { + if (mlir::isa(taskConfig.taskOp.getInnerTaskOp())) { + count++; + } + } + if (static_cast(taskConfig.cycleStart) > startTime) { + break; + } + } + return count; +} + +uint64_t findNextSaturationStart(size_t startIndex, vpux::VPURT::TaskConfigVec& allTasks, size_t numClusters, + std::map& swKernelCountsCache) { + // Saturation is defined as 2x the number of clusters (e.g., 4 clusters -> 8 SW kernels) + const size_t saturationThreshold = numClusters * 2; + + // Iterate through tasks strictly AFTER the startIndex + for (size_t i = startIndex + 1; i < allTasks.size(); ++i) { + uint64_t currentStartTime = static_cast(allTasks[i].cycleStart); + + if (swKernelCountsCache.find(currentStartTime) == swKernelCountsCache.end()) { + swKernelCountsCache[currentStartTime] = getSwKernelCountAtTime(currentStartTime, allTasks); + } + + if (swKernelCountsCache[currentStartTime] >= saturationThreshold) { + return currentStartTime; + } + } + + return std::numeric_limits::max(); +} + +std::optional AddSwKernelInstructionPrefetch::findBestInsertionGapDuringExec( + const std::string& kernelName, uint64_t targetKernelGroupStartTime, VPURT::TaskConfigVec& allTasks, + size_t numClusters) { + const size_t saturationThreshold = numClusters * 2; + + // + std::map> validGaps; + std::map swKernelCountsCache; // local cache + + int64_t prevTargetTileTaskIndex = -1; + uint64_t prevTargetTileTaskStartTime = 0; + + // find the largest gap between a non-saturated SW task and a saturated SW task / the kernel to be prefetched + for (size_t i = 0; i < allTasks.size(); ++i) { + auto& currentTaskConfig = allTasks[i]; + uint64_t currentTaskStartTime = static_cast(currentTaskConfig.cycleStart); + if (currentTaskStartTime > targetKernelGroupStartTime) { + break; + } + + bool isTargetTileTask = false; + if (auto swOp = mlir::dyn_cast(currentTaskConfig.taskOp.getInnerTaskOp()); swOp != nullptr) { + isTargetTileTask = (swOp.getTileIndexAttr().getInt() == _targetInsertTileDuringExec); + } + + if (prevTargetTileTaskIndex != -1 && isTargetTileTask) { + auto& insertionPointTask = allTasks[prevTargetTileTaskIndex]; + auto insertionPointStartTime = static_cast(insertionPointTask.cycleStart); + + size_t simultaneousSwKernels = getSwKernelCountAtTime(insertionPointStartTime, allTasks); + + if (simultaneousSwKernels < saturationThreshold) { + uint64_t nextSaturationStart = + findNextSaturationStart(prevTargetTileTaskIndex, allTasks, numClusters, swKernelCountsCache); + uint64_t gapEnd = std::min(nextSaturationStart, targetKernelGroupStartTime); + uint64_t lookaheadGap = 0; + if (gapEnd > prevTargetTileTaskStartTime) { + lookaheadGap = gapEnd - prevTargetTileTaskStartTime; + } + + if (lookaheadGap >= _prefetchGapThresholdDuringExec) { + GapCandidate gap; + gap.lookaheadGap = lookaheadGap; + gap.insertionPointTaskIndex = prevTargetTileTaskIndex; + validGaps[lookaheadGap] = gap; + } + } + } + + if (isTargetTileTask) { + prevTargetTileTaskIndex = static_cast(i); + prevTargetTileTaskStartTime = currentTaskStartTime; + } + } + + if (validGaps.empty()) { + _log.trace("Kernel '{0}': No suitable insertion point found.", kernelName); + return std::nullopt; + } + + return validGaps.begin()->second; +} + +std::vector AddSwKernelInstructionPrefetch::insertPrefetchTasksDuringExec( + mlir::Operation* funcOp, AddSwKernelInstructionPrefetch::SwKernelPrefetchVec& kernelsToPrefetch, + VPURT::TaskConfigVec& allTasks) { + auto moduleOp = funcOp->getParentOfType(); + const auto numClusters = getNumTiles(moduleOp); + VPUX_THROW_WHEN(numClusters == 0, "Number of tiles is zero."); + + std::vector prefetchedKernels{}; + + for (auto& kernelInfo : kernelsToPrefetch) { + std::string kernelName = std::get<0>(kernelInfo); + size_t firstAppearanceIndex = std::get<2>(kernelInfo); + + if (firstAppearanceIndex >= allTasks.size()) { + _log.trace("Skipping kernel '{0}': Invalid firstAppearanceIndex {1}", kernelName, firstAppearanceIndex); + continue; + } + if (kernelNameToOps.count(kernelName) == 0) { + _log.trace("Skipping kernel '{0}': Missing dependencies (kernelNameToOps)", kernelName); + continue; + } + + auto targetKernelGroupStartTime = static_cast(allTasks[firstAppearanceIndex].cycleStart); + + // Finds the best insertion point for prefetch by identifying non-saturated execution windows. + // Scans for tasks on the target tile to serve as prefetch anchors. A valid "Gap" is the + // duration from an anchor task to the next saturation event or the target kernel start. + // + // Logic: + // 1. Find a candidate task on the target tile. + // 2. Ensure NPU is not saturated at that time. + // 3. Calculate Gap = (Next Saturation or Target Start) - Insertion Time. + // 4. Return the candidate with the largest Gap >= _prefetchGapThreshold. + auto bestGapOpt = findBestInsertionGapDuringExec(kernelName, targetKernelGroupStartTime, allTasks, numClusters); + + if (!bestGapOpt.has_value()) { + _log.trace("Kernel '{0}': No valid gap found.", kernelName); + continue; + } + + GapCandidate bestGap = bestGapOpt.value(); + _log.trace("Kernel '{0}': Found best gap of {1} cycles. Inserting relative to task {2}.", kernelName, + bestGap.lookaheadGap, bestGap.insertionPointTaskIndex); + + if (bestGap.insertionPointTaskIndex < 0 || + static_cast(bestGap.insertionPointTaskIndex) >= allTasks.size()) { + _log.error("Kernel '{0}': Invalid insertionPointTaskIndex {1}. Skipping insertion.", kernelName, + bestGap.insertionPointTaskIndex); + continue; + } + + auto insertBeforeOp = allTasks[bestGap.insertionPointTaskIndex].taskOp; + size_t dynamicExecTile = _dynamicPrefetchTileCounter % numClusters; + _dynamicPrefetchTileCounter++; + + auto newPrefetchKernel = insertDummyKernelOpBeforeFirstKernelTask(insertBeforeOp, mlir::ValueRange(), + dynamicExecTile, kernelName); + + prefetchedKernels.push_back(newPrefetchKernel); + } + + return prefetchedKernels; +} + void AddSwKernelInstructionPrefetch::safeRunOnFunc() { auto funcOp = getOperation(); if (!hasVPUSWModule(funcOp)) { @@ -444,10 +644,6 @@ void AddSwKernelInstructionPrefetch::safeRunOnFunc() { auto [kernelsToPrefetch, firstShvTaskIndex] = getPrefetchCandidatesAndFirstSwTask(funcOp, allTasks); auto [firstShaveTaskInIR, bestUpdateBarrier, bestReleaseCycle] = getFirstSwTaskInIRAndBestUpdateBarrier(infSim, allTasks, firstShvTaskIndex); - if (firstShaveTaskInIR == nullptr || kernelsToPrefetch.empty()) { - return; - } - _log.trace("insertPoint: {0}, bestReleaseCycle: {1}", *firstShaveTaskInIR, bestReleaseCycle); if (_useDummyKernelForInstructionPrefetch) { auto memSpaceAttr = mlir::SymbolRefAttr::get(module->getContext(), stringifyEnum(VPU::MemoryKind::CMX_NN)); @@ -455,7 +651,17 @@ void AddSwKernelInstructionPrefetch::safeRunOnFunc() { VPUX_THROW_WHEN(dummyKernelResMem == nullptr, "Cannot find DummySWKernelsForInstructionPrefetchReservedMemory!"); } - auto newPrefetchKernels = insertPrefetchTasks(funcOp, kernelsToPrefetch, firstShaveTaskInIR, bestUpdateBarrier); + if (kernelsToPrefetch.empty()) { + return; + } + + std::vector newPrefetchKernels; + if (firstShaveTaskInIR) { + _log.trace("insertPoint: {0}, bestReleaseCycle: {1}", *firstShaveTaskInIR, bestReleaseCycle); + newPrefetchKernels = insertPrefetchTasks(funcOp, kernelsToPrefetch, firstShaveTaskInIR, bestUpdateBarrier); + } else { + newPrefetchKernels = insertPrefetchTasksDuringExec(funcOp, kernelsToPrefetch, allTasks); + } // Update dependencies for cache handling operations to meet requirements of control graph split. auto& barrierInfo = getAnalysis(); diff --git a/tests/lit/NPU/dialect/VPUIP/passes/add_sw_kernel_instruction_prefetch_mid_execution_40XX.mlir b/tests/lit/NPU/dialect/VPUIP/passes/add_sw_kernel_instruction_prefetch_mid_execution_40XX.mlir new file mode 100644 index 0000000000..7c75b60da1 --- /dev/null +++ b/tests/lit/NPU/dialect/VPUIP/passes/add_sw_kernel_instruction_prefetch_mid_execution_40XX.mlir @@ -0,0 +1,180 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +// RUN: vpux-opt --split-input-file --init-compiler="vpu-arch=%arch% allow-custom-values=true enable-sw-kernel-fifo-per-shave-engine=false" --add-sw-kernel-instruction-prefetch %s | FileCheck %s +// REQUIRES: arch-NPU40XX + +!DummyDDRT = memref<32000x1x1x1xf16, @DDR> +!DummyCMX0T = memref<32000x1x1x1xf16, [@CMX_NN, 0]> +!DummyCMX1T = memref<32000x1x1x1xf16, [@CMX_NN, 1]> +!DummyCMX0Convert = memref<32000x1x1x1xf32, [@CMX_NN, 0]> +!DummyCMX1Convert = memref<32000x1x1x1xf32, [@CMX_NN, 1]> + +// This test checks following schedule +// Barriers : 0 1 2 3 4 5 +// Cluster 0: | [ DMA ] | [ DMA ] | [ Softmax] | [ Convert ] | [ DMA ] | [ Softmax ] +// Cluster 1: | [ DMA ] | [ Softmax] | [ Convert ] +// Other : [ SyncDMA ] | +// + +module @subgraph attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode} { + VPURT.SW.Runtime entryPoint : @VPU.SW::@runtime stack_configuration : [4096, 4096, 4096, 4096, 4096, 4096] + module @VPU.SW { + func.func private @builtin_SoftMax(memref<*xf16, @CMX_NN>, memref<*xf16, @CMX_NN>, i64, i64) attributes {VPU.kernel_code = "softmax.cpp", VPU.kernel_entry = "softmax", VPU.task_type = @COMPUTE} + func.func private @builtin_Convert(memref<*xf16, @CMX_NN>, memref<*xf32, @CMX_NN>) attributes {VPU.kernel_code = "convert.cpp", VPU.kernel_entry = "convert", VPU.kernel_name = "convert", VPU.task_type = @COMPUTE} + func.func private @runtime() attributes {VPU.kernel_code = "nnActEntry"} + } + config.Resources {activity_factor = 0.078934384661980161 : f64} 2 of @NCE at 1.700000e+03 MHz { + builtin.module @ReservedMemory { + module @DummySWKernelsForInstructionPrefetchReservedMemory { + config.MemoryResource 8 bytes of @CMX_NN offset 1474552 + } + } + config.MemoryResource 1326182 bytes of @CMX_NN_FragmentationAware + config.MemoryResource 1473536 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} + config.ExecutorResource 2 of @SHAVE_ACT + config.ExecutorResource 1 of @DPU + } + config.ExecutorResource 1 of @M2I + config.ExecutorResource 1 of @DMA_NN + config.MemoryResource 2306867200 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} + net.NetworkInfo {inferenceTiming = 369464 : i64} entryPoint : @main inputsInfo : { + DataInfo "data" : tensor<1x3x62x62xui8> + } outputsInfo : { + DataInfo "out" : tensor<1x3x62x62xui8> + } + func.func @main(%arg0: memref<1x3x62x62xui8, @DDR>) -> memref<1x3x62x62xui8, @DDR> { + %0 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + %1 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + %2 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + %3 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + %4 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + %5 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + %6 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + %7 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + + // CHECK: [[BARRIER_0:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + // CHECK: [[BARRIER_1:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + // CHECK: [[BARRIER_2:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + // CHECK: [[BARRIER_3:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + // CHECK: [[BARRIER_4:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + // CHECK: [[BARRIER_5:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + // CHECK: [[BARRIER_6:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + // CHECK: [[BARRIER_7:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + + %28 = VPURT.DeclareBuffer <0> -> memref<0x0x0x0xi32, @DDR> + %ddr_buf = VPURT.DeclareBuffer <0> -> !DummyDDRT + %cmx_0 = VPURT.DeclareBuffer [0] <0> -> !DummyCMX0T + %cmx_1 = VPURT.DeclareBuffer [1] <0> -> !DummyCMX1T + + VPURT.Task updates(%0 : !VPURT.Barrier) { + %241 = VPUIP.SyncDMA {port = 0 : i64} inputs(%28 : memref<0x0x0x0xi32, @DDR>) outputs(%28 : memref<0x0x0x0xi32, @DDR>) -> memref<0x0x0x0xi32, @DDR> + } + + VPURT.Task waits(%0: !VPURT.Barrier) updates(%1 : !VPURT.Barrier) { + %241 = VPUIP.NNDMA {port = 0 : i64} inputs(%ddr_buf :!DummyDDRT) outputs(%cmx_0 : !DummyCMX0T) -> !DummyCMX0T + } + + VPURT.Task waits(%1: !VPURT.Barrier) updates(%2 : !VPURT.Barrier) { + %241 = VPUIP.NNDMA {port = 0 : i64} inputs(%ddr_buf :!DummyDDRT) outputs(%cmx_0 : !DummyCMX0T) -> !DummyCMX0T + } + + VPURT.Task waits(%2: !VPURT.Barrier) updates(%3 : !VPURT.Barrier) { + %241 = VPUIP.NNDMA {port = 0 : i64} inputs(%ddr_buf :!DummyDDRT) outputs(%cmx_0 : !DummyCMX0T) -> !DummyCMX0T + } + + VPURT.Task waits(%3: !VPURT.Barrier) updates(%4 : !VPURT.Barrier) { + %241 = VPUIP.NNDMA {port = 0 : i64} inputs(%ddr_buf :!DummyDDRT) outputs(%cmx_0 : !DummyCMX0T) -> !DummyCMX0T + } + + VPURT.Task waits(%3: !VPURT.Barrier) updates(%4 : !VPURT.Barrier) { + %241 = VPUIP.NNDMA {port = 1 : i64} inputs(%ddr_buf :!DummyDDRT) outputs(%cmx_1 : !DummyCMX1T) -> !DummyCMX1T + } + + VPURT.Task waits(%4: !VPURT.Barrier) updates(%5 : !VPURT.Barrier) { + %results = VPUIP.SW.Kernel {resultSegmentSizes = array} @VPU.SW::@builtin_SoftMax inputs(%cmx_0 as %arg3: !DummyCMX0T) outputs(%cmx_0 as %arg4: !DummyCMX0T) on tile 0 -> !DummyCMX0T{ + VPUIP.SW.Kernel.run {attrs = [0, 0]}(%arg3, %arg4) : !DummyCMX0T, !DummyCMX0T + } + } + + VPURT.Task waits(%4: !VPURT.Barrier) updates(%5 : !VPURT.Barrier) { + %results = VPUIP.SW.Kernel {resultSegmentSizes = array} @VPU.SW::@builtin_SoftMax inputs(%cmx_1 as %arg3: !DummyCMX1T) outputs(%cmx_1 as %arg4: !DummyCMX1T) on tile 1 -> !DummyCMX1T{ + VPUIP.SW.Kernel.run {attrs = [0, 0]}(%arg3, %arg4) : !DummyCMX1T, !DummyCMX1T + } + } + + %cmx0_convert = VPURT.DeclareBuffer [0] <0> -> !DummyCMX0Convert + VPURT.Task waits(%5: !VPURT.Barrier) updates(%6 : !VPURT.Barrier) { + %results = VPUIP.SW.Kernel {resultSegmentSizes = array} @VPU.SW::@builtin_Convert inputs(%cmx_0 as %arg3: !DummyCMX0T) outputs(%cmx0_convert as %arg4: !DummyCMX0Convert) on tile 0 -> (!DummyCMX0Convert) { + VPUIP.SW.Kernel.run {attrs = [[]]}(%arg3, %arg4) : !DummyCMX0T, !DummyCMX0Convert + } + } + + %cmx1_convert = VPURT.DeclareBuffer [1] <0> -> !DummyCMX1Convert + VPURT.Task waits(%5: !VPURT.Barrier) updates(%6 : !VPURT.Barrier) { + %results = VPUIP.SW.Kernel {resultSegmentSizes = array} @VPU.SW::@builtin_Convert inputs(%cmx_1 as %arg3: !DummyCMX1T) outputs(%cmx1_convert as %arg4: !DummyCMX1Convert) on tile 1 -> (!DummyCMX1Convert) { + VPUIP.SW.Kernel.run {attrs = [[]]}(%arg3, %arg4) : !DummyCMX1T, !DummyCMX1Convert + } + } + + VPURT.Task waits(%6: !VPURT.Barrier) updates(%7 : !VPURT.Barrier) { + %241 = VPUIP.NNDMA {port = 0 : i64} inputs(%ddr_buf :!DummyDDRT) outputs(%cmx_0 : !DummyCMX0T) -> !DummyCMX0T + } + + VPURT.Task waits(%7: !VPURT.Barrier) { + %results = VPUIP.SW.Kernel {resultSegmentSizes = array} @VPU.SW::@builtin_SoftMax inputs(%cmx_0 as %arg3: !DummyCMX0T) outputs(%cmx_0 as %arg4: !DummyCMX0T) on tile 0 -> !DummyCMX0T{ + VPUIP.SW.Kernel.run {attrs = [0, 0]}(%arg3, %arg4) : !DummyCMX0T, !DummyCMX0T + } + } + + // CHECK: VPURT.Task updates([[BARRIER_0]] : !VPURT.Barrier) { + // CHECK-NEXT: VPUIP.SyncDMA + + // CHECK: VPURT.Task waits([[BARRIER_0]] : !VPURT.Barrier) updates([[BARRIER_1]] : !VPURT.Barrier) { + // CHECK-NEXT: VPUIP.NNDMA + + // CHECK: VPURT.Task waits([[BARRIER_1]] : !VPURT.Barrier) updates([[BARRIER_2]] : !VPURT.Barrier) { + // CHECK-NEXT: VPUIP.NNDMA + + // CHECK: VPURT.Task waits([[BARRIER_2]] : !VPURT.Barrier) updates([[BARRIER_3]] : !VPURT.Barrier) { + // CHECK-NEXT: VPUIP.NNDMA + + // CHECK: VPURT.Task waits([[BARRIER_3]] : !VPURT.Barrier) updates([[BARRIER_4]] : !VPURT.Barrier) { + // CHECK-NEXT: VPUIP.NNDMA + + // CHECK: VPURT.Task waits([[BARRIER_3]] : !VPURT.Barrier) updates([[BARRIER_4]] : !VPURT.Barrier) { + // CHECK-NEXT: VPUIP.NNDMA + + // CHECK: VPURT.Task waits([[BARRIER_4]] : !VPURT.Barrier) updates([[BARRIER_5]] : !VPURT.Barrier) { + // CHECK: VPUIP.SW.Kernel + // CHECK-SAME: @VPU.SW::@builtin_SoftMax + + // CHECK: VPURT.Task waits([[BARRIER_4]] : !VPURT.Barrier) updates([[BARRIER_5]] : !VPURT.Barrier) { + // CHECK: VPUIP.SW.Kernel + // CHECK-SAME: @VPU.SW::@builtin_SoftMax + + // CHECK: VPURT.Task { + // CHECK-NEXT: VPUIP.SW.Kernel + // CHECK-SAME: skipProfiling + // CHECK-SAME: @VPU.SW::@builtin_Convert + + // CHECK: VPURT.Task waits([[BARRIER_5]] : !VPURT.Barrier) updates([[BARRIER_6]] : !VPURT.Barrier) { + // CHECK: VPUIP.SW.Kernel + // CHECK-SAME: @VPU.SW::@builtin_Convert + + // CHECK: VPURT.Task waits([[BARRIER_5]] : !VPURT.Barrier) updates([[BARRIER_6]] : !VPURT.Barrier) { + // CHECK: VPUIP.SW.Kernel + // CHECK-SAME: @VPU.SW::@builtin_Convert + + // CHECK: VPURT.Task waits([[BARRIER_6]] : !VPURT.Barrier) updates([[BARRIER_7]] : !VPURT.Barrier) { + // CHECK-NEXT: VPUIP.NNDMA + + // CHECK: VPURT.Task waits([[BARRIER_7]] : !VPURT.Barrier) { + // CHECK: VPUIP.SW.Kernel + // CHECK-SAME: @VPU.SW::@builtin_SoftMax + + return %arg0 : memref<1x3x62x62xui8, @DDR> + } +} diff --git a/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_40XX.mlir b/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_40XX.mlir index a2ae982802..63fcf82282 100644 --- a/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_40XX.mlir +++ b/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_40XX.mlir @@ -20,6 +20,14 @@ // CHECK-LABEL: @SoftMax module @SoftMax attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode} { + config.Resources {activity_factor = 0.078934384661980161 : f64} 6 of @NCE at 1.700000e+03 MHz { + builtin.module @ReservedMemory { + module @DummySWKernelsForInstructionPrefetchReservedMemory { + config.MemoryResource 8 bytes of @CMX_NN offset 1473528 + } + } + } + VPURT.SW.Runtime entryPoint : @VPU.SW::@runtime stack_configuration : [4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096] module @VPU.SW { func.func private @builtin_SoftMax(memref<*xf16, @CMX_NN>, memref<*xf16, @CMX_NN>, i64, i64) attributes {VPU.kernel_code = "softmax.cpp", VPU.kernel_entry = "softmax", VPU.task_type = @COMPUTE} @@ -166,6 +174,14 @@ module @SoftMax attributes {config.arch = #config.arch_kind, config.com // CHECK-LABEL: @TwoFunctions module @TwoFunctions attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode} { + config.Resources {activity_factor = 0.078934384661980161 : f64} 6 of @NCE at 1.700000e+03 MHz { + builtin.module @ReservedMemory { + module @DummySWKernelsForInstructionPrefetchReservedMemory { + config.MemoryResource 8 bytes of @CMX_NN offset 1473528 + } + } + } + // CHECK-DAG: {{ }}config.Resources VPURT.SW.Runtime entryPoint : @VPU.SW::@runtime stack_configuration : [4096, 4096, 4096, 4096] diff --git a/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_repeating_blocks.mlir b/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_repeating_blocks.mlir index 12e3e9d702..24593a29e1 100644 --- a/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_repeating_blocks.mlir +++ b/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_repeating_blocks.mlir @@ -9,6 +9,14 @@ !MemRef = memref<1x3x62x62xf16> module @ChainCalls { + config.Resources 2 of @NCE at 1.300000e+03 MHz { + builtin.module @ReservedMemory { + module @DummySWKernelsForInstructionPrefetchReservedMemory { + config.MemoryResource 8 bytes of @CMX_NN offset 0 + } + } + } + net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "input" : tensor<1x3x62x62xf16> } outputsInfo : { @@ -61,6 +69,14 @@ module @ChainCalls { !MemRef = memref<1x1x2x64xf16> module @SwKernelsChainCalls { + config.Resources 2 of @NCE at 1.300000e+03 MHz { + builtin.module @ReservedMemory { + module @DummySWKernelsForInstructionPrefetchReservedMemory { + config.MemoryResource 8 bytes of @CMX_NN offset 0 + } + } + } + net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "input" : tensor<1x1x2x64xf16> } outputsInfo : { @@ -146,6 +162,14 @@ module @SwKernelsChainCalls { !MemRef = memref<1x1x2x64xf16> module @SwKernelsIndependentCalls { + config.Resources 2 of @NCE at 1.300000e+03 MHz { + builtin.module @ReservedMemory { + module @DummySWKernelsForInstructionPrefetchReservedMemory { + config.MemoryResource 8 bytes of @CMX_NN offset 0 + } + } + } + net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "input" : tensor<1x1x2x64xf16> } outputsInfo : { diff --git a/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_schedule_trace_enabled_40XX.mlir b/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_schedule_trace_enabled_40XX.mlir index 5406c523ac..e4e470ff78 100644 --- a/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_schedule_trace_enabled_40XX.mlir +++ b/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_schedule_trace_enabled_40XX.mlir @@ -9,6 +9,14 @@ // CHECK-LABEL: @Gather module @Gather attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode} { + config.Resources {activity_factor = 0.078934384661980161 : f64} 2 of @NCE at 1.700000e+03 MHz { + builtin.module @ReservedMemory { + module @DummySWKernelsForInstructionPrefetchReservedMemory { + config.MemoryResource 8 bytes of @CMX_NN offset 1473528 + } + } + } + VPURT.SW.Runtime entryPoint: @VPU.SW::@runtime stack_configuration: [4096, 4096, 4096, 4096] diff --git a/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_vertical_fusion_outlining_40XX+.mlir b/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_vertical_fusion_outlining_40XX+.mlir index d1ed426361..7aff433937 100644 --- a/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_vertical_fusion_outlining_40XX+.mlir +++ b/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_vertical_fusion_outlining_40XX+.mlir @@ -14,6 +14,13 @@ module @VerticalFusionOutlining attributes {config.compilationMode = #config.com func.func private @runtime() attributes {VPU.kernel_code = "nnActEntry"} } + config.Resources {activity_factor = 0.078934384661980161 : f64} 2 of @NCE at 1.700000e+03 MHz { + builtin.module @ReservedMemory { + module @DummySWKernelsForInstructionPrefetchReservedMemory { + config.MemoryResource 8 bytes of @CMX_NN offset 1473528 + } + } + } net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "input" : tensor<1x16x128x128xf16, {order = #NHWC}> } outputsInfo : {