@@ -31,10 +31,23 @@ using namespace vpux;
3131
3232namespace {
3333
34+ struct GapCandidate {
35+ uint64_t lookaheadGap = 0 ;
36+ int64_t insertionPointTaskIndex = -1 ;
37+
38+ // used for sort
39+ bool operator >(const GapCandidate& other) const {
40+ return lookaheadGap > other.lookaheadGap ;
41+ }
42+ };
43+
3444static const SmallVector<StringLiteral> SW_DUMMY_KERNELS_PREFETCH_SUPPORTED = {
3545 " activation_swish" , " eltwise_mul" , " softmax" , " convert" , " rms_norm" ,
3646 " activation_swish" , " activation_sin" , " eltwise_equal" , " activation_cos" , " eltwise_select" };
3747
48+ static const SmallVector<StringRef> SW_DUMMY_KERNELS_WITHOUT_ARGS = {
49+ " convert" , " eltwise_mul" , " activation_cos" , " activation_sin" , " eltwise_equal" , " eltwise_select" , " rms_norm" };
50+
3851//
3952// AddSwKernelInstructionPrefetch
4053//
@@ -82,6 +95,9 @@ class AddSwKernelInstructionPrefetch final :
8295 std::vector<VPUIP::SwKernelOp> insertPrefetchTasks (mlir::Operation* funcOp, SwKernelPrefetchVec& kernelsToPrefetch,
8396 mlir::Operation* firstShaveTaskInIR,
8497 mlir::Value bestUpdateBarrier);
98+ std::optional<GapCandidate> findBestInsertionGapDuringExec (const std::string& kernelName,
99+ uint64_t targetKernelGroupStartTime,
100+ VPURT::TaskConfigVec& allTasks, size_t numClusters);
85101 std::vector<VPUIP::SwKernelOp> insertPrefetchTasksDuringExec (
86102 mlir::Operation* funcOp, AddSwKernelInstructionPrefetch::SwKernelPrefetchVec& kernelsToPrefetch,
87103 VPURT::TaskConfigVec& allTasks);
@@ -101,6 +117,12 @@ class AddSwKernelInstructionPrefetch final :
101117 size_t _minimumFreeCyclesForPrefetch = 250000 ;
102118 bool _useDummyKernelForInstructionPrefetch = false ;
103119 size_t _dynamicPrefetchTileCounter = 0 ;
120+ // Using Tile 1 as the target for insertion to enable prefetching only when the available tile count is larger
121+ // than 1.
122+ int64_t _targetInsertTileDuringExec = 1 ;
123+ // The threshold of 50,000 cycles is empirically chosen to ensure there is a sufficient gap
124+ // to perform instruction prefetching without causing stalls.
125+ uint64_t _prefetchGapThresholdDuringExec = 50000 ;
104126};
105127
106128bool AddSwKernelInstructionPrefetch::hasVPUSWModule (mlir::Operation* funcOp) {
@@ -248,9 +270,7 @@ VPUIP::SwKernelOp AddSwKernelInstructionPrefetch::insertDummyKernelOpBeforeFirst
248270 // so we need to add skipProfiling as attribute to avoid capturing their metadata
249271 cachePrefetchSwKernel->setAttr (" skipProfiling" , mlir::UnitAttr::get (firstSwTask->getContext ()));
250272
251- auto args = (kernelName == " convert" || kernelName == " eltwise_mul" || kernelName == " activation_cos" ||
252- kernelName == " activation_sin" || kernelName == " eltwise_equal" || kernelName == " eltwise_select" ||
253- kernelName == " rms_norm" )
273+ auto args = llvm::is_contained (SW_DUMMY_KERNELS_WITHOUT_ARGS, kernelName)
254274 ? mlir::ArrayAttr::get (moduleOp->getContext (), {})
255275 : kernelNameToArgs[kernelName];
256276
@@ -427,6 +447,21 @@ std::vector<VPUIP::SwKernelOp> AddSwKernelInstructionPrefetch::insertPrefetchTas
427447 return prefetchedKernels;
428448}
429449
450+ size_t getSwKernelCountAtTime (uint64_t startTime, VPURT::TaskConfigVec& allTasks) {
451+ size_t count = 0 ;
452+ for (auto & taskConfig : allTasks) {
453+ if (static_cast <uint64_t >(taskConfig.cycleStart ) == startTime) {
454+ if (mlir::isa<VPUIP::SwKernelOp>(taskConfig.taskOp .getInnerTaskOp ())) {
455+ count++;
456+ }
457+ }
458+ if (static_cast <uint64_t >(taskConfig.cycleStart ) > startTime) {
459+ break ;
460+ }
461+ }
462+ return count;
463+ }
464+
430465uint64_t findNextSaturationStart (size_t startIndex, vpux::VPURT::TaskConfigVec& allTasks, size_t numClusters,
431466 std::map<uint64_t , size_t >& swKernelCountsCache) {
432467 // Saturation is defined as 2x the number of clusters (e.g., 4 clusters -> 8 SW kernels)
@@ -437,19 +472,7 @@ uint64_t findNextSaturationStart(size_t startIndex, vpux::VPURT::TaskConfigVec&
437472 uint64_t currentStartTime = static_cast <uint64_t >(allTasks[i].cycleStart );
438473
439474 if (swKernelCountsCache.find (currentStartTime) == swKernelCountsCache.end ()) {
440- size_t swKernelCount = 0 ;
441- // Count all SW Kernels that start at this specific time
442- for (auto & task : allTasks) {
443- if (static_cast <uint64_t >(task.cycleStart ) == currentStartTime) {
444- if (mlir::isa<VPUIP::SwKernelOp>(task.taskOp .getInnerTaskOp ())) {
445- swKernelCount++;
446- }
447- }
448- if (static_cast <uint64_t >(task.cycleStart ) > currentStartTime) {
449- break ;
450- }
451- }
452- swKernelCountsCache[currentStartTime] = swKernelCount;
475+ swKernelCountsCache[currentStartTime] = getSwKernelCountAtTime (currentStartTime, allTasks);
453476 }
454477
455478 if (swKernelCountsCache[currentStartTime] >= saturationThreshold) {
@@ -460,43 +483,17 @@ uint64_t findNextSaturationStart(size_t startIndex, vpux::VPURT::TaskConfigVec&
460483 return std::numeric_limits<uint64_t >::max ();
461484}
462485
463- struct GapCandidate {
464- uint64_t lookaheadGap = 0 ;
465- int64_t insertionPointTaskIndex = -1 ;
466-
467- // used for sort
468- bool operator >(const GapCandidate& other) const {
469- return lookaheadGap > other.lookaheadGap ;
470- }
471- };
472-
473- size_t getSwKernelCountAtTime (uint64_t startTime, VPURT::TaskConfigVec& allTasks) {
474- size_t count = 0 ;
475- for (auto & taskConfig : allTasks) {
476- if (static_cast <uint64_t >(taskConfig.cycleStart ) == startTime) {
477- if (mlir::isa<VPUIP::SwKernelOp>(taskConfig.taskOp .getInnerTaskOp ())) {
478- count++;
479- }
480- }
481- if (static_cast <uint64_t >(taskConfig.cycleStart ) > startTime) {
482- break ;
483- }
484- }
485- return count;
486- }
487-
488- std::optional<GapCandidate> findBestInsertionGap (const std::string& kernelName, uint64_t targetKernelGroupStartTime,
489- VPURT::TaskConfigVec& allTasks, size_t numClusters, Logger& log) {
490- const int64_t targetInsertTile = 1 ;
491- const uint64_t GAP_THRESHOLD = 50000 ;
486+ std::optional<GapCandidate> AddSwKernelInstructionPrefetch::findBestInsertionGapDuringExec (
487+ const std::string& kernelName, uint64_t targetKernelGroupStartTime, VPURT::TaskConfigVec& allTasks,
488+ size_t numClusters) {
492489 const size_t saturationThreshold = numClusters * 2 ;
493490
494491 // <LookaheadGapSize, GapCandidate>
495492 std::map<uint64_t , GapCandidate, std::greater<uint64_t >> validGaps;
496493 std::map<uint64_t , size_t > swKernelCountsCache; // local cache
497494
498- int64_t previousT1TaskIndex = -1 ;
499- uint64_t previousT1TaskStartTime = 0 ;
495+ int64_t prevTargetTileTaskIndex = -1 ;
496+ uint64_t prevTargetTileTaskStartTime = 0 ;
500497
501498 // find the largest gap between a non-saturated SW task and a saturated SW task / the kernel to be prefetched
502499 for (size_t i = 0 ; i < allTasks.size (); ++i) {
@@ -506,43 +503,43 @@ std::optional<GapCandidate> findBestInsertionGap(const std::string& kernelName,
506503 break ;
507504 }
508505
509- bool isT1Task = false ;
506+ bool isTargetTileTask = false ;
510507 if (auto swOp = mlir::dyn_cast<VPUIP::SwKernelOp>(currentTaskConfig.taskOp .getInnerTaskOp ()); swOp != nullptr ) {
511- isT1Task = (swOp.getTileIndexAttr ().getInt () == targetInsertTile );
508+ isTargetTileTask = (swOp.getTileIndexAttr ().getInt () == _targetInsertTileDuringExec );
512509 }
513510
514- if (previousT1TaskIndex != -1 && isT1Task ) {
515- auto & insertionPointTask = allTasks[previousT1TaskIndex ];
511+ if (prevTargetTileTaskIndex != -1 && isTargetTileTask ) {
512+ auto & insertionPointTask = allTasks[prevTargetTileTaskIndex ];
516513 auto insertionPointStartTime = static_cast <uint64_t >(insertionPointTask.cycleStart );
517514
518515 size_t simultaneousSwKernels = getSwKernelCountAtTime (insertionPointStartTime, allTasks);
519516
520517 if (simultaneousSwKernels < saturationThreshold) {
521518 uint64_t nextSaturationStart =
522- findNextSaturationStart (previousT1TaskIndex , allTasks, numClusters, swKernelCountsCache);
519+ findNextSaturationStart (prevTargetTileTaskIndex , allTasks, numClusters, swKernelCountsCache);
523520 uint64_t gapEnd = std::min (nextSaturationStart, targetKernelGroupStartTime);
524521 uint64_t lookaheadGap = 0 ;
525- if (gapEnd > previousT1TaskStartTime ) {
526- lookaheadGap = gapEnd - previousT1TaskStartTime ;
522+ if (gapEnd > prevTargetTileTaskStartTime ) {
523+ lookaheadGap = gapEnd - prevTargetTileTaskStartTime ;
527524 }
528525
529- if (lookaheadGap >= GAP_THRESHOLD ) {
526+ if (lookaheadGap >= _prefetchGapThresholdDuringExec ) {
530527 GapCandidate gap;
531528 gap.lookaheadGap = lookaheadGap;
532- gap.insertionPointTaskIndex = previousT1TaskIndex ;
529+ gap.insertionPointTaskIndex = prevTargetTileTaskIndex ;
533530 validGaps[lookaheadGap] = gap;
534531 }
535532 }
536533 }
537534
538- if (isT1Task ) {
539- previousT1TaskIndex = static_cast <int64_t >(i);
540- previousT1TaskStartTime = currentTaskStartTime;
535+ if (isTargetTileTask ) {
536+ prevTargetTileTaskIndex = static_cast <int64_t >(i);
537+ prevTargetTileTaskStartTime = currentTaskStartTime;
541538 }
542539 }
543540
544541 if (validGaps.empty ()) {
545- log .trace (" Kernel '{0}': No suitable insertion point found." , kernelName);
542+ _log .trace (" Kernel '{0}': No suitable insertion point found." , kernelName);
546543 return std::nullopt ;
547544 }
548545
@@ -573,7 +570,16 @@ std::vector<VPUIP::SwKernelOp> AddSwKernelInstructionPrefetch::insertPrefetchTas
573570
574571 auto targetKernelGroupStartTime = static_cast <uint64_t >(allTasks[firstAppearanceIndex].cycleStart );
575572
576- auto bestGapOpt = findBestInsertionGap (kernelName, targetKernelGroupStartTime, allTasks, numClusters, _log);
573+ // Finds the best insertion point for prefetch by identifying non-saturated execution windows.
574+ // Scans for tasks on the target tile to serve as prefetch anchors. A valid "Gap" is the
575+ // duration from an anchor task to the next saturation event or the target kernel start.
576+ //
577+ // Logic:
578+ // 1. Find a candidate task on the target tile.
579+ // 2. Ensure NPU is not saturated at that time.
580+ // 3. Calculate Gap = (Next Saturation or Target Start) - Insertion Time.
581+ // 4. Return the candidate with the largest Gap >= _prefetchGapThreshold.
582+ auto bestGapOpt = findBestInsertionGapDuringExec (kernelName, targetKernelGroupStartTime, allTasks, numClusters);
577583
578584 if (!bestGapOpt.has_value ()) {
579585 _log.trace (" Kernel '{0}': No valid gap found." , kernelName);
0 commit comments