Skip to content

Commit c8f614f

Browse files
committed
Refactor code, rename variables and add comments
1 parent 1b364a5 commit c8f614f

File tree

3 files changed

+108
-62
lines changed

3 files changed

+108
-62
lines changed

src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp

Lines changed: 67 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,23 @@ using namespace vpux;
3131

3232
namespace {
3333

34+
struct GapCandidate {
35+
uint64_t lookaheadGap = 0;
36+
int64_t insertionPointTaskIndex = -1;
37+
38+
// used for sort
39+
bool operator>(const GapCandidate& other) const {
40+
return lookaheadGap > other.lookaheadGap;
41+
}
42+
};
43+
3444
static const SmallVector<StringLiteral> SW_DUMMY_KERNELS_PREFETCH_SUPPORTED = {
3545
"activation_swish", "eltwise_mul", "softmax", "convert", "rms_norm",
3646
"activation_swish", "activation_sin", "eltwise_equal", "activation_cos", "eltwise_select"};
3747

48+
static const SmallVector<StringRef> SW_DUMMY_KERNELS_WITHOUT_ARGS = {
49+
"convert", "eltwise_mul", "activation_cos", "activation_sin", "eltwise_equal", "eltwise_select", "rms_norm"};
50+
3851
//
3952
// AddSwKernelInstructionPrefetch
4053
//
@@ -82,6 +95,9 @@ class AddSwKernelInstructionPrefetch final :
8295
std::vector<VPUIP::SwKernelOp> insertPrefetchTasks(mlir::Operation* funcOp, SwKernelPrefetchVec& kernelsToPrefetch,
8396
mlir::Operation* firstShaveTaskInIR,
8497
mlir::Value bestUpdateBarrier);
98+
std::optional<GapCandidate> findBestInsertionGapDuringExec(const std::string& kernelName,
99+
uint64_t targetKernelGroupStartTime,
100+
VPURT::TaskConfigVec& allTasks, size_t numClusters);
85101
std::vector<VPUIP::SwKernelOp> insertPrefetchTasksDuringExec(
86102
mlir::Operation* funcOp, AddSwKernelInstructionPrefetch::SwKernelPrefetchVec& kernelsToPrefetch,
87103
VPURT::TaskConfigVec& allTasks);
@@ -101,6 +117,12 @@ class AddSwKernelInstructionPrefetch final :
101117
size_t _minimumFreeCyclesForPrefetch = 250000;
102118
bool _useDummyKernelForInstructionPrefetch = false;
103119
size_t _dynamicPrefetchTileCounter = 0;
120+
// Using Tile 1 as the target for insertion to enable prefetching only when the available tile count is larger
121+
// than 1.
122+
int64_t _targetInsertTileDuringExec = 1;
123+
// The threshold of 50,000 cycles is empirically chosen to ensure there is a sufficient gap
124+
// to perform instruction prefetching without causing stalls.
125+
uint64_t _prefetchGapThresholdDuringExec = 50000;
104126
};
105127

106128
bool AddSwKernelInstructionPrefetch::hasVPUSWModule(mlir::Operation* funcOp) {
@@ -248,9 +270,7 @@ VPUIP::SwKernelOp AddSwKernelInstructionPrefetch::insertDummyKernelOpBeforeFirst
248270
// so we need to add skipProfiling as attribute to avoid capturing their metadata
249271
cachePrefetchSwKernel->setAttr("skipProfiling", mlir::UnitAttr::get(firstSwTask->getContext()));
250272

251-
auto args = (kernelName == "convert" || kernelName == "eltwise_mul" || kernelName == "activation_cos" ||
252-
kernelName == "activation_sin" || kernelName == "eltwise_equal" || kernelName == "eltwise_select" ||
253-
kernelName == "rms_norm")
273+
auto args = llvm::is_contained(SW_DUMMY_KERNELS_WITHOUT_ARGS, kernelName)
254274
? mlir::ArrayAttr::get(moduleOp->getContext(), {})
255275
: kernelNameToArgs[kernelName];
256276

@@ -427,6 +447,21 @@ std::vector<VPUIP::SwKernelOp> AddSwKernelInstructionPrefetch::insertPrefetchTas
427447
return prefetchedKernels;
428448
}
429449

450+
size_t getSwKernelCountAtTime(uint64_t startTime, VPURT::TaskConfigVec& allTasks) {
451+
size_t count = 0;
452+
for (auto& taskConfig : allTasks) {
453+
if (static_cast<uint64_t>(taskConfig.cycleStart) == startTime) {
454+
if (mlir::isa<VPUIP::SwKernelOp>(taskConfig.taskOp.getInnerTaskOp())) {
455+
count++;
456+
}
457+
}
458+
if (static_cast<uint64_t>(taskConfig.cycleStart) > startTime) {
459+
break;
460+
}
461+
}
462+
return count;
463+
}
464+
430465
uint64_t findNextSaturationStart(size_t startIndex, vpux::VPURT::TaskConfigVec& allTasks, size_t numClusters,
431466
std::map<uint64_t, size_t>& swKernelCountsCache) {
432467
// Saturation is defined as 2x the number of clusters (e.g., 4 clusters -> 8 SW kernels)
@@ -437,19 +472,7 @@ uint64_t findNextSaturationStart(size_t startIndex, vpux::VPURT::TaskConfigVec&
437472
uint64_t currentStartTime = static_cast<uint64_t>(allTasks[i].cycleStart);
438473

439474
if (swKernelCountsCache.find(currentStartTime) == swKernelCountsCache.end()) {
440-
size_t swKernelCount = 0;
441-
// Count all SW Kernels that start at this specific time
442-
for (auto& task : allTasks) {
443-
if (static_cast<uint64_t>(task.cycleStart) == currentStartTime) {
444-
if (mlir::isa<VPUIP::SwKernelOp>(task.taskOp.getInnerTaskOp())) {
445-
swKernelCount++;
446-
}
447-
}
448-
if (static_cast<uint64_t>(task.cycleStart) > currentStartTime) {
449-
break;
450-
}
451-
}
452-
swKernelCountsCache[currentStartTime] = swKernelCount;
475+
swKernelCountsCache[currentStartTime] = getSwKernelCountAtTime(currentStartTime, allTasks);
453476
}
454477

455478
if (swKernelCountsCache[currentStartTime] >= saturationThreshold) {
@@ -460,43 +483,17 @@ uint64_t findNextSaturationStart(size_t startIndex, vpux::VPURT::TaskConfigVec&
460483
return std::numeric_limits<uint64_t>::max();
461484
}
462485

463-
struct GapCandidate {
464-
uint64_t lookaheadGap = 0;
465-
int64_t insertionPointTaskIndex = -1;
466-
467-
// used for sort
468-
bool operator>(const GapCandidate& other) const {
469-
return lookaheadGap > other.lookaheadGap;
470-
}
471-
};
472-
473-
size_t getSwKernelCountAtTime(uint64_t startTime, VPURT::TaskConfigVec& allTasks) {
474-
size_t count = 0;
475-
for (auto& taskConfig : allTasks) {
476-
if (static_cast<uint64_t>(taskConfig.cycleStart) == startTime) {
477-
if (mlir::isa<VPUIP::SwKernelOp>(taskConfig.taskOp.getInnerTaskOp())) {
478-
count++;
479-
}
480-
}
481-
if (static_cast<uint64_t>(taskConfig.cycleStart) > startTime) {
482-
break;
483-
}
484-
}
485-
return count;
486-
}
487-
488-
std::optional<GapCandidate> findBestInsertionGap(const std::string& kernelName, uint64_t targetKernelGroupStartTime,
489-
VPURT::TaskConfigVec& allTasks, size_t numClusters, Logger& log) {
490-
const int64_t targetInsertTile = 1;
491-
const uint64_t GAP_THRESHOLD = 50000;
486+
std::optional<GapCandidate> AddSwKernelInstructionPrefetch::findBestInsertionGapDuringExec(
487+
const std::string& kernelName, uint64_t targetKernelGroupStartTime, VPURT::TaskConfigVec& allTasks,
488+
size_t numClusters) {
492489
const size_t saturationThreshold = numClusters * 2;
493490

494491
// <LookaheadGapSize, GapCandidate>
495492
std::map<uint64_t, GapCandidate, std::greater<uint64_t>> validGaps;
496493
std::map<uint64_t, size_t> swKernelCountsCache; // local cache
497494

498-
int64_t previousT1TaskIndex = -1;
499-
uint64_t previousT1TaskStartTime = 0;
495+
int64_t prevTargetTileTaskIndex = -1;
496+
uint64_t prevTargetTileTaskStartTime = 0;
500497

501498
// find the largest gap between a non-saturated SW task and a saturated SW task / the kernel to be prefetched
502499
for (size_t i = 0; i < allTasks.size(); ++i) {
@@ -506,43 +503,43 @@ std::optional<GapCandidate> findBestInsertionGap(const std::string& kernelName,
506503
break;
507504
}
508505

509-
bool isT1Task = false;
506+
bool isTargetTileTask = false;
510507
if (auto swOp = mlir::dyn_cast<VPUIP::SwKernelOp>(currentTaskConfig.taskOp.getInnerTaskOp()); swOp != nullptr) {
511-
isT1Task = (swOp.getTileIndexAttr().getInt() == targetInsertTile);
508+
isTargetTileTask = (swOp.getTileIndexAttr().getInt() == _targetInsertTileDuringExec);
512509
}
513510

514-
if (previousT1TaskIndex != -1 && isT1Task) {
515-
auto& insertionPointTask = allTasks[previousT1TaskIndex];
511+
if (prevTargetTileTaskIndex != -1 && isTargetTileTask) {
512+
auto& insertionPointTask = allTasks[prevTargetTileTaskIndex];
516513
auto insertionPointStartTime = static_cast<uint64_t>(insertionPointTask.cycleStart);
517514

518515
size_t simultaneousSwKernels = getSwKernelCountAtTime(insertionPointStartTime, allTasks);
519516

520517
if (simultaneousSwKernels < saturationThreshold) {
521518
uint64_t nextSaturationStart =
522-
findNextSaturationStart(previousT1TaskIndex, allTasks, numClusters, swKernelCountsCache);
519+
findNextSaturationStart(prevTargetTileTaskIndex, allTasks, numClusters, swKernelCountsCache);
523520
uint64_t gapEnd = std::min(nextSaturationStart, targetKernelGroupStartTime);
524521
uint64_t lookaheadGap = 0;
525-
if (gapEnd > previousT1TaskStartTime) {
526-
lookaheadGap = gapEnd - previousT1TaskStartTime;
522+
if (gapEnd > prevTargetTileTaskStartTime) {
523+
lookaheadGap = gapEnd - prevTargetTileTaskStartTime;
527524
}
528525

529-
if (lookaheadGap >= GAP_THRESHOLD) {
526+
if (lookaheadGap >= _prefetchGapThresholdDuringExec) {
530527
GapCandidate gap;
531528
gap.lookaheadGap = lookaheadGap;
532-
gap.insertionPointTaskIndex = previousT1TaskIndex;
529+
gap.insertionPointTaskIndex = prevTargetTileTaskIndex;
533530
validGaps[lookaheadGap] = gap;
534531
}
535532
}
536533
}
537534

538-
if (isT1Task) {
539-
previousT1TaskIndex = static_cast<int64_t>(i);
540-
previousT1TaskStartTime = currentTaskStartTime;
535+
if (isTargetTileTask) {
536+
prevTargetTileTaskIndex = static_cast<int64_t>(i);
537+
prevTargetTileTaskStartTime = currentTaskStartTime;
541538
}
542539
}
543540

544541
if (validGaps.empty()) {
545-
log.trace("Kernel '{0}': No suitable insertion point found.", kernelName);
542+
_log.trace("Kernel '{0}': No suitable insertion point found.", kernelName);
546543
return std::nullopt;
547544
}
548545

@@ -573,7 +570,16 @@ std::vector<VPUIP::SwKernelOp> AddSwKernelInstructionPrefetch::insertPrefetchTas
573570

574571
auto targetKernelGroupStartTime = static_cast<uint64_t>(allTasks[firstAppearanceIndex].cycleStart);
575572

576-
auto bestGapOpt = findBestInsertionGap(kernelName, targetKernelGroupStartTime, allTasks, numClusters, _log);
573+
// Finds the best insertion point for prefetch by identifying non-saturated execution windows.
574+
// Scans for tasks on the target tile to serve as prefetch anchors. A valid "Gap" is the
575+
// duration from an anchor task to the next saturation event or the target kernel start.
576+
//
577+
// Logic:
578+
// 1. Find a candidate task on the target tile.
579+
// 2. Ensure NPU is not saturated at that time.
580+
// 3. Calculate Gap = (Next Saturation or Target Start) - Insertion Time.
581+
// 4. Return the candidate with the largest Gap >= _prefetchGapThreshold.
582+
auto bestGapOpt = findBestInsertionGapDuringExec(kernelName, targetKernelGroupStartTime, allTasks, numClusters);
577583

578584
if (!bestGapOpt.has_value()) {
579585
_log.trace("Kernel '{0}': No valid gap found.", kernelName);

tests/lit/NPU/dialect/VPUIP/passes/add_sw_kernel_instruction_prefetch_mid_execution_40XX.mlir

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
//
2-
// Copyright (C) 2024-2025 Intel Corporation.
2+
// Copyright (C) 2025 Intel Corporation.
33
// SPDX-License-Identifier: Apache-2.0
44
//
55

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
{"traceEvents":[
2+
{"name": "process_name", "ph": "M", "pid":0, "args": {"name" : "DMA"}},
3+
{"name": "process_sort_index", "ph": "M", "pid":0, "args": {"sort_index" : "0"}},
4+
{"name": "thread_name", "ph": "M", "pid":0, "tid":0, "args": {"name" : "DMA"}},
5+
{"name": "thread_name", "ph": "M", "pid":0, "tid":1, "args": {"name" : "DMA"}},
6+
{"name": "process_name", "ph": "M", "pid":1, "args": {"name" : "Cluster (0)"}},
7+
{"name": "process_sort_index", "ph": "M", "pid":1, "args": {"sort_index" : "1"}},
8+
{"name": "thread_name", "ph": "M", "pid":1, "tid":2, "args": {"name" : "Shave"}},
9+
{"name": "process_name", "ph": "M", "pid":2, "args": {"name" : "Layers"}},
10+
{"name": "process_sort_index", "ph": "M", "pid":2, "args": {"sort_index" : "2"}},
11+
{"name": "thread_name", "ph": "M", "pid":2, "tid":3, "args": {"name" : "Layers"}},
12+
{"name":"", "cat":"DMA", "ph":"X", "ts":0.000, "dur":0.957, "pid":0, "tid":0},
13+
{"name":"start_barrier_sync_dma", "cat":"DMA", "ph":"X", "ts":0.000, "dur":0.000, "pid":0, "tid":1},
14+
{"name":"", "cat":"DMA", "ph":"X", "ts":5.777, "dur":1.218, "pid":0, "tid":0},
15+
{"name":"?_cache_handling_op/cluster_0", "cat":"Shave", "ph":"X", "ts":0.957, "dur":0.000, "pid":1, "tid":2},
16+
{"name":"?cluster_0", "cat":"Shave", "ph":"X", "ts":0.958, "dur":4.818, "pid":1, "tid":2},
17+
{"name":"start_barrier_sync_dma", "cat":"Layer", "ph":"X", "ts":0.000, "dur":0.000, "pid":2, "tid":3, "args":{"Layer type": "<Unknown>"}},
18+
{"name":"", "cat":"Layer", "ph":"X", "ts":0.000, "dur":6.995, "pid":2, "tid":3, "args":{"Layer type": "<Unknown>", "Shave time:": "4us 818ns", "DMA time:": "2us 175ns"}}
19+
],
20+
"taskStatistics": {
21+
"total duration":6.995,
22+
"DMA duration":2.175,
23+
"DPU duration":0.000,
24+
"SW duration":4.818,
25+
"M2I duration":0.000,
26+
"DMA-DPU overlap":0.000,
27+
"DMA-SW overlap":0.000,
28+
"SW-DPU overlap":0.000,
29+
"all tasks union":6.993,
30+
"total idle":0.002,
31+
"SW duration without DPU overlap":4.818,
32+
"DMA duration without overlaps":2.175,
33+
"Sum of DMA task durations":2.175,
34+
"Sum of DPU task durations":0.000,
35+
"Sum of SW task durations":4.818,
36+
"Sum of M2I task durations":0.000
37+
},
38+
"workpoint": { "freq": 1700.0, "status": "SIM" },
39+
"displayTimeUnit": "ns"
40+
}

0 commit comments

Comments
 (0)