Refactor code, rename variables and add comments

Kepontry · Kepontry · commit c8f614f3b118 · 2025-12-19T00:09:58.000+08:00
diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp
@@ -31,10 +31,23 @@ using namespace vpux;
 
 namespace {
 
+struct GapCandidate {
+    uint64_t lookaheadGap = 0;
+    int64_t insertionPointTaskIndex = -1;
+
+    // used for sort
+    bool operator>(const GapCandidate& other) const {
+        return lookaheadGap > other.lookaheadGap;
+    }
+};
+
 static const SmallVector<StringLiteral> SW_DUMMY_KERNELS_PREFETCH_SUPPORTED = {
         "activation_swish", "eltwise_mul",    "softmax",       "convert",        "rms_norm",
         "activation_swish", "activation_sin", "eltwise_equal", "activation_cos", "eltwise_select"};
 
+static const SmallVector<StringRef> SW_DUMMY_KERNELS_WITHOUT_ARGS = {
+        "convert", "eltwise_mul", "activation_cos", "activation_sin", "eltwise_equal", "eltwise_select", "rms_norm"};
+
 //
 // AddSwKernelInstructionPrefetch
 //
@@ -82,6 +95,9 @@ class AddSwKernelInstructionPrefetch final :
     std::vector<VPUIP::SwKernelOp> insertPrefetchTasks(mlir::Operation* funcOp, SwKernelPrefetchVec& kernelsToPrefetch,
                                                        mlir::Operation* firstShaveTaskInIR,
                                                        mlir::Value bestUpdateBarrier);
+    std::optional<GapCandidate> findBestInsertionGapDuringExec(const std::string& kernelName,
+                                                               uint64_t targetKernelGroupStartTime,
+                                                               VPURT::TaskConfigVec& allTasks, size_t numClusters);
     std::vector<VPUIP::SwKernelOp> insertPrefetchTasksDuringExec(
             mlir::Operation* funcOp, AddSwKernelInstructionPrefetch::SwKernelPrefetchVec& kernelsToPrefetch,
             VPURT::TaskConfigVec& allTasks);
@@ -101,6 +117,12 @@ class AddSwKernelInstructionPrefetch final :
     size_t _minimumFreeCyclesForPrefetch = 250000;
     bool _useDummyKernelForInstructionPrefetch = false;
     size_t _dynamicPrefetchTileCounter = 0;
+    // Using Tile 1 as the target for insertion to enable prefetching only when the available tile count is larger
+    // than 1.
+    int64_t _targetInsertTileDuringExec = 1;
+    // The threshold of 50,000 cycles is empirically chosen to ensure there is a sufficient gap
+    // to perform instruction prefetching without causing stalls.
+    uint64_t _prefetchGapThresholdDuringExec = 50000;
 };
 
 bool AddSwKernelInstructionPrefetch::hasVPUSWModule(mlir::Operation* funcOp) {
@@ -248,9 +270,7 @@ VPUIP::SwKernelOp AddSwKernelInstructionPrefetch::insertDummyKernelOpBeforeFirst
     // so we need to add skipProfiling as attribute to avoid capturing their metadata
     cachePrefetchSwKernel->setAttr("skipProfiling", mlir::UnitAttr::get(firstSwTask->getContext()));
 
-    auto args = (kernelName == "convert" || kernelName == "eltwise_mul" || kernelName == "activation_cos" ||
-                 kernelName == "activation_sin" || kernelName == "eltwise_equal" || kernelName == "eltwise_select" ||
-                 kernelName == "rms_norm")
+    auto args = llvm::is_contained(SW_DUMMY_KERNELS_WITHOUT_ARGS, kernelName)
                         ? mlir::ArrayAttr::get(moduleOp->getContext(), {})
                         : kernelNameToArgs[kernelName];
 
@@ -427,6 +447,21 @@ std::vector<VPUIP::SwKernelOp> AddSwKernelInstructionPrefetch::insertPrefetchTas
     return prefetchedKernels;
 }
 
+size_t getSwKernelCountAtTime(uint64_t startTime, VPURT::TaskConfigVec& allTasks) {
+    size_t count = 0;
+    for (auto& taskConfig : allTasks) {
+        if (static_cast<uint64_t>(taskConfig.cycleStart) == startTime) {
+            if (mlir::isa<VPUIP::SwKernelOp>(taskConfig.taskOp.getInnerTaskOp())) {
+                count++;
+            }
+        }
+        if (static_cast<uint64_t>(taskConfig.cycleStart) > startTime) {
+            break;
+        }
+    }
+    return count;
+}
+
 uint64_t findNextSaturationStart(size_t startIndex, vpux::VPURT::TaskConfigVec& allTasks, size_t numClusters,
                                  std::map<uint64_t, size_t>& swKernelCountsCache) {
     // Saturation is defined as 2x the number of clusters (e.g., 4 clusters -> 8 SW kernels)
@@ -437,19 +472,7 @@ uint64_t findNextSaturationStart(size_t startIndex, vpux::VPURT::TaskConfigVec&
         uint64_t currentStartTime = static_cast<uint64_t>(allTasks[i].cycleStart);
 
         if (swKernelCountsCache.find(currentStartTime) == swKernelCountsCache.end()) {
-            size_t swKernelCount = 0;
-            // Count all SW Kernels that start at this specific time
-            for (auto& task : allTasks) {
-                if (static_cast<uint64_t>(task.cycleStart) == currentStartTime) {
-                    if (mlir::isa<VPUIP::SwKernelOp>(task.taskOp.getInnerTaskOp())) {
-                        swKernelCount++;
-                    }
-                }
-                if (static_cast<uint64_t>(task.cycleStart) > currentStartTime) {
-                    break;
-                }
-            }
-            swKernelCountsCache[currentStartTime] = swKernelCount;
+            swKernelCountsCache[currentStartTime] = getSwKernelCountAtTime(currentStartTime, allTasks);
         }
 
         if (swKernelCountsCache[currentStartTime] >= saturationThreshold) {
@@ -460,43 +483,17 @@ uint64_t findNextSaturationStart(size_t startIndex, vpux::VPURT::TaskConfigVec&
     return std::numeric_limits<uint64_t>::max();
 }
 
-struct GapCandidate {
-    uint64_t lookaheadGap = 0;
-    int64_t insertionPointTaskIndex = -1;
-
-    // used for sort
-    bool operator>(const GapCandidate& other) const {
-        return lookaheadGap > other.lookaheadGap;
-    }
-};
-
-size_t getSwKernelCountAtTime(uint64_t startTime, VPURT::TaskConfigVec& allTasks) {
-    size_t count = 0;
-    for (auto& taskConfig : allTasks) {
-        if (static_cast<uint64_t>(taskConfig.cycleStart) == startTime) {
-            if (mlir::isa<VPUIP::SwKernelOp>(taskConfig.taskOp.getInnerTaskOp())) {
-                count++;
-            }
-        }
-        if (static_cast<uint64_t>(taskConfig.cycleStart) > startTime) {
-            break;
-        }
-    }
-    return count;
-}
-
-std::optional<GapCandidate> findBestInsertionGap(const std::string& kernelName, uint64_t targetKernelGroupStartTime,
-                                                 VPURT::TaskConfigVec& allTasks, size_t numClusters, Logger& log) {
-    const int64_t targetInsertTile = 1;
-    const uint64_t GAP_THRESHOLD = 50000;
+std::optional<GapCandidate> AddSwKernelInstructionPrefetch::findBestInsertionGapDuringExec(
+        const std::string& kernelName, uint64_t targetKernelGroupStartTime, VPURT::TaskConfigVec& allTasks,
+        size_t numClusters) {
     const size_t saturationThreshold = numClusters * 2;
 
     // <LookaheadGapSize, GapCandidate>
     std::map<uint64_t, GapCandidate, std::greater<uint64_t>> validGaps;
     std::map<uint64_t, size_t> swKernelCountsCache;  // local cache
 
-    int64_t previousT1TaskIndex = -1;
-    uint64_t previousT1TaskStartTime = 0;
+    int64_t prevTargetTileTaskIndex = -1;
+    uint64_t prevTargetTileTaskStartTime = 0;
 
     // find the largest gap between a non-saturated SW task and a saturated SW task / the kernel to be prefetched
     for (size_t i = 0; i < allTasks.size(); ++i) {
@@ -506,43 +503,43 @@ std::optional<GapCandidate> findBestInsertionGap(const std::string& kernelName,
             break;
         }
 
-        bool isT1Task = false;
+        bool isTargetTileTask = false;
         if (auto swOp = mlir::dyn_cast<VPUIP::SwKernelOp>(currentTaskConfig.taskOp.getInnerTaskOp()); swOp != nullptr) {
-            isT1Task = (swOp.getTileIndexAttr().getInt() == targetInsertTile);
+            isTargetTileTask = (swOp.getTileIndexAttr().getInt() == _targetInsertTileDuringExec);
         }
 
-        if (previousT1TaskIndex != -1 && isT1Task) {
-            auto& insertionPointTask = allTasks[previousT1TaskIndex];
+        if (prevTargetTileTaskIndex != -1 && isTargetTileTask) {
+            auto& insertionPointTask = allTasks[prevTargetTileTaskIndex];
             auto insertionPointStartTime = static_cast<uint64_t>(insertionPointTask.cycleStart);
 
             size_t simultaneousSwKernels = getSwKernelCountAtTime(insertionPointStartTime, allTasks);
 
             if (simultaneousSwKernels < saturationThreshold) {
                 uint64_t nextSaturationStart =
-                        findNextSaturationStart(previousT1TaskIndex, allTasks, numClusters, swKernelCountsCache);
+                        findNextSaturationStart(prevTargetTileTaskIndex, allTasks, numClusters, swKernelCountsCache);
                 uint64_t gapEnd = std::min(nextSaturationStart, targetKernelGroupStartTime);
                 uint64_t lookaheadGap = 0;
-                if (gapEnd > previousT1TaskStartTime) {
-                    lookaheadGap = gapEnd - previousT1TaskStartTime;
+                if (gapEnd > prevTargetTileTaskStartTime) {
+                    lookaheadGap = gapEnd - prevTargetTileTaskStartTime;
                 }
 
-                if (lookaheadGap >= GAP_THRESHOLD) {
+                if (lookaheadGap >= _prefetchGapThresholdDuringExec) {
                     GapCandidate gap;
                     gap.lookaheadGap = lookaheadGap;
-                    gap.insertionPointTaskIndex = previousT1TaskIndex;
+                    gap.insertionPointTaskIndex = prevTargetTileTaskIndex;
                     validGaps[lookaheadGap] = gap;
                 }
             }
         }
 
-        if (isT1Task) {
-            previousT1TaskIndex = static_cast<int64_t>(i);
-            previousT1TaskStartTime = currentTaskStartTime;
+        if (isTargetTileTask) {
+            prevTargetTileTaskIndex = static_cast<int64_t>(i);
+            prevTargetTileTaskStartTime = currentTaskStartTime;
         }
     }
 
     if (validGaps.empty()) {
-        log.trace("Kernel '{0}': No suitable insertion point found.", kernelName);
+        _log.trace("Kernel '{0}': No suitable insertion point found.", kernelName);
         return std::nullopt;
     }
 
@@ -573,7 +570,16 @@ std::vector<VPUIP::SwKernelOp> AddSwKernelInstructionPrefetch::insertPrefetchTas
 
         auto targetKernelGroupStartTime = static_cast<uint64_t>(allTasks[firstAppearanceIndex].cycleStart);
 
-        auto bestGapOpt = findBestInsertionGap(kernelName, targetKernelGroupStartTime, allTasks, numClusters, _log);
+        // Finds the best insertion point for prefetch by identifying non-saturated execution windows.
+        // Scans for tasks on the target tile to serve as prefetch anchors. A valid "Gap" is the
+        // duration from an anchor task to the next saturation event or the target kernel start.
+        //
+        // Logic:
+        // 1. Find a candidate task on the target tile.
+        // 2. Ensure NPU is not saturated at that time.
+        // 3. Calculate Gap = (Next Saturation or Target Start) - Insertion Time.
+        // 4. Return the candidate with the largest Gap >= _prefetchGapThreshold.
+        auto bestGapOpt = findBestInsertionGapDuringExec(kernelName, targetKernelGroupStartTime, allTasks, numClusters);
 
         if (!bestGapOpt.has_value()) {
             _log.trace("Kernel '{0}': No valid gap found.", kernelName);
diff --git a/tests/lit/NPU/dialect/VPUIP/passes/add_sw_kernel_instruction_prefetch_mid_execution_40XX.mlir b/tests/lit/NPU/dialect/VPUIP/passes/add_sw_kernel_instruction_prefetch_mid_execution_40XX.mlir
@@ -1,5 +1,5 @@
 //
-// Copyright (C) 2024-2025 Intel Corporation.
+// Copyright (C) 2025 Intel Corporation.
 // SPDX-License-Identifier: Apache-2.0
 //
 
diff --git a/tests/lit/NPU/dialect/VPUIP/pipelines/compileTimeScheduleTrace.json b/tests/lit/NPU/dialect/VPUIP/pipelines/compileTimeScheduleTrace.json
@@ -0,0 +1,40 @@
+{"traceEvents":[
+{"name": "process_name", "ph": "M", "pid":0, "args": {"name" : "DMA"}},
+{"name": "process_sort_index", "ph": "M", "pid":0, "args": {"sort_index" : "0"}},
+{"name": "thread_name", "ph": "M", "pid":0, "tid":0, "args": {"name" : "DMA"}},
+{"name": "thread_name", "ph": "M", "pid":0, "tid":1, "args": {"name" : "DMA"}},
+{"name": "process_name", "ph": "M", "pid":1, "args": {"name" : "Cluster (0)"}},
+{"name": "process_sort_index", "ph": "M", "pid":1, "args": {"sort_index" : "1"}},
+{"name": "thread_name", "ph": "M", "pid":1, "tid":2, "args": {"name" : "Shave"}},
+{"name": "process_name", "ph": "M", "pid":2, "args": {"name" : "Layers"}},
+{"name": "process_sort_index", "ph": "M", "pid":2, "args": {"sort_index" : "2"}},
+{"name": "thread_name", "ph": "M", "pid":2, "tid":3, "args": {"name" : "Layers"}},
+{"name":"", "cat":"DMA", "ph":"X", "ts":0.000, "dur":0.957, "pid":0, "tid":0},
+{"name":"start_barrier_sync_dma", "cat":"DMA", "ph":"X", "ts":0.000, "dur":0.000, "pid":0, "tid":1},
+{"name":"", "cat":"DMA", "ph":"X", "ts":5.777, "dur":1.218, "pid":0, "tid":0},
+{"name":"?_cache_handling_op/cluster_0", "cat":"Shave", "ph":"X", "ts":0.957, "dur":0.000, "pid":1, "tid":2},
+{"name":"?cluster_0", "cat":"Shave", "ph":"X", "ts":0.958, "dur":4.818, "pid":1, "tid":2},
+{"name":"start_barrier_sync_dma", "cat":"Layer", "ph":"X", "ts":0.000, "dur":0.000, "pid":2, "tid":3, "args":{"Layer type": "<Unknown>"}},
+{"name":"", "cat":"Layer", "ph":"X", "ts":0.000, "dur":6.995, "pid":2, "tid":3, "args":{"Layer type": "<Unknown>", "Shave time:": "4us 818ns", "DMA time:": "2us 175ns"}}
+],
+"taskStatistics": {
+"total duration":6.995,
+"DMA duration":2.175,
+"DPU duration":0.000,
+"SW duration":4.818,
+"M2I duration":0.000,
+"DMA-DPU overlap":0.000,
+"DMA-SW overlap":0.000,
+"SW-DPU overlap":0.000,
+"all tasks union":6.993,
+"total idle":0.002,
+"SW duration without DPU overlap":4.818,
+"DMA duration without overlaps":2.175,
+"Sum of DMA task durations":2.175,
+"Sum of DPU task durations":0.000,
+"Sum of SW task durations":4.818,
+"Sum of M2I task durations":0.000
+},
+"workpoint": { "freq": 1700.0, "status": "SIM" },
+"displayTimeUnit": "ns"
+}