Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,22 @@ using namespace vpux;

namespace {

static const SmallVector<StringLiteral> SW_DUMMY_KERNELS_PREFETCH_SUPPORTED = {"convert", "softmax"};
struct GapCandidate {
uint64_t lookaheadGap = 0;
int64_t insertionPointTaskIndex = -1;

// used for sort
bool operator>(const GapCandidate& other) const {
return lookaheadGap > other.lookaheadGap;
}
};

static const SmallVector<StringLiteral> SW_DUMMY_KERNELS_PREFETCH_SUPPORTED = {
"activation_swish", "eltwise_mul", "softmax", "convert", "rms_norm",
"activation_swish", "activation_sin", "eltwise_equal", "activation_cos", "eltwise_select"};

static const SmallVector<StringRef> SW_DUMMY_KERNELS_WITHOUT_ARGS = {
"convert", "eltwise_mul", "activation_cos", "activation_sin", "eltwise_equal", "eltwise_select", "rms_norm"};

//
// AddSwKernelInstructionPrefetch
Expand Down Expand Up @@ -66,19 +81,26 @@ class AddSwKernelInstructionPrefetch final :
size_t clusterIdx, std::string& kernelName,
mlir::SymbolRefAttr functionSymbol);

VPUIP::SwKernelOp insertDummyKernelOpBeforeFirstKernelTask(mlir::Operation* firstSwTask, mlir::Value updateBarrier,
size_t clusterIdx, std::string& kernelName);
VPUIP::SwKernelOp insertDummyKernelOpBeforeFirstKernelTask(mlir::Operation* firstSwTask,
mlir::ValueRange updateBarrier, size_t clusterIdx,
std::string& kernelName);
mlir::Operation* getFirstSwTaskInIRWaitingForBarrier(mlir::Value waitBarrier);
std::pair<std::string, size_t> getKernelNameAndSize(VPUIP::SwKernelOp swKernelOp);

using SwKernelPrefetchVec = std::vector<std::pair<std::string, size_t>>;
using SwKernelPrefetchVec = std::vector<std::tuple<std::string, size_t, size_t>>;
std::pair<SwKernelPrefetchVec, size_t> getPrefetchCandidatesAndFirstSwTask(mlir::Operation* funcOp,
VPURT::TaskConfigVec& allTasks);
std::tuple<mlir::Operation*, mlir::Value, size_t> getFirstSwTaskInIRAndBestUpdateBarrier(
VPURT::InferenceExecutionSimulator& infSim, VPURT::TaskConfigVec& allTasks, size_t firstShvTaskIndex);
std::vector<VPUIP::SwKernelOp> insertPrefetchTasks(mlir::Operation* funcOp, SwKernelPrefetchVec& kernelsToPrefetch,
mlir::Operation* firstShaveTaskInIR,
mlir::Value bestUpdateBarrier);
std::optional<GapCandidate> findBestInsertionGapDuringExec(const std::string& kernelName,
uint64_t targetKernelGroupStartTime,
VPURT::TaskConfigVec& allTasks, size_t numClusters);
std::vector<VPUIP::SwKernelOp> insertPrefetchTasksDuringExec(
mlir::Operation* funcOp, AddSwKernelInstructionPrefetch::SwKernelPrefetchVec& kernelsToPrefetch,
VPURT::TaskConfigVec& allTasks);

bool hasVPUSWModule(mlir::Operation* funcOp);
size_t getOffsetReservedMem(const mlir::ModuleOp module);
Expand All @@ -94,6 +116,13 @@ class AddSwKernelInstructionPrefetch final :
bool _minFreeCyclesHasValue = false;
size_t _minimumFreeCyclesForPrefetch = 250000;
bool _useDummyKernelForInstructionPrefetch = false;
size_t _dynamicPrefetchTileCounter = 0;
// Using Tile 1 as the target for insertion to enable prefetching only when the available tile count is larger
// than 1.
int64_t _targetInsertTileDuringExec = 1;
// The threshold of 50,000 cycles is empirically chosen to ensure there is a sufficient gap
// to perform instruction prefetching without causing stalls.
uint64_t _prefetchGapThresholdDuringExec = 50000;
};

bool AddSwKernelInstructionPrefetch::hasVPUSWModule(mlir::Operation* funcOp) {
Expand Down Expand Up @@ -186,21 +215,26 @@ VPUIP::SwKernelOp AddSwKernelInstructionPrefetch::insertPrefetchOpBeforeFirstKer
}

// For LNL, Shave kernel instruction prefetch needs to insert a dummy kernel instead of prefetch kernel
VPUIP::SwKernelOp AddSwKernelInstructionPrefetch::insertDummyKernelOpBeforeFirstKernelTask(mlir::Operation* firstSwTask,
mlir::Value updateBarrier,
size_t clusterIdx,
std::string& kernelName) {
VPUIP::SwKernelOp AddSwKernelInstructionPrefetch::insertDummyKernelOpBeforeFirstKernelTask(
mlir::Operation* firstSwTask, mlir::ValueRange updateBarrier, size_t clusterIdx, std::string& kernelName) {
mlir::OpBuilder builder(firstSwTask);
auto moduleOp = firstSwTask->getParentOfType<mlir::ModuleOp>();
auto kernelOp = kernelNameToOps[kernelName];
auto moduleOp = kernelOp->getParentOfType<mlir::ModuleOp>();
auto reservedMemOffset = getOffsetReservedMem(moduleOp);
auto offsetAttr = getIntAttr(moduleOp->getContext(), reservedMemOffset);
auto kernelOp = kernelNameToOps[kernelName];
auto tileIndexAttr = kernelOp.getTileIndexAttr();
VPUX_THROW_UNLESS(tileIndexAttr, "SwKernelOp '{0}' does not have a tileIndex attribute", kernelOp->getLoc());
const int64_t tileIndex = static_cast<int64_t>(clusterIdx);

auto createBuffer = [&](mlir::Value io, StringRef suffix, mlir::SmallVector<mlir::Value>& buffers) {
if (auto bufOp = io.getDefiningOp<VPURT::DeclareBufferOp>()) {
auto newType = mlir::cast<NDTypeInterface>(io.getType()).changeShape({1, 1, 1, 1});
auto origType = mlir::cast<NDTypeInterface>(io.getType());
auto newMemSpaceAttr = vpux::IndexedSymbolAttr::get(moduleOp->getContext(),
stringifyEnum(VPU::MemoryKind::CMX_NN), tileIndex);
auto newSectionIndexAttr = builder.getI64ArrayAttr({tileIndex});
auto newType = origType.changeShape({1, 1, 1, 1}).changeMemSpace(newMemSpaceAttr);
auto newBuff = builder.create<VPURT::DeclareBufferOp>(appendLoc(bufOp->getLoc(), suffix), newType,
bufOp.getSectionAttr(), bufOp.getSectionIndexAttr(),
bufOp.getSectionAttr(), newSectionIndexAttr,
offsetAttr, bufOp.getSwizzlingKeyAttr());
buffers.push_back(newBuff);
return true;
Expand Down Expand Up @@ -230,14 +264,16 @@ VPUIP::SwKernelOp AddSwKernelInstructionPrefetch::insertDummyKernelOpBeforeFirst

auto cachePrefetchSwKernel = vpux::VPURT::wrapIntoTaskOp<VPUIP::SwKernelOp>(
builder, mlir::ValueRange(), updateBarrier, newLoc, mlir::ValueRange(srcBuffers),
mlir::ValueRange(dstBuffers), nullptr, kernelNameToSymbol[kernelName], kernelOp.getTileIndexAttr(),
mlir::ValueRange(dstBuffers), nullptr, kernelNameToSymbol[kernelName], builder.getI64IntegerAttr(tileIndex),
kernelOp.getInputStridesAttr(), kernelOp.getOutputStridesAttr());
// The dummy kernels here are generated after ActShaveProfilingPass,
// so we need to add skipProfiling as attribute to avoid capturing their metadata
cachePrefetchSwKernel->setAttr("skipProfiling", mlir::UnitAttr::get(firstSwTask->getContext()));

auto args =
(kernelName == "convert") ? mlir::ArrayAttr::get(moduleOp->getContext(), {}) : kernelNameToArgs[kernelName];
auto args = llvm::is_contained(SW_DUMMY_KERNELS_WITHOUT_ARGS, kernelName)
? mlir::ArrayAttr::get(moduleOp->getContext(), {})
: kernelNameToArgs[kernelName];

vpux::VPUIP::initSwKernel(cachePrefetchSwKernel, mlir::ValueRange(srcBuffers), mlir::ValueRange(dstBuffers), args,
_log.nest(), /*swKernelRunOp=*/nullptr);

Expand Down Expand Up @@ -316,7 +352,7 @@ AddSwKernelInstructionPrefetch::getPrefetchCandidatesAndFirstSwTask(mlir::Operat
}

if (!cache.isLoaded(kernelName)) {
kernelsToPrefetch.push_back(std::move(kernelNameAndSize));
kernelsToPrefetch.push_back(std::make_tuple(kernelName, kernelSize, shvTaskIndex));
}
cache.loadKernel(kernelName, kernelSize);

Expand Down Expand Up @@ -355,7 +391,8 @@ AddSwKernelInstructionPrefetch::getFirstSwTaskInIRAndBestUpdateBarrier(VPURT::In
_log.trace("First SW kernel start time {0}, best barrier release time {1}", firstKernelTask.cycleStart,
bestReleaseCycle);
if (bestReleaseCycle < _minimumFreeCyclesForPrefetch) {
_log.info("bestReleaseCycle: {0} is smaller than _minimumFreeCyclesForPrefetch {1}, skipping prefetching",
_log.info("bestReleaseCycle: {0} is smaller than _minimumFreeCyclesForPrefetch {1}, try prefetching during "
"execution",
bestReleaseCycle, _minimumFreeCyclesForPrefetch);
return std::make_tuple(nullptr, nullptr, 0);
}
Expand Down Expand Up @@ -394,7 +431,7 @@ std::vector<VPUIP::SwKernelOp> AddSwKernelInstructionPrefetch::insertPrefetchTas
for (size_t shaveIdx = 0; (shaveIdx < numClusters * noOfShavesPerCluster) && (shaveIdx < kernelsToPrefetch.size());
shaveIdx++) {
auto clusterIdx = shaveIdx / noOfShavesPerCluster;
auto [kernelName, kernelSize] = kernelsToPrefetch[shaveIdx];
auto [kernelName, kernelSize, shvTaskIndex] = kernelsToPrefetch[shaveIdx];
_log.trace("Prefetching kernel {0} on cluster {1}", kernelName, clusterIdx);
auto newPrefetchKernel =
_useDummyKernelForInstructionPrefetch
Expand All @@ -410,6 +447,169 @@ std::vector<VPUIP::SwKernelOp> AddSwKernelInstructionPrefetch::insertPrefetchTas
return prefetchedKernels;
}

size_t getSwKernelCountAtTime(uint64_t startTime, VPURT::TaskConfigVec& allTasks) {
size_t count = 0;
for (auto& taskConfig : allTasks) {
if (static_cast<uint64_t>(taskConfig.cycleStart) == startTime) {
if (mlir::isa<VPUIP::SwKernelOp>(taskConfig.taskOp.getInnerTaskOp())) {
count++;
}
}
if (static_cast<uint64_t>(taskConfig.cycleStart) > startTime) {
break;
}
}
return count;
}

uint64_t findNextSaturationStart(size_t startIndex, vpux::VPURT::TaskConfigVec& allTasks, size_t numClusters,
std::map<uint64_t, size_t>& swKernelCountsCache) {
// Saturation is defined as 2x the number of clusters (e.g., 4 clusters -> 8 SW kernels)
const size_t saturationThreshold = numClusters * 2;

// Iterate through tasks strictly AFTER the startIndex
for (size_t i = startIndex + 1; i < allTasks.size(); ++i) {
uint64_t currentStartTime = static_cast<uint64_t>(allTasks[i].cycleStart);

if (swKernelCountsCache.find(currentStartTime) == swKernelCountsCache.end()) {
swKernelCountsCache[currentStartTime] = getSwKernelCountAtTime(currentStartTime, allTasks);
}

if (swKernelCountsCache[currentStartTime] >= saturationThreshold) {
return currentStartTime;
}
}

return std::numeric_limits<uint64_t>::max();
}

std::optional<GapCandidate> AddSwKernelInstructionPrefetch::findBestInsertionGapDuringExec(
const std::string& kernelName, uint64_t targetKernelGroupStartTime, VPURT::TaskConfigVec& allTasks,
size_t numClusters) {
const size_t saturationThreshold = numClusters * 2;

// <LookaheadGapSize, GapCandidate>
std::map<uint64_t, GapCandidate, std::greater<uint64_t>> validGaps;
std::map<uint64_t, size_t> swKernelCountsCache; // local cache

int64_t prevTargetTileTaskIndex = -1;
uint64_t prevTargetTileTaskStartTime = 0;

// find the largest gap between a non-saturated SW task and a saturated SW task / the kernel to be prefetched
for (size_t i = 0; i < allTasks.size(); ++i) {
auto& currentTaskConfig = allTasks[i];
uint64_t currentTaskStartTime = static_cast<uint64_t>(currentTaskConfig.cycleStart);
if (currentTaskStartTime > targetKernelGroupStartTime) {
break;
}

bool isTargetTileTask = false;
if (auto swOp = mlir::dyn_cast<VPUIP::SwKernelOp>(currentTaskConfig.taskOp.getInnerTaskOp()); swOp != nullptr) {
isTargetTileTask = (swOp.getTileIndexAttr().getInt() == _targetInsertTileDuringExec);
}

if (prevTargetTileTaskIndex != -1 && isTargetTileTask) {
auto& insertionPointTask = allTasks[prevTargetTileTaskIndex];
auto insertionPointStartTime = static_cast<uint64_t>(insertionPointTask.cycleStart);

size_t simultaneousSwKernels = getSwKernelCountAtTime(insertionPointStartTime, allTasks);

if (simultaneousSwKernels < saturationThreshold) {
uint64_t nextSaturationStart =
findNextSaturationStart(prevTargetTileTaskIndex, allTasks, numClusters, swKernelCountsCache);
uint64_t gapEnd = std::min(nextSaturationStart, targetKernelGroupStartTime);
uint64_t lookaheadGap = 0;
if (gapEnd > prevTargetTileTaskStartTime) {
lookaheadGap = gapEnd - prevTargetTileTaskStartTime;
}

if (lookaheadGap >= _prefetchGapThresholdDuringExec) {
GapCandidate gap;
gap.lookaheadGap = lookaheadGap;
gap.insertionPointTaskIndex = prevTargetTileTaskIndex;
validGaps[lookaheadGap] = gap;
}
}
}

if (isTargetTileTask) {
prevTargetTileTaskIndex = static_cast<int64_t>(i);
prevTargetTileTaskStartTime = currentTaskStartTime;
}
}

if (validGaps.empty()) {
_log.trace("Kernel '{0}': No suitable insertion point found.", kernelName);
return std::nullopt;
}

return validGaps.begin()->second;
}

std::vector<VPUIP::SwKernelOp> AddSwKernelInstructionPrefetch::insertPrefetchTasksDuringExec(
mlir::Operation* funcOp, AddSwKernelInstructionPrefetch::SwKernelPrefetchVec& kernelsToPrefetch,
VPURT::TaskConfigVec& allTasks) {
auto moduleOp = funcOp->getParentOfType<mlir::ModuleOp>();
const auto numClusters = getNumTiles(moduleOp);
VPUX_THROW_WHEN(numClusters == 0, "Number of tiles is zero.");

std::vector<VPUIP::SwKernelOp> prefetchedKernels{};

for (auto& kernelInfo : kernelsToPrefetch) {
std::string kernelName = std::get<0>(kernelInfo);
size_t firstAppearanceIndex = std::get<2>(kernelInfo);

if (firstAppearanceIndex >= allTasks.size()) {
_log.trace("Skipping kernel '{0}': Invalid firstAppearanceIndex {1}", kernelName, firstAppearanceIndex);
continue;
}
if (kernelNameToOps.count(kernelName) == 0) {
_log.trace("Skipping kernel '{0}': Missing dependencies (kernelNameToOps)", kernelName);
continue;
}

auto targetKernelGroupStartTime = static_cast<uint64_t>(allTasks[firstAppearanceIndex].cycleStart);

// Finds the best insertion point for prefetch by identifying non-saturated execution windows.
// Scans for tasks on the target tile to serve as prefetch anchors. A valid "Gap" is the
// duration from an anchor task to the next saturation event or the target kernel start.
//
// Logic:
// 1. Find a candidate task on the target tile.
// 2. Ensure NPU is not saturated at that time.
// 3. Calculate Gap = (Next Saturation or Target Start) - Insertion Time.
// 4. Return the candidate with the largest Gap >= _prefetchGapThreshold.
auto bestGapOpt = findBestInsertionGapDuringExec(kernelName, targetKernelGroupStartTime, allTasks, numClusters);

if (!bestGapOpt.has_value()) {
_log.trace("Kernel '{0}': No valid gap found.", kernelName);
continue;
}

GapCandidate bestGap = bestGapOpt.value();
_log.trace("Kernel '{0}': Found best gap of {1} cycles. Inserting relative to task {2}.", kernelName,
bestGap.lookaheadGap, bestGap.insertionPointTaskIndex);

if (bestGap.insertionPointTaskIndex < 0 ||
static_cast<size_t>(bestGap.insertionPointTaskIndex) >= allTasks.size()) {
_log.error("Kernel '{0}': Invalid insertionPointTaskIndex {1}. Skipping insertion.", kernelName,
bestGap.insertionPointTaskIndex);
continue;
}

auto insertBeforeOp = allTasks[bestGap.insertionPointTaskIndex].taskOp;
size_t dynamicExecTile = _dynamicPrefetchTileCounter % numClusters;
_dynamicPrefetchTileCounter++;

auto newPrefetchKernel = insertDummyKernelOpBeforeFirstKernelTask(insertBeforeOp, mlir::ValueRange(),
dynamicExecTile, kernelName);

prefetchedKernels.push_back(newPrefetchKernel);
}

return prefetchedKernels;
}

void AddSwKernelInstructionPrefetch::safeRunOnFunc() {
auto funcOp = getOperation();
if (!hasVPUSWModule(funcOp)) {
Expand Down Expand Up @@ -444,18 +644,24 @@ void AddSwKernelInstructionPrefetch::safeRunOnFunc() {
auto [kernelsToPrefetch, firstShvTaskIndex] = getPrefetchCandidatesAndFirstSwTask(funcOp, allTasks);
auto [firstShaveTaskInIR, bestUpdateBarrier, bestReleaseCycle] =
getFirstSwTaskInIRAndBestUpdateBarrier(infSim, allTasks, firstShvTaskIndex);
if (firstShaveTaskInIR == nullptr || kernelsToPrefetch.empty()) {
return;
}
_log.trace("insertPoint: {0}, bestReleaseCycle: {1}", *firstShaveTaskInIR, bestReleaseCycle);

if (_useDummyKernelForInstructionPrefetch) {
auto memSpaceAttr = mlir::SymbolRefAttr::get(module->getContext(), stringifyEnum(VPU::MemoryKind::CMX_NN));
auto dummyKernelResMem = config::getDummySwKernelsForInstructionPrefetchReservedMemory(module, memSpaceAttr);
VPUX_THROW_WHEN(dummyKernelResMem == nullptr,
"Cannot find DummySWKernelsForInstructionPrefetchReservedMemory!");
}
auto newPrefetchKernels = insertPrefetchTasks(funcOp, kernelsToPrefetch, firstShaveTaskInIR, bestUpdateBarrier);
if (kernelsToPrefetch.empty()) {
return;
}

std::vector<VPUIP::SwKernelOp> newPrefetchKernels;
if (firstShaveTaskInIR) {
_log.trace("insertPoint: {0}, bestReleaseCycle: {1}", *firstShaveTaskInIR, bestReleaseCycle);
newPrefetchKernels = insertPrefetchTasks(funcOp, kernelsToPrefetch, firstShaveTaskInIR, bestUpdateBarrier);
} else {
newPrefetchKernels = insertPrefetchTasksDuringExec(funcOp, kernelsToPrefetch, allTasks);
}

// Update dependencies for cache handling operations to meet requirements of control graph split.
auto& barrierInfo = getAnalysis<BarrierInfo>();
Expand Down
Loading
Loading