Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changes/6498.feature.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Add resource isolation options for multi-agent setup
51 changes: 30 additions & 21 deletions configs/agent/sample.toml
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,14 @@
# If agents field is populated, this field indicates the default values for all
# agents.
[resource]
# Hard CPU allocation for this agent (e.g., 8 cores).
# Only used in MANUAL allocation mode.
# All agents must specify this value when allocation-mode is MANUAL.
## allocated-cpu = 8
# Hard memory allocation for this agent (e.g., "32G").
# Only used in MANUAL allocation mode.
# All agents must specify this value when allocation-mode is MANUAL.
## allocated-mem = "32G"
# The number of CPU cores reserved for the operating system and the agent
# service.
reserved-cpu = 1
Expand All @@ -160,6 +168,12 @@
# Currently this value is unused. In future releases, it may be used to preserve
# the minimum disk space from the scratch disk allocation via loopback files.
reserved-disk = "8G"
# Resource allocation mode for multi-agent scenarios.
# - `shared`: All agents share the full resource pool (default, backward
# compatible).
# - `auto-split`: Automatically divide resources equally (1/N) among all agents.
# - `manual`: Manually specify per-agent resource allocations via config.
allocation-mode = "shared"
# The alignment of the reported main memory size to absorb tiny deviations from
# per-node firmware/hardware settings. Recommended to be multiple of the
# page/hugepage size (e.g., 2 MiB).
Expand All @@ -169,6 +183,10 @@
# Affinity policy
affinity-policy = "INTERLEAVED"

# Device-specific per-slot resource allocations.
# Only used in MANUAL allocation mode.
[resource.allocated-devices]

# Pyroscope configuration
[pyroscope]
# Whether to enable Pyroscope profiling
Expand Down Expand Up @@ -415,24 +433,15 @@

# Resource config overrides for the individual agent
[agents.resource]
# The number of CPU cores reserved for the operating system and the agent
# service.
reserved-cpu = 1
# The memory space reserved for the operating system and the agent service. It
# is subtracted from the reported main memory size and not available for user
# workload allocation. Depending on the memory-align-size option and system
# configuration, this may not be the exact value but have slightly less or more
# values within the memory-align-size.
reserved-mem = 1073741824
# The disk space reserved for the operating system and the agent service.
# Currently this value is unused. In future releases, it may be used to preserve
# the minimum disk space from the scratch disk allocation via loopback files.
reserved-disk = 8589934592
# The alignment of the reported main memory size to absorb tiny deviations from
# per-node firmware/hardware settings. Recommended to be multiple of the
# page/hugepage size (e.g., 2 MiB).
memory-align-size = 16777216
# Resource allocation order
allocation-order = [ "cuda", "rocm", "tpu", "cpu", "mem",]
# Affinity policy
affinity-policy = 1
# Hard CPU allocation for this agent (e.g., 8 cores).
# Only used in MANUAL allocation mode.
# All agents must specify this value when allocation-mode is MANUAL.
## allocated-cpu = 8
# Hard memory allocation for this agent (e.g., "32G").
# Only used in MANUAL allocation mode.
# All agents must specify this value when allocation-mode is MANUAL.
## allocated-mem = "32G"

# Device-specific per-slot resource allocations.
# Only used in MANUAL allocation mode.
[agents.resource.allocated-devices]
6 changes: 6 additions & 0 deletions src/ai/backend/accelerator/cuda_open/plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,9 @@ async def gather_node_measures(
MetricTypes.GAUGE,
unit_hint="bytes",
stats_filter=frozenset({"max"}),
measurement_scale_factor=ctx.agent.resource_partitioner.get_resource_scaling_factor(
SlotName("cuda.device")
),
per_node=Measurement(Decimal(mem_used_total), Decimal(mem_avail_total)),
per_device=mem_stats,
),
Expand All @@ -259,6 +262,9 @@ async def gather_node_measures(
MetricTypes.UTILIZATION,
unit_hint="percent",
stats_filter=frozenset({"avg", "max"}),
measurement_scale_factor=ctx.agent.resource_partitioner.get_resource_scaling_factor(
SlotName("cuda.device")
),
per_node=Measurement(Decimal(util_total), Decimal(dev_count * 100)),
per_device=util_stats,
),
Expand Down
52 changes: 35 additions & 17 deletions src/ai/backend/agent/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
from dataclasses import dataclass
from decimal import Decimal
from io import SEEK_END, BytesIO
from itertools import chain
from pathlib import Path
from types import TracebackType
from typing import (
Expand Down Expand Up @@ -174,7 +175,6 @@
from ai.backend.common.types import (
MODEL_SERVICE_RUNTIME_PROFILES,
AbuseReportValue,
AcceleratorMetadata,
AgentId,
AutoPullBehavior,
BinarySize,
Expand Down Expand Up @@ -233,11 +233,11 @@
from .observer.heartbeat import HeartbeatObserver
from .observer.host_port import HostPortObserver
from .resources import (
AbstractComputeDevice,
AbstractComputePlugin,
ComputerContext,
KernelResourceSpec,
Mount,
ResourcePartitioner,
align_memory,
allocate,
known_slot_types,
Expand Down Expand Up @@ -765,7 +765,10 @@ class AbstractAgent(
etcd: AsyncEtcd
local_instance_id: str
kernel_registry: MutableMapping[KernelId, AbstractKernel]
resource_partitioner: ResourcePartitioner
computers: MutableMapping[DeviceName, ComputerContext]
total_slots: Mapping[SlotName, Decimal]
reserved_slots: Mapping[SlotName, Decimal]
images: Mapping[ImageCanonical, ScannedImage]
port_pool: set[int]

Expand Down Expand Up @@ -836,6 +839,7 @@ def __init__(
error_monitor: ErrorPluginContext,
skip_initial_scan: bool = False,
agent_public_key: Optional[PublicKey],
resource_partitioner: ResourcePartitioner,
) -> None:
self._skip_initial_scan = skip_initial_scan
self.loop = current_loop()
Expand All @@ -845,7 +849,10 @@ def __init__(
self.local_instance_id = generate_local_instance_id(__file__)
self.agent_public_key = agent_public_key
self.kernel_registry = {}
self.resource_partitioner = resource_partitioner
self.computers = {}
self.total_slots = {}
self.reserved_slots = {}
self.images = {}
self.restarting_kernels = {}
self.stat_ctx = StatContext(
Expand Down Expand Up @@ -932,28 +939,34 @@ async def __ainit__(self) -> None:
alloc_map_mod.log_alloc_map = self.local_config.debug.log_alloc_map
computers = await self.load_resources()

all_devices: list[AbstractComputeDevice] = []
metadatas: list[AcceleratorMetadata] = []
for name, computer in computers.items():
devices = await computer.list_devices()
all_devices.extend(devices)
alloc_map = await computer.create_alloc_map()
self.computers[name] = ComputerContext(computer, devices, alloc_map)
metadatas.append(computer.get_metadata())

self.total_slots = self.resource_partitioner.calculate_total_slots(
self.computers, self.local_config.resource_common
)
self.reserved_slots = self.resource_partitioner.restrict_computer_resources(
self.computers, self.total_slots
)
self.slots = await self.update_slots()
log.info("Resource slots: {!r}", self.slots)
log.info("Slot types: {!r}", known_slot_types)
self.timer_tasks.append(aiotools.create_timer(self.update_slots_periodically, 30.0))

# Use ValkeyStatClient batch operations for better performance
field_value_map = {}
for metadata in metadatas:
for computer_ctx in self.computers.values():
metadata = computer_ctx.instance.get_metadata()
field_value_map[metadata["slot_name"]] = dump_json_str(metadata).encode()

if field_value_map:
await self.valkey_stat_client.store_computer_metadata(field_value_map)

all_devices = list(
chain.from_iterable(computer.devices for computer in self.computers.values())
)
self.affinity_map = AffinityMap.build(all_devices)

if not self._skip_initial_scan:
Expand Down Expand Up @@ -1947,6 +1960,7 @@ async def load_resources(
"""
Detect available resources attached on the system and load corresponding device plugin.
"""
raise NotImplementedError

@abstractmethod
async def scan_available_resources(
Expand All @@ -1955,6 +1969,7 @@ async def scan_available_resources(
"""
Scan and define the amount of available resource slots in this node.
"""
raise NotImplementedError

async def update_slots(
self,
Expand All @@ -1965,14 +1980,9 @@ async def update_slots(
"""
scanned_slots = await self.scan_available_resources()
usable_slots: dict[SlotName, Decimal] = {}
reserved_slots = {
SlotName("cpu"): Decimal(self.local_config.resource.reserved_cpu),
SlotName("mem"): Decimal(self.local_config.resource.reserved_mem),
SlotName("disk"): Decimal(self.local_config.resource.reserved_disk),
}
for slot_name, slot_capacity in scanned_slots.items():
if slot_name == SlotName("mem"):
mem_reserved = int(reserved_slots.get(slot_name, 0))
mem_reserved = int(self.reserved_slots.get(slot_name, 0))
mem_align = int(self.local_config.resource.memory_align_size)
mem_usable, mem_reserved = align_memory(
int(slot_capacity), mem_reserved, align=mem_align
Expand All @@ -1986,7 +1996,7 @@ async def update_slots(
)
else:
usable_capacity = max(
Decimal(0), slot_capacity - reserved_slots.get(slot_name, Decimal(0))
Decimal(0), slot_capacity - self.reserved_slots.get(slot_name, Decimal(0))
)
usable_slots[slot_name] = usable_capacity
return usable_slots
Expand Down Expand Up @@ -2098,6 +2108,7 @@ async def scan_images(self) -> ScanImagesResult:
This is called periodically to keep the image list up-to-date and allow
manual image addition and deletions by admins.
"""
raise NotImplementedError

async def _scan_images_wrapper(self, interval: float) -> None:
result = await self.scan_images()
Expand All @@ -2118,6 +2129,7 @@ async def push_image(
"""
Push the given image to the given registry.
"""
raise NotImplementedError

@abstractmethod
async def pull_image(
Expand All @@ -2130,12 +2142,14 @@ async def pull_image(
"""
Pull the given image from the given registry.
"""
raise NotImplementedError

@abstractmethod
async def purge_images(self, request: PurgeImagesReq) -> PurgeImagesResp:
"""
Purge the given images from the agent.
"""
raise NotImplementedError

async def check_and_pull(
self,
Expand Down Expand Up @@ -2267,7 +2281,7 @@ async def check_image(
Check the availability of the image and return a boolean flag that indicates whether
the agent should try pulling the image from a registry.
"""
return False
raise NotImplementedError

async def scan_running_kernels(self) -> None:
"""
Expand Down Expand Up @@ -3489,6 +3503,7 @@ async def destroy_kernel(
* Send SIGTERM to the kernel's main process.
* Send SIGKILL if it's not terminated within a few seconds.
"""
raise NotImplementedError

@abstractmethod
async def clean_kernel(
Expand All @@ -3512,6 +3527,7 @@ async def clean_kernel(
The ``container_id`` may be ``None`` if the container has already gone away.
In such cases, skip container-specific cleanups.
"""
raise NotImplementedError

@abstractmethod
async def create_local_network(self, network_name: str) -> None:
Expand All @@ -3523,6 +3539,7 @@ async def create_local_network(self, network_name: str) -> None:
It may raise :exc:`NotImplementedError` and then the manager
will cancel creation of the session.
"""
raise NotImplementedError

@abstractmethod
async def destroy_local_network(self, network_name: str) -> None:
Expand All @@ -3531,6 +3548,7 @@ async def destroy_local_network(self, network_name: str) -> None:

This is called by the manager after kernel destruction.
"""
raise NotImplementedError

@abstractmethod
async def restart_kernel__load_config(
Expand All @@ -3541,7 +3559,7 @@ async def restart_kernel__load_config(
"""
Restore the cluster config from a previous launch of the kernel.
"""
pass
raise NotImplementedError

@abstractmethod
async def restart_kernel__store_config(
Expand All @@ -3554,7 +3572,7 @@ async def restart_kernel__store_config(
Store the cluster config to a kernel-related storage (e.g., scratch space),
so that restarts of this kernel can reuse the configuration.
"""
pass
raise NotImplementedError

async def restart_kernel(
self,
Expand Down
11 changes: 11 additions & 0 deletions src/ai/backend/agent/alloc_map.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,17 @@ def update_affinity_hint(
hint_for_next_allocation.append(dev)
affinity_hint.devices = hint_for_next_allocation

@final
def update_device_slot_amounts(self, slot_amounts: Mapping[SlotName, Decimal]) -> None:
self.device_slots = {
device_id: DeviceSlotInfo(
slot_type=slot_info.slot_type,
slot_name=slot_info.slot_name,
amount=slot_amounts[slot_info.slot_name],
)
for device_id, slot_info in self.device_slots.items()
}

@abstractmethod
def allocate(
self,
Expand Down
Loading
Loading