lablup · hhoikoo · Oct 21, 2025
diff --git a/changes/6498.feature.md b/changes/6498.feature.md
@@ -0,0 +1 @@
+Add resource isolation options for multi-agent setup
diff --git a/configs/agent/sample.toml b/configs/agent/sample.toml
@@ -147,6 +147,14 @@
 # If agents field is populated, this field indicates the default values for all
 # agents.
 [resource]
+  # Hard CPU allocation for this agent (e.g., 8 cores).
+  # Only used in MANUAL allocation mode.
+  # All agents must specify this value when allocation-mode is MANUAL.
+  ## allocated-cpu = 8
+  # Hard memory allocation for this agent (e.g., "32G").
+  # Only used in MANUAL allocation mode.
+  # All agents must specify this value when allocation-mode is MANUAL.
+  ## allocated-mem = "32G"
   # The number of CPU cores reserved for the operating system and the agent
   # service.
   reserved-cpu = 1
@@ -160,6 +168,12 @@
   # Currently this value is unused. In future releases, it may be used to preserve
   # the minimum disk space from the scratch disk allocation via loopback files.
   reserved-disk = "8G"
+  # Resource allocation mode for multi-agent scenarios.
+  # - `shared`: All agents share the full resource pool (default, backward
+  # compatible).
+  # - `auto-split`: Automatically divide resources equally (1/N) among all agents.
+  # - `manual`: Manually specify per-agent resource allocations via config.
+  allocation-mode = "shared"
   # The alignment of the reported main memory size to absorb tiny deviations from
   # per-node firmware/hardware settings. Recommended to be multiple of the
   # page/hugepage size (e.g., 2 MiB).
@@ -169,6 +183,10 @@
   # Affinity policy
   affinity-policy = "INTERLEAVED"
 
+  # Device-specific per-slot resource allocations.
+  # Only used in MANUAL allocation mode.
+  [resource.allocated-devices]
+
 # Pyroscope configuration
 [pyroscope]
   # Whether to enable Pyroscope profiling
@@ -415,24 +433,15 @@
 
   # Resource config overrides for the individual agent
   [agents.resource]
-    # The number of CPU cores reserved for the operating system and the agent
-    # service.
-    reserved-cpu = 1
-    # The memory space reserved for the operating system and the agent service. It
-    # is subtracted from the reported main memory size and not available for user
-    # workload allocation. Depending on the memory-align-size option and system
-    # configuration, this may not be the exact value but have slightly less or more
-    # values within the memory-align-size.
-    reserved-mem = 1073741824
-    # The disk space reserved for the operating system and the agent service.
-    # Currently this value is unused. In future releases, it may be used to preserve
-    # the minimum disk space from the scratch disk allocation via loopback files.
-    reserved-disk = 8589934592
-    # The alignment of the reported main memory size to absorb tiny deviations from
-    # per-node firmware/hardware settings. Recommended to be multiple of the
-    # page/hugepage size (e.g., 2 MiB).
-    memory-align-size = 16777216
-    # Resource allocation order
-    allocation-order = [ "cuda", "rocm", "tpu", "cpu", "mem",]
-    # Affinity policy
-    affinity-policy = 1
+    # Hard CPU allocation for this agent (e.g., 8 cores).
+    # Only used in MANUAL allocation mode.
+    # All agents must specify this value when allocation-mode is MANUAL.
+    ## allocated-cpu = 8
+    # Hard memory allocation for this agent (e.g., "32G").
+    # Only used in MANUAL allocation mode.
+    # All agents must specify this value when allocation-mode is MANUAL.
+    ## allocated-mem = "32G"
+
+    # Device-specific per-slot resource allocations.
+    # Only used in MANUAL allocation mode.
+    [agents.resource.allocated-devices]
diff --git a/src/ai/backend/accelerator/cuda_open/plugin.py b/src/ai/backend/accelerator/cuda_open/plugin.py
@@ -251,6 +251,9 @@ async def gather_node_measures(
                 MetricTypes.GAUGE,
                 unit_hint="bytes",
                 stats_filter=frozenset({"max"}),
+                measurement_scale_factor=ctx.agent.resource_partitioner.get_resource_scaling_factor(
+                    SlotName("cuda.device")
+                ),
                 per_node=Measurement(Decimal(mem_used_total), Decimal(mem_avail_total)),
                 per_device=mem_stats,
             ),
@@ -259,6 +262,9 @@ async def gather_node_measures(
                 MetricTypes.UTILIZATION,
                 unit_hint="percent",
                 stats_filter=frozenset({"avg", "max"}),
+                measurement_scale_factor=ctx.agent.resource_partitioner.get_resource_scaling_factor(
+                    SlotName("cuda.device")
+                ),
                 per_node=Measurement(Decimal(util_total), Decimal(dev_count * 100)),
                 per_device=util_stats,
             ),

diff --git a/src/ai/backend/agent/agent.py b/src/ai/backend/agent/agent.py
@@ -30,6 +30,7 @@
 from dataclasses import dataclass
 from decimal import Decimal
 from io import SEEK_END, BytesIO
+from itertools import chain
 from pathlib import Path
 from types import TracebackType
 from typing import (
@@ -174,7 +175,6 @@
 from ai.backend.common.types import (
     MODEL_SERVICE_RUNTIME_PROFILES,
     AbuseReportValue,
-    AcceleratorMetadata,
     AgentId,
     AutoPullBehavior,
     BinarySize,
@@ -233,11 +233,11 @@
 from .observer.heartbeat import HeartbeatObserver
 from .observer.host_port import HostPortObserver
 from .resources import (
-    AbstractComputeDevice,
     AbstractComputePlugin,
     ComputerContext,
     KernelResourceSpec,
     Mount,
+    ResourcePartitioner,
     align_memory,
     allocate,
     known_slot_types,
@@ -765,7 +765,10 @@ class AbstractAgent(
     etcd: AsyncEtcd
     local_instance_id: str
     kernel_registry: MutableMapping[KernelId, AbstractKernel]
+    resource_partitioner: ResourcePartitioner
     computers: MutableMapping[DeviceName, ComputerContext]
+    total_slots: Mapping[SlotName, Decimal]
+    reserved_slots: Mapping[SlotName, Decimal]
     images: Mapping[ImageCanonical, ScannedImage]
     port_pool: set[int]
 
@@ -836,6 +839,7 @@ def __init__(
         error_monitor: ErrorPluginContext,
         skip_initial_scan: bool = False,
         agent_public_key: Optional[PublicKey],
+        resource_partitioner: ResourcePartitioner,
     ) -> None:
         self._skip_initial_scan = skip_initial_scan
         self.loop = current_loop()
@@ -845,7 +849,10 @@ def __init__(
         self.local_instance_id = generate_local_instance_id(__file__)
         self.agent_public_key = agent_public_key
         self.kernel_registry = {}
+        self.resource_partitioner = resource_partitioner
         self.computers = {}
+        self.total_slots = {}
+        self.reserved_slots = {}
         self.images = {}
         self.restarting_kernels = {}
         self.stat_ctx = StatContext(
@@ -932,28 +939,34 @@ async def __ainit__(self) -> None:
         alloc_map_mod.log_alloc_map = self.local_config.debug.log_alloc_map
         computers = await self.load_resources()
 
-        all_devices: list[AbstractComputeDevice] = []
-        metadatas: list[AcceleratorMetadata] = []
         for name, computer in computers.items():
             devices = await computer.list_devices()
-            all_devices.extend(devices)
             alloc_map = await computer.create_alloc_map()
             self.computers[name] = ComputerContext(computer, devices, alloc_map)
-            metadatas.append(computer.get_metadata())
 
+        self.total_slots = self.resource_partitioner.calculate_total_slots(
+            self.computers, self.local_config.resource_common
+        )
+        self.reserved_slots = self.resource_partitioner.restrict_computer_resources(
+            self.computers, self.total_slots
+        )
         self.slots = await self.update_slots()
         log.info("Resource slots: {!r}", self.slots)
         log.info("Slot types: {!r}", known_slot_types)
         self.timer_tasks.append(aiotools.create_timer(self.update_slots_periodically, 30.0))
 
         # Use ValkeyStatClient batch operations for better performance
         field_value_map = {}
-        for metadata in metadatas:
+        for computer_ctx in self.computers.values():
+            metadata = computer_ctx.instance.get_metadata()
             field_value_map[metadata["slot_name"]] = dump_json_str(metadata).encode()
 
         if field_value_map:
             await self.valkey_stat_client.store_computer_metadata(field_value_map)
 
+        all_devices = list(
+            chain.from_iterable(computer.devices for computer in self.computers.values())
+        )
         self.affinity_map = AffinityMap.build(all_devices)
 
         if not self._skip_initial_scan:
@@ -1947,6 +1960,7 @@ async def load_resources(
         """
         Detect available resources attached on the system and load corresponding device plugin.
         """
+        raise NotImplementedError
 
     @abstractmethod
     async def scan_available_resources(
@@ -1955,6 +1969,7 @@ async def scan_available_resources(
         """
         Scan and define the amount of available resource slots in this node.
         """
+        raise NotImplementedError
 
     async def update_slots(
         self,
@@ -1965,14 +1980,9 @@ async def update_slots(
         """
         scanned_slots = await self.scan_available_resources()
         usable_slots: dict[SlotName, Decimal] = {}
-        reserved_slots = {
-            SlotName("cpu"): Decimal(self.local_config.resource.reserved_cpu),
-            SlotName("mem"): Decimal(self.local_config.resource.reserved_mem),
-            SlotName("disk"): Decimal(self.local_config.resource.reserved_disk),
-        }
         for slot_name, slot_capacity in scanned_slots.items():
             if slot_name == SlotName("mem"):
-                mem_reserved = int(reserved_slots.get(slot_name, 0))
+                mem_reserved = int(self.reserved_slots.get(slot_name, 0))
                 mem_align = int(self.local_config.resource.memory_align_size)
                 mem_usable, mem_reserved = align_memory(
                     int(slot_capacity), mem_reserved, align=mem_align
@@ -1986,7 +1996,7 @@ async def update_slots(
                 )
             else:
                 usable_capacity = max(
-                    Decimal(0), slot_capacity - reserved_slots.get(slot_name, Decimal(0))
+                    Decimal(0), slot_capacity - self.reserved_slots.get(slot_name, Decimal(0))
                 )
             usable_slots[slot_name] = usable_capacity
         return usable_slots
@@ -2098,6 +2108,7 @@ async def scan_images(self) -> ScanImagesResult:
         This is called periodically to keep the image list up-to-date and allow
         manual image addition and deletions by admins.
         """
+        raise NotImplementedError
 
     async def _scan_images_wrapper(self, interval: float) -> None:
         result = await self.scan_images()
@@ -2118,6 +2129,7 @@ async def push_image(
         """
         Push the given image to the given registry.
         """
+        raise NotImplementedError
 
     @abstractmethod
     async def pull_image(
@@ -2130,12 +2142,14 @@ async def pull_image(
         """
         Pull the given image from the given registry.
         """
+        raise NotImplementedError
 
     @abstractmethod
     async def purge_images(self, request: PurgeImagesReq) -> PurgeImagesResp:
         """
         Purge the given images from the agent.
         """
+        raise NotImplementedError
 
     async def check_and_pull(
         self,
@@ -2267,7 +2281,7 @@ async def check_image(
         Check the availability of the image and return a boolean flag that indicates whether
         the agent should try pulling the image from a registry.
         """
-        return False
+        raise NotImplementedError
 
     async def scan_running_kernels(self) -> None:
         """
@@ -3489,6 +3503,7 @@ async def destroy_kernel(
         * Send SIGTERM to the kernel's main process.
         * Send SIGKILL if it's not terminated within a few seconds.
         """
+        raise NotImplementedError
 
     @abstractmethod
     async def clean_kernel(
@@ -3512,6 +3527,7 @@ async def clean_kernel(
         The ``container_id`` may be ``None`` if the container has already gone away.
         In such cases, skip container-specific cleanups.
         """
+        raise NotImplementedError
 
     @abstractmethod
     async def create_local_network(self, network_name: str) -> None:
@@ -3523,6 +3539,7 @@ async def create_local_network(self, network_name: str) -> None:
         It may raise :exc:`NotImplementedError` and then the manager
         will cancel creation of the session.
         """
+        raise NotImplementedError
 
     @abstractmethod
     async def destroy_local_network(self, network_name: str) -> None:
@@ -3531,6 +3548,7 @@ async def destroy_local_network(self, network_name: str) -> None:
 
         This is called by the manager after kernel destruction.
         """
+        raise NotImplementedError
 
     @abstractmethod
     async def restart_kernel__load_config(
@@ -3541,7 +3559,7 @@ async def restart_kernel__load_config(
         """
         Restore the cluster config from a previous launch of the kernel.
         """
-        pass
+        raise NotImplementedError
 
     @abstractmethod
     async def restart_kernel__store_config(
@@ -3554,7 +3572,7 @@ async def restart_kernel__store_config(
         Store the cluster config to a kernel-related storage (e.g., scratch space),
         so that restarts of this kernel can reuse the configuration.
         """
-        pass
+        raise NotImplementedError
 
     async def restart_kernel(
         self,

diff --git a/src/ai/backend/agent/alloc_map.py b/src/ai/backend/agent/alloc_map.py
@@ -236,6 +236,17 @@ def update_affinity_hint(
                     hint_for_next_allocation.append(dev)
         affinity_hint.devices = hint_for_next_allocation
 
+    @final
+    def update_device_slot_amounts(self, slot_amounts: Mapping[SlotName, Decimal]) -> None:
+        self.device_slots = {
+            device_id: DeviceSlotInfo(
+                slot_type=slot_info.slot_type,
+                slot_name=slot_info.slot_name,
+                amount=slot_amounts[slot_info.slot_name],
+            )
+            for device_id, slot_info in self.device_slots.items()
+        }
+
     @abstractmethod
     def allocate(
         self,
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Add resource isolation options for multi-agent setup