Skip to content

Commit 80a5685

Browse files
committed
feat(BA-3024): Treat resource allocation as a single block in config
1 parent 574496c commit 80a5685

File tree

5 files changed

+236
-178
lines changed

5 files changed

+236
-178
lines changed

configs/agent/sample.toml

Lines changed: 19 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -143,14 +143,6 @@
143143
# If agents field is populated, this field indicates the default values for all
144144
# agents.
145145
[resource]
146-
# Hard CPU allocation for this agent (e.g., 8 cores).
147-
# Only used in MANUAL allocation mode.
148-
# All agents must specify this value when allocation-mode is MANUAL.
149-
## allocated-cpu = 8
150-
# Hard memory allocation for this agent (e.g., "32G").
151-
# Only used in MANUAL allocation mode.
152-
# All agents must specify this value when allocation-mode is MANUAL.
153-
## allocated-mem = "32G"
154146
# The number of CPU cores reserved for the operating system and the agent
155147
# service.
156148
reserved-cpu = 1
@@ -179,9 +171,21 @@
179171
# Affinity policy
180172
affinity-policy = "INTERLEAVED"
181173

182-
# Device-specific per-slot resource allocations.
174+
# Resource allocations.
183175
# Only used in MANUAL allocation mode.
184-
[resource.allocated-devices]
176+
[resource.allocations]
177+
# Hard CPU allocation for this agent (e.g., 8 cores).
178+
# Only used in MANUAL allocation mode.
179+
# All agents must specify this value when allocation-mode is MANUAL.
180+
cpu = 8
181+
# Hard memory allocation for this agent (e.g., "32G").
182+
# Only used in MANUAL allocation mode.
183+
# All agents must specify this value when allocation-mode is MANUAL.
184+
mem = "32G"
185+
186+
# Device-specific per-slot resource allocations.
187+
# Only used in MANUAL allocation mode.
188+
[resource.allocations.devices]
185189

186190
# Pyroscope configuration
187191
[pyroscope]
@@ -401,7 +405,7 @@
401405
# late into the agent's runtime.
402406
port-range = [ 30000, 31000,]
403407
# Statistics type
404-
## stats-type = "cgroup"
408+
## stats-type = "docker"
405409
# Sandbox type
406410
sandbox-type = "docker"
407411
# Jail arguments
@@ -411,7 +415,7 @@
411415
# Scratch root directory
412416
scratch-root = "scratches"
413417
# Scratch size
414-
scratch-size = 0
418+
scratch-size = "0"
415419
# Scratch NFS address
416420
## scratch-nfs-address = "192.168.1.100:/export"
417421
# Scratch NFS options
@@ -430,12 +434,12 @@
430434
# Hard CPU allocation for this agent (e.g., 8 cores).
431435
# Only used in MANUAL allocation mode.
432436
# All agents must specify this value when allocation-mode is MANUAL.
433-
## allocated-cpu = 8
437+
cpu = 8
434438
# Hard memory allocation for this agent (e.g., "32G").
435439
# Only used in MANUAL allocation mode.
436440
# All agents must specify this value when allocation-mode is MANUAL.
437-
## allocated-mem = "32G"
441+
mem = "32G"
438442

439443
# Device-specific per-slot resource allocations.
440444
# Only used in MANUAL allocation mode.
441-
[agents.resource.allocated-devices]
445+
[agents.resource.devices]

src/ai/backend/agent/config/unified.py

Lines changed: 70 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@
5151
ResourceGroupType,
5252
ServiceDiscoveryType,
5353
SlotName,
54+
SlotNameField,
5455
)
5556
from ai.backend.logging import BraceStyleAdapter
5657
from ai.backend.logging.config import LoggingConfig
@@ -875,7 +876,49 @@ class ContainerConfig(CommonContainerConfig, OverridableContainerConfig):
875876
pass
876877

877878

878-
class CommonResourceConfig(BaseConfigSchema):
879+
class ResourceAllocationConfig(BaseConfigSchema):
880+
cpu: int = Field(
881+
description=textwrap.dedent("""
882+
Hard CPU allocation for this agent (e.g., 8 cores).
883+
Only used in MANUAL allocation mode.
884+
All agents must specify this value when allocation-mode is MANUAL.
885+
"""),
886+
examples=[8, 16],
887+
)
888+
mem: BinarySizeField = Field(
889+
description=textwrap.dedent("""
890+
Hard memory allocation for this agent (e.g., "32G").
891+
Only used in MANUAL allocation mode.
892+
All agents must specify this value when allocation-mode is MANUAL.
893+
"""),
894+
examples=["32G", "64G"],
895+
)
896+
devices: Mapping[SlotNameField, Decimal] = Field(
897+
default_factory=dict,
898+
description=textwrap.dedent("""
899+
Device-specific per-slot resource allocations.
900+
Only used in MANUAL allocation mode.
901+
"""),
902+
examples=[{"cuda.mem": "0.3", "cuda.shares": "0.5"}],
903+
)
904+
905+
model_config = ConfigDict(
906+
extra="allow",
907+
arbitrary_types_allowed=True,
908+
)
909+
910+
@model_validator(mode="after")
911+
def validate_values_are_positive(self) -> Self:
912+
if self.cpu is not None and self.cpu < 0:
913+
raise ValueError(f"Allocated cpu must not be a negative value, but given {self.cpu}")
914+
if self.mem is not None and self.mem < 0:
915+
raise ValueError(f"Allocated mem must not be a negative value, but given {self.mem}")
916+
if any(value < 0 for value in self.devices.values()):
917+
raise ValueError("All allocated device resource values must not be a negative value")
918+
return self
919+
920+
921+
class ResourceConfig(BaseConfigSchema):
879922
reserved_cpu: int = Field(
880923
default=1,
881924
description="The number of CPU cores reserved for the operating system and the agent service.",
@@ -919,6 +962,13 @@ class CommonResourceConfig(BaseConfigSchema):
919962
validation_alias=AliasChoices("allocation-mode", "allocation_mode"),
920963
serialization_alias="allocation-mode",
921964
)
965+
allocations: Optional[ResourceAllocationConfig] = Field(
966+
default=None,
967+
description=textwrap.dedent("""
968+
Resource allocations.
969+
Only used in MANUAL allocation mode.
970+
"""),
971+
)
922972
memory_align_size: BinarySizeField = Field(
923973
default=BinarySize.finite_from_str("16M"),
924974
description=(
@@ -961,64 +1011,6 @@ def _parse_affinity_policy(cls, v: Any) -> AffinityPolicy:
9611011
return v
9621012

9631013

964-
class OverridableResourceConfig(BaseConfigSchema):
965-
allocated_cpu: Optional[int] = Field(
966-
default=None,
967-
description=textwrap.dedent("""
968-
Hard CPU allocation for this agent (e.g., 8 cores).
969-
Only used in MANUAL allocation mode.
970-
All agents must specify this value when allocation-mode is MANUAL.
971-
"""),
972-
examples=[8, 16],
973-
validation_alias=AliasChoices("allocated-cpu", "allocated_cpu"),
974-
serialization_alias="allocated-cpu",
975-
)
976-
allocated_mem: Optional[BinarySizeField] = Field(
977-
default=None,
978-
description=textwrap.dedent("""
979-
Hard memory allocation for this agent (e.g., "32G").
980-
Only used in MANUAL allocation mode.
981-
All agents must specify this value when allocation-mode is MANUAL.
982-
"""),
983-
examples=["32G", "64G"],
984-
validation_alias=AliasChoices("allocated-mem", "allocated_mem"),
985-
serialization_alias="allocated-mem",
986-
)
987-
allocated_devices: Mapping[SlotName, Decimal] = Field(
988-
default_factory=dict,
989-
description=textwrap.dedent("""
990-
Device-specific per-slot resource allocations.
991-
Only used in MANUAL allocation mode.
992-
"""),
993-
examples=[{"cuda.mem": "0.3", "cuda.shares": "0.5"}],
994-
validation_alias=AliasChoices("allocated-devices", "allocated_devices"),
995-
serialization_alias="allocated-devices",
996-
)
997-
998-
model_config = ConfigDict(
999-
extra="allow",
1000-
arbitrary_types_allowed=True,
1001-
)
1002-
1003-
@model_validator(mode="after")
1004-
def validate_values_are_positive(self) -> Self:
1005-
if self.allocated_cpu is not None and self.allocated_cpu < 0:
1006-
raise ValueError(
1007-
f"Allocated cpu must not be a negative value, but given {self.allocated_cpu}"
1008-
)
1009-
if self.allocated_mem is not None and self.allocated_mem < 0:
1010-
raise ValueError(
1011-
f"Allocated mem must not be a negative value, but given {self.allocated_mem}"
1012-
)
1013-
if any(value < 0 for value in self.allocated_devices.values()):
1014-
raise ValueError("All allocated device resource values must not be a negative value")
1015-
return self
1016-
1017-
1018-
class ResourceConfig(CommonResourceConfig, OverridableResourceConfig):
1019-
pass
1020-
1021-
10221014
class EtcdConfig(BaseConfigSchema):
10231015
namespace: str = Field(
10241016
description="Etcd namespace",
@@ -1261,11 +1253,11 @@ class AgentOverrideConfig(BaseConfigSchema):
12611253
Only override fields if necessary.
12621254
"""),
12631255
)
1264-
container: OverridableContainerConfig | None = Field(
1256+
container: Optional[OverridableContainerConfig] = Field(
12651257
default=None,
12661258
description="Container config overrides for the individual agent",
12671259
)
1268-
resource: OverridableResourceConfig | None = Field(
1260+
resource: Optional[ResourceAllocationConfig] = Field(
12691261
default=None,
12701262
description="Resource config overrides for the individual agent",
12711263
)
@@ -1287,10 +1279,19 @@ def construct_unified_config(self, *, default: AgentUnifiedConfig) -> AgentUnifi
12871279
update=container_override_fields
12881280
)
12891281
if self.resource is not None:
1290-
resource_override_fields = self.resource.model_dump(
1291-
include=self.resource.model_fields_set
1282+
default_allocations = default.resource.allocations
1283+
override_allocations = self.resource
1284+
if default_allocations is None:
1285+
merged_allocations = override_allocations
1286+
else:
1287+
merged_allocations = default_allocations.model_copy(
1288+
update=override_allocations.model_dump(
1289+
include=override_allocations.model_fields_set
1290+
)
1291+
)
1292+
agent_updates["resource"] = default.resource.model_copy(
1293+
update={"allocations": merged_allocations}
12921294
)
1293-
agent_updates["resource"] = default.resource.model_copy(update=resource_override_fields)
12941295
return default.model_copy(update=agent_updates)
12951296

12961297

@@ -1326,10 +1327,6 @@ def agent_default(self) -> OverridableAgentConfig:
13261327
def agent_ids(self) -> Sequence[AgentId]:
13271328
return [AgentId(agent_config.agent.id) for agent_config in self.get_agent_configs()]
13281329

1329-
@property
1330-
def resource_common(self) -> CommonResourceConfig:
1331-
return self.resource
1332-
13331330
def get_agent_configs(self) -> Sequence[AgentUnifiedConfig]:
13341331
agent_configs = [agent.construct_unified_config(default=self) for agent in self.agents]
13351332
if not agent_configs:
@@ -1404,27 +1401,21 @@ def _validate_resource_allocation_mode(self) -> Self:
14041401
match self.resource.allocation_mode:
14051402
case ResourceAllocationMode.SHARED | ResourceAllocationMode.AUTO_SPLIT:
14061403
for config in agent_configs:
1407-
resource = config.resource
1408-
if any([
1409-
resource.allocated_cpu is not None,
1410-
resource.allocated_mem is not None,
1411-
resource.allocated_devices,
1412-
]):
1404+
if config.resource.allocations is not None:
14131405
raise ValueError(
14141406
"On non-MANUAL mode, config must not specify manual resource allocations"
14151407
)
14161408

14171409
case ResourceAllocationMode.MANUAL:
1410+
slot_names: list[set[SlotName]] = []
14181411
for config in agent_configs:
1419-
resource = config.resource
1420-
if any([resource.allocated_cpu is None, resource.allocated_mem is None]):
1412+
if config.resource.allocations is None:
14211413
raise ValueError(
14221414
"On MANUAL mode, config must specify cpu and mem resource allocations"
14231415
)
14241416

1425-
slot_names = [
1426-
set(config.resource.allocated_devices.keys()) for config in agent_configs
1427-
]
1417+
slot_names.append(set(config.resource.allocations.devices.keys()))
1418+
14281419
if not all(slot_name == slot_names[0] for slot_name in slot_names):
14291420
raise ValueError("All agents must have the same slots defined in the devices!")
14301421

src/ai/backend/common/configs/sample_generator.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,10 @@ def _dump_toml_scalar(
124124
if ctx is not None:
125125
match ctx.hint:
126126
case "BinarySize":
127-
value = f"{BinarySize(value):s}".upper()
127+
if isinstance(value, BinarySize):
128+
value = f"{value:s}".upper()
129+
else:
130+
value = f"{BinarySize.from_str(str(value)):s}".upper()
128131
case "HostPortPair":
129132
value = {"host": value["host"], "port": value["port"]}
130133
case "EnumByValue":

src/ai/backend/common/types.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@
101101
"ResourceSlot",
102102
"ResourceGroupType",
103103
"SlotName",
104+
"SlotNameField",
104105
"SlotTypes",
105106
"IntrinsicSlotNames",
106107
"DefaultForUnspecified",
@@ -365,6 +366,17 @@ def is_accelerator(self) -> bool:
365366
return False
366367

367368

369+
def _validate_slot_name(v: Any) -> SlotName:
370+
"""Validator for SlotName fields."""
371+
if isinstance(v, SlotName):
372+
return v
373+
return SlotName(v)
374+
375+
376+
# Create a custom type annotation for SlotName fields
377+
SlotNameField = Annotated[SlotName, PlainValidator(_validate_slot_name)]
378+
379+
368380
MetricKey = NewType("MetricKey", str)
369381

370382
AccessKey = NewType("AccessKey", str)

0 commit comments

Comments
 (0)