5151 ResourceGroupType ,
5252 ServiceDiscoveryType ,
5353 SlotName ,
54+ SlotNameField ,
5455)
5556from ai .backend .logging import BraceStyleAdapter
5657from ai .backend .logging .config import LoggingConfig
@@ -875,7 +876,49 @@ class ContainerConfig(CommonContainerConfig, OverridableContainerConfig):
875876 pass
876877
877878
878- class CommonResourceConfig (BaseConfigSchema ):
879+ class ResourceAllocationConfig (BaseConfigSchema ):
880+ cpu : int = Field (
881+ description = textwrap .dedent ("""
882+ Hard CPU allocation for this agent (e.g., 8 cores).
883+ Only used in MANUAL allocation mode.
884+ All agents must specify this value when allocation-mode is MANUAL.
885+ """ ),
886+ examples = [8 , 16 ],
887+ )
888+ mem : BinarySizeField = Field (
889+ description = textwrap .dedent ("""
890+ Hard memory allocation for this agent (e.g., "32G").
891+ Only used in MANUAL allocation mode.
892+ All agents must specify this value when allocation-mode is MANUAL.
893+ """ ),
894+ examples = ["32G" , "64G" ],
895+ )
896+ devices : Mapping [SlotNameField , Decimal ] = Field (
897+ default_factory = dict ,
898+ description = textwrap .dedent ("""
899+ Device-specific per-slot resource allocations.
900+ Only used in MANUAL allocation mode.
901+ """ ),
902+ examples = [{"cuda.mem" : "0.3" , "cuda.shares" : "0.5" }],
903+ )
904+
905+ model_config = ConfigDict (
906+ extra = "allow" ,
907+ arbitrary_types_allowed = True ,
908+ )
909+
910+ @model_validator (mode = "after" )
911+ def validate_values_are_positive (self ) -> Self :
912+ if self .cpu is not None and self .cpu < 0 :
913+ raise ValueError (f"Allocated cpu must not be a negative value, but given { self .cpu } " )
914+ if self .mem is not None and self .mem < 0 :
915+ raise ValueError (f"Allocated mem must not be a negative value, but given { self .mem } " )
916+ if any (value < 0 for value in self .devices .values ()):
917+ raise ValueError ("All allocated device resource values must not be a negative value" )
918+ return self
919+
920+
921+ class ResourceConfig (BaseConfigSchema ):
879922 reserved_cpu : int = Field (
880923 default = 1 ,
881924 description = "The number of CPU cores reserved for the operating system and the agent service." ,
@@ -919,6 +962,13 @@ class CommonResourceConfig(BaseConfigSchema):
919962 validation_alias = AliasChoices ("allocation-mode" , "allocation_mode" ),
920963 serialization_alias = "allocation-mode" ,
921964 )
965+ allocations : Optional [ResourceAllocationConfig ] = Field (
966+ default = None ,
967+ description = textwrap .dedent ("""
968+ Resource allocations.
969+ Only used in MANUAL allocation mode.
970+ """ ),
971+ )
922972 memory_align_size : BinarySizeField = Field (
923973 default = BinarySize .finite_from_str ("16M" ),
924974 description = (
@@ -961,64 +1011,6 @@ def _parse_affinity_policy(cls, v: Any) -> AffinityPolicy:
9611011 return v
9621012
9631013
964- class OverridableResourceConfig (BaseConfigSchema ):
965- allocated_cpu : Optional [int ] = Field (
966- default = None ,
967- description = textwrap .dedent ("""
968- Hard CPU allocation for this agent (e.g., 8 cores).
969- Only used in MANUAL allocation mode.
970- All agents must specify this value when allocation-mode is MANUAL.
971- """ ),
972- examples = [8 , 16 ],
973- validation_alias = AliasChoices ("allocated-cpu" , "allocated_cpu" ),
974- serialization_alias = "allocated-cpu" ,
975- )
976- allocated_mem : Optional [BinarySizeField ] = Field (
977- default = None ,
978- description = textwrap .dedent ("""
979- Hard memory allocation for this agent (e.g., "32G").
980- Only used in MANUAL allocation mode.
981- All agents must specify this value when allocation-mode is MANUAL.
982- """ ),
983- examples = ["32G" , "64G" ],
984- validation_alias = AliasChoices ("allocated-mem" , "allocated_mem" ),
985- serialization_alias = "allocated-mem" ,
986- )
987- allocated_devices : Mapping [SlotName , Decimal ] = Field (
988- default_factory = dict ,
989- description = textwrap .dedent ("""
990- Device-specific per-slot resource allocations.
991- Only used in MANUAL allocation mode.
992- """ ),
993- examples = [{"cuda.mem" : "0.3" , "cuda.shares" : "0.5" }],
994- validation_alias = AliasChoices ("allocated-devices" , "allocated_devices" ),
995- serialization_alias = "allocated-devices" ,
996- )
997-
998- model_config = ConfigDict (
999- extra = "allow" ,
1000- arbitrary_types_allowed = True ,
1001- )
1002-
1003- @model_validator (mode = "after" )
1004- def validate_values_are_positive (self ) -> Self :
1005- if self .allocated_cpu is not None and self .allocated_cpu < 0 :
1006- raise ValueError (
1007- f"Allocated cpu must not be a negative value, but given { self .allocated_cpu } "
1008- )
1009- if self .allocated_mem is not None and self .allocated_mem < 0 :
1010- raise ValueError (
1011- f"Allocated mem must not be a negative value, but given { self .allocated_mem } "
1012- )
1013- if any (value < 0 for value in self .allocated_devices .values ()):
1014- raise ValueError ("All allocated device resource values must not be a negative value" )
1015- return self
1016-
1017-
1018- class ResourceConfig (CommonResourceConfig , OverridableResourceConfig ):
1019- pass
1020-
1021-
10221014class EtcdConfig (BaseConfigSchema ):
10231015 namespace : str = Field (
10241016 description = "Etcd namespace" ,
@@ -1261,11 +1253,11 @@ class AgentOverrideConfig(BaseConfigSchema):
12611253 Only override fields if necessary.
12621254 """ ),
12631255 )
1264- container : OverridableContainerConfig | None = Field (
1256+ container : Optional [ OverridableContainerConfig ] = Field (
12651257 default = None ,
12661258 description = "Container config overrides for the individual agent" ,
12671259 )
1268- resource : OverridableResourceConfig | None = Field (
1260+ resource : Optional [ ResourceAllocationConfig ] = Field (
12691261 default = None ,
12701262 description = "Resource config overrides for the individual agent" ,
12711263 )
@@ -1287,10 +1279,19 @@ def construct_unified_config(self, *, default: AgentUnifiedConfig) -> AgentUnifi
12871279 update = container_override_fields
12881280 )
12891281 if self .resource is not None :
1290- resource_override_fields = self .resource .model_dump (
1291- include = self .resource .model_fields_set
1282+ default_allocations = default .resource .allocations
1283+ override_allocations = self .resource
1284+ if default_allocations is None :
1285+ merged_allocations = override_allocations
1286+ else :
1287+ merged_allocations = default_allocations .model_copy (
1288+ update = override_allocations .model_dump (
1289+ include = override_allocations .model_fields_set
1290+ )
1291+ )
1292+ agent_updates ["resource" ] = default .resource .model_copy (
1293+ update = {"allocations" : merged_allocations }
12921294 )
1293- agent_updates ["resource" ] = default .resource .model_copy (update = resource_override_fields )
12941295 return default .model_copy (update = agent_updates )
12951296
12961297
@@ -1326,10 +1327,6 @@ def agent_default(self) -> OverridableAgentConfig:
13261327 def agent_ids (self ) -> Sequence [AgentId ]:
13271328 return [AgentId (agent_config .agent .id ) for agent_config in self .get_agent_configs ()]
13281329
1329- @property
1330- def resource_common (self ) -> CommonResourceConfig :
1331- return self .resource
1332-
13331330 def get_agent_configs (self ) -> Sequence [AgentUnifiedConfig ]:
13341331 agent_configs = [agent .construct_unified_config (default = self ) for agent in self .agents ]
13351332 if not agent_configs :
@@ -1404,27 +1401,21 @@ def _validate_resource_allocation_mode(self) -> Self:
14041401 match self .resource .allocation_mode :
14051402 case ResourceAllocationMode .SHARED | ResourceAllocationMode .AUTO_SPLIT :
14061403 for config in agent_configs :
1407- resource = config .resource
1408- if any ([
1409- resource .allocated_cpu is not None ,
1410- resource .allocated_mem is not None ,
1411- resource .allocated_devices ,
1412- ]):
1404+ if config .resource .allocations is not None :
14131405 raise ValueError (
14141406 "On non-MANUAL mode, config must not specify manual resource allocations"
14151407 )
14161408
14171409 case ResourceAllocationMode .MANUAL :
1410+ slot_names : list [set [SlotName ]] = []
14181411 for config in agent_configs :
1419- resource = config .resource
1420- if any ([resource .allocated_cpu is None , resource .allocated_mem is None ]):
1412+ if config .resource .allocations is None :
14211413 raise ValueError (
14221414 "On MANUAL mode, config must specify cpu and mem resource allocations"
14231415 )
14241416
1425- slot_names = [
1426- set (config .resource .allocated_devices .keys ()) for config in agent_configs
1427- ]
1417+ slot_names .append (set (config .resource .allocations .devices .keys ()))
1418+
14281419 if not all (slot_name == slot_names [0 ] for slot_name in slot_names ):
14291420 raise ValueError ("All agents must have the same slots defined in the devices!" )
14301421
0 commit comments