Add simplified Training Options for TrainJob labels and annotations

abhijeet-dhumal · abhijeet-dhumal · commit 2d7a8e6c2064 · 2025-09-15T11:04:39.000+05:30
Signed-off-by: Abhijeet Dhumal &lt;abdhumal@redhat.com&gt;
diff --git a/kubeflow/trainer/api/trainer_client.py b/kubeflow/trainer/api/trainer_client.py
@@ -13,10 +13,10 @@
 # limitations under the License.
 
 import logging
-from typing import Optional, Union, Iterator
-
+from typing import List, Optional, Union, Iterator
 from kubeflow.trainer.constants import constants
 from kubeflow.trainer.types import types
+from kubeflow.trainer.options.options import Option
 from kubeflow.trainer.backends.kubernetes.backend import KubernetesBackend
 from kubeflow.trainer.backends.kubernetes.types import KubernetesBackendConfig
 from kubeflow.trainer.backends.localprocess.backend import LocalProcessBackend
@@ -93,6 +93,7 @@ def train(
         runtime: Optional[types.Runtime] = None,
         initializer: Optional[types.Initializer] = None,
         trainer: Optional[Union[types.CustomTrainer, types.BuiltinTrainer]] = None,
+        options: Optional[List[Option]] = None,
     ) -> str:
         """Create a TrainJob. You can configure the TrainJob using one of these trainers:
 
@@ -107,6 +108,8 @@ def train(
             initializer: Optional configuration for the dataset and model initializers.
             trainer: Optional configuration for a CustomTrainer or BuiltinTrainer. If not specified,
                 the TrainJob will use the runtime's default values.
+            options: Optional list of configuration options to apply to the TrainJob. Use
+                WithLabels and WithAnnotations for basic metadata configuration.
 
         Returns:
             The unique name of the TrainJob that has been generated.
@@ -116,7 +119,24 @@ def train(
             TimeoutError: Timeout to create TrainJobs.
             RuntimeError: Failed to create TrainJobs.
         """
-        return self.backend.train(runtime=runtime, initializer=initializer, trainer=trainer)
+        job_spec = {}
+
+        if options:
+            for option in options:
+                option.apply(job_spec)
+
+        metadata_section = job_spec.get("metadata", {})
+
+        labels = metadata_section.get("labels") or None
+        annotations = metadata_section.get("annotations") or None
+
+        return self.backend.train(
+            runtime=runtime,
+            initializer=initializer,
+            trainer=trainer,
+            labels=labels,
+            annotations=annotations,
+        )
 
     def list_jobs(self, runtime: Optional[types.Runtime] = None) -> list[types.TrainJob]:
         """List of the created TrainJobs. If a runtime is specified, only TrainJobs associated with
diff --git a/kubeflow/trainer/backends/kubernetes/backend.py b/kubeflow/trainer/backends/kubernetes/backend.py
@@ -19,7 +19,7 @@
 import string
 import time
 import uuid
-from typing import Optional, Union, Iterator
+from typing import Dict, Optional, Union, Iterator
 import re
 
 from kubeflow.trainer.constants import constants
@@ -181,6 +181,8 @@ def train(
         runtime: Optional[types.Runtime] = None,
         initializer: Optional[types.Initializer] = None,
         trainer: Optional[Union[types.CustomTrainer, types.BuiltinTrainer]] = None,
+        labels: Optional[Dict[str, str]] = None,
+        annotations: Optional[Dict[str, str]] = None,
     ) -> str:
         if runtime is None:
             runtime = self.get_runtime(constants.TORCH_RUNTIME)
@@ -216,7 +218,11 @@ def train(
         train_job = models.TrainerV1alpha1TrainJob(
             apiVersion=constants.API_VERSION,
             kind=constants.TRAINJOB_KIND,
-            metadata=models.IoK8sApimachineryPkgApisMetaV1ObjectMeta(name=train_job_name),
+            metadata=models.IoK8sApimachineryPkgApisMetaV1ObjectMeta(
+                name=train_job_name,
+                labels=labels,
+                annotations=annotations
+            ),
             spec=models.TrainerV1alpha1TrainJobSpec(
                 runtimeRef=models.TrainerV1alpha1RuntimeRef(name=runtime.name),
                 trainer=(trainer_crd if trainer_crd != models.TrainerV1alpha1Trainer() else None),
diff --git a/kubeflow/trainer/backends/kubernetes/backend_test.py b/kubeflow/trainer/backends/kubernetes/backend_test.py
@@ -25,14 +25,15 @@
 import string
 import uuid
 from dataclasses import asdict
-from typing import Optional
+from typing import Dict, Optional
 from unittest.mock import Mock, patch
 
 import pytest
 from kubeflow_trainer_api import models
 
 from kubeflow.trainer.constants import constants
 from kubeflow.trainer.types import types
+from kubeflow.trainer.options import WithLabels, WithAnnotations
 from kubeflow.trainer.utils import utils
 from kubeflow.trainer.backends.kubernetes.backend import KubernetesBackend
 from kubeflow.trainer.backends.kubernetes.types import KubernetesBackendConfig
@@ -253,14 +254,20 @@ def get_train_job(
     runtime_name: str,
     train_job_name: str = BASIC_TRAIN_JOB_NAME,
     train_job_trainer: Optional[models.TrainerV1alpha1Trainer] = None,
+    labels: Optional[Dict[str, str]] = None,
+    annotations: Optional[Dict[str, str]] = None,
 ) -> models.TrainerV1alpha1TrainJob:
     """
     Create a mock TrainJob object with optional trainer configurations.
     """
     train_job = models.TrainerV1alpha1TrainJob(
         apiVersion=constants.API_VERSION,
         kind=constants.TRAINJOB_KIND,
-        metadata=models.IoK8sApimachineryPkgApisMetaV1ObjectMeta(name=train_job_name),
+        metadata=models.IoK8sApimachineryPkgApisMetaV1ObjectMeta(
+            name=train_job_name,
+            labels=labels,
+            annotations=annotations
+        ),
         spec=models.TrainerV1alpha1TrainJobSpec(
             runtimeRef=models.TrainerV1alpha1RuntimeRef(name=runtime_name),
             trainer=train_job_trainer,
@@ -788,7 +795,73 @@ def test_get_runtime_packages(kubernetes_backend, test_case):
             },
             expected_error=ValueError,
         ),
-
+        TestCase(
+            name="valid flow with labels and annotations",
+            expected_status=SUCCESS,
+            config={
+                "labels": {"kueue.x-k8s.io/queue-name": "ml-queue", "team": "ml-engineering"},
+                "annotations": {"experiment.id": "exp-001", "description": "Test training job"},
+            },
+            expected_output=get_train_job(
+                runtime_name=TORCH_RUNTIME,
+                train_job_name=BASIC_TRAIN_JOB_NAME,
+                labels={"kueue.x-k8s.io/queue-name": "ml-queue", "team": "ml-engineering"},
+                annotations={"experiment.id": "exp-001", "description": "Test training job"},
+            ),
+        ),
+        TestCase(
+            name="valid flow with only labels",
+            expected_status=SUCCESS,
+            config={
+                "labels": {"priority": "high"},
+            },
+            expected_output=get_train_job(
+                runtime_name=TORCH_RUNTIME,
+                train_job_name=BASIC_TRAIN_JOB_NAME,
+                labels={"priority": "high"},
+            ),
+        ),
+        TestCase(
+            name="valid flow with only annotations",
+            expected_status=SUCCESS,
+            config={
+                "annotations": {"created-by": "training-pipeline"},
+            },
+            expected_output=get_train_job(
+                runtime_name=TORCH_RUNTIME,
+                train_job_name=BASIC_TRAIN_JOB_NAME,
+                annotations={"created-by": "training-pipeline"},
+            ),
+        ),
+        # Test cases using the new Options pattern
+        TestCase(
+            name="valid flow with WithLabels option",
+            expected_status=SUCCESS,
+            config={
+                "options": [WithLabels({"team": "ml-platform", "project": "training"})],
+            },
+            expected_output=get_train_job(
+                runtime_name=TORCH_RUNTIME,
+                train_job_name=BASIC_TRAIN_JOB_NAME,
+                labels={"team": "ml-platform", "project": "training"},
+            ),
+        ),
+        TestCase(
+            name="valid flow with multiple options",
+            expected_status=SUCCESS,
+            config={
+                "options": [
+                    WithLabels({"team": "ml-platform"}),
+                    WithAnnotations({"created-by": "sdk"}),
+                ],
+            },
+            expected_output=get_train_job(
+                runtime_name=TORCH_RUNTIME,
+                train_job_name=BASIC_TRAIN_JOB_NAME,
+                labels={"team": "ml-platform"},
+                annotations={"created-by": "sdk"},
+            ),
+        ),
     ],
 )
 def test_train(kubernetes_backend, test_case):
@@ -798,8 +871,38 @@ def test_train(kubernetes_backend, test_case):
         kubernetes_backend.namespace = test_case.config.get("namespace", DEFAULT_NAMESPACE)
         runtime = kubernetes_backend.get_runtime(test_case.config.get("runtime", TORCH_RUNTIME))
 
+        job_spec = {}
+
+        options = test_case.config.get("options", None)
+        if options:
+            for option in options:
+                option.apply(job_spec)
+
+        metadata_section = job_spec.get("metadata", {})
+
+        labels = metadata_section.get("labels") or None
+        annotations = metadata_section.get("annotations") or None
+
+        # Merge individual parameters with options
+        individual_labels = test_case.config.get("labels", None)
+        individual_annotations = test_case.config.get("annotations", None)
+
+        if individual_labels:
+            if labels:
+                labels.update(individual_labels)
+            else:
+                labels = individual_labels
+        if individual_annotations:
+            if annotations:
+                annotations.update(individual_annotations)
+            else:
+                annotations = individual_annotations
+
         train_job_name = kubernetes_backend.train(
-            runtime=runtime, trainer=test_case.config.get("trainer", None)
+            runtime=runtime,
+            trainer=test_case.config.get("trainer", None),
+            labels=labels,
+            annotations=annotations,
         )
 
         assert test_case.expected_status == SUCCESS
diff --git a/kubeflow/trainer/options/__init__.py b/kubeflow/trainer/options/__init__.py
@@ -0,0 +1,25 @@
+# Copyright 2025 The Kubeflow Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from kubeflow.trainer.options.options import (
+    Option,
+    WithAnnotations,
+    WithLabels,
+)
+
+__all__ = [
+    "Option",
+    "WithAnnotations",
+    "WithLabels",
+]
diff --git a/kubeflow/trainer/options/options.py b/kubeflow/trainer/options/options.py
@@ -0,0 +1,76 @@
+# Copyright 2025 The Kubeflow Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Dict
+
+
+class Option(ABC):
+    """Base class for TrainJob configuration options.
+
+    Options provide a composable way to configure different aspects of a TrainJob.
+    Each option implements the apply() method to modify the TrainJob specification.
+    """
+
+    @abstractmethod
+    def apply(self, job_spec: dict) -> None:
+        """Apply this option to the TrainJob specification.
+
+        Args:
+            job_spec: The TrainJob specification dictionary to modify.
+        """
+        pass
+
+
+@dataclass
+class WithLabels(Option):
+    """Add labels to the TrainJob resource metadata (.metadata.labels).
+
+    These labels are applied to the TrainJob resource itself and are used
+    for resource organization, filtering, and selection.
+
+    Args:
+        labels: Dictionary of labels to apply to the TrainJob metadata.
+    """
+
+    labels: Dict[str, str]
+
+    def apply(self, job_spec: dict) -> None:
+        """Apply labels to TrainJob metadata."""
+        metadata = job_spec.setdefault("metadata", {})
+        existing_labels = metadata.setdefault("labels", {})
+        existing_labels.update(self.labels)
+
+
+@dataclass
+class WithAnnotations(Option):
+    """Add annotations to the TrainJob resource metadata (.metadata.annotations).
+
+    These annotations are applied to the TrainJob resource itself and are used
+    for storing additional metadata about the training job resource.
+
+    Args:
+        annotations: Dictionary of annotations to apply to the TrainJob metadata.
+    """
+
+    annotations: Dict[str, str]
+
+    def apply(self, job_spec: dict) -> None:
+        """Apply annotations to TrainJob metadata."""
+        metadata = job_spec.setdefault("metadata", {})
+        existing_annotations = metadata.setdefault("annotations", {})
+        existing_annotations.update(self.annotations)
+
+
diff --git a/kubeflow/trainer/types/__init__.py b/kubeflow/trainer/types/__init__.py
@@ -0,0 +1,49 @@
+# Copyright 2024 The Kubeflow Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from kubeflow.trainer.types.types import (
+    BuiltinTrainer,
+    CustomTrainer,
+    DataFormat,
+    DataType,
+    HuggingFaceDatasetInitializer,
+    HuggingFaceModelInitializer,
+    Initializer,
+    Loss,
+    Runtime,
+    RuntimeTrainer,
+    Step,
+    TorchTuneConfig,
+    TorchTuneInstructDataset,
+    TrainJob,
+    TrainerType,
+)
+
+__all__ = [
+    "BuiltinTrainer",
+    "CustomTrainer",
+    "DataFormat",
+    "DataType",
+    "HuggingFaceDatasetInitializer",
+    "HuggingFaceModelInitializer",
+    "Initializer",
+    "Loss",
+    "Runtime",
+    "RuntimeTrainer",
+    "Step",
+    "TorchTuneConfig",
+    "TorchTuneInstructDataset",
+    "TrainJob",
+    "TrainerType",
+]
diff --git a/uv.lock b/uv.lock