Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion apiserver/pkg/server/ray_job_submission_service_server.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ type RayJobSubmissionServiceServer struct {
// Create RayJobSubmissionServiceServer
func NewRayJobSubmissionServiceServer(clusterServer *ClusterServer, options *RayJobSubmissionServiceServerOptions) *RayJobSubmissionServiceServer {
zl := zerolog.New(os.Stdout).Level(zerolog.DebugLevel)
return &RayJobSubmissionServiceServer{clusterServer: clusterServer, options: options, log: zerologr.New(&zl).WithName("jobsubmissionservice"), dashboardClientFunc: utils.GetRayDashboardClientFunc(nil, false)}
return &RayJobSubmissionServiceServer{clusterServer: clusterServer, options: options, log: zerologr.New(&zl).WithName("jobsubmissionservice"), dashboardClientFunc: utils.GetRayDashboardClientFunc(context.Background(), nil, false, false)}
}

// Submit Ray job
Expand Down
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ require (
github.com/google/uuid v1.6.0 // indirect
github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674 // indirect
github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79 // indirect
github.com/hashicorp/golang-lru/v2 v2.0.7 // indirect
github.com/inconshreveable/mousetrap v1.1.0 // indirect
github.com/josharian/intern v1.0.0 // indirect
github.com/json-iterator/go v1.1.12 // indirect
Expand Down
2 changes: 2 additions & 0 deletions go.sum

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions helm-chart/kuberay-operator/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,7 @@ spec:
| rbacEnable | bool | `true` | If rbacEnable is set to false, no RBAC resources will be created, including the Role for leader election, the Role for Pods and Services, and so on. |
| crNamespacedRbacEnable | bool | `true` | When crNamespacedRbacEnable is set to true, the KubeRay operator will create a Role for RayCluster preparation (e.g., Pods, Services) and a corresponding RoleBinding for each namespace listed in the "watchNamespace" parameter. Please note that even if crNamespacedRbacEnable is set to false, the Role and RoleBinding for leader election will still be created. Note: (1) This variable is only effective when rbacEnable and singleNamespaceInstall are both set to true. (2) In most cases, it should be set to true, unless you are using a Kubernetes cluster managed by GitOps tools such as ArgoCD. |
| singleNamespaceInstall | bool | `false` | When singleNamespaceInstall is true: - Install namespaced RBAC resources such as Role and RoleBinding instead of cluster-scoped ones like ClusterRole and ClusterRoleBinding so that the chart can be installed by users with permissions restricted to a single namespace. (Please note that this excludes the CRDs, which can only be installed at the cluster scope.) - If "watchNamespace" is not set, the KubeRay operator will, by default, only listen to resource events within its own namespace. |
| useBackgroundGoroutine | bool | `false` | When useBackgroundGoroutine is set to true: the KubeRay operator will use background goroutines to fetch the jobInfo from Ray clusters for RayJob status updates. |
| env | string | `nil` | Environment variables. |
| resources | object | `{"limits":{"cpu":"100m","memory":"512Mi"}}` | Resource requests and limits for containers. |
| livenessProbe.initialDelaySeconds | int | `10` | |
Expand Down
3 changes: 3 additions & 0 deletions helm-chart/kuberay-operator/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,9 @@ spec:
{{- $argList = append $argList (printf "--burst=%v" .Values.kubeClient.burst) -}}
{{- end -}}
{{- end -}}
{{- if hasKey .Values "useBackgroundGoroutine" -}}
{{- $argList = append $argList (printf "--use-background-goroutine=%t" .Values.useBackgroundGoroutine) -}}
{{- end -}}
{{- (printf "\n") -}}
{{- $argList | toYaml | indent 12 }}
ports:
Expand Down
4 changes: 4 additions & 0 deletions helm-chart/kuberay-operator/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,10 @@ crNamespacedRbacEnable: true
# to resource events within its own namespace.
singleNamespaceInstall: false

# -- When useBackgroundGoroutine is set to true:
# the KubeRay operator will use background goroutines to fetch the jobInfo from Ray clusters for RayJob status updates.
useBackgroundGoroutine: false

# The KubeRay operator will watch the custom resources in the namespaces listed in the "watchNamespace" parameter.
# watchNamespace:
# - n1
Expand Down
10 changes: 8 additions & 2 deletions ray-operator/apis/config/v1alpha1/configuration_types.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
package v1alpha1

import (
"context"

corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"sigs.k8s.io/controller-runtime/pkg/manager"
Expand Down Expand Up @@ -86,10 +88,14 @@ type Configuration struct {

// EnableMetrics indicates whether KubeRay operator should emit control plane metrics.
EnableMetrics bool `json:"enableMetrics,omitempty"`

// UseBackgroundGoroutine indicates that it wil use goroutine to fetch the job info from ray dashboard and
// store the job info in the cache
UseBackgroundGoroutine bool `json:"useBackgroundGoroutine,omitempty"`
}

func (config Configuration) GetDashboardClient(mgr manager.Manager) func(rayCluster *rayv1.RayCluster, url string) (dashboardclient.RayDashboardClientInterface, error) {
return utils.GetRayDashboardClientFunc(mgr, config.UseKubernetesProxy)
func (config Configuration) GetDashboardClient(ctx context.Context, mgr manager.Manager) func(rayCluster *rayv1.RayCluster, url string) (dashboardclient.RayDashboardClientInterface, error) {
return utils.GetRayDashboardClientFunc(ctx, mgr, config.UseKubernetesProxy, config.UseBackgroundGoroutine)
}

func (config Configuration) GetHttpProxyClient(mgr manager.Manager) func(hostIp, podNamespace, podName string, port int) utils.RayHttpProxyClientInterface {
Expand Down
16 changes: 10 additions & 6 deletions ray-operator/controllers/ray/rayjob_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package ray

import (
"context"
errs "errors"
"fmt"
"os"
"strconv"
Expand Down Expand Up @@ -42,11 +43,10 @@ const (
// RayJobReconciler reconciles a RayJob object
type RayJobReconciler struct {
client.Client
Scheme *runtime.Scheme
Recorder record.EventRecorder

dashboardClientFunc func(rayCluster *rayv1.RayCluster, url string) (dashboardclient.RayDashboardClientInterface, error)
Recorder record.EventRecorder
options RayJobReconcilerOptions
Scheme *runtime.Scheme
dashboardClientFunc func(rayCluster *rayv1.RayCluster, url string) (dashboardclient.RayDashboardClientInterface, error)
}

type RayJobReconcilerOptions struct {
Expand All @@ -55,8 +55,8 @@ type RayJobReconcilerOptions struct {
}

// NewRayJobReconciler returns a new reconcile.Reconciler
func NewRayJobReconciler(_ context.Context, mgr manager.Manager, options RayJobReconcilerOptions, provider utils.ClientProvider) *RayJobReconciler {
dashboardClientFunc := provider.GetDashboardClient(mgr)
func NewRayJobReconciler(ctx context.Context, mgr manager.Manager, options RayJobReconcilerOptions, provider utils.ClientProvider) *RayJobReconciler {
dashboardClientFunc := provider.GetDashboardClient(ctx, mgr)
return &RayJobReconciler{
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
Expand Down Expand Up @@ -288,6 +288,10 @@ func (r *RayJobReconciler) Reconcile(ctx context.Context, request ctrl.Request)

jobInfo, err := rayDashboardClient.GetJobInfo(ctx, rayJobInstance.Status.JobId)
if err != nil {
if errs.Is(err, dashboardclient.ErrAgain) {
logger.Info("The Ray job Info was not ready. Try again next iteration.", "JobId", rayJobInstance.Status.JobId)
return ctrl.Result{RequeueAfter: RayJobDefaultRequeueDuration}, nil
}
// If the Ray job was not found, GetJobInfo returns a BadRequest error.
if errors.IsBadRequest(err) {
if rayJobInstance.Spec.SubmissionMode == rayv1.HTTPMode {
Expand Down
4 changes: 2 additions & 2 deletions ray-operator/controllers/ray/rayservice_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,8 @@ type RayServiceReconciler struct {
}

// NewRayServiceReconciler returns a new reconcile.Reconciler
func NewRayServiceReconciler(_ context.Context, mgr manager.Manager, provider utils.ClientProvider) *RayServiceReconciler {
dashboardClientFunc := provider.GetDashboardClient(mgr)
func NewRayServiceReconciler(ctx context.Context, mgr manager.Manager, provider utils.ClientProvider) *RayServiceReconciler {
dashboardClientFunc := provider.GetDashboardClient(ctx, mgr)
httpProxyClientFunc := provider.GetHttpProxyClient(mgr)
return &RayServiceReconciler{
Client: mgr.GetClient(),
Expand Down
7 changes: 6 additions & 1 deletion ray-operator/controllers/ray/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ limitations under the License.
package ray

import (
"context"
"os"
"path/filepath"
"testing"
Expand Down Expand Up @@ -52,7 +53,7 @@ var (

type TestClientProvider struct{}

func (testProvider TestClientProvider) GetDashboardClient(_ manager.Manager) func(rayCluster *rayv1.RayCluster, url string) (dashboardclient.RayDashboardClientInterface, error) {
func (testProvider TestClientProvider) GetDashboardClient(_ context.Context, _ manager.Manager) func(rayCluster *rayv1.RayCluster, url string) (dashboardclient.RayDashboardClientInterface, error) {
return func(_ *rayv1.RayCluster, _ string) (dashboardclient.RayDashboardClientInterface, error) {
return fakeRayDashboardClient, nil
}
Expand All @@ -64,6 +65,10 @@ func (testProvider TestClientProvider) GetHttpProxyClient(_ manager.Manager) fun
}
}

func (testProvider TestClientProvider) DoesUseBackgroundGoroutine() bool {
return false
}

func TestAPIs(t *testing.T) {
RegisterFailHandler(Fail)

Expand Down
Loading
Loading