diff --git a/bundle/manifests/k8s-nim-operator.clusterserviceversion.yaml b/bundle/manifests/k8s-nim-operator.clusterserviceversion.yaml index 765d64fac..b2ce8cb25 100644 --- a/bundle/manifests/k8s-nim-operator.clusterserviceversion.yaml +++ b/bundle/manifests/k8s-nim-operator.clusterserviceversion.yaml @@ -1345,6 +1345,18 @@ spec: - watch - create - delete + - apiGroups: + - admissionregistration.k8s.io + resources: + - validatingwebhookconfigurations + verbs: + - get + - list + - watch + - create + - update + - patch + - delete deployments: - name: k8s-nim-operator spec: @@ -1401,6 +1413,8 @@ spec: fieldPath: metadata.namespace - name: ENABLE_WEBHOOKS value: "true" + - name: OPERATOR_NAME_PREFIX + value: "k8s-nim-operator" image: 'ghcr.io/nvidia/k8s-nim-operator:main' imagePullPolicy: Always livenessProbe: @@ -1414,6 +1428,10 @@ spec: successThreshold: 1 timeoutSeconds: 1 name: manager + volumeMounts: + - name: cert + mountPath: /tmp/k8s-webhook-server/serving-certs + readOnly: true readinessProbe: failureThreshold: 3 httpGet: @@ -1435,6 +1453,11 @@ spec: allowPrivilegeEscalation: false terminationMessagePath: /dev/termination-log terminationMessagePolicy: File + volumes: + - name: cert + secret: + secretName: k8s-nim-operator-webhook-server-cert + defaultMode: 420 dnsPolicy: ClusterFirst imagePullSecrets: [] restartPolicy: Always @@ -1458,45 +1481,3 @@ spec: supported: false - type: AllNamespaces supported: true - webhookdefinitions: - - type: ValidatingAdmissionWebhook - admissionReviewVersions: - - v1 - containerPort: 9443 - targetPort: 9443 - deploymentName: k8s-nim-operator - failurePolicy: Fail - generateName: vnimcache-v1alpha1.kb.io - rules: - - apiGroups: - - apps.nvidia.com - apiVersions: - - v1alpha1 - operations: - - CREATE - - UPDATE - resources: - - nimcaches - sideEffects: None - webhookPath: /validate-apps-nvidia-com-v1alpha1-nimcache - - type: ValidatingAdmissionWebhook - admissionReviewVersions: - - v1 - containerPort: 9443 - targetPort: 9443 - deploymentName: k8s-nim-operator - failurePolicy: Fail - generateName: vnimservice-v1alpha1.kb.io - rules: - - apiGroups: - - apps.nvidia.com - apiVersions: - - v1alpha1 - operations: - - CREATE - - UPDATE - resources: - - nimservices - sideEffects: None - webhookPath: /validate-apps-nvidia-com-v1alpha1-nimservice - diff --git a/bundle/manifests/k8s-nim-operator.webhookservice.yaml b/bundle/manifests/k8s-nim-operator.webhookservice.yaml new file mode 100644 index 000000000..84c2e8b6e --- /dev/null +++ b/bundle/manifests/k8s-nim-operator.webhookservice.yaml @@ -0,0 +1,19 @@ +apiVersion: v1 +kind: Service +metadata: + name: k8s-nim-operator-webhook-service + labels: + app.kubernetes.io/name: k8s-nim-operator + app.kubernetes.io/instance: nim-operator + control-plane: controller-manager + annotations: + service.beta.openshift.io/serving-cert-secret-name: k8s-nim-operator-webhook-server-cert +spec: + selector: + app.kubernetes.io/name: k8s-nim-operator + app.kubernetes.io/instance: nim-operator + control-plane: controller-manager + ports: + - port: 443 + targetPort: 9443 + protocol: TCP diff --git a/cmd/main.go b/cmd/main.go index 7c87f4824..9f69f1c9b 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -17,6 +17,7 @@ limitations under the License. package main import ( + "context" "crypto/tls" "flag" "os" @@ -34,6 +35,7 @@ import ( clientgoscheme "k8s.io/client-go/kubernetes/scheme" _ "k8s.io/client-go/plugin/pkg/client/auth" ctrl "sigs.k8s.io/controller-runtime" + crclient "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/healthz" "sigs.k8s.io/controller-runtime/pkg/log/zap" "sigs.k8s.io/controller-runtime/pkg/metrics/filters" @@ -43,7 +45,9 @@ import ( appsv1alpha1 "github.com/NVIDIA/k8s-nim-operator/api/apps/v1alpha1" "github.com/NVIDIA/k8s-nim-operator/internal/conditions" + "github.com/NVIDIA/k8s-nim-operator/internal/config" "github.com/NVIDIA/k8s-nim-operator/internal/controller" + "github.com/NVIDIA/k8s-nim-operator/internal/k8sutil" "github.com/NVIDIA/k8s-nim-operator/internal/render" webhookappsv1alpha1 "github.com/NVIDIA/k8s-nim-operator/internal/webhook/apps/v1alpha1" // +kubebuilder:scaffold:imports @@ -256,27 +260,33 @@ func main() { // nolint:goconst // Parse ENABLE_WEBHOOKS environment variable once as a boolean. - var enableWebhooks bool if val, ok := os.LookupEnv("ENABLE_WEBHOOKS"); ok { var err error - enableWebhooks, err = strconv.ParseBool(val) + config.EnableWebhooks, err = strconv.ParseBool(val) if err != nil { setupLog.Error(err, "invalid value for ENABLE_WEBHOOKS, expected boolean") os.Exit(1) } } - - if enableWebhooks { - if err := webhookappsv1alpha1.SetupNIMCacheWebhookWithManager(mgr); err != nil { - setupLog.Error(err, "unable to create webhook", "webhook", "NIMCache") - os.Exit(1) - } - - if err := webhookappsv1alpha1.SetupNIMServiceWebhookWithManager(mgr); err != nil { - setupLog.Error(err, "unable to create webhook", "webhook", "NIMService") - os.Exit(1) + if val, ok := os.LookupEnv("TLS_MODE"); ok { + if val != "cert-manager" && val != "secret" { + setupLog.Error(err, "invalid value for TLS_MODE, expected 'cert-manager' or 'secret") } + config.TLSMode = val } + if val, ok := os.LookupEnv("TLS_SECRET"); ok { + config.TLSSecret = val + } + if val, ok := os.LookupEnv("TLS_CA"); ok { + config.TLSCA = []byte(val) + } + if val, ok := os.LookupEnv("OPERATOR_NAME_PREFIX"); ok { + config.OperatorNamePrefix = val + } + if val, ok := os.LookupEnv("OPERATOR_NAMESPACE"); ok { + config.OperatorNamespace = val + } + // +kubebuilder:scaffold:builder if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil { @@ -288,6 +298,44 @@ func main() { os.Exit(1) } + cfg := ctrl.GetConfigOrDie() + liveClient, err := crclient.New(cfg, crclient.Options{Scheme: scheme}) + if err != nil { + setupLog.Error(err, "unable to construct live client") + os.Exit(1) + } + ctx := context.Background() + orch, err := k8sutil.GetOrchestratorType(ctx, liveClient) // uses direct REST calls + if err != nil { + setupLog.Error(err, "failed to detect orchestrator type") + os.Exit(1) + } + config.OrchestratorType = orch + setupLog.Info("detected orchestrator", "type", orch) + + if config.EnableWebhooks { + if err := webhookappsv1alpha1.SetupNIMCacheWebhookWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create webhook", "webhook", "NIMCache") + os.Exit(1) + } + + if err := webhookappsv1alpha1.SetupNIMServiceWebhookWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create webhook", "webhook", "NIMService") + os.Exit(1) + } + // Set up cluster-level ValidatingWebhookConfiguration. + if err := webhookappsv1alpha1.EnsureValidatingWebhook( + context.TODO(), + mgr.GetAPIReader(), // uncached reads + mgr.GetClient(), // writes go through the normal client + config.OperatorNamespace, + config.OperatorNamePrefix, + ); err != nil { + setupLog.Error(err, "unable to ensure ValidatingWebhookConfiguration") + os.Exit(1) + } + } + setupLog.Info("starting manager") if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil { setupLog.Error(err, "problem running manager") diff --git a/deployments/helm/k8s-nim-operator/templates/admission-controller.yaml b/deployments/helm/k8s-nim-operator/templates/admission-controller.yaml index 606cf08e2..dac10c73d 100644 --- a/deployments/helm/k8s-nim-operator/templates/admission-controller.yaml +++ b/deployments/helm/k8s-nim-operator/templates/admission-controller.yaml @@ -19,7 +19,9 @@ spec: app.kubernetes.io/instance: {{ .Release.Name }} {{- end }} --- -{{- if .Values.operator.admissionController.enabled }} +{{ if and .Values.operator.admissionController.enabled (eq .Values.operator.admissionController.tls.mode "cert-manager") }} +{{ $issuerType := default "selfsigned" .Values.operator.admissionController.tls.certManager.issuerType -}} +{{ $issuerName := .Values.operator.admissionController.tls.certManager.issuerName -}} apiVersion: cert-manager.io/v1 kind: Certificate metadata: @@ -32,13 +34,16 @@ spec: dnsNames: - {{ include "k8s-nim-operator.fullname" . }}-webhook-service.{{ .Release.Namespace }}.svc - {{ include "k8s-nim-operator.fullname" . }}-webhook-service.{{ .Release.Namespace }}.svc.cluster.local + {{- range .Values.operator.admissionController.tls.certManager.dnsNames }} + - {{ . }} + {{- end }} issuerRef: - kind: Issuer - name: {{ include "k8s-nim-operator.fullname" . }}-selfsigned-issuer + kind: {{- if eq (lower $issuerType) "clusterissuer" }} ClusterIssuer {{- else }} Issuer {{- end }} + name: {{- if eq (lower $issuerType) "selfsigned" }} {{ include "k8s-nim-operator.fullname" . }}-selfsigned-issuer {{- else }} {{ required "operator.admissionController.tls.certManager.issuerName is required when issuerType is 'issuer' or 'clusterissuer'" $issuerName }} {{- end }} secretName: {{ include "k8s-nim-operator.fullname" . }}-webhook-server-cert {{- end }} --- -{{- if .Values.operator.admissionController.enabled }} +{{ if and .Values.operator.admissionController.enabled (eq .Values.operator.admissionController.tls.mode "cert-manager") (eq (lower (default "selfsigned" .Values.operator.admissionController.tls.certManager.issuerType)) "selfsigned") }} apiVersion: cert-manager.io/v1 kind: Issuer metadata: @@ -49,45 +54,4 @@ metadata: app.kubernetes.io/managed-by: helm spec: selfSigned: {} -{{- end}} ---- -{{- if .Values.operator.admissionController.enabled }} -apiVersion: admissionregistration.k8s.io/v1 -kind: ValidatingWebhookConfiguration -metadata: - name: {{ include "k8s-nim-operator.fullname" . }}-validating-webhook-configuration - annotations: - cert-manager.io/inject-ca-from: {{ .Release.Namespace }}/{{ include "k8s-nim-operator.fullname" . }}-serving-cert - labels: - app.kubernetes.io/name: k8s-nim-operator - app.kubernetes.io/managed-by: helm -webhooks: - - name: vnimcache-v1alpha1.kb.io - admissionReviewVersions: ["v1"] - clientConfig: - service: - name: {{ include "k8s-nim-operator.fullname" . }}-webhook-service - namespace: {{ .Release.Namespace }} - path: /validate-apps-nvidia-com-v1alpha1-nimcache - failurePolicy: Fail - rules: - - apiGroups: ["apps.nvidia.com"] - apiVersions: ["v1alpha1"] - operations: ["CREATE", "UPDATE"] - resources: ["nimcaches"] - sideEffects: None - - name: vnimservice-v1alpha1.kb.io - admissionReviewVersions: ["v1"] - clientConfig: - service: - name: {{ include "k8s-nim-operator.fullname" . }}-webhook-service - namespace: {{ .Release.Namespace }} - path: /validate-apps-nvidia-com-v1alpha1-nimservice - failurePolicy: Fail - rules: - - apiGroups: ["apps.nvidia.com"] - apiVersions: ["v1alpha1"] - operations: ["CREATE", "UPDATE"] - resources: ["nimservices"] - sideEffects: None -{{- end }} \ No newline at end of file +{{- end}} \ No newline at end of file diff --git a/deployments/helm/k8s-nim-operator/templates/deployment.yaml b/deployments/helm/k8s-nim-operator/templates/deployment.yaml index d0147f648..ca2b3932e 100644 --- a/deployments/helm/k8s-nim-operator/templates/deployment.yaml +++ b/deployments/helm/k8s-nim-operator/templates/deployment.yaml @@ -51,8 +51,21 @@ spec: valueFrom: fieldRef: fieldPath: metadata.namespace + - name: OPERATOR_NAME_PREFIX + value: {{ include "k8s-nim-operator.fullname" . }} - name: ENABLE_WEBHOOKS value: "{{ .Values.operator.admissionController.enabled }}" + - name: TLS_MODE + value: "{{ .Values.operator.admissionController.tls.mode }}" + - name: TLS_SECRET + value: "{{ .Values.operator.admissionController.tls.secret.name }}" + - name: TLS_CA + {{- if and (eq .Values.operator.admissionController.tls.mode "secret") (.Values.operator.admissionController.tls.secret.caBundle) }} + value: |- +{{ .Values.operator.admissionController.tls.secret.caBundle | nindent 14 }} + {{- else }} + value: "" + {{- end }} livenessProbe: httpGet: path: /healthz @@ -79,8 +92,12 @@ spec: volumes: - name: cert secret: - secretName: {{ include "k8s-nim-operator.fullname" . }}-webhook-server-cert - defaultMode: 420 + secretName: {{- if eq .Values.operator.admissionController.tls.mode "secret" }} + {{ .Values.operator.admissionController.tls.secret.name }} + {{- else }} + {{ include "k8s-nim-operator.fullname" . }}-webhook-server-cert + {{- end }} + defaultMode: 420 {{- end }} {{- with .Values.operator.nodeSelector }} nodeSelector: diff --git a/deployments/helm/k8s-nim-operator/templates/manager-rbac.yaml b/deployments/helm/k8s-nim-operator/templates/manager-rbac.yaml index fe47b661a..6a7a03bbf 100644 --- a/deployments/helm/k8s-nim-operator/templates/manager-rbac.yaml +++ b/deployments/helm/k8s-nim-operator/templates/manager-rbac.yaml @@ -582,8 +582,10 @@ rules: - get - list - watch - - patch + - create - update + - patch + - delete --- apiVersion: rbac.authorization.k8s.io/v1 diff --git a/deployments/helm/k8s-nim-operator/values.yaml b/deployments/helm/k8s-nim-operator/values.yaml index 025fdcb64..e2271a0df 100644 --- a/deployments/helm/k8s-nim-operator/values.yaml +++ b/deployments/helm/k8s-nim-operator/values.yaml @@ -56,8 +56,27 @@ operator: admissionController: # Enable the admission controller. # Note: cert-manager must be installed beforehand, as it is required to generate the TLS certificates. - enabled: false - + enabled: true + # TLS certificate configuration + tls: + # Certificate management mode: "cert-manager" or "secret" + # - "cert-manager": Use cert-manager to automatically generate and manage certificates + # - "secret": Use a user-provided secret containing tls.crt and tls.key + mode: "cert-manager" + certManager: + # Issuer type: "selfsigned", "clusterissuer", or "issuer" + issuerType: "selfsigned" + # Issuer name (required when issuerType is "clusterissuer" or "issuer") + issuerName: "" + # Additional DNS names for the certificate + dnsNames: [] + secret: + # Name of the secret containing tls.crt and tls.key + name: "" + # Base64-encoded CA certificate bundle for validating the webhook's TLS certificate (base64 encoded) + # Required when using secret mode. + # Note: Only include intermediate CA certificates, not root CA certificates + caBundle: metricsService: ports: - name: metrics diff --git a/internal/config/config.go b/internal/config/config.go new file mode 100644 index 000000000..84c2eef65 --- /dev/null +++ b/internal/config/config.go @@ -0,0 +1,13 @@ +package config + +import "github.com/NVIDIA/k8s-nim-operator/internal/k8sutil" + +var ( + TLSMode string + TLSSecret string + TLSCA []byte + EnableWebhooks bool + OperatorNamePrefix string + OperatorNamespace string + OrchestratorType k8sutil.OrchestratorType +) diff --git a/internal/webhook/apps/v1alpha1/configuration.go b/internal/webhook/apps/v1alpha1/configuration.go new file mode 100644 index 000000000..e96dc6ef2 --- /dev/null +++ b/internal/webhook/apps/v1alpha1/configuration.go @@ -0,0 +1,170 @@ +package v1alpha1 + +import ( + "context" + "encoding/json" + + admissionv1 "k8s.io/api/admissionregistration/v1" + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" + + "github.com/NVIDIA/k8s-nim-operator/internal/config" + "github.com/NVIDIA/k8s-nim-operator/internal/k8sutil" +) + +// EnsureValidatingWebhook creates or updates the ValidatingWebhookConfiguration +// that used to be templated by Helm. It is a best-effort reconciliation and +// returns an error only when we cannot make the desired state match the spec. +func EnsureValidatingWebhook( + ctx context.Context, + apiReader client.Reader, + writer client.Client, + namespace string, + fullNamePrefix string, +) error { + // Desired validatingwebhookconfiguration spec. + desired := buildConfigurationSpec(namespace, fullNamePrefix) + + // Check if there is already a spec. + existing := &admissionv1.ValidatingWebhookConfiguration{} + err := apiReader.Get(ctx, types.NamespacedName{Name: desired.Name}, existing) + if err != nil && !errors.IsNotFound(err) { + return err + } + + if errors.IsNotFound(err) { + return writer.Create(ctx, desired) + } + + // Deep-compare; update only if something differs. + cur, _ := json.Marshal(existing.Webhooks) + want, _ := json.Marshal(desired.Webhooks) + + if string(cur) == string(want) { + return nil + } + + existing.Webhooks = desired.Webhooks + existing.Annotations = desired.Annotations + return writer.Update(ctx, existing) +} + +// buildDesired reproduces the spec that used to be in Helm. +func buildConfigurationSpec(namespace, namePrefix string) *admissionv1.ValidatingWebhookConfiguration { + pathCache := "/validate-apps-nvidia-com-v1alpha1-nimcache" + pathService := "/validate-apps-nvidia-com-v1alpha1-nimservice" + + // Use appropriate annotations/labels as per deployment mode. + var annotations map[string]string + var labels map[string]string + var clientconfignimcache admissionv1.WebhookClientConfig + var clientconfignimservice admissionv1.WebhookClientConfig + + clientconfignimcache = admissionv1.WebhookClientConfig{ + Service: &admissionv1.ServiceReference{ + Namespace: namespace, + Name: namePrefix + "-webhook-service", + Path: &pathCache, + }, + } + clientconfignimservice = admissionv1.WebhookClientConfig{ + Service: &admissionv1.ServiceReference{ + Namespace: namespace, + Name: namePrefix + "-webhook-service", + Path: &pathService, + }, + } + + // Deployment specific values. + if config.OrchestratorType == k8sutil.K8s { + if config.TLSMode == "cert-manager" { + annotations = map[string]string{"cert-manager.io/inject-ca-from": namespace + "/" + namePrefix + "-serving-cert"} + } else { + annotations = map[string]string{} + clientconfignimcache = admissionv1.WebhookClientConfig{ + Service: &admissionv1.ServiceReference{ + Namespace: namespace, + Name: namePrefix + "-webhook-service", + Path: &pathCache, + }, + CABundle: config.TLSCA, + } + clientconfignimservice = admissionv1.WebhookClientConfig{ + Service: &admissionv1.ServiceReference{ + Namespace: namespace, + Name: namePrefix + "-webhook-service", + Path: &pathService, + }, + CABundle: config.TLSCA, + } + } + labels = map[string]string{ + "app.kubernetes.io/name": "k8s-nim-operator", + "app.kubernetes.io/managed-by": "helm", + } + } else { + annotations = map[string]string{"service.beta.openshift.io/inject-cabundle": "true"} + labels = map[string]string{ + "app.kubernetes.io/name": "k8s-nim-operator", + "app.kubernetes.io/managed-by": "openshift", + } + } + + return &admissionv1.ValidatingWebhookConfiguration{ + ObjectMeta: metav1.ObjectMeta{ + Name: namePrefix + "-validating-webhook-configuration", + Annotations: annotations, + Labels: labels, + }, + Webhooks: []admissionv1.ValidatingWebhook{ + { + Name: "vnimcache-v1alpha1.kb.io", + AdmissionReviewVersions: []string{"v1"}, + ClientConfig: clientconfignimcache, + FailurePolicy: func() *admissionv1.FailurePolicyType { + fp := admissionv1.Fail + return &fp + }(), + SideEffects: func() *admissionv1.SideEffectClass { + s := admissionv1.SideEffectClassNone + return &s + }(), + Rules: []admissionv1.RuleWithOperations{{ + Operations: []admissionv1.OperationType{ + admissionv1.Create, admissionv1.Update, + }, + Rule: admissionv1.Rule{ + APIGroups: []string{"apps.nvidia.com"}, + APIVersions: []string{"v1alpha1"}, + Resources: []string{"nimcaches"}, + }, + }}, + }, + { + Name: "vnimservice-v1alpha1.kb.io", + AdmissionReviewVersions: []string{"v1"}, + ClientConfig: clientconfignimservice, + FailurePolicy: func() *admissionv1.FailurePolicyType { + fp := admissionv1.Fail + return &fp + }(), + SideEffects: func() *admissionv1.SideEffectClass { + s := admissionv1.SideEffectClassNone + return &s + }(), + Rules: []admissionv1.RuleWithOperations{{ + Operations: []admissionv1.OperationType{ + admissionv1.Create, admissionv1.Update, + }, + Rule: admissionv1.Rule{ + APIGroups: []string{"apps.nvidia.com"}, + APIVersions: []string{"v1alpha1"}, + Resources: []string{"nimservices"}, + }, + }}, + }, + }, + } +} diff --git a/internal/webhook/apps/v1alpha1/nimcache_webhook_validation_helper.go b/internal/webhook/apps/v1alpha1/nimcache_webhook_validation_helper.go index bfc861b3d..d8fd74edb 100644 --- a/internal/webhook/apps/v1alpha1/nimcache_webhook_validation_helper.go +++ b/internal/webhook/apps/v1alpha1/nimcache_webhook_validation_helper.go @@ -74,6 +74,10 @@ func validateNGCSource(ngcSource *appsv1alpha1.NGCSource, fldPath *field.Path) f func validateModel(model *appsv1alpha1.ModelSpec, fldPath *field.Path) field.ErrorList { errList := field.ErrorList{} + if model == nil { + return errList + } + // If Model.Profiles is not empty, ensure all other Model fields are empty. If Model.Profiles contains "all", length must be 1 if len(model.Profiles) > 0 {