diff --git a/Makefile b/Makefile index e49482c0..96802576 100644 --- a/Makefile +++ b/Makefile @@ -59,9 +59,9 @@ GITHUB_REGISTRY := ghcr.io/oracle OPERATOR_IMAGE_NAME := coherence-operator OPERATOR_IMAGE_REGISTRY ?= $(ORACLE_REGISTRY) OPERATOR_IMAGE_TAG_SUFFIX ?= -OPERATOR_IMAGE_TAG := $(VERSION)$(OPERATOR_IMAGE_TAG_SUFFIX) -OPERATOR_IMAGE_TAG_ARM := $(VERSION)-arm64$(OPERATOR_IMAGE_TAG_SUFFIX) -OPERATOR_IMAGE_TAG_AMD := $(VERSION)-amd64$(OPERATOR_IMAGE_TAG_SUFFIX) +OPERATOR_IMAGE_TAG ?= $(VERSION)$(OPERATOR_IMAGE_TAG_SUFFIX) +OPERATOR_IMAGE_TAG_ARM ?= $(VERSION)-arm64$(OPERATOR_IMAGE_TAG_SUFFIX) +OPERATOR_IMAGE_TAG_AMD ?= $(VERSION)-amd64$(OPERATOR_IMAGE_TAG_SUFFIX) OPERATOR_IMAGE := $(OPERATOR_IMAGE_REGISTRY)/$(OPERATOR_IMAGE_NAME):$(OPERATOR_IMAGE_TAG) OPERATOR_IMAGE_ARM := $(OPERATOR_IMAGE_REGISTRY)/$(OPERATOR_IMAGE_NAME):$(OPERATOR_IMAGE_TAG_ARM) OPERATOR_IMAGE_AMD := $(OPERATOR_IMAGE_REGISTRY)/$(OPERATOR_IMAGE_NAME):$(OPERATOR_IMAGE_TAG_AMD) @@ -1945,7 +1945,7 @@ endif .PHONY: just-deploy -just-deploy: ensure-pull-secret ## Deploy the Coherence Operator without rebuilding anything +just-deploy: $(TOOLS_BIN)/kustomize ensure-pull-secret ## Deploy the Coherence Operator without rebuilding anything $(call prepare_deploy,$(OPERATOR_IMAGE),$(OPERATOR_NAMESPACE)) ifeq ("$(OPERATOR_IMAGE_REGISTRY)","$(ORACLE_REGISTRY)") $(KUSTOMIZE) build $(BUILD_DEPLOY)/default | $(KUBECTL_CMD) apply -f - @@ -2709,7 +2709,10 @@ test-examples: build-examples PUSH_ARGS ?= .PHONY: push-operator-image -push-operator-image: $(BUILD_TARGETS)/build-operator +push-operator-image: $(BUILD_TARGETS)/build-operator just-push-operator-image + +.PHONY: just-push-operator-image +just-push-operator-image: ifneq ("$(OPERATOR_RELEASE_REGISTRY)","$(OPERATOR_IMAGE_REGISTRY)") $(DOCKER_CMD) tag $(OPERATOR_IMAGE_ARM) $(OPERATOR_RELEASE_ARM) $(DOCKER_CMD) tag $(OPERATOR_IMAGE_AMD) $(OPERATOR_RELEASE_AMD) diff --git a/docs/troubleshooting/01_trouble-shooting.adoc b/docs/troubleshooting/01_trouble-shooting.adoc index 2e29cc2d..ade1a520 100644 --- a/docs/troubleshooting/01_trouble-shooting.adoc +++ b/docs/troubleshooting/01_trouble-shooting.adoc @@ -39,6 +39,23 @@ This page will be updated and maintained over time to include common issues we s == Issues +[#start-timeout] +=== The Operator fails to start with a timeout error + +One of the first things the operator does on start-up os to log the Kubernetes server version. +This is done by making a request to the API server with a default timeout of one minute. +If the network policies, firewall rules, or RBAC are blocking the operator from connecting to the API server this +request will time out with an error like the one below + +[source] +---- +2025-10-16T09:05:29Z INFO setup ERROR: failed to get Kubernetes server version {"Host": "https://10.96.0.1:443", "Error": "Get \"https://10.96.0.1:443/version?timeout=32s\": dial tcp 10.96.0.1:443: i/o timeout"} +Error: unable to get kubernetes server version: Get "https://10.96.0.1:443/version?timeout=32s": dial tcp 10.96.0.1:443: i/o timeout +---- + +The error message in the operator's logs will include the host name and port that the operator attempted to use, +so this can be used to then ensure egress is allowed to the correct host and port. + [#no-operator] === I Uninstalled the Operator and Cannot Delete the Coherence Clusters @@ -97,6 +114,9 @@ not cleanly shut down and will then not be able to be restarted using the persis The readiness/liveness probe used by the Operator in the Coherence Pods checks a number of things to determine whether the Pods is ready, one of these is whether the JVM is a cluster member. If your application uses a custom main class and is not properly bootstrapping Coherence then the Pod will not be ready until your application code actually touches a Coherence resource causing Coherence to start and join the cluster. +If you have overridden the configuration for the readiness or liveness probes for the Coherence cluster with custom endpoints, +then you need to debug your own code. + When running in clusters with the Operator using custom main classes it is advisable to properly bootstrap Coherence from within your `main` method. This can be done using the new Coherence bootstrap API available from CE release 20.12 or by calling `com.tangosol.net.DefaultCacheServer.startServerDaemon().waitForServiceStart();` diff --git a/examples/095_network_policies/README.adoc b/examples/095_network_policies/README.adoc index 4b51f267..a122c60e 100644 --- a/examples/095_network_policies/README.adoc +++ b/examples/095_network_policies/README.adoc @@ -148,6 +148,17 @@ Configuring access to the API server is not as straight forward as other network The reason for this is that there is no Pod available with labels that can be used in the configuration, instead, the IP address of the API server itself must be used. +[NOTE] +==== +If the operator cannot connect to the API server it will fail to start. +One of the first things the operator does on start-up os to log the Kubernetes version that the server is running. +This is done by making a request to the API server with a default timeout of one minute. +If the network policies are blocking the operator from connecting to the API server this request will time out. + +The error message in the operator's logs will include the host name and port that the operator attempted to use, +so this can be used to then ensure egress is allowed to the correct host and port. +==== + There are various methods to find the IP address of the API server. The exact method required may vary depending on the type of Kubernetes cluster being used, for example a simple development cluster running in KinD on a laptop may differ from a cluster running in a cloud provider's infrastructure. diff --git a/hack/buildah/run-buildah.sh b/hack/buildah/run-buildah.sh index 5d3a3aef..a27ff10e 100755 --- a/hack/buildah/run-buildah.sh +++ b/hack/buildah/run-buildah.sh @@ -137,6 +137,6 @@ else -e HTTP_PROXY="${HTTP_PROXY}" -e HTTPS_PROXY="${HTTPS_PROXY}" -e NO_PROXY="${NO_PROXY}" \ -e http_proxy="${http_proxy}" -e https_proxy="${https_proxy}" -e no_proxy="${no_proxy}" \ --name buildah \ - quay.io/buildah/stable:v1.37.1 "${SCRIPT_NAME}" + quay.io/buildah/stable:v1.41.5 "${SCRIPT_NAME}" fi diff --git a/pkg/operator/operator.go b/pkg/operator/operator.go index d81a9f61..a6323c05 100644 --- a/pkg/operator/operator.go +++ b/pkg/operator/operator.go @@ -65,6 +65,7 @@ const ( FlagOperatorImage = "operator-image" FlagEnvVar = "env" FlagJvmArg = "jvm" + FlagKubernetesCheckTimeout = "kubernetes-check-timeout" // EnvVarWatchNamespace is the environment variable to use to set the watch namespace(s) EnvVarWatchNamespace = "WATCH_NAMESPACE" @@ -85,6 +86,11 @@ const ( LabelTestHostName = "coherence.oracle.com/test_hostname" // LabelTestHealthPort is a label applied to Pods to set a testing health check port LabelTestHealthPort = "coherence.oracle.com/test_health_port" + + // DefaultKubernetesCheckTimeout is the default timeout applied to the initial Kubernetes API connection check. + DefaultKubernetesCheckTimeout = time.Minute + // MinKubernetesCheckTimeout is the minimum timeout applied to the initial Kubernetes API connection check. + MinKubernetesCheckTimeout = 10 * time.Second ) var setupLog = ctrl.Log.WithName("setup") @@ -245,6 +251,11 @@ func SetupFlags(cmd *cobra.Command, v *viper.Viper) { time.Second*20, "The duration the Operator uses for the leadership lease renewal timeout. "+ "If the value entered is less than 10s, then 10s will be used") + cmd.Flags().Duration( + FlagKubernetesCheckTimeout, + DefaultKubernetesCheckTimeout, + "The duration the Operator uses for the initial Kubernetes API connection check timeout. "+ + "If the value entered is less than 60s, then 60s will be used") // enable using dashed notation in flags and underscores in env v.SetEnvKeyReplacer(strings.NewReplacer("-", "_")) diff --git a/pkg/runner/cmd_operator.go b/pkg/runner/cmd_operator.go index 891c4ca6..31b6cc8d 100644 --- a/pkg/runner/cmd_operator.go +++ b/pkg/runner/cmd_operator.go @@ -9,6 +9,7 @@ package runner import ( "context" "crypto/tls" + "encoding/json" "fmt" "net/http" "os" @@ -25,6 +26,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" apiruntime "k8s.io/apimachinery/pkg/runtime" utilruntime "k8s.io/apimachinery/pkg/util/runtime" + "k8s.io/apimachinery/pkg/version" clientgoscheme "k8s.io/client-go/kubernetes/scheme" rest2 "k8s.io/client-go/rest" "k8s.io/utils/ptr" @@ -79,8 +81,8 @@ func operatorCommand(v *viper.Viper) *cobra.Command { func execute(v *viper.Viper) error { ctrl.SetLogger(zap.New(zap.UseDevMode(true))) - setupLog.Info(fmt.Sprintf("Operator Coherence Image: %s", operator.GetDefaultCoherenceImage())) setupLog.Info(fmt.Sprintf("Operator Image: %s", operator.GetDefaultOperatorImage())) + setupLog.Info(fmt.Sprintf("Default Coherence Image (only used when no image is specified for a Coherence resource): %s", operator.GetDefaultCoherenceImage())) // if the enable-http2 flag is false (the default), http/2 should be disabled // due to its vulnerabilities. More specifically, disabling http/2 will @@ -118,12 +120,18 @@ func execute(v *viper.Viper) error { if err != nil { return errors.Wrap(err, "unable to create client set") } + setupLog.Info("Successfully created kubernetes client", "Host", cfg.Host) - version, err := cs.DiscoveryClient.ServerVersion() + // Get and display the k8s version of the server. + // This will also verify that we can actually talk to the k8s API server. + // For example, incorrectly configured network polices or RBAC rules can prevent us from talking to the server. + sv, err := getServerVersion(cs, v) if err != nil { + setupLog.Info("ERROR: failed to get the Kubernetes server version. This could be cause by misconfigured network policies, RBAC rules or firewalls, etc.", + "Host", cfg.Host, "Error", err.Error()) return errors.Wrap(err, "unable to get kubernetes server version") } - setupLog.Info("Kubernetes server version", "Major", version.Major, "Minor", version.Minor, "Platform", version.Platform) + setupLog.Info("Kubernetes server version", "Major", sv.Major, "Minor", sv.Minor, "Platform", sv.Platform, "Host", cfg.Host) // The Operator web-hook server has been removed, so we need to delete any existing web-hooks setupLog.Info("Ensuring any existing webhook configurations are removed") @@ -286,3 +294,26 @@ func execute(v *viper.Viper) error { return nil } + +// GetServerVersion fetches the Kubernetes server version using the provided ClientSet and returns it as a version.Info struct. +// It uses the discovery client to send a GET request to the "/version" endpoint and parses the response into version.Info. +// This method has a default timeout of 1 minute but can be overridden by setting the environment variable KUBERNETES_CHECK_TIMEOUT. +// Returns an error if the request fails or if the JSON response cannot be parsed. +func getServerVersion(cs clients.ClientSet, v *viper.Viper) (*version.Info, error) { + timeout := v.GetDuration(operator.FlagKubernetesCheckTimeout) + if timeout < operator.MinKubernetesCheckTimeout { + timeout = operator.MinKubernetesCheckTimeout + } + ctx, cancel := context.WithTimeout(context.Background(), timeout) + defer cancel() + body, err := cs.DiscoveryClient.RESTClient().Get().AbsPath("/version").Do(ctx).Raw() + if err != nil { + return nil, err + } + var info version.Info + err = json.Unmarshal(body, &info) + if err != nil { + return nil, fmt.Errorf("unable to parse the server version: %v", err) + } + return &info, nil +}