Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -304,9 +304,7 @@ func electMaster(mgr manager.Manager, nsxClient *nsx.Client) {

func main() {
log.Info("Starting NSX Operator")
cfg := ctrl.GetConfigOrDie()
cfg.Timeout = pkgutil.TCPReadTimeout
mgr, err := ctrl.NewManager(cfg, ctrl.Options{
mgr, err := ctrl.NewManager(pkgutil.GetConfig(), ctrl.Options{
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So the API server address switch only can occur in the startup stage, right? Then if the eth1 down during the NSX operator runtime, what will happen?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes it only occurs in the startup stage.

The current case is when wcp enabled between backup and restore, cpvm eth1 will be down after NSX restore and we rely on NSX Operator to recover it. In this case, NSX Operator will always restarts as NSX connection will be down due to restore. In other cases eth1 may be down, shall we always expect NSX or WCP side to bring it back, and it might be fine NSX Operator does not work during that time?

If there is use case that NSX Operator should switch from cluster ip to localhost at runtime, maybe we can leverage the liveness probe to force the nsx operator restarting. Actually we need to refactor the liveness probe in a following up PR as currently it will try to check the eth1, i.e. get api like http://172.26.0.3:8384/healthz

Copy link
Contributor Author

@yanjunz97 yanjunz97 May 7, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've checked this in HA mode, and found operator will restart after eht1 down automatically because the lease renewal failed.
Updated: But in non-HA mode, operator will not restart, but the api server call will fail with errors like {"error": "Put \"https://172.24.0.1:443/apis/crd.nsx.vmware.com/v1alpha1/namespaces/ns-1/subnetsets/pod-default/status\": http2: client connection lost"}

Scheme: scheme,
HealthProbeBindAddress: config.ProbeAddr,
Metrics: metricsserver.Options{BindAddress: config.MetricsAddr},
Expand Down
5 changes: 1 addition & 4 deletions pkg/util/cert.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ import (
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/util/retry"
ctrl "sigs.k8s.io/controller-runtime"

"github.com/vmware-tanzu/nsx-operator/pkg/config"
)
Expand Down Expand Up @@ -146,9 +145,7 @@ func GenerateWebhookCerts() error {
Bytes: x509.MarshalPKCS1PrivateKey(serverKey),
})

cfg := ctrl.GetConfigOrDie()
cfg.Timeout = TCPReadTimeout
kubeClient := kubernetes.NewForConfigOrDie(cfg)
kubeClient := kubernetes.NewForConfigOrDie(GetConfig())
certSecret := &corev1.Secret{
TypeMeta: v1.TypeMeta{},
ObjectMeta: v1.ObjectMeta{
Expand Down
55 changes: 55 additions & 0 deletions pkg/util/kubernetes.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
package util

import (
"fmt"
"net/http"
"strings"

"k8s.io/client-go/rest"
ctrl "sigs.k8s.io/controller-runtime"
)

const (
localhostIP = "127.0.0.1"
localhostIPv6 = "::1"
localhostPort = "6443"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do you know if we can get the K8s config like the port from ENV parameter?

)

func GetConfig() *rest.Config {
cfg := ctrl.GetConfigOrDie()
cfg.Timeout = TCPReadTimeout
if !getHealth(cfg) {
hosts := strings.Split(cfg.Host, ":")
// cfg.Host is in the form of https://host:port
if len(hosts) > 3 {
cfg.Host = fmt.Sprintf("https://[%s]:%s", localhostIPv6, localhostPort)
} else {
cfg.Host = fmt.Sprintf("https://%s:%s", localhostIP, localhostPort)
}
log.Info("Failed to connect to configured kubernetes API server, set to loopback address", "host", cfg.Host)
}
return cfg
}

func getHealth(cfg *rest.Config) bool {
client, err := rest.HTTPClientFor(cfg)
if err != nil {
log.Error(err, "Failed to create client for config", "config", cfg)
return false
}

healthUrl := cfg.Host + "/healthz"
resp, err := client.Get(healthUrl)
if err != nil {
log.Error(err, "Failed to connect to Kubernetes API Server", "url", healthUrl)
return false
}
defer resp.Body.Close()

if resp.StatusCode != http.StatusOK {
log.Error(nil, "Kubernetes healthz check failed", "status", resp.Status)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a way to distinguish TCP connection error, TLS handshake validation error?
I mean we should only switch to localhost for TCP connection error, right?

return false
}
log.Debug("Connection is health", "url", healthUrl)
return true
}
78 changes: 78 additions & 0 deletions pkg/util/kubernetes_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
package util

import (
"bytes"
"errors"
"io"
"net/http"
"testing"

"github.com/agiledragon/gomonkey/v2"
"github.com/stretchr/testify/assert"
"k8s.io/client-go/rest"
ctrl "sigs.k8s.io/controller-runtime"
)

func TestGetConfig(t *testing.T) {
patches := gomonkey.ApplyFunc(ctrl.GetConfigOrDie, func() *rest.Config {
return &rest.Config{
Host: "https://10.0.0.1:443",
}
})
defer patches.Reset()

tests := []struct {
name string
preparedFunc func() *gomonkey.Patches
expectedHost string
}{
{
name: "healthyTraffic",
preparedFunc: func() *gomonkey.Patches {
return gomonkey.ApplyFunc((*http.Client).Get, func(c *http.Client, url string) (resp *http.Response, err error) {
return &http.Response{
StatusCode: http.StatusOK,
Body: io.NopCloser(bytes.NewBufferString(`{"ok": true}`)),
}, nil
})
},
expectedHost: "https://10.0.0.1:443",
},
{
name: "unhealthyTrafficAndIPv4",
preparedFunc: func() *gomonkey.Patches {
return gomonkey.ApplyFunc((*http.Client).Get, func(c *http.Client, url string) (resp *http.Response, err error) {
return &http.Response{
StatusCode: http.StatusNotFound,
Body: io.NopCloser(bytes.NewBufferString(`{"ok": false}`)),
}, nil
})
},
expectedHost: "https://127.0.0.1:6443",
},
{
name: "errorTrafficAndIPv6",
preparedFunc: func() *gomonkey.Patches {
patches := gomonkey.ApplyFunc(ctrl.GetConfigOrDie, func() *rest.Config {
return &rest.Config{
Host: "https://aa:bb:cc:dd:ee:ff:443",
}
})
patches.ApplyFunc((*http.Client).Get, func(c *http.Client, url string) (resp *http.Response, err error) {
return nil, errors.New("mock get failure")
})
return patches
},
expectedHost: "https://[::1]:6443",
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
patches := tt.preparedFunc()
defer patches.Reset()
cfg := GetConfig()
assert.Equal(t, tt.expectedHost, cfg.Host)
})
}
}
Loading