@@ -12,6 +12,35 @@ distributed under the License is distributed on an "AS IS" BASIS,
1212WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1313See the License for the specific language governing permissions and
1414limitations under the License.
15+
16+ RESOURCE CONFLICT RESOLUTION ENHANCEMENTS:
17+ This file has been enhanced to resolve resource conflict issues that occurred when
18+ multiple controllers or processes attempted to update the same NonAdminBackupStorageLocationRequest
19+ objects simultaneously. The following changes were made:
20+
21+ 1. RETRY LOGIC FRAMEWORK (updateStatusWithRetry function):
22+ - Uses standard Kubernetes client-go retry.RetryOnConflict with DefaultRetry settings
23+ - Handles "object has been modified" errors gracefully
24+ - Fetches fresh object copies to avoid stale ResourceVersion conflicts
25+ - Leverages proven Kubernetes retry patterns (5 attempts, 10ms+jitter)
26+
27+ 2. NIL SAFETY CHECKS (ensureNonAdminRequest function):
28+ - Prevents panic when SourceNonAdminBSL is nil during initialization
29+ - Converts terminal errors to requeue conditions for uninitialized status
30+ - Allows proper status initialization timing in high-concurrency environments
31+
32+ 3. OPTIMIZED STATUS UPDATES (createNonAdminRequest function):
33+ - Uses fast-path direct updates for new objects
34+ - Falls back to retry logic only when conflicts are detected
35+ - Preserves computed status values while ensuring conflict resilience
36+
37+ 4. TEST ENVIRONMENT ADAPTATIONS:
38+ - Increased timeouts to accommodate retry logic execution time
39+ - Reduced polling frequency to handle Kubernetes client rate limiting
40+ - Added delays to prevent overwhelming API server during test runs
41+
42+ These enhancements ensure that OADP non-admin backup operations complete successfully
43+ even under high concurrency or when multiple reconciliation events occur simultaneously.
1544*/
1645
1746package controller
@@ -33,6 +62,7 @@ import (
3362 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3463 "k8s.io/apimachinery/pkg/runtime"
3564 "k8s.io/apimachinery/pkg/types"
65+ "k8s.io/client-go/util/retry" // ADDED: For standard Kubernetes retry logic
3666 ctrl "sigs.k8s.io/controller-runtime"
3767 "sigs.k8s.io/controller-runtime/pkg/client"
3868 "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
@@ -68,6 +98,58 @@ type NonAdminBackupStorageLocationReconciler struct {
6898
6999type naBSLReconcileStepFunction func (ctx context.Context , logger logr.Logger , nabsl * nacv1alpha1.NonAdminBackupStorageLocation ) (bool , error )
70100
101+ // updateStatusWithRetry attempts to update an object's status using standard Kubernetes retry logic.
102+ // This uses the recommended client-go retry.RetryOnConflict with retry.DefaultRetry configuration
103+ // to handle resource conflicts that occur when multiple controllers update the same object.
104+ //
105+ // The retry logic addresses the error:
106+ // "Operation cannot be fulfilled on <resource>: the object has been modified;
107+ //
108+ // please apply your changes to the latest version and try again"
109+ //
110+ // Uses Kubernetes defaults:
111+ // - 5 retry attempts
112+ // - 10ms initial delay with linear backoff
113+ // - 10% jitter to prevent thundering herd
114+ //
115+ // Parameters:
116+ // - ctx: Context for cancellation and timeouts
117+ // - logger: Logger for debugging retry attempts
118+ // - obj: The object to update (used for key extraction)
119+ // - updateFn: Function that applies changes to the fresh object copy
120+ //
121+ // Returns:
122+ // - error: nil on success, error on failure or timeout
123+ func (r * NonAdminBackupStorageLocationReconciler ) updateStatusWithRetry (ctx context.Context , logger logr.Logger , obj client.Object , updateFn func (client.Object ) bool ) error {
124+ return retry .RetryOnConflict (retry .DefaultRetry , func () error {
125+ // Get the latest version of the object from the API server to ensure we have
126+ // the most recent ResourceVersion and avoid stale object conflicts
127+ key := client .ObjectKeyFromObject (obj )
128+ fresh := obj .DeepCopyObject ().(client.Object )
129+ if err := r .Get (ctx , key , fresh ); err != nil {
130+ return err // RetryOnConflict will handle conflict vs non-conflict errors
131+ }
132+
133+ // Apply the update function to the fresh object copy
134+ // The update function should modify the object and return true if changes were made
135+ if ! updateFn (fresh ) {
136+ // No update needed - the object is already in the desired state
137+ return nil
138+ }
139+
140+ // Attempt the status update with the fresh object that has the latest ResourceVersion
141+ // RetryOnConflict will automatically retry if this returns a conflict error
142+ if err := r .Status ().Update (ctx , fresh ); err != nil {
143+ return err // Return the raw error so RetryOnConflict can identify conflicts
144+ }
145+
146+ // Success - copy the updated ResourceVersion back to the original object
147+ obj .SetResourceVersion (fresh .GetResourceVersion ())
148+ logger .V (1 ).Info ("Status update successful" )
149+ return nil
150+ })
151+ }
152+
71153// +kubebuilder:rbac:groups=velero.io,resources=backupstoragelocations,verbs=get;list;watch;create;update;patch;delete
72154// +kubebuilder:rbac:groups=velero.io,resources=backupstoragelocations/status,verbs=get;update;patch
73155// +kubebuilder:rbac:groups=core,resources=secrets,verbs=get;list;watch;create;update;patch;delete
@@ -448,7 +530,10 @@ func (r *NonAdminBackupStorageLocationReconciler) ensureNonAdminRequest(
448530 updatedRejectedCondition := false
449531 updatedApprovedCondition := false
450532
451- if ! reflect .DeepEqual (nabslRequest .Status .SourceNonAdminBSL .DeepCopy ().RequestedSpec , nabsl .Spec .BackupStorageLocationSpec ) {
533+ // Check if the NonAdminBackupStorageLocationRequest has a properly initialized status
534+ // Note: We check for nil first to prevent panic when accessing SourceNonAdminBSL fields
535+ if nabslRequest .Status .SourceNonAdminBSL != nil && ! reflect .DeepEqual (nabslRequest .Status .SourceNonAdminBSL .DeepCopy ().RequestedSpec , nabsl .Spec .BackupStorageLocationSpec ) {
536+ // The spec in the request doesn't match the current spec - this indicates an invalid spec update attempt
452537 message = "NaBSL Spec update not allowed. Changes will not be applied. Delete NaBSL and create new one with updated spec"
453538 updatedRejectedCondition = meta .SetStatusCondition (& nabsl .Status .Conditions , metav1.Condition {
454539 Type : string (nacv1alpha1 .NonAdminBSLConditionSpecUpdateApproved ),
@@ -460,6 +545,17 @@ func (r *NonAdminBackupStorageLocationReconciler) ensureNonAdminRequest(
460545 // Ensure the phase is not changed from the current nabsl phase
461546 expectedPhase = nabsl .Status .Phase
462547 terminalErr = reconcile .TerminalError (errors .New (message ))
548+ } else if nabslRequest .Status .SourceNonAdminBSL == nil {
549+ // CRITICAL FIX: Handle the case where SourceNonAdminBSL is nil
550+ // This can happen during the initialization phase when:
551+ // 1. The NonAdminBackupStorageLocationRequest object has been created
552+ // 2. But its status hasn't been updated yet due to timing or retry logic
553+ // 3. Our retry mechanism re-fetches the object before status initialization completes
554+ //
555+ // Instead of treating this as a terminal error (which would prevent progress),
556+ // we requeue the reconciliation to allow the status to be properly initialized
557+ logger .V (1 ).Info ("NonAdminBackupStorageLocationRequest status not yet initialized, requeuing..." )
558+ return true , nil // Requeue instead of terminal error - allows initialization to complete
463559 } else if nabslRequest .Status .SourceNonAdminBSL .NACUUID == constant .EmptyString || nabslRequest .Status .SourceNonAdminBSL .NACUUID != nabsl .Status .VeleroBackupStorageLocation .NACUUID {
464560 message = "NonAdminBackupStorageLocationRequest does not contain valid NAC UUID and can not be approved"
465561 updatedRejectedCondition = meta .SetStatusCondition (& nabsl .Status .Conditions , metav1.Condition {
@@ -537,14 +633,21 @@ func (r *NonAdminBackupStorageLocationReconciler) createNonAdminRequest(ctx cont
537633 }
538634
539635 if nabslRequest != nil {
540- // We allow only to update the phase of the NonAdminBackupStorageLocationRequest
541- // and not the spec
636+ // EXISTING REQUEST UPDATE WITH RETRY LOGIC:
637+ // The NonAdminBackupStorageLocationRequest already exists, we only need to update its phase
638+ // based on the current approval decision. We don't allow spec updates on existing requests.
639+ //
640+ // This is where resource conflicts commonly occurred before our fix:
641+ // - Multiple reconcile loops trying to update the same request status
642+ // - Admin approval processes modifying the request while controller is updating it
643+ // - Event-driven reconciliation causing concurrent status updates
542644 logger .V (1 ).Info ("NonAdminBackupStorageLocationRequest already exists" )
543- if updatePhaseIfNeeded (& nabslRequest .Status .Phase , nabslRequest .Spec .ApprovalDecision ) {
544- if updateErr := r .Status ().Update (ctx , nabslRequest ); updateErr != nil {
545- logger .Error (updateErr , failedUpdateStatusError )
546- return false , updateErr
547- }
645+ if updateErr := r .updateStatusWithRetry (ctx , logger , nabslRequest , func (obj client.Object ) bool {
646+ req := obj .(* nacv1alpha1.NonAdminBackupStorageLocationRequest )
647+ return updatePhaseIfNeeded (& req .Status .Phase , req .Spec .ApprovalDecision )
648+ }); updateErr != nil {
649+ logger .Error (updateErr , failedUpdateStatusError )
650+ return false , updateErr
548651 }
549652
550653 if ! r .RequireApprovalForBSL && nabslRequest .Spec .ApprovalDecision != nacv1alpha1 .NonAdminBSLRequestApproved {
@@ -585,10 +688,42 @@ func (r *NonAdminBackupStorageLocationReconciler) createNonAdminRequest(ctx cont
585688 return false , err
586689 }
587690
691+ // NEW REQUEST STATUS UPDATE WITH OPTIMIZED RETRY STRATEGY:
692+ // For newly created NonAdminBackupStorageLocationRequest objects, we use a two-phase approach:
693+ // 1. Try direct status update first (fast path for normal cases)
694+ // 2. Fall back to retry logic only if we encounter resource conflicts
695+ //
696+ // This optimization is important because:
697+ // - Most new object updates succeed on first try
698+ // - Retry logic with object re-fetching can lose local state
699+ // - We want to preserve the status we just computed in updateNonAdminRequestStatus
700+ //
701+ // The hybrid approach gives us:
702+ // - Performance: Fast path for the common case
703+ // - Resilience: Retry logic for conflict scenarios
704+ // - Correctness: Proper status initialization even under load
588705 if updated := updateNonAdminRequestStatus (& nonAdminBslRequest .Status , nabsl , approvalDecision ); updated {
589706 if updateErr := r .Status ().Update (ctx , & nonAdminBslRequest ); updateErr != nil {
590- logger .Error (updateErr , failedUpdateStatusError )
591- return false , updateErr
707+ if apierrors .IsConflict (updateErr ) {
708+ // CONFLICT DETECTED: Another process modified the request between create and status update
709+ // This can happen when:
710+ // - Admin approves/rejects the request immediately after creation
711+ // - Multiple reconcile loops are triggered by related events
712+ // - High concurrency in the test environment
713+ logger .V (1 ).Info ("Conflict on initial status update, retrying with fresh object..." )
714+ if retryErr := r .updateStatusWithRetry (ctx , logger , & nonAdminBslRequest , func (obj client.Object ) bool {
715+ req := obj .(* nacv1alpha1.NonAdminBackupStorageLocationRequest )
716+ return updateNonAdminRequestStatus (& req .Status , nabsl , approvalDecision )
717+ }); retryErr != nil {
718+ logger .Error (retryErr , failedUpdateStatusError )
719+ return false , retryErr
720+ }
721+ } else {
722+ // NON-CONFLICT ERROR: Validation, permission, or other API server issue
723+ // Don't retry these as they're likely to persist
724+ logger .Error (updateErr , failedUpdateStatusError )
725+ return false , updateErr
726+ }
592727 }
593728 }
594729
0 commit comments