@@ -12,6 +12,35 @@ distributed under the License is distributed on an "AS IS" BASIS,
1212WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
1313See the License for the specific language governing permissions and 
1414limitations under the License. 
15+ 
16+ RESOURCE CONFLICT RESOLUTION ENHANCEMENTS: 
17+ This file has been enhanced to resolve resource conflict issues that occurred when 
18+ multiple controllers or processes attempted to update the same NonAdminBackupStorageLocationRequest 
19+ objects simultaneously. The following changes were made: 
20+ 
21+ 1. RETRY LOGIC FRAMEWORK (updateStatusWithRetry function): 
22+    - Uses standard Kubernetes client-go retry.RetryOnConflict with DefaultRetry settings 
23+    - Handles "object has been modified" errors gracefully 
24+    - Fetches fresh object copies to avoid stale ResourceVersion conflicts 
25+    - Leverages proven Kubernetes retry patterns (5 attempts, 10ms+jitter) 
26+ 
27+ 2. NIL SAFETY CHECKS (ensureNonAdminRequest function): 
28+    - Prevents panic when SourceNonAdminBSL is nil during initialization 
29+    - Converts terminal errors to requeue conditions for uninitialized status 
30+    - Allows proper status initialization timing in high-concurrency environments 
31+ 
32+ 3. OPTIMIZED STATUS UPDATES (createNonAdminRequest function): 
33+    - Uses fast-path direct updates for new objects 
34+    - Falls back to retry logic only when conflicts are detected 
35+    - Preserves computed status values while ensuring conflict resilience 
36+ 
37+ 4. TEST ENVIRONMENT ADAPTATIONS: 
38+    - Increased timeouts to accommodate retry logic execution time 
39+    - Reduced polling frequency to handle Kubernetes client rate limiting 
40+    - Added delays to prevent overwhelming API server during test runs 
41+ 
42+ These enhancements ensure that OADP non-admin backup operations complete successfully 
43+ even under high concurrency or when multiple reconciliation events occur simultaneously. 
1544*/ 
1645
1746package  controller
@@ -33,6 +62,7 @@ import (
3362	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 
3463	"k8s.io/apimachinery/pkg/runtime" 
3564	"k8s.io/apimachinery/pkg/types" 
65+ 	"k8s.io/client-go/util/retry"  // ADDED: For standard Kubernetes retry logic 
3666	ctrl "sigs.k8s.io/controller-runtime" 
3767	"sigs.k8s.io/controller-runtime/pkg/client" 
3868	"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" 
@@ -68,6 +98,58 @@ type NonAdminBackupStorageLocationReconciler struct {
6898
6999type  naBSLReconcileStepFunction  func (ctx  context.Context , logger  logr.Logger , nabsl  * nacv1alpha1.NonAdminBackupStorageLocation ) (bool , error )
70100
101+ // updateStatusWithRetry attempts to update an object's status using standard Kubernetes retry logic. 
102+ // This uses the recommended client-go retry.RetryOnConflict with retry.DefaultRetry configuration 
103+ // to handle resource conflicts that occur when multiple controllers update the same object. 
104+ // 
105+ // The retry logic addresses the error: 
106+ // "Operation cannot be fulfilled on <resource>: the object has been modified; 
107+ // 
108+ //	please apply your changes to the latest version and try again" 
109+ // 
110+ // Uses Kubernetes defaults: 
111+ //   - 5 retry attempts 
112+ //   - 10ms initial delay with linear backoff 
113+ //   - 10% jitter to prevent thundering herd 
114+ // 
115+ // Parameters: 
116+ //   - ctx: Context for cancellation and timeouts 
117+ //   - logger: Logger for debugging retry attempts 
118+ //   - obj: The object to update (used for key extraction) 
119+ //   - updateFn: Function that applies changes to the fresh object copy 
120+ // 
121+ // Returns: 
122+ //   - error: nil on success, error on failure or timeout 
123+ func  (r  * NonAdminBackupStorageLocationReconciler ) updateStatusWithRetry (ctx  context.Context , logger  logr.Logger , obj  client.Object , updateFn  func (client.Object ) bool ) error  {
124+ 	return  retry .RetryOnConflict (retry .DefaultRetry , func () error  {
125+ 		// Get the latest version of the object from the API server to ensure we have 
126+ 		// the most recent ResourceVersion and avoid stale object conflicts 
127+ 		key  :=  client .ObjectKeyFromObject (obj )
128+ 		fresh  :=  obj .DeepCopyObject ().(client.Object )
129+ 		if  err  :=  r .Get (ctx , key , fresh ); err  !=  nil  {
130+ 			return  err  // RetryOnConflict will handle conflict vs non-conflict errors 
131+ 		}
132+ 
133+ 		// Apply the update function to the fresh object copy 
134+ 		// The update function should modify the object and return true if changes were made 
135+ 		if  ! updateFn (fresh ) {
136+ 			// No update needed - the object is already in the desired state 
137+ 			return  nil 
138+ 		}
139+ 
140+ 		// Attempt the status update with the fresh object that has the latest ResourceVersion 
141+ 		// RetryOnConflict will automatically retry if this returns a conflict error 
142+ 		if  err  :=  r .Status ().Update (ctx , fresh ); err  !=  nil  {
143+ 			return  err  // Return the raw error so RetryOnConflict can identify conflicts 
144+ 		}
145+ 
146+ 		// Success - copy the updated ResourceVersion back to the original object 
147+ 		obj .SetResourceVersion (fresh .GetResourceVersion ())
148+ 		logger .V (1 ).Info ("Status update successful" )
149+ 		return  nil 
150+ 	})
151+ }
152+ 
71153// +kubebuilder:rbac:groups=velero.io,resources=backupstoragelocations,verbs=get;list;watch;create;update;patch;delete 
72154// +kubebuilder:rbac:groups=velero.io,resources=backupstoragelocations/status,verbs=get;update;patch 
73155// +kubebuilder:rbac:groups=core,resources=secrets,verbs=get;list;watch;create;update;patch;delete 
@@ -448,7 +530,10 @@ func (r *NonAdminBackupStorageLocationReconciler) ensureNonAdminRequest(
448530	updatedRejectedCondition  :=  false 
449531	updatedApprovedCondition  :=  false 
450532
451- 	if  ! reflect .DeepEqual (nabslRequest .Status .SourceNonAdminBSL .DeepCopy ().RequestedSpec , nabsl .Spec .BackupStorageLocationSpec ) {
533+ 	// Check if the NonAdminBackupStorageLocationRequest has a properly initialized status 
534+ 	// Note: We check for nil first to prevent panic when accessing SourceNonAdminBSL fields 
535+ 	if  nabslRequest .Status .SourceNonAdminBSL  !=  nil  &&  ! reflect .DeepEqual (nabslRequest .Status .SourceNonAdminBSL .DeepCopy ().RequestedSpec , nabsl .Spec .BackupStorageLocationSpec ) {
536+ 		// The spec in the request doesn't match the current spec - this indicates an invalid spec update attempt 
452537		message  =  "NaBSL Spec update not allowed. Changes will not be applied. Delete NaBSL and create new one with updated spec" 
453538		updatedRejectedCondition  =  meta .SetStatusCondition (& nabsl .Status .Conditions , metav1.Condition {
454539			Type :    string (nacv1alpha1 .NonAdminBSLConditionSpecUpdateApproved ),
@@ -460,6 +545,17 @@ func (r *NonAdminBackupStorageLocationReconciler) ensureNonAdminRequest(
460545		// Ensure the phase is not changed from the current nabsl phase 
461546		expectedPhase  =  nabsl .Status .Phase 
462547		terminalErr  =  reconcile .TerminalError (errors .New (message ))
548+ 	} else  if  nabslRequest .Status .SourceNonAdminBSL  ==  nil  {
549+ 		// CRITICAL FIX: Handle the case where SourceNonAdminBSL is nil 
550+ 		// This can happen during the initialization phase when: 
551+ 		// 1. The NonAdminBackupStorageLocationRequest object has been created 
552+ 		// 2. But its status hasn't been updated yet due to timing or retry logic 
553+ 		// 3. Our retry mechanism re-fetches the object before status initialization completes 
554+ 		// 
555+ 		// Instead of treating this as a terminal error (which would prevent progress), 
556+ 		// we requeue the reconciliation to allow the status to be properly initialized 
557+ 		logger .V (1 ).Info ("NonAdminBackupStorageLocationRequest status not yet initialized, requeuing..." )
558+ 		return  true , nil  // Requeue instead of terminal error - allows initialization to complete 
463559	} else  if  nabslRequest .Status .SourceNonAdminBSL .NACUUID  ==  constant .EmptyString  ||  nabslRequest .Status .SourceNonAdminBSL .NACUUID  !=  nabsl .Status .VeleroBackupStorageLocation .NACUUID  {
464560		message  =  "NonAdminBackupStorageLocationRequest does not contain valid NAC UUID and can not be approved" 
465561		updatedRejectedCondition  =  meta .SetStatusCondition (& nabsl .Status .Conditions , metav1.Condition {
@@ -537,14 +633,21 @@ func (r *NonAdminBackupStorageLocationReconciler) createNonAdminRequest(ctx cont
537633	}
538634
539635	if  nabslRequest  !=  nil  {
540- 		// We allow only to update the phase of the NonAdminBackupStorageLocationRequest 
541- 		// and not the spec 
636+ 		// EXISTING REQUEST UPDATE WITH RETRY LOGIC: 
637+ 		// The NonAdminBackupStorageLocationRequest already exists, we only need to update its phase 
638+ 		// based on the current approval decision. We don't allow spec updates on existing requests. 
639+ 		// 
640+ 		// This is where resource conflicts commonly occurred before our fix: 
641+ 		// - Multiple reconcile loops trying to update the same request status 
642+ 		// - Admin approval processes modifying the request while controller is updating it 
643+ 		// - Event-driven reconciliation causing concurrent status updates 
542644		logger .V (1 ).Info ("NonAdminBackupStorageLocationRequest already exists" )
543- 		if  updatePhaseIfNeeded (& nabslRequest .Status .Phase , nabslRequest .Spec .ApprovalDecision ) {
544- 			if  updateErr  :=  r .Status ().Update (ctx , nabslRequest ); updateErr  !=  nil  {
545- 				logger .Error (updateErr , failedUpdateStatusError )
546- 				return  false , updateErr 
547- 			}
645+ 		if  updateErr  :=  r .updateStatusWithRetry (ctx , logger , nabslRequest , func (obj  client.Object ) bool  {
646+ 			req  :=  obj .(* nacv1alpha1.NonAdminBackupStorageLocationRequest )
647+ 			return  updatePhaseIfNeeded (& req .Status .Phase , req .Spec .ApprovalDecision )
648+ 		}); updateErr  !=  nil  {
649+ 			logger .Error (updateErr , failedUpdateStatusError )
650+ 			return  false , updateErr 
548651		}
549652
550653		if  ! r .RequireApprovalForBSL  &&  nabslRequest .Spec .ApprovalDecision  !=  nacv1alpha1 .NonAdminBSLRequestApproved  {
@@ -585,10 +688,42 @@ func (r *NonAdminBackupStorageLocationReconciler) createNonAdminRequest(ctx cont
585688		return  false , err 
586689	}
587690
691+ 	// NEW REQUEST STATUS UPDATE WITH OPTIMIZED RETRY STRATEGY: 
692+ 	// For newly created NonAdminBackupStorageLocationRequest objects, we use a two-phase approach: 
693+ 	// 1. Try direct status update first (fast path for normal cases) 
694+ 	// 2. Fall back to retry logic only if we encounter resource conflicts 
695+ 	// 
696+ 	// This optimization is important because: 
697+ 	// - Most new object updates succeed on first try 
698+ 	// - Retry logic with object re-fetching can lose local state 
699+ 	// - We want to preserve the status we just computed in updateNonAdminRequestStatus 
700+ 	// 
701+ 	// The hybrid approach gives us: 
702+ 	// - Performance: Fast path for the common case 
703+ 	// - Resilience: Retry logic for conflict scenarios 
704+ 	// - Correctness: Proper status initialization even under load 
588705	if  updated  :=  updateNonAdminRequestStatus (& nonAdminBslRequest .Status , nabsl , approvalDecision ); updated  {
589706		if  updateErr  :=  r .Status ().Update (ctx , & nonAdminBslRequest ); updateErr  !=  nil  {
590- 			logger .Error (updateErr , failedUpdateStatusError )
591- 			return  false , updateErr 
707+ 			if  apierrors .IsConflict (updateErr ) {
708+ 				// CONFLICT DETECTED: Another process modified the request between create and status update 
709+ 				// This can happen when: 
710+ 				// - Admin approves/rejects the request immediately after creation 
711+ 				// - Multiple reconcile loops are triggered by related events 
712+ 				// - High concurrency in the test environment 
713+ 				logger .V (1 ).Info ("Conflict on initial status update, retrying with fresh object..." )
714+ 				if  retryErr  :=  r .updateStatusWithRetry (ctx , logger , & nonAdminBslRequest , func (obj  client.Object ) bool  {
715+ 					req  :=  obj .(* nacv1alpha1.NonAdminBackupStorageLocationRequest )
716+ 					return  updateNonAdminRequestStatus (& req .Status , nabsl , approvalDecision )
717+ 				}); retryErr  !=  nil  {
718+ 					logger .Error (retryErr , failedUpdateStatusError )
719+ 					return  false , retryErr 
720+ 				}
721+ 			} else  {
722+ 				// NON-CONFLICT ERROR: Validation, permission, or other API server issue 
723+ 				// Don't retry these as they're likely to persist 
724+ 				logger .Error (updateErr , failedUpdateStatusError )
725+ 				return  false , updateErr 
726+ 			}
592727		}
593728	}
594729
0 commit comments