openshift · openshift-merge-bot · Dec 29, 2025 · Dec 22, 2025
diff --git a/...oke-kpis/hacks/clean-up/telcov10n-metal-single-node-spoke-kpis-hacks-clean-up-commands.sh b/...oke-kpis/hacks/clean-up/telcov10n-metal-single-node-spoke-kpis-hacks-clean-up-commands.sh
@@ -15,16 +15,11 @@ function release_locked_host {
   network_spoke_mac_address=$(cat $SHARED_DIR/hosts.yaml|grep 'mac:'|awk -F'mac:' '{print $2}'|tr -d '[:blank:]')
   local spoke_lock_filename="/var/run/lock/ztp-baremetal-pool/spoke-baremetal-${network_spoke_mac_address//:/-}.lock"
 
-  echo "************ telcov10n Releasing Lock for the host used by this Spoke cluster deployemnt ************"
+  echo "************ telcov10n Releasing Lock for the host used by this Spoke cluster deployment ************"
 
   set -x
-  timeout -s 9 10m ssh "${SSHOPTS[@]}" "root@${AUX_HOST}" bash -s --  \
-    "${spoke_lock_filename}" << 'EOF'
-set -o nounset
-set -o errexit
-set -o pipefail
-sudo rm -fv ${1}
-EOF
+  timeout -s 9 10m ssh "${SSHOPTS[@]}" "root@${AUX_HOST}" \
+    "sudo rm -fv ${spoke_lock_filename} && echo 'Lock released successfully.'"
   set +x
 }
 
@@ -48,17 +43,44 @@ function server_poweroff {
 
 }
 
+function cleanup_waiting_file {
+  # Always clean up our waiting file, regardless of lock status
+
+  if [ ! -f "${SHARED_DIR}/own_waiting_file.txt" ]; then
+    echo "[INFO] No waiting file to clean up."
+    return 0
+  fi
+
+  local own_waiting_file
+  own_waiting_file=$(cat "${SHARED_DIR}/own_waiting_file.txt")
+
+  if [ -z "${own_waiting_file}" ]; then
+    echo "[INFO] Waiting file path is empty, nothing to clean up."
+    return 0
+  fi
+
+  echo "************ telcov10n Cleaning up waiting file ************"
+  echo "[INFO] Removing waiting file: ${own_waiting_file}"
+
+  timeout -s 9 2m ssh "${SSHOPTS[@]}" "root@${AUX_HOST}" \
+    "rm -fv ${own_waiting_file} 2>/dev/null || true"
+}
+
 function main {
 
+  # Setup SSH access once for all operations
+  setup_aux_host_ssh_access
+
+  # Always clean up our waiting file first (handles both timeout and normal cases)
+  cleanup_waiting_file
+
+  # Only do lock-related cleanup if we hold the lock
   local does_the_current_job_hold_a_lock_to_use_a_baremetal_server
   does_the_current_job_hold_a_lock_to_use_a_baremetal_server=$( \
     cat ${SHARED_DIR}/do_you_hold_the_lock_for_the_sno_spoke_cluster_server.txt || echo "no")
 
   if [ "${does_the_current_job_hold_a_lock_to_use_a_baremetal_server}" == "yes" ]; then
-
-    setup_aux_host_ssh_access
     server_poweroff
-
     # This must be run the latest one since it releases its server lock
     release_locked_host
   fi

diff --git a/...e-spoke-kpis/hacks/deploy/telcov10n-metal-single-node-spoke-kpis-hacks-deploy-commands.sh b/...e-spoke-kpis/hacks/deploy/telcov10n-metal-single-node-spoke-kpis-hacks-deploy-commands.sh
@@ -9,6 +9,27 @@ echo "************ telcov10n Fix user IDs in a container ************"
 
 source ${SHARED_DIR}/common-telcov10n-bash-functions.sh
 
+function extract_and_set_ocp_version {
+
+  echo "************ telcov10n Extracting OCP version from JOB_NAME ************"
+
+  echo "[INFO] JOB_NAME: ${JOB_NAME:-not set}"
+
+  OCP_VERSION=$(extract_ocp_version)
+
+  if [ -z "${OCP_VERSION}" ]; then
+    echo "[ERROR] Could not extract OCP version from JOB_NAME"
+    exit 1
+  fi
+
+  echo "[INFO] OCP Version: ${OCP_VERSION}"
+
+  # Store OCP version for other steps
+  echo -n "${OCP_VERSION}" >| ${SHARED_DIR}/ocp_version.txt
+
+  export OCP_VERSION
+}
+
 function define_spoke_cluster_name {
 
   #### Spoke cluster
@@ -36,6 +57,83 @@ function set_spoke_cluster_kubeconfig {
   export KUBECONFIG="${SHARED_DIR}/spoke-${secret_kubeconfig}.yaml"
 }
 
+# Track if we've already created the waiting request file (stored path for cleanup)
+WAITING_FILE_PATH=""
+
+function create_waiting_request_on_bastion {
+
+  local spoke_lock_filename="${1}"
+
+  # Only create the waiting file once per session
+  # Each job gets a unique file (with timestamp) that only it will delete
+  if [ -n "${WAITING_FILE_PATH}" ]; then
+    return 0
+  fi
+
+  echo
+  echo "************ telcov10n Registering wait request before lock attempt ************"
+  echo
+
+  local waiting_file
+  waiting_file=$(create_waiting_request_file "${AUX_HOST}" "${spoke_lock_filename}" "${OCP_VERSION}")
+
+  if [ -n "${waiting_file}" ]; then
+    echo "[INFO] Created waiting request file: ${waiting_file}"
+    echo "       This signals that a job with OCP version ${OCP_VERSION} is waiting."
+    WAITING_FILE_PATH="${waiting_file}"
+    # Store the path in SHARED_DIR for cleanup step
+    echo -n "${waiting_file}" >| ${SHARED_DIR}/own_waiting_file.txt
+  else
+    echo "[WARNING] Failed to create waiting request file."
+  fi
+
+  echo
+}
+
+function validate_lock_for_higher_priority {
+
+  local spoke_lock_filename="${1}"
+
+  echo
+  echo "************ telcov10n Validating lock acquisition for priority ************"
+  echo
+
+  # Check if there's a higher priority job waiting BEFORE removing our waiting file
+  # This way, if we need to release the lock, our waiting file stays intact
+  local check_result
+  check_result=$(check_for_higher_priority_waiter "${AUX_HOST}" "${spoke_lock_filename}" "${OCP_VERSION}")
+
+  if [[ "${check_result}" == quit:* ]]; then
+    local higher_version=${check_result#quit:}
+    echo
+    echo "[WARNING] Lock acquired but a higher priority job is waiting!"
+    echo "          Current job version: ${OCP_VERSION}"
+    echo "          Higher version waiting: ${higher_version}"
+    echo "          Releasing lock to allow higher priority job to proceed..."
+    echo "          (Keeping own waiting file for next attempt)"
+    echo
+    # Release the lock to let the higher priority job acquire it
+    # Keep our waiting file - we're still waiting!
+    timeout -s 9 10m ssh "${SSHOPTS[@]}" "root@${AUX_HOST}" "rm -fv ${spoke_lock_filename}"
+    return 1
+  fi
+
+  echo "[INFO] No higher priority jobs waiting. Proceeding with lock."
+
+  # NOW remove our waiting file since we're proceeding
+  if [ -n "${WAITING_FILE_PATH}" ]; then
+    echo "[INFO] Removing own waiting file: ${WAITING_FILE_PATH}"
+    remove_own_waiting_file "${AUX_HOST}" "${WAITING_FILE_PATH}"
+    WAITING_FILE_PATH=""
+    rm -f ${SHARED_DIR}/own_waiting_file.txt 2>/dev/null || true
+  fi
+
+  # Store lock filename for later use by other steps
+  echo -n "${spoke_lock_filename}" >| ${SHARED_DIR}/spoke_lock_filename.txt
+
+  return 0
+}
+
 function select_baremetal_host_from_pool {
 
   echo "************ telcov10n select a baremetal host from the pool ************"
@@ -60,13 +158,24 @@ function select_baremetal_host_from_pool {
       local network_spoke_mac_address
       network_spoke_mac_address="$(cat ${baremetal_host_path}/network_spoke_mac_address)"
       local spoke_lock_filename="/var/run/lock/ztp-baremetal-pool/spoke-baremetal-${network_spoke_mac_address//:/-}.lock"
+
+      # Create waiting request file BEFORE trying to acquire lock (only once)
+      # This ensures our presence is visible even if we immediately get the lock
+      create_waiting_request_on_bastion "${spoke_lock_filename}"
+
       try_to_lock_host "${AUX_HOST}" "${spoke_lock_filename}" "${host_lock_timestamp}" "${LOCK_TIMEOUT}"
-      [[ "$(check_the_host_was_locked "${AUX_HOST}" "${spoke_lock_filename}" "${host_lock_timestamp}")" == "locked" ]] &&
-      {
-        update_host_and_master_yaml_files "$(dirname ${host})" ;
-        echo -n "yes" >| ${SHARED_DIR}/do_you_hold_the_lock_for_the_sno_spoke_cluster_server.txt
-        return 0 ;
-      }
+      if [[ "$(check_the_host_was_locked "${AUX_HOST}" "${spoke_lock_filename}" "${host_lock_timestamp}")" == "locked" ]]; then
+        # Validate that no higher priority job is waiting
+        if validate_lock_for_higher_priority "${spoke_lock_filename}"; then
+          update_host_and_master_yaml_files "$(dirname ${host})"
+          echo -n "yes" >| ${SHARED_DIR}/do_you_hold_the_lock_for_the_sno_spoke_cluster_server.txt
+          return 0
+        else
+          # Higher priority job is waiting, lock was released
+          # Our waiting file is still intact (not removed until validation passes)
+          echo "[INFO] Will retry acquiring lock..."
+        fi
+      fi
     fi
   done
 
@@ -218,6 +327,7 @@ function hack_spoke_deployment {
 function main {
 
   setup_aux_host_ssh_access
+  extract_and_set_ocp_version
   define_spoke_cluster_name
   set_spoke_cluster_kubeconfig
   hack_spoke_deployment

diff --git a/...node-spoke-kpis/hacks/deploy/telcov10n-metal-single-node-spoke-kpis-hacks-deploy-ref.yaml b/...node-spoke-kpis/hacks/deploy/telcov10n-metal-single-node-spoke-kpis-hacks-deploy-ref.yaml
@@ -46,4 +46,6 @@ ref:
       If cluster endpoints are reachables through a socks5 proxy
   documentation: |-
     This step allows to adapt the SNO Spoke cluster deployment for
-    the new baremetal server pool in the new lab location
+    the new baremetal server pool in the new lab location.
+    OCP version is automatically extracted from RELEASE_IMAGE_LATEST for
+    graceful quit priority when multiple jobs compete for the same baremetal host.
diff --git a/...oke-kpis/tests/cpu-util/telcov10n-metal-single-node-spoke-kpis-tests-cpu-util-commands.sh b/...oke-kpis/tests/cpu-util/telcov10n-metal-single-node-spoke-kpis-tests-cpu-util-commands.sh
@@ -247,14 +247,21 @@ function test_kpis {
 
   echo "************ telcov10n Run CPU Utilization Telco KPIs test ************"
 
+  # Check for graceful quit request before starting this test
+  check_for_quit "cpu_utils_test" "graceful"
+
   make_up_inventory
   make_up_remote_test_command
   make_up_ansible_playbook
   run_ansible_playbook
   setup_test_result_for_component_readiness
+
+  # Mark successful completion
+  echo -n "completed" >| ${SHARED_DIR}/cpu_util_test_status.txt
 }
 
 function main {
+  setup_ssh_and_lock_info
   set_spoke_cluster_kubeconfig
   test_kpis
 }

diff --git a/...-spoke-kpis/tests/cpu-util/telcov10n-metal-single-node-spoke-kpis-tests-cpu-util-ref.yaml b/...-spoke-kpis/tests/cpu-util/telcov10n-metal-single-node-spoke-kpis-tests-cpu-util-ref.yaml
@@ -53,3 +53,7 @@ ref:
       [see: https://github.com/neisw/ci-test-mapping/blob/main/pkg/components/telcoperformance/component.go]
   documentation: |-
     This step allows to verify the SNO Spoke cluster deployed through its kubeconfig.
+    OCP version is loaded from SHARED_DIR/ocp_version.txt (set by deploy step).
+    If a graceful quit is requested by a higher version job, this test will be skipped
+    to release the baremetal host lock faster. The oslat test will have already completed
+    by this point, so PTP reporting can still collect those results.
diff --git a/...ode-spoke-kpis/tests/oslat/telcov10n-metal-single-node-spoke-kpis-tests-oslat-commands.sh b/...ode-spoke-kpis/tests/oslat/telcov10n-metal-single-node-spoke-kpis-tests-oslat-commands.sh
@@ -239,14 +239,21 @@ function test_kpis {
 
   echo "************ telcov10n Run oslat Telco KPIs test ************"
 
+  # Check for graceful quit request before starting this test
+  check_for_quit "oslat_test" "graceful"
+
   make_up_inventory
   make_up_remote_test_command
   make_up_ansible_playbook
   run_ansible_playbook
   setup_test_result_for_component_readiness
+
+  # Mark successful completion
+  echo -n "completed" >| ${SHARED_DIR}/oslat_test_status.txt
 }
 
 function main {
+  setup_ssh_and_lock_info
   set_spoke_cluster_kubeconfig
   test_kpis
 }

diff --git a/...e-node-spoke-kpis/tests/oslat/telcov10n-metal-single-node-spoke-kpis-tests-oslat-ref.yaml b/...e-node-spoke-kpis/tests/oslat/telcov10n-metal-single-node-spoke-kpis-tests-oslat-ref.yaml
@@ -46,3 +46,6 @@ ref:
       [see: https://github.com/neisw/ci-test-mapping/blob/main/pkg/components/telcoperformance/component.go]
   documentation: |-
     This step allows to verify the SNO Spoke cluster deployed through its kubeconfig.
+    OCP version is loaded from SHARED_DIR/ocp_version.txt (set by deploy step).
+    If a graceful quit is requested by a higher version job, this test will be skipped
+    and the job will exit gracefully to release the baremetal host lock.
diff --git a/...-node-spoke/cluster/install/telcov10n-metal-single-node-spoke-cluster-install-commands.sh b/...-node-spoke/cluster/install/telcov10n-metal-single-node-spoke-cluster-install-commands.sh
@@ -160,6 +160,10 @@ function checking_installation_progress {
   timeout=$(date -d "${ABORT_INSTALLATION_TIMEOUT}" +%s)
   abort_installation=/tmp/abort.installation
 
+  # Counter for quit check - only check every QUIT_CHECK_INTERVAL iterations
+  local quit_check_counter=0
+  local quit_check_interval="${QUIT_CHECK_INTERVAL:-3}"
+
   while true; do
 
     test -f ${abort_installation} && {
@@ -203,6 +207,14 @@ function checking_installation_progress {
         echo "$ touch ${abort_installation}"
       fi
 
+      # Check for quit request every N iterations (QUIT_CHECK_INTERVAL)
+      # Use "force" mode since if interrupted, the rest of the steps are meaningless (cluster not ready)
+      ((quit_check_counter++))
+      if [ "${quit_check_counter}" -ge "${quit_check_interval}" ]; then
+        check_for_quit "cluster_installation_progress" "force"
+        quit_check_counter=0
+      fi
+
       sleep ${refresh_timing:="10m"} ;
     } || echo
   done
@@ -238,6 +250,10 @@ function get_and_save_kubeconfig_and_creds {
 }
 
 function main {
+
+  # Setup SSH and load lock info for quit checks
+  setup_ssh_and_lock_info
+
   set_hub_cluster_kubeconfig
   generate_cluster_image_set
   create_spoke_namespace

diff --git a/...gle-node-spoke/cluster/install/telcov10n-metal-single-node-spoke-cluster-install-ref.yaml b/...gle-node-spoke/cluster/install/telcov10n-metal-single-node-spoke-cluster-install-ref.yaml
@@ -39,6 +39,14 @@ ref:
       Set the amount of time, the step must wait at most, before unconditionally forcing the workflow to continue
       with the next steps which will clean up and free all resources used in the deployment of this cluster
       (MIN: "REFRESH_TIME value", default: "1 hours + 45 min", MAX: "2 hours - REFRESH_TIME value").
+  - name: QUIT_CHECK_INTERVAL
+    default: "3"
+    documentation: |-
+      Number of REFRESH_TIME iterations between each quit condition check. A higher priority job (newer OCP version)
+      waiting for the baremetal host lock can request the current job to quit. This setting controls how often
+      the installation loop checks for such quit requests. Uses "force" mode (exit 1) since an interrupted
+      installation leaves the cluster unusable. Default is 3 (check every 3 iterations, i.e., every 9 min
+      with default REFRESH_TIME of 3m).
   - name: BIOS_SETTINGS
     default: "{}"
     documentation: |-