Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,11 @@ function release_locked_host {
network_spoke_mac_address=$(cat $SHARED_DIR/hosts.yaml|grep 'mac:'|awk -F'mac:' '{print $2}'|tr -d '[:blank:]')
local spoke_lock_filename="/var/run/lock/ztp-baremetal-pool/spoke-baremetal-${network_spoke_mac_address//:/-}.lock"

echo "************ telcov10n Releasing Lock for the host used by this Spoke cluster deployemnt ************"
echo "************ telcov10n Releasing Lock for the host used by this Spoke cluster deployment ************"

set -x
timeout -s 9 10m ssh "${SSHOPTS[@]}" "root@${AUX_HOST}" bash -s -- \
"${spoke_lock_filename}" << 'EOF'
set -o nounset
set -o errexit
set -o pipefail
sudo rm -fv ${1}
EOF
timeout -s 9 10m ssh "${SSHOPTS[@]}" "root@${AUX_HOST}" \
"sudo rm -fv ${spoke_lock_filename} && echo 'Lock released successfully.'"
set +x
}

Expand All @@ -48,17 +43,44 @@ function server_poweroff {

}

function cleanup_waiting_file {
# Always clean up our waiting file, regardless of lock status

if [ ! -f "${SHARED_DIR}/own_waiting_file.txt" ]; then
echo "[INFO] No waiting file to clean up."
return 0
fi

local own_waiting_file
own_waiting_file=$(cat "${SHARED_DIR}/own_waiting_file.txt")

if [ -z "${own_waiting_file}" ]; then
echo "[INFO] Waiting file path is empty, nothing to clean up."
return 0
fi

echo "************ telcov10n Cleaning up waiting file ************"
echo "[INFO] Removing waiting file: ${own_waiting_file}"

timeout -s 9 2m ssh "${SSHOPTS[@]}" "root@${AUX_HOST}" \
"rm -fv ${own_waiting_file} 2>/dev/null || true"
}

function main {

# Setup SSH access once for all operations
setup_aux_host_ssh_access

# Always clean up our waiting file first (handles both timeout and normal cases)
cleanup_waiting_file

# Only do lock-related cleanup if we hold the lock
local does_the_current_job_hold_a_lock_to_use_a_baremetal_server
does_the_current_job_hold_a_lock_to_use_a_baremetal_server=$( \
cat ${SHARED_DIR}/do_you_hold_the_lock_for_the_sno_spoke_cluster_server.txt || echo "no")

if [ "${does_the_current_job_hold_a_lock_to_use_a_baremetal_server}" == "yes" ]; then

setup_aux_host_ssh_access
server_poweroff

# This must be run the latest one since it releases its server lock
release_locked_host
fi
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,27 @@ echo "************ telcov10n Fix user IDs in a container ************"

source ${SHARED_DIR}/common-telcov10n-bash-functions.sh

function extract_and_set_ocp_version {

echo "************ telcov10n Extracting OCP version from JOB_NAME ************"

echo "[INFO] JOB_NAME: ${JOB_NAME:-not set}"

OCP_VERSION=$(extract_ocp_version)

if [ -z "${OCP_VERSION}" ]; then
echo "[ERROR] Could not extract OCP version from JOB_NAME"
exit 1
fi

echo "[INFO] OCP Version: ${OCP_VERSION}"

# Store OCP version for other steps
echo -n "${OCP_VERSION}" >| ${SHARED_DIR}/ocp_version.txt

export OCP_VERSION
}

function define_spoke_cluster_name {

#### Spoke cluster
Expand Down Expand Up @@ -36,6 +57,83 @@ function set_spoke_cluster_kubeconfig {
export KUBECONFIG="${SHARED_DIR}/spoke-${secret_kubeconfig}.yaml"
}

# Track if we've already created the waiting request file (stored path for cleanup)
WAITING_FILE_PATH=""

function create_waiting_request_on_bastion {

local spoke_lock_filename="${1}"

# Only create the waiting file once per session
# Each job gets a unique file (with timestamp) that only it will delete
if [ -n "${WAITING_FILE_PATH}" ]; then
return 0
fi

echo
echo "************ telcov10n Registering wait request before lock attempt ************"
echo

local waiting_file
waiting_file=$(create_waiting_request_file "${AUX_HOST}" "${spoke_lock_filename}" "${OCP_VERSION}")

if [ -n "${waiting_file}" ]; then
echo "[INFO] Created waiting request file: ${waiting_file}"
echo " This signals that a job with OCP version ${OCP_VERSION} is waiting."
WAITING_FILE_PATH="${waiting_file}"
# Store the path in SHARED_DIR for cleanup step
echo -n "${waiting_file}" >| ${SHARED_DIR}/own_waiting_file.txt
else
echo "[WARNING] Failed to create waiting request file."
fi

echo
}

function validate_lock_for_higher_priority {

local spoke_lock_filename="${1}"

echo
echo "************ telcov10n Validating lock acquisition for priority ************"
echo

# Check if there's a higher priority job waiting BEFORE removing our waiting file
# This way, if we need to release the lock, our waiting file stays intact
local check_result
check_result=$(check_for_higher_priority_waiter "${AUX_HOST}" "${spoke_lock_filename}" "${OCP_VERSION}")

if [[ "${check_result}" == quit:* ]]; then
local higher_version=${check_result#quit:}
echo
echo "[WARNING] Lock acquired but a higher priority job is waiting!"
echo " Current job version: ${OCP_VERSION}"
echo " Higher version waiting: ${higher_version}"
echo " Releasing lock to allow higher priority job to proceed..."
echo " (Keeping own waiting file for next attempt)"
echo
# Release the lock to let the higher priority job acquire it
# Keep our waiting file - we're still waiting!
timeout -s 9 10m ssh "${SSHOPTS[@]}" "root@${AUX_HOST}" "rm -fv ${spoke_lock_filename}"
return 1
fi

echo "[INFO] No higher priority jobs waiting. Proceeding with lock."

# NOW remove our waiting file since we're proceeding
if [ -n "${WAITING_FILE_PATH}" ]; then
echo "[INFO] Removing own waiting file: ${WAITING_FILE_PATH}"
remove_own_waiting_file "${AUX_HOST}" "${WAITING_FILE_PATH}"
WAITING_FILE_PATH=""
rm -f ${SHARED_DIR}/own_waiting_file.txt 2>/dev/null || true
fi

# Store lock filename for later use by other steps
echo -n "${spoke_lock_filename}" >| ${SHARED_DIR}/spoke_lock_filename.txt

return 0
}

function select_baremetal_host_from_pool {

echo "************ telcov10n select a baremetal host from the pool ************"
Expand All @@ -60,13 +158,24 @@ function select_baremetal_host_from_pool {
local network_spoke_mac_address
network_spoke_mac_address="$(cat ${baremetal_host_path}/network_spoke_mac_address)"
local spoke_lock_filename="/var/run/lock/ztp-baremetal-pool/spoke-baremetal-${network_spoke_mac_address//:/-}.lock"

# Create waiting request file BEFORE trying to acquire lock (only once)
# This ensures our presence is visible even if we immediately get the lock
create_waiting_request_on_bastion "${spoke_lock_filename}"

try_to_lock_host "${AUX_HOST}" "${spoke_lock_filename}" "${host_lock_timestamp}" "${LOCK_TIMEOUT}"
[[ "$(check_the_host_was_locked "${AUX_HOST}" "${spoke_lock_filename}" "${host_lock_timestamp}")" == "locked" ]] &&
{
update_host_and_master_yaml_files "$(dirname ${host})" ;
echo -n "yes" >| ${SHARED_DIR}/do_you_hold_the_lock_for_the_sno_spoke_cluster_server.txt
return 0 ;
}
if [[ "$(check_the_host_was_locked "${AUX_HOST}" "${spoke_lock_filename}" "${host_lock_timestamp}")" == "locked" ]]; then
# Validate that no higher priority job is waiting
if validate_lock_for_higher_priority "${spoke_lock_filename}"; then
update_host_and_master_yaml_files "$(dirname ${host})"
echo -n "yes" >| ${SHARED_DIR}/do_you_hold_the_lock_for_the_sno_spoke_cluster_server.txt
return 0
else
# Higher priority job is waiting, lock was released
# Our waiting file is still intact (not removed until validation passes)
echo "[INFO] Will retry acquiring lock..."
fi
fi
fi
done

Expand Down Expand Up @@ -218,6 +327,7 @@ function hack_spoke_deployment {
function main {

setup_aux_host_ssh_access
extract_and_set_ocp_version
define_spoke_cluster_name
set_spoke_cluster_kubeconfig
hack_spoke_deployment
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,4 +46,6 @@ ref:
If cluster endpoints are reachables through a socks5 proxy
documentation: |-
This step allows to adapt the SNO Spoke cluster deployment for
the new baremetal server pool in the new lab location
the new baremetal server pool in the new lab location.
OCP version is automatically extracted from RELEASE_IMAGE_LATEST for
graceful quit priority when multiple jobs compete for the same baremetal host.
Original file line number Diff line number Diff line change
Expand Up @@ -247,14 +247,21 @@ function test_kpis {

echo "************ telcov10n Run CPU Utilization Telco KPIs test ************"

# Check for graceful quit request before starting this test
check_for_quit "cpu_utils_test" "graceful"

make_up_inventory
make_up_remote_test_command
make_up_ansible_playbook
run_ansible_playbook
setup_test_result_for_component_readiness

# Mark successful completion
echo -n "completed" >| ${SHARED_DIR}/cpu_util_test_status.txt
}

function main {
setup_ssh_and_lock_info
set_spoke_cluster_kubeconfig
test_kpis
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,3 +53,7 @@ ref:
[see: https://github.com/neisw/ci-test-mapping/blob/main/pkg/components/telcoperformance/component.go]
documentation: |-
This step allows to verify the SNO Spoke cluster deployed through its kubeconfig.
OCP version is loaded from SHARED_DIR/ocp_version.txt (set by deploy step).
If a graceful quit is requested by a higher version job, this test will be skipped
to release the baremetal host lock faster. The oslat test will have already completed
by this point, so PTP reporting can still collect those results.
Original file line number Diff line number Diff line change
Expand Up @@ -239,14 +239,21 @@ function test_kpis {

echo "************ telcov10n Run oslat Telco KPIs test ************"

# Check for graceful quit request before starting this test
check_for_quit "oslat_test" "graceful"

make_up_inventory
make_up_remote_test_command
make_up_ansible_playbook
run_ansible_playbook
setup_test_result_for_component_readiness

# Mark successful completion
echo -n "completed" >| ${SHARED_DIR}/oslat_test_status.txt
}

function main {
setup_ssh_and_lock_info
set_spoke_cluster_kubeconfig
test_kpis
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,3 +46,6 @@ ref:
[see: https://github.com/neisw/ci-test-mapping/blob/main/pkg/components/telcoperformance/component.go]
documentation: |-
This step allows to verify the SNO Spoke cluster deployed through its kubeconfig.
OCP version is loaded from SHARED_DIR/ocp_version.txt (set by deploy step).
If a graceful quit is requested by a higher version job, this test will be skipped
and the job will exit gracefully to release the baremetal host lock.
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,10 @@ function checking_installation_progress {
timeout=$(date -d "${ABORT_INSTALLATION_TIMEOUT}" +%s)
abort_installation=/tmp/abort.installation

# Counter for quit check - only check every QUIT_CHECK_INTERVAL iterations
local quit_check_counter=0
local quit_check_interval="${QUIT_CHECK_INTERVAL:-3}"

while true; do

test -f ${abort_installation} && {
Expand Down Expand Up @@ -203,6 +207,14 @@ function checking_installation_progress {
echo "$ touch ${abort_installation}"
fi

# Check for quit request every N iterations (QUIT_CHECK_INTERVAL)
# Use "force" mode since if interrupted, the rest of the steps are meaningless (cluster not ready)
((quit_check_counter++))
if [ "${quit_check_counter}" -ge "${quit_check_interval}" ]; then
check_for_quit "cluster_installation_progress" "force"
quit_check_counter=0
fi

sleep ${refresh_timing:="10m"} ;
} || echo
done
Expand Down Expand Up @@ -238,6 +250,10 @@ function get_and_save_kubeconfig_and_creds {
}

function main {

# Setup SSH and load lock info for quit checks
setup_ssh_and_lock_info

set_hub_cluster_kubeconfig
generate_cluster_image_set
create_spoke_namespace
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,14 @@ ref:
Set the amount of time, the step must wait at most, before unconditionally forcing the workflow to continue
with the next steps which will clean up and free all resources used in the deployment of this cluster
(MIN: "REFRESH_TIME value", default: "1 hours + 45 min", MAX: "2 hours - REFRESH_TIME value").
- name: QUIT_CHECK_INTERVAL
default: "3"
documentation: |-
Number of REFRESH_TIME iterations between each quit condition check. A higher priority job (newer OCP version)
waiting for the baremetal host lock can request the current job to quit. This setting controls how often
the installation loop checks for such quit requests. Uses "force" mode (exit 1) since an interrupted
installation leaves the cluster unusable. Default is 3 (check every 3 iterations, i.e., every 9 min
with default REFRESH_TIME of 3m).
- name: BIOS_SETTINGS
default: "{}"
documentation: |-
Expand Down
Loading