Skip to content

Commit 2dcc96d

Browse files
Add storage provisioning diagnostics for CI failures
1 parent 4e8f662 commit 2dcc96d

File tree

3 files changed

+125
-0
lines changed

3 files changed

+125
-0
lines changed
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
#!/bin/bash
2+
# Monitor k3d cluster resources during tests
3+
# Run this in the background to capture resource usage over time
4+
5+
set -euo pipefail
6+
7+
INTERVAL="${1:-30}" # Check every N seconds
8+
DURATION="${2:-300}" # Run for N seconds (5 minutes)
9+
OUTPUT="${3:-resource-monitor.log}"
10+
11+
echo "Starting resource monitoring (interval=${INTERVAL}s, duration=${DURATION}s)" > "$OUTPUT"
12+
echo "================================================================" >> "$OUTPUT"
13+
14+
END_TIME=$(($(date +%s) + DURATION))
15+
16+
while [ $(date +%s) -lt $END_TIME ]; do
17+
TIMESTAMP=$(date '+%Y-%m-%d %H:%M:%S')
18+
19+
echo "" >> "$OUTPUT"
20+
echo "=== $TIMESTAMP ===" >> "$OUTPUT"
21+
22+
# Docker disk usage
23+
echo "--- Docker Disk Usage ---" >> "$OUTPUT"
24+
docker system df >> "$OUTPUT" 2>&1 || true
25+
26+
# k3d node disk usage
27+
echo "--- k3d Node Disk Usage ---" >> "$OUTPUT"
28+
docker exec k3d-k3s-default-server-0 df -h 2>> "$OUTPUT" | \
29+
grep -E '(Filesystem|/$|/var/lib|overlay)' >> "$OUTPUT" || true
30+
31+
# PVC counts
32+
echo "--- PVC Status ---" >> "$OUTPUT"
33+
kubectl get pvc --all-namespaces -o json 2>/dev/null | \
34+
jq -r '.items | group_by(.status.phase) | map({phase: .[0].status.phase, count: length}) | .[]' >> "$OUTPUT" 2>&1 || true
35+
36+
# Local path provisioner status
37+
echo "--- Provisioner Pods ---" >> "$OUTPUT"
38+
kubectl get pods -n kube-system -l app=local-path-provisioner -o wide >> "$OUTPUT" 2>&1 || true
39+
40+
sleep "$INTERVAL"
41+
done
42+
43+
echo "Resource monitoring complete" >> "$OUTPUT"

.github/workflows/test.yaml

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,12 +135,48 @@ jobs:
135135
--env 'PGO_FEATURE_GATES=TablespaceVolumes=true,OpenTelemetryLogs=true,OpenTelemetryMetrics=true' \
136136
--name 'postgres-operator' localhost/postgres-operator
137137
138+
- name: Check initial environment health
139+
run: |
140+
echo "::group::Docker system info"
141+
docker system df
142+
docker info | grep -E 'Storage Driver|Data Space'
143+
echo "::endgroup::"
144+
echo "::group::k3d node disk space"
145+
docker exec k3d-k3s-default-server-0 df -h
146+
echo "::endgroup::"
147+
echo "::group::Storage provisioner status"
148+
kubectl get storageclass -o wide
149+
kubectl get pods -n kube-system -l app=local-path-provisioner -o wide
150+
echo "::endgroup::"
151+
152+
- name: Start resource monitoring
153+
run: |
154+
nohup .github/actions/k3d/monitor-resources.sh 30 600 chainsaw-resource-monitor.log &
155+
echo $! > monitor.pid
156+
138157
- run: |
139158
make check-chainsaw && exit
140159
failed=$?
141160
echo '::group::PGO logs'; docker logs 'postgres-operator'; echo '::endgroup::'
142161
exit $failed
143162
163+
- name: Stop resource monitoring
164+
if: always()
165+
run: |
166+
if [ -f monitor.pid ]; then
167+
kill $(cat monitor.pid) 2>/dev/null || true
168+
rm monitor.pid
169+
fi
170+
171+
- name: Upload resource monitoring logs
172+
if: always()
173+
uses: actions/upload-artifact@v4
174+
with:
175+
name: "chainsaw-resource-monitor-k8s=${{ matrix.kubernetes }}"
176+
path: chainsaw-resource-monitor.log
177+
retention-days: 7
178+
if-no-files-found: ignore
179+
144180
- run: make generate-kuttl
145181
env:
146182
KUTTL_PG_UPGRADE_FROM_VERSION: '16'

testing/chainsaw/e2e/pgbackrest-restore/templates/clone-cluster.yaml

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,19 @@ spec:
3434
volume:
3535
volumeClaimSpec: ($volume)
3636

37+
- description: Wait for PVCs to be created
38+
sleep:
39+
duration: 5s
40+
41+
- description: Check PVC provisioning status early
42+
script:
43+
content: |
44+
echo "=== PVCs for ($name) ==="
45+
kubectl get pvc -l postgres-operator.crunchydata.com/cluster=($name) -o wide || true
46+
echo ""
47+
echo "=== All PVC Events ==="
48+
kubectl get events --field-selector involvedObject.kind=PersistentVolumeClaim -o wide | tail -20 || true
49+
3750
-
3851
description: >
3952
Wait for the cluster to come online
@@ -89,3 +102,36 @@ spec:
89102
apiVersion: v1
90103
kind: PersistentVolumeClaim
91104
selector: (join('', ['postgres-operator.crunchydata.com/cluster=', $name]))
105+
106+
- description: Get all PVC events to diagnose provisioning failures
107+
script:
108+
content: kubectl get events --all-namespaces --field-selector involvedObject.kind=PersistentVolumeClaim -o wide
109+
110+
- description: Check storage provisioner status
111+
script:
112+
content: |
113+
echo "=== Storage Classes ==="
114+
kubectl get storageclass -o wide
115+
echo ""
116+
echo "=== Local Path Provisioner Pods ==="
117+
kubectl get pods -n kube-system -l app=local-path-provisioner -o wide
118+
echo ""
119+
echo "=== Local Path Provisioner Logs ==="
120+
kubectl logs -n kube-system -l app=local-path-provisioner --tail=100 --prefix=true
121+
122+
- description: Check node resources and disk space
123+
script:
124+
content: |
125+
echo "=== Node Resources ==="
126+
kubectl describe nodes
127+
echo ""
128+
echo "=== Node Disk Usage (via docker) ==="
129+
docker exec k3d-k3s-default-server-0 df -h
130+
131+
- description: Check all PVs and their status
132+
script:
133+
content: kubectl get pv -o wide
134+
135+
- description: Check for stuck PVCs across all namespaces
136+
script:
137+
content: kubectl get pvc --all-namespaces -o wide

0 commit comments

Comments
 (0)