Skip to content

Commit

Permalink
Implement the new test check watchdog
Browse files Browse the repository at this point in the history
  • Loading branch information
happz authored and psss committed Mar 8, 2024
1 parent 4b3da6d commit d7a650f
Show file tree
Hide file tree
Showing 15 changed files with 823 additions and 92 deletions.
6 changes: 6 additions & 0 deletions docs/releases.rst
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,12 @@ to TPM 2.0 for now, the future release of `testcloud`__, the
library behing ``virtual`` plugin, will extend the support to more
versions.

A new :ref:`watchdog test check<plugins/test-checks/watchdog>` has been
added. It monitors a guest running the test with either ping or SSH
connections, and may force reboot of the guest when it becomes
unresponsive. This is the first step towards helping tests handle kernel
panics and similar situations.

__ https://pagure.io/testcloud/


Expand Down
6 changes: 3 additions & 3 deletions tests/execute/reboot/basic.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,11 @@ rlJournalStart
for interactive in "" "--interactive"; do
rlPhaseStartTest "Simple reboot test (interactivity: $interactive)"
rlRun -s "tmt run --scratch -i $run -dddvvva execute -h tmt $interactive"
rlAssertGrep "Reboot during test '/test' with reboot count 1" $rlRun_LOG
rlAssertGrep "Soft reboot during test '/test' with reboot count 1" $rlRun_LOG
rlAssertGrep "After first reboot" $rlRun_LOG
rlAssertGrep "Reboot during test '/test' with reboot count 2" $rlRun_LOG
rlAssertGrep "Soft reboot during test '/test' with reboot count 2" $rlRun_LOG
rlAssertGrep "After second reboot" $rlRun_LOG
rlAssertGrep "Reboot during test '/test' with reboot count 3" $rlRun_LOG
rlAssertGrep "Soft reboot during test '/test' with reboot count 3" $rlRun_LOG
rlAssertGrep "After third reboot" $rlRun_LOG
rlRun "rm $rlRun_LOG"

Expand Down
6 changes: 3 additions & 3 deletions tests/execute/reboot/reuse.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,11 @@ rlJournalStart
provision="provision -h connect -g $guest -P $port -u $user -k $key"
for _ in $(seq 0 1); do
rlRun -s "tmt run --scratch -ai $run -dddvvv $provision"
rlAssertGrep "Reboot during test '/test' with reboot count 1" $rlRun_LOG
rlAssertGrep "Soft reboot during test '/test' with reboot count 1" $rlRun_LOG
rlAssertGrep "After first reboot" $rlRun_LOG
rlAssertGrep "Reboot during test '/test' with reboot count 2" $rlRun_LOG
rlAssertGrep "Soft reboot during test '/test' with reboot count 2" $rlRun_LOG
rlAssertGrep "After second reboot" $rlRun_LOG
rlAssertGrep "Reboot during test '/test' with reboot count 3" $rlRun_LOG
rlAssertGrep "Soft reboot during test '/test' with reboot count 3" $rlRun_LOG
rlAssertGrep "After third reboot" $rlRun_LOG
rlRun "rm $rlRun_LOG"

Expand Down
43 changes: 43 additions & 0 deletions tests/test/check/data/main.fmf
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,46 @@
sudo bash -c "passwd --help &> /root/passwd.log; \
ls -alZ /root/passwd.log; \
rm -f /root/passwd.log" || /bin/true

/watchdog/ping:
require:
- /usr/bin/uptime

test: |
set -x

export
uptime

if [ "$TMT_REBOOT_COUNT" = "1" ]; then exit 0; fi

# Collect a couple of successful responses
sysctl net.ipv4.icmp_echo_ignore_all
sysctl net.ipv6.icmp.echo_ignore_all

echo "test starts, will sleep for a while"
sleep 120
uptime

# Trigger kernel panic. There should be no development after this line,
# but to be sure, sleep more.
echo c > /proc/sysrq-trigger

# Now wait to be noticed by the watchdog
sleep 300

duration: 30m

check:
- how: watchdog

interval: 5
reboot: true

# The only viable way to test this with easy-to-setup guests is `virtual` + SSH.
# Cannot use ping, we'd be pinging our own localhost, and Beaker requires nontrivial
# setup. The watchdog should detect this & disable the ping probe.
ping: true

ssh-ping: true
ssh-ping-threshold: 3
9 changes: 9 additions & 0 deletions tests/test/check/main.fmf
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,14 @@ tier: 2
/avc:
test: ./test-avc.sh
tag+:
- provision-only
- provision-local
- provision-virtual

/watchdog:
test: ./test-watchdog.sh
duration: 15m
tag+:
- provision-only
- provision-container
- provision-virtual
79 changes: 79 additions & 0 deletions tests/test/check/test-watchdog.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
#!/bin/bash

. /usr/share/beakerlib/beakerlib.sh || exit 1

function assert_check_result () {
rlAssertEquals "$1" "watchdog:$2" "$(yq -r ".[] | .check | .[] | select(.event == \"$3\") | \"\\(.name):\\(.result)\"" $results)"
}


rlJournalStart
rlPhaseStartSetup
rlRun "PROVISION_HOW=${PROVISION_HOW:-virtual}"

# Using /var/tmp instead of /tmp - we need the directory to survive
# reboot, under /tmp it would be removed :/
rlRun "run=\$(mktemp -d -p /var/tmp)" 0 "Create run directory"

rlRun "results=$run/plan/execute/results.yaml"

rlRun "pushd data"
rlRun "set -o pipefail"
rlPhaseEnd

rlPhaseStartTest "Test guest watchdog ping with $PROVISION_HOW provisioning"
rlRun "test_dir=$run/plan/execute/data/guest/default-0/watchdog/ping-1"
rlRun "log=$run/log.txt"
rlRun "test_log=$test_dir/output.txt"
rlRun "watchdog_log=$test_dir/tmt-watchdog.txt"

if [ "$PROVISION_HOW" = "container" ]; then
rlRun "tmt -c provision_method=$PROVISION_HOW run --id $run --scratch -a -vv provision -h $PROVISION_HOW test -n /watchdog" 1

elif [ "$PROVISION_HOW" = "virtual" ]; then
rlRun "tmt -c provision_method=$PROVISION_HOW run --id $run --scratch -a -vv provision -h $PROVISION_HOW --connection system test -n /watchdog"

else
rlDie "Provision method $PROVISION_HOW is not supported by the test."
fi

rlRun "cat $results"
rlRun "cat $test_log"

if [ "$PROVISION_HOW" = "container" ]; then
rlRun "grep -E '\\[watchdog\\][[:space:]]+warn: Ping against this guest is not supported, disabling.' $log"
rlRun "grep -E '\\[watchdog\\][[:space:]]+warn: SSH ping against this guest is not supported, disabling.' $log"

assert_check_result "watchdog as an after-test should pass" "pass" "after-test"

rlAssertGrep "TMT_REBOOT_COUNT=\"0\"" $test_log
rlAssertNotGrep "TMT_REBOOT_COUNT=\"1\"" $test_log
rlAssertGrep "/proc/sysrq-trigger: Read-only file system" $test_log

elif [ "$PROVISION_HOW" = "virtual" ]; then
rlRun "cat $watchdog_log"

rlRun "grep -E '\\[watchdog\\][[:space:]]+warn: Ping against this guest is not supported, disabling.' $log"

assert_check_result "watchdog as an after-test should pass" "pass" "after-test"

rlAssertGrep "TMT_REBOOT_COUNT=\"0\"" $test_log
rlAssertGrep "TMT_REBOOT_COUNT=\"1\"" $test_log
rlAssertGrep "++ exit 0" $test_log

rlAssertGrep "# ssh-ping reported" $watchdog_log
rlAssertGrep "# failed 1 of 3 allowed" $watchdog_log
rlAssertGrep "# failed 2 of 3 allowed" $watchdog_log
rlAssertGrep "# failed 3 of 3 allowed" $watchdog_log

rlRun "grep -E '\\[watchdog\\][[:space:]]+fail: exhausted 3 SSH ping attempts' $log"
rlAssertGrep "Hard reboot during test '/watchdog/ping' with reboot count 1." $log
fi
rlPhaseEnd

rlPhaseStartCleanup
rlRun "popd"

rlRun "rm -rf $run"
rlPhaseEnd
rlJournalEnd
3 changes: 3 additions & 0 deletions tmt/checks/avc.py
Original file line number Diff line number Diff line change
Expand Up @@ -330,6 +330,9 @@ def after_test(
name='avc',
result=ResultOutcome.SKIP)]

if invocation.hard_reboot_requested:
return [CheckResult(name='dmesg', result=ResultOutcome.SKIP)]

assert invocation.phase.step.workdir is not None # narrow type

outcome, path = create_final_report(invocation, logger)
Expand Down
3 changes: 3 additions & 0 deletions tmt/checks/dmesg.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,9 @@ def after_test(
if not invocation.guest.facts.has_capability(GuestCapability.SYSLOG_ACTION_READ_ALL):
return [CheckResult(name='dmesg', result=ResultOutcome.SKIP)]

if invocation.hard_reboot_requested:
return [CheckResult(name='dmesg', result=ResultOutcome.SKIP)]

outcome, path = cls._save_dmesg(invocation, CheckEvent.AFTER_TEST, logger)

return [CheckResult(name='dmesg', result=outcome, log=[path])]
Loading

0 comments on commit d7a650f

Please sign in to comment.