From 89a69d6d3ba291992bc10b06d1aaa358327d672f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Milo=C5=A1=20Prchl=C3=ADk?= Date: Fri, 6 Oct 2023 09:32:39 +0200 Subject: [PATCH] Watchdog check --- tests/test/check/data/main.fmf | 14 ++ tests/test/check/main.fmf | 12 ++ tests/test/check/test-watchdog.sh | 43 +++++ tmt/checks/watchdog.py | 268 ++++++++++++++++++++++++++++++ tmt/log.py | 2 +- 5 files changed, 338 insertions(+), 1 deletion(-) create mode 100755 tests/test/check/test-watchdog.sh create mode 100644 tmt/checks/watchdog.py diff --git a/tests/test/check/data/main.fmf b/tests/test/check/data/main.fmf index 43f73da68f..4d064f8818 100644 --- a/tests/test/check/data/main.fmf +++ b/tests/test/check/data/main.fmf @@ -16,3 +16,17 @@ sudo bash -c "passwd --help &> /root/passwd.log; \ ls -alZ /root/passwd.log; \ rm -f /root/passwd.log" || /bin/true + +/watchdog/ping: + test: | + set -x + echo "hello!" + sleep 20 + echo "bye!" + + check: + - name: watchdog + interval: 5 + + ping: true + ping-packets: 1 diff --git a/tests/test/check/main.fmf b/tests/test/check/main.fmf index 446b435fb1..1a17ec6d7b 100644 --- a/tests/test/check/main.fmf +++ b/tests/test/check/main.fmf @@ -24,3 +24,15 @@ tier: 2 environment: PROVISION_METHODS: local virtual tag+: [additional_coverage] + +/watchdog: + test: ./test-watchdog.sh + + environment: + PROVISION_METHODS: local + + adjust: + - when: how == full + environment: + PROVISION_METHODS: local virtual + tag+: [additional_coverage] diff --git a/tests/test/check/test-watchdog.sh b/tests/test/check/test-watchdog.sh new file mode 100755 index 0000000000..0bdbc4c1ee --- /dev/null +++ b/tests/test/check/test-watchdog.sh @@ -0,0 +1,43 @@ +#!/bin/bash + +. /usr/share/beakerlib/beakerlib.sh || exit 1 + +function assert_check_result () { + rlAssertEquals "$1" "avc:$2" "$(yq -r ".[] | .check | .[] | select(.event == \"$3\") | \"\\(.name):\\(.result)\"" $results)" +} + + +rlJournalStart + rlPhaseStartSetup + rlRun "run=\$(mktemp -d)" 0 "Create run directory" + + rlRun "results=$run/plan/execute/results.yaml" + + rlRun "pushd data" + rlRun "set -o pipefail" + rlPhaseEnd + + for method in ${PROVISION_METHODS:-local}; do + rlPhaseStartTest "Test guest watchdog ping with $method provisioning" +# rlRun "avc_log=$run/plan/execute/data/guest/default-0/avc/harmless-1/tmt-avc-after-test.txt" +# + rlRun "tmt run --id $run --scratch -a -vv provision -h $method test -n /watchdog" + rlRun "tmt run --id $run --scratch -a -vv provision -h $method provision --insert -h $method test -n /watchdog" +# +# rlRun "cat $results" +# rlRun "cat $avc_log" +# +# rlAssertExists "$avc_log" +# +# assert_check_result "avc as an after-test should pass" "pass" "after-test" +# +# rlAssertGrep "" "$avc_log" + rlPhaseEnd + done + + rlPhaseStartCleanup + rlRun "popd" + + # rlRun "rm -rf $run" + rlPhaseEnd +rlJournalEnd diff --git a/tmt/checks/watchdog.py b/tmt/checks/watchdog.py new file mode 100644 index 0000000000..2e095043fc --- /dev/null +++ b/tmt/checks/watchdog.py @@ -0,0 +1,268 @@ +import dataclasses +import datetime +import re +import threading +import time +from typing import List, Optional, Iterable, Iterator, Dict + +import tmt.log +import tmt.steps.execute +import tmt.steps.provision +import tmt.utils +from tmt.checks import Check, CheckPlugin, provides_check +from tmt.result import CheckResult, ResultOutcome +from tmt.utils import Path, field, render_run_exception_streams + +PING_OUTPUT_PATTERN = re.compile( + r'(?m)(?P\d+) packets transmitted, (?P\d+) received') + + +REPORT_FILENAME = 'tmt-watchdog.txt' + + +def render_report_path( + plugin: tmt.steps.execute.ExecutePlugin, + guest: tmt.steps.provision.Guest, + test: 'tmt.base.Test') -> Path: + """ Render path to a watchdog report file from necessary components """ + + return plugin.data_path( + test, + guest, + filename=REPORT_FILENAME, + create=True, + full=True) + + +def report_progress( + log: Path, + report: Iterable[str], + command_output: Optional[str] = None) -> None: + """ + Add new report into a report file. + + :param log: path to the report file. + :param report: iterable of report lines to add. Each line is emitted on its + own line in the file. + :param command_output: if set, the string is added to the report file once + ``report`` lines are written into it. + """ + + timestamp = tmt.steps.execute.ExecutePlugin.format_timestamp( + datetime.datetime.now(datetime.timezone.utc)) + + with open(log, mode='a') as f: + f.write(f'# Reported at {timestamp}\n') + + for line in report: + f.write(line) + f.write('\n') + + if command_output: + f.write('\n') + f.write(command_output) + + f.write('\n') + + +@dataclasses.dataclass +class GuestContext: + """ Per-guest watchdog context """ + + #: Current number of failed watchdog checks. + ping_failures: int = 0 + + #: If set, contains a daemonized thread running the watchdog checks. + thread: Optional[threading.Thread] = None + + #: As long as this field is set to ``True``, the watchdog will run its + #: internal loop and run relevant checks. It is unset when terminating + #: the watchdog check to notify the thread it's time to quit. + keep_running: bool = True + + +@dataclasses.dataclass +class WatchdogCheck(Check): + interval: int = field(default=60) + + ping: bool = field(default=False) + ping_packets: int = field(default=1) + ping_threshold: int = field(default=10) + + #: Per-guest context entries. One for each guest name. + guest_context: Dict[str, GuestContext] = field( + default_factory=dict, + internal=True, + # It is not necessary to preserve guest context. + serialize=lambda context: {}) + + def do_ping( + self, + plugin: tmt.steps.execute.ExecutePlugin, + guest: tmt.steps.provision.Guest, + test: 'tmt.base.Test', + logger: tmt.log.Logger) -> None: + """ Perform a ping check """ + + logger.verbose('pinging', level=2) + + log = render_report_path(plugin, guest, test) + + guest_context = self.guest_context[guest.name] + + def _fail_parse_error(ping_output: str) -> None: + """ Handle unparseable ``ping`` output """ + + logger.fail('failed to parse ping output') + + guest_context.ping_failures += 1 + + report_progress( + log, + [ + '# failed to parse ping output', + f'# failed {guest_context.ping_failures} of {self.ping_threshold} allowed', + ], + command_output=ping_output + ) + + def _fail_lost_packets(ping_output: str, transmitted: int, received: int) -> None: + """ Handle missing response packets """ + + logger.fail(f'not all packets returned: {transmitted=} {received=}') + + guest_context.ping_failures += 1 + + report_progress( + log, + [ + '# not all packets returned', + f'# failed {guest_context.ping_failures} of {self.ping_threshold} allowed', + ], + command_output=ping_output + ) + + def _success(ping_output: str) -> None: + """ Handle successfull response """ + + logger.verbose('successful response') + + report = [ + '# successfull response' + ] + + if guest_context.ping_failures != 0: + report.append(f'# replenished failure budget back to {self.ping_threshold}') + + guest_context.ping_failures = 0 + + report_progress( + log, + report, + command_output=ping_output + ) + + def _handle_output(ping_output: str) -> None: + """ Process ``ping`` output and decide on its outcome """ + + match = PING_OUTPUT_PATTERN.search(ping_output) + + if match is None: + _fail_parse_error(ping_output) + + else: + groups = match.groupdict() + + transmitted = int(groups['transmitted']) + received = int(groups['received']) + + if transmitted != received: + _fail_lost_packets(ping_output, transmitted, received) + + else: + _success(ping_output) + + if guest_context.ping_failures >= self.ping_threshold: + logger.fail(f'exhausted {self.ping_threshold} ping attempts') + + try: + assert guest.guest is not None # narrow type + + output = tmt.utils.Command('ping', '-c', str(self.ping_packets), guest.guest) \ + .run(cwd=Path.cwd(), logger=logger) + + _handle_output(output.stdout or '') + + except tmt.utils.RunError as exc: + if exc.returncode == 1: + _handle_output(exc.stdout or '') + + else: + _handle_output('\n'.join(render_run_exception_streams(exc.stdout, exc.stderr))) + + +@provides_check('watchdog') +class Watchdog(CheckPlugin): + _check_class = WatchdogCheck + + @classmethod + def before_test( + cls, + *, + check: WatchdogCheck, + plugin: tmt.steps.execute.ExecutePlugin, + guest: tmt.steps.provision.Guest, + test: 'tmt.base.Test', + environment: Optional[tmt.utils.EnvironmentType] = None, + logger: tmt.log.Logger) -> List[CheckResult]: + + # Setup a logger + watchdog_logger = logger.clone() + watchdog_logger.labels.append('watchdog') + + # Create a guest context for the guest we've been given + guest_context = check.guest_context[guest.name] = GuestContext() + + def watchdog() -> None: + """ Watchdog thread code """ + + while guest_context.keep_running: + if check.ping: + check.do_ping(plugin, guest, test, watchdog_logger) + + time.sleep(check.interval) + + guest_context.thread = threading.Thread( + target=watchdog, + name=f'watchdog-{guest.name}', + daemon=True) + + guest_context.thread.start() + + return [] + + @classmethod + def after_test( + cls, + *, + check: WatchdogCheck, + plugin: tmt.steps.execute.ExecutePlugin, + guest: tmt.steps.provision.Guest, + test: 'tmt.base.Test', + environment: Optional[tmt.utils.EnvironmentType] = None, + logger: tmt.log.Logger) -> List[CheckResult]: + + guest_context = check.guest_context.pop(guest.name) + + if guest_context.thread: + guest_context.keep_running = False + guest_context.thread.join() + + guest_context.thread = None + + return [ + CheckResult( + name='watchdog', + result=ResultOutcome.PASS, + log=[render_report_path(plugin, guest, test)] + )] diff --git a/tmt/log.py b/tmt/log.py index b969cbc5b3..74a17f8933 100644 --- a/tmt/log.py +++ b/tmt/log.py @@ -523,7 +523,7 @@ def clone(self) -> 'Logger': return Logger( self._logger, base_shift=self._base_shift, - labels=self.labels, + labels=self.labels[:], labels_padding=self.labels_padding, verbosity_level=self.verbosity_level, debug_level=self.debug_level,