Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

autorestart/test_container_autorestart.py failure #6825

Open
andywongarista opened this issue Nov 14, 2022 · 3 comments
Open

autorestart/test_container_autorestart.py failure #6825

andywongarista opened this issue Nov 14, 2022 · 3 comments

Comments

@andywongarista
Copy link
Contributor

Description

Failure seen autorestart/test_container_autorestart.py

Steps to reproduce the issue:

  1. Run autorestart/test_container_autorestart.py

Describe the results you received:

    def test_containers_autorestart(duthosts, enum_rand_one_per_hwsku_hostname, enum_rand_one_asic_index,
                                    enum_dut_feature, tbinfo):
        """
        @summary: Test the auto-restart feature of each container against two scenarios: killing
                  a non-critical process to verify the container is still running; killing each
                  critical process to verify the container will be stopped and restarted
        """
        duthost = duthosts[enum_rand_one_per_hwsku_hostname]
        asic = duthost.asic_instance(enum_rand_one_asic_index)
        service_name = asic.get_service_name(enum_dut_feature)
        container_name = asic.get_docker_name(enum_dut_feature)
>       run_test_on_single_container(duthost, container_name, service_name, tbinfo)

asic       = <SonicAsic 0>
container_name = 'swss'
duthost    = <MultiAsicSonicHost pkz405>
duthosts   = [<MultiAsicSonicHost pkz405>]
enum_dut_feature = 'swss'
enum_rand_one_asic_index = None
enum_rand_one_per_hwsku_hostname = 'pkz405'
service_name = 'swss'
tbinfo     = {'comment': 'Tests Arista Arista-720DT-48S', 'conf-name': 'ardut', 'duts': ['pkz405'], 'duts_map': {'pkz405': 0}, ...}

autorestart/test_container_autorestart.py:494:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
autorestart/test_container_autorestart.py:393: in run_test_on_single_container
    feature_autorestart_states = duthost.get_container_autorestart_states()
common/devices/sonic.py:1307: in get_container_autorestart_states
    show_cmd_output = self.shell("show feature autorestart")

...

        if (res.is_failed or 'exception' in res) and not module_ignore_errors:
>           raise RunAnsibleModuleFail("run module {} failed".format(self.module_name), res)
E           RunAnsibleModuleFail: run module shell failed, Ansible Results =>
E           {"changed": true, "cmd": "show feature autorestart", "delta": "0:00:00.578098", "end": "2022-11-14 20:22:45.394982", "failed": true, "msg": "non-zero return code", "rc":
1, "start": "2022-11-14 20:22:44.816884", "stderr": "Traceback (most recent call last):\n  File \"/usr/local/bin/show\", line 8, in <module>\n    sys.exit(cli())\n  File \"/usr/local
/lib/python3.9/dist-packages/click/core.py\", line 764, in __call__\n    return self.main(*args, **kwargs)\n  File \"/usr/local/lib/python3.9/dist-packages/click/core.py\", line 717,
 in main\n    rv = self.invoke(ctx)\n  File \"/usr/local/lib/python3.9/dist-packages/click/core.py\", line 1137, in invoke\n    return _process_result(sub_ctx.command.invoke(sub_ctx)
)\n  File \"/usr/local/lib/python3.9/dist-packages/click/core.py\", line 1137, in invoke\n    return _process_result(sub_ctx.command.invoke(sub_ctx))\n  File \"/usr/local/lib/python3
.9/dist-packages/click/core.py\", line 956, in invoke\n    return ctx.invoke(self.callback, **ctx.params)\n  File \"/usr/local/lib/python3.9/dist-packages/click/core.py\", line 555,
in invoke\n    return callback(*args, **kwargs)\n  File \"/usr/local/lib/python3.9/dist-packages/click/decorators.py\", line 64, in new_func\n    return ctx.invoke(f, obj, *args, **k
wargs)\n  File \"/usr/local/lib/python3.9/dist-packages/click/core.py\", line 555, in invoke\n    return callback(*args, **kwargs)\n  File \"/usr/local/lib/python3.9/dist-packages/sh
ow/feature.py\", line 165, in feature_autorestart\n    body.append([name, feature_table[name]['auto_restart']])\nKeyError: 'auto_restart'", "stderr_lines": ["Traceback (most recent c
all last):", "  File \"/usr/local/bin/show\", line 8, in <module>", "    sys.exit(cli())", "  File \"/usr/local/lib/python3.9/dist-packages/click/core.py\", line 764, in __call__", "
    return self.main(*args, **kwargs)", "  File \"/usr/local/lib/python3.9/dist-packages/click/core.py\", line 717, in main", "    rv = self.invoke(ctx)", "  File \"/usr/local/lib/py
thon3.9/dist-packages/click/core.py\", line 1137, in invoke", "    return _process_result(sub_ctx.command.invoke(sub_ctx))", "  File \"/usr/local/lib/python3.9/dist-packages/click/co
re.py\", line 1137, in invoke", "    return _process_result(sub_ctx.command.invoke(sub_ctx))", "  File \"/usr/local/lib/python3.9/dist-packages/click/core.py\", line 956, in invoke",
 "    return ctx.invoke(self.callback, **ctx.params)", "  File \"/usr/local/lib/python3.9/dist-packages/click/core.py\", line 555, in invoke", "    return callback(*args, **kwargs)",
 "  File \"/usr/local/lib/python3.9/dist-packages/click/decorators.py\", line 64, in new_func", "    return ctx.invoke(f, obj, *args, **kwargs)", "  File \"/usr/local/lib/python3.9/d
ist-packages/click/core.py\", line 555, in invoke", "    return callback(*args, **kwargs)", "  File \"/usr/local/lib/python3.9/dist-packages/show/feature.py\", line 165, in feature_a
utorestart", "    body.append([name, feature_table[name]['auto_restart']])", "KeyError: 'auto_restart'"], "stdout": "", "stdout_lines": []}

complex_args = {}
filename   = '/data/tests/common/devices/sonic.py'
function_name = 'get_container_autorestart_states'
index      = 0
line_number = 1307
lines      = ['        show_cmd_output = self.shell("show feature autorestart")\n']
module_args = ('show feature autorestart',)
module_async = False
module_ignore_errors = False
previous_frame = <frame object at 0x7f72c4e50bf0>
res        = {'stderr_lines': [u'Traceback (most recent call last):', u'  File "/usr/local/...: [], u'start': u'2022-11-14 20:22:44.816884', u'msg': u'non-zero return code'}
self       = <SonicHost pkz405>
verbose    = True

common/devices/base.py:89: RunAnsibleModuleFail

============================================================================== short test summary info ===============================================================================
FAILED autorestart/test_container_autorestart.py::test_containers_autorestart[pkz405-None-lldp] - RunAnsibleModuleFail: run module shell failed, Ansible Results =>
FAILED autorestart/test_container_autorestart.py::test_containers_autorestart[pkz405-None-pmon] - RunAnsibleModuleFail: run module shell failed, Ansible Results =>
FAILED autorestart/test_container_autorestart.py::test_containers_autorestart[pkz405-None-sflow] - RunAnsibleModuleFail: run module shell failed, Ansible Results =>
FAILED autorestart/test_container_autorestart.py::test_containers_autorestart[pkz405-None-snmp] - RunAnsibleModuleFail: run module shell failed, Ansible Results =>
FAILED autorestart/test_container_autorestart.py::test_containers_autorestart[pkz405-None-macsec] - RunAnsibleModuleFail: run module shell failed, Ansible Results =>
FAILED autorestart/test_container_autorestart.py::test_containers_autorestart[pkz405-None-telemetry] - RunAnsibleModuleFail: run module shell failed, Ansible Results =>
FAILED autorestart/test_container_autorestart.py::test_containers_autorestart[pkz405-None-mux] - RunAnsibleModuleFail: run module shell failed, Ansible Results =>
FAILED autorestart/test_container_autorestart.py::test_containers_autorestart[pkz405-None-bgp] - RunAnsibleModuleFail: run module shell failed, Ansible Results =>
FAILED autorestart/test_container_autorestart.py::test_containers_autorestart[pkz405-None-radv] - RunAnsibleModuleFail: run module shell failed, Ansible Results =>
FAILED autorestart/test_container_autorestart.py::test_containers_autorestart[pkz405-None-nat] - RunAnsibleModuleFail: run module shell failed, Ansible Results =>
FAILED autorestart/test_container_autorestart.py::test_containers_autorestart[pkz405-None-teamd] - RunAnsibleModuleFail: run module shell failed, Ansible Results =>
FAILED autorestart/test_container_autorestart.py::test_containers_autorestart[pkz405-None-dhcp_relay] - RunAnsibleModuleFail: run module shell failed, Ansible Results =>
FAILED autorestart/test_container_autorestart.py::test_containers_autorestart[pkz405-None-swss] - RunAnsibleModuleFail: run module shell failed, Ansible Results =>
FAILED autorestart/test_container_autorestart.py::test_containers_autorestart[pkz405-None-syncd] - RunAnsibleModuleFail: run module shell failed, Ansible Results =>
============================================================================ 14 failed in 589.03 seconds =============================================================================

Describe the results you expected:

  1. autorestart/test_container_autorestart.py passed

Additional information you deem important:

**Output of `show version`:**

```
(paste your output here)
```

**Attach debug file `sudo generate_dump`:**

```
(paste your output here)
```
@azure-pipelines-wrapper
Copy link

Thanks for opening this issue!

@kartik-arista
Copy link
Contributor

Manually checking on a device:

show feature autorestart
Traceback (most recent call last):
  File "/usr/local/bin/show", line 8, in <module>
    sys.exit(cli())
  File "/usr/local/lib/python3.9/dist-packages/click/core.py", line 764, in __call__
    return self.main(*args, **kwargs)
  File "/usr/local/lib/python3.9/dist-packages/click/core.py", line 717, in main
    rv = self.invoke(ctx)
  File "/usr/local/lib/python3.9/dist-packages/click/core.py", line 1137, in invoke
    return _process_result(sub_ctx.command.invoke(sub_ctx))
  File "/usr/local/lib/python3.9/dist-packages/click/core.py", line 1137, in invoke
    return _process_result(sub_ctx.command.invoke(sub_ctx))
  File "/usr/local/lib/python3.9/dist-packages/click/core.py", line 956, in invoke
    return ctx.invoke(self.callback, **ctx.params)
  File "/usr/local/lib/python3.9/dist-packages/click/core.py", line 555, in invoke
    return callback(*args, **kwargs)
  File "/usr/local/lib/python3.9/dist-packages/click/decorators.py", line 64, in new_func
    return ctx.invoke(f, obj, *args, **kwargs)
  File "/usr/local/lib/python3.9/dist-packages/click/core.py", line 555, in invoke
    return callback(*args, **kwargs)
  File "/usr/local/lib/python3.9/dist-packages/show/feature.py", line 165, in feature_autorestart
    body.append([name, feature_table[name]['auto_restart']])

Looking deeper, it happens because some feature does not have auto_restart explicitly set in CONFIG_DB. For example, on this device:

      "macsec": {
            "has_per_asic_scope": "True",
            "high_mem_alert": "disabled",
            "auto_restart": "enabled",
            "state": "enabled",
            "has_global_scope": "False",
            "set_owner": "local",
            "has_timer": "False"
        },
        "telemetry": {
            "has_per_asic_scope": "False",
            "state": "disabled"
        },

The show command needs to be made more resilient to this condition so it does not abort this way. Should be a trivial fix. Arista will submit a PR for this.

@kartik-arista
Copy link
Contributor

Logged

sonic-net/sonic-utilities#2587

to fix the show command. This issue can be closed once the show command fix is completed.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

No branches or pull requests

3 participants