From 7a59431eb1c2a65cdcc4da5edfc4f7b087cce27d Mon Sep 17 00:00:00 2001 From: Scott Date: Thu, 12 Mar 2020 14:40:47 -0400 Subject: [PATCH] Panic in controller when step fails before image digest exporter The image digest exporter (part of the Image Output Resource) is configured with `"terminationMessagePolicy": "FallbackToLogsOnError",`. When a previous step has failed in a Task our entrypoint wrapping the exporter emits a log line like `2020/03/13 12:03:26 Skipping step because a previous step failed`. Because the image digest exporter is set to FallbackToLogsOnError Kubernetes slurps up this log line as the termination message. That line gets read by the Tekton controller, which is looking for JSON in the termination message. It fails to parse and stops trying to read step statuses. That in turn results in a mismatch in the length of the list of steps and the length of the list of step statuses. Finally we attempt to sort the list of step statuses alongside the list of steps. This method panics with an out of bounds error because it assumes the lengths of the two lists are the same. So, this PR does the following things: 1. The image digest exporter has the FallbackToLogsOnError policy removed. I can't think of a reason that we need this anymore. 2. The Tekton controller no longer breaks out of the loop while it's parsing step statuses and instead simply ignores non-JSON termination messages. --- pkg/pod/status.go | 65 +++++++------ pkg/pod/status_test.go | 94 ++++++++++++++++++- .../taskrun/resources/image_exporter.go | 1 - .../taskrun/resources/image_exporter_test.go | 11 +-- pkg/reconciler/taskrun/taskrun.go | 2 +- pkg/reconciler/taskrun/taskrun_test.go | 1 - test/builder/container.go | 7 -- test/builder/step.go | 7 -- 8 files changed, 132 insertions(+), 56 deletions(-) diff --git a/pkg/pod/status.go b/pkg/pod/status.go index e8223d31798..93ba0cb38f6 100644 --- a/pkg/pod/status.go +++ b/pkg/pod/status.go @@ -24,8 +24,8 @@ import ( "time" "github.com/tektoncd/pipeline/pkg/apis/pipeline/v1alpha1" - "github.com/tektoncd/pipeline/pkg/logging" "github.com/tektoncd/pipeline/pkg/termination" + "go.uber.org/zap" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "knative.dev/pkg/apis" @@ -99,7 +99,7 @@ func SidecarsReady(podStatus corev1.PodStatus) bool { } // MakeTaskRunStatus returns a TaskRunStatus based on the Pod's status. -func MakeTaskRunStatus(tr v1alpha1.TaskRun, pod *corev1.Pod, taskSpec v1alpha1.TaskSpec) v1alpha1.TaskRunStatus { +func MakeTaskRunStatus(logger *zap.SugaredLogger, tr v1alpha1.TaskRun, pod *corev1.Pod, taskSpec v1alpha1.TaskSpec) v1alpha1.TaskRunStatus { trs := &tr.Status if trs.GetCondition(apis.ConditionSucceeded) == nil || trs.GetCondition(apis.ConditionSucceeded).Status == corev1.ConditionUnknown { // If the taskRunStatus doesn't exist yet, it's because we just started running @@ -114,39 +114,12 @@ func MakeTaskRunStatus(tr v1alpha1.TaskRun, pod *corev1.Pod, taskSpec v1alpha1.T trs.PodName = pod.Name trs.Steps = []v1alpha1.StepState{} trs.Sidecars = []v1alpha1.SidecarState{} - logger, _ := logging.NewLogger("", "status") - defer func() { - _ = logger.Sync() - }() for _, s := range pod.Status.ContainerStatuses { if IsContainerStep(s.Name) { if s.State.Terminated != nil && len(s.State.Terminated.Message) != 0 { - msg := s.State.Terminated.Message - r, err := termination.ParseMessage(msg) - if err != nil { - logger.Errorf("Could not parse json message %q because of %w", msg, err) - break - } - for index, result := range r { - if result.Key == "StartedAt" { - t, err := time.Parse(time.RFC3339, result.Value) - if err != nil { - logger.Errorf("Could not parse time: %q: %w", result.Value, err) - break - } - s.State.Terminated.StartedAt = metav1.NewTime(t) - // remove the entry for the starting time - r = append(r[:index], r[index+1:]...) - if len(r) == 0 { - s.State.Terminated.Message = "" - } else if bytes, err := json.Marshal(r); err != nil { - logger.Errorf("Error marshalling remaining results: %w", err) - } else { - s.State.Terminated.Message = string(bytes) - } - break - } + if err := updateStatusStartTime(&s); err != nil { + logger.Errorf("error setting the start time of step %q in taskrun %q: %w", s.Name, tr.Name, err) } } trs.Steps = append(trs.Steps, v1alpha1.StepState{ @@ -180,6 +153,36 @@ func MakeTaskRunStatus(tr v1alpha1.TaskRun, pod *corev1.Pod, taskSpec v1alpha1.T return *trs } +// updateStatusStartTime searches for a result called "StartedAt" in the JSON-formatted termination message +// of a step and sets the State.Terminated.StartedAt field to this time if it's found. The "StartedAt" result +// is also removed from the list of results in the container status. +func updateStatusStartTime(s *corev1.ContainerStatus) error { + r, err := termination.ParseMessage(s.State.Terminated.Message) + if err != nil { + return fmt.Errorf("termination message could not be parsed as JSON: %w", err) + } + for index, result := range r { + if result.Key == "StartedAt" { + t, err := time.Parse(time.RFC3339, result.Value) + if err != nil { + return fmt.Errorf("could not parse time value %q in StartedAt field: %w", result.Value, err) + } + s.State.Terminated.StartedAt = metav1.NewTime(t) + // remove the entry for the starting time + r = append(r[:index], r[index+1:]...) + if len(r) == 0 { + s.State.Terminated.Message = "" + } else if bytes, err := json.Marshal(r); err != nil { + return fmt.Errorf("error marshalling remaining results back into termination message: %w", err) + } else { + s.State.Terminated.Message = string(bytes) + } + break + } + } + return nil +} + func updateCompletedTaskRun(trs *v1alpha1.TaskRunStatus, pod *corev1.Pod) { if DidTaskRunFail(pod) { msg := getFailureMessage(pod) diff --git a/pkg/pod/status_test.go b/pkg/pod/status_test.go index 310867a42c6..e3bfb17ba18 100644 --- a/pkg/pod/status_test.go +++ b/pkg/pod/status_test.go @@ -22,6 +22,8 @@ import ( "github.com/google/go-cmp/cmp" "github.com/tektoncd/pipeline/pkg/apis/pipeline/v1alpha1" + "github.com/tektoncd/pipeline/pkg/apis/pipeline/v1beta1" + "github.com/tektoncd/pipeline/pkg/logging" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "knative.dev/pkg/apis" @@ -40,6 +42,7 @@ func TestMakeTaskRunStatus(t *testing.T) { for _, c := range []struct { desc string podStatus corev1.PodStatus + taskSpec v1alpha1.TaskSpec want v1alpha1.TaskRunStatus }{{ desc: "empty", @@ -600,6 +603,94 @@ func TestMakeTaskRunStatus(t *testing.T) { }}, }, }, + }, { + desc: "non-json-termination-message-with-steps-afterwards-shouldnt-panic", + taskSpec: v1alpha1.TaskSpec{TaskSpec: v1beta1.TaskSpec{ + Steps: []v1alpha1.Step{{Container: corev1.Container{ + Name: "non-json", + }}, {Container: corev1.Container{ + Name: "after-non-json", + }}, {Container: corev1.Container{ + Name: "this-step-might-panic", + }}, {Container: corev1.Container{ + Name: "foo", + }}}, + }}, + podStatus: corev1.PodStatus{ + Phase: corev1.PodFailed, + ContainerStatuses: []corev1.ContainerStatus{{ + Name: "step-this-step-might-panic", + ImageID: "image", + State: corev1.ContainerState{ + Terminated: &corev1.ContainerStateTerminated{}, + }, + }, { + Name: "step-foo", + ImageID: "image", + State: corev1.ContainerState{ + Terminated: &corev1.ContainerStateTerminated{}, + }, + }, { + Name: "step-non-json", + ImageID: "image", + State: corev1.ContainerState{ + Terminated: &corev1.ContainerStateTerminated{ + ExitCode: 1, + Message: "this is a non-json termination message. dont panic!", + }, + }, + }, { + Name: "step-after-non-json", + ImageID: "image", + State: corev1.ContainerState{ + Terminated: &corev1.ContainerStateTerminated{}, + }, + }}, + }, + want: v1alpha1.TaskRunStatus{ + Status: duckv1beta1.Status{ + Conditions: []apis.Condition{{ + Type: apis.ConditionSucceeded, + Status: corev1.ConditionFalse, + Reason: ReasonFailed, + Message: `"step-non-json" exited with code 1 (image: "image"); for logs run: kubectl -n foo logs pod -c step-non-json`, + }}, + }, + TaskRunStatusFields: v1alpha1.TaskRunStatusFields{ + Steps: []v1alpha1.StepState{{ + ContainerState: corev1.ContainerState{ + Terminated: &corev1.ContainerStateTerminated{ + ExitCode: 1, + Message: "this is a non-json termination message. dont panic!", + }}, + + Name: "non-json", + ContainerName: "step-non-json", + ImageID: "image", + }, { + ContainerState: corev1.ContainerState{ + Terminated: &corev1.ContainerStateTerminated{}}, + Name: "after-non-json", + ContainerName: "step-after-non-json", + ImageID: "image", + }, { + ContainerState: corev1.ContainerState{ + Terminated: &corev1.ContainerStateTerminated{}}, + Name: "this-step-might-panic", + ContainerName: "step-this-step-might-panic", + ImageID: "image", + }, { + ContainerState: corev1.ContainerState{ + Terminated: &corev1.ContainerStateTerminated{}}, + Name: "foo", + ContainerName: "step-foo", + ImageID: "image", + }}, + Sidecars: []v1alpha1.SidecarState{}, + // We don't actually care about the time, just that it's not nil + CompletionTime: &metav1.Time{Time: time.Now()}, + }, + }, }} { t.Run(c.desc, func(t *testing.T) { now := metav1.Now() @@ -624,7 +715,8 @@ func TestMakeTaskRunStatus(t *testing.T) { }, } - got := MakeTaskRunStatus(tr, pod, v1alpha1.TaskSpec{}) + logger, _ := logging.NewLogger("", "status") + got := MakeTaskRunStatus(logger, tr, pod, c.taskSpec) // Common traits, set for test case brevity. c.want.PodName = "pod" diff --git a/pkg/reconciler/taskrun/resources/image_exporter.go b/pkg/reconciler/taskrun/resources/image_exporter.go index 7b599a12b6e..acb4be6884c 100644 --- a/pkg/reconciler/taskrun/resources/image_exporter.go +++ b/pkg/reconciler/taskrun/resources/image_exporter.go @@ -98,6 +98,5 @@ func imageDigestExporterStep(imageDigestExporterImage string, imagesJSON []byte) Args: []string{ "-images", string(imagesJSON), }, - TerminationMessagePolicy: corev1.TerminationMessageFallbackToLogsOnError, }} } diff --git a/pkg/reconciler/taskrun/resources/image_exporter_test.go b/pkg/reconciler/taskrun/resources/image_exporter_test.go index 7551b411754..5ea20c61593 100644 --- a/pkg/reconciler/taskrun/resources/image_exporter_test.go +++ b/pkg/reconciler/taskrun/resources/image_exporter_test.go @@ -91,11 +91,10 @@ func TestAddOutputImageDigestExporter(t *testing.T) { wantSteps: []v1alpha1.Step{{Container: corev1.Container{ Name: "step1", }}, {Container: corev1.Container{ - Name: "image-digest-exporter-9l9zj", - Image: "override-with-imagedigest-exporter-image:latest", - Command: []string{"/ko-app/imagedigestexporter"}, - Args: []string{"-images", "[{\"name\":\"source-image-1\",\"type\":\"image\",\"url\":\"gcr.io/some-image-1\",\"digest\":\"\",\"OutputImageDir\":\"/workspace/output/source-image\"}]"}, - TerminationMessagePolicy: "FallbackToLogsOnError", + Name: "image-digest-exporter-9l9zj", + Image: "override-with-imagedigest-exporter-image:latest", + Command: []string{"/ko-app/imagedigestexporter"}, + Args: []string{"-images", "[{\"name\":\"source-image-1\",\"type\":\"image\",\"url\":\"gcr.io/some-image-1\",\"digest\":\"\",\"OutputImageDir\":\"/workspace/output/source-image\"}]"}, }}}, }, { desc: "image resource in task with multiple steps", @@ -163,8 +162,6 @@ func TestAddOutputImageDigestExporter(t *testing.T) { Image: "override-with-imagedigest-exporter-image:latest", Command: []string{"/ko-app/imagedigestexporter"}, Args: []string{"-images", "[{\"name\":\"source-image-1\",\"type\":\"image\",\"url\":\"gcr.io/some-image-1\",\"digest\":\"\",\"OutputImageDir\":\"/workspace/output/source-image\"}]"}, - - TerminationMessagePolicy: "FallbackToLogsOnError", }}}, }} { t.Run(c.desc, func(t *testing.T) { diff --git a/pkg/reconciler/taskrun/taskrun.go b/pkg/reconciler/taskrun/taskrun.go index ea6740b0b01..634c27298b3 100644 --- a/pkg/reconciler/taskrun/taskrun.go +++ b/pkg/reconciler/taskrun/taskrun.go @@ -400,7 +400,7 @@ func (c *Reconciler) reconcile(ctx context.Context, tr *v1alpha1.TaskRun) error before := tr.Status.GetCondition(apis.ConditionSucceeded) // Convert the Pod's status to the equivalent TaskRun Status. - tr.Status = podconvert.MakeTaskRunStatus(*tr, pod, *taskSpec) + tr.Status = podconvert.MakeTaskRunStatus(c.Logger, *tr, pod, *taskSpec) if err := updateTaskRunResourceResult(tr, pod.Status); err != nil { return err diff --git a/pkg/reconciler/taskrun/taskrun_test.go b/pkg/reconciler/taskrun/taskrun_test.go index 890575399f9..1bb34e7d610 100644 --- a/pkg/reconciler/taskrun/taskrun_test.go +++ b/pkg/reconciler/taskrun/taskrun_test.go @@ -709,7 +709,6 @@ func TestReconcile(t *testing.T) { tb.VolumeMount("tekton-internal-workspace", workspaceDir), tb.VolumeMount("tekton-internal-home", "/tekton/home"), tb.VolumeMount("tekton-internal-results", "/tekton/results"), - tb.TerminationMessagePolicy(corev1.TerminationMessageFallbackToLogsOnError), tb.TerminationMessagePath("/tekton/termination"), ), ), diff --git a/test/builder/container.go b/test/builder/container.go index de6d9653ff4..0165c95b1f2 100644 --- a/test/builder/container.go +++ b/test/builder/container.go @@ -132,13 +132,6 @@ func EphemeralStorage(val string) ResourceListOp { } } -// TerminationMessagePolicy sets the policy of the termination message. -func TerminationMessagePolicy(terminationMessagePolicy corev1.TerminationMessagePolicy) ContainerOp { - return func(c *corev1.Container) { - c.TerminationMessagePolicy = terminationMessagePolicy - } -} - // TerminationMessagePath sets the termination message path. func TerminationMessagePath(terminationMessagePath string) ContainerOp { return func(c *corev1.Container) { diff --git a/test/builder/step.go b/test/builder/step.go index 2905ce9de93..a0c4f7105fa 100644 --- a/test/builder/step.go +++ b/test/builder/step.go @@ -147,10 +147,3 @@ func StepTerminationMessagePath(terminationMessagePath string) StepOp { step.TerminationMessagePath = terminationMessagePath } } - -// StepTerminationMessagePolicy sets the policy of the termination message. -func StepTerminationMessagePolicy(terminationMessagePolicy corev1.TerminationMessagePolicy) StepOp { - return func(step *v1alpha1.Step) { - step.TerminationMessagePolicy = terminationMessagePolicy - } -}