tektoncd · tekton-robot · Jul 9, 2021 · Dec 21, 2020 · imjasonh · May 25, 2021
diff --git a/cmd/entrypoint/main.go b/cmd/entrypoint/main.go
@@ -34,16 +34,36 @@ import (
 )
 
 var (
- ep = flag.String("entrypoint", "", "Original specified entrypoint to execute")
- waitFiles = flag.String("wait_file", "", "Comma-separated list of paths to wait for")
- waitFileContent = flag.Bool("wait_file_content", false, "If specified, expect wait_file to have content")
- postFile = flag.String("post_file", "", "If specified, file to write upon completion")
- terminationPath = flag.String("termination_path", "/tekton/termination", "If specified, file to write upon termination")
- results = flag.String("results", "", "If specified, list of file names that might contain task results")
- timeout = flag.Duration("timeout", time.Duration(0), "If specified, sets timeout for step")
+ ep = flag.String("entrypoint", "", "Original specified entrypoint to execute")
+ waitFiles = flag.String("wait_file", "", "Comma-separated list of paths to wait for")
+ waitFileContent = flag.Bool("wait_file_content", false, "If specified, expect wait_file to have content")
+ postFile = flag.String("post_file", "", "If specified, file to write upon completion")
+ terminationPath = flag.String("termination_path", "/tekton/termination", "If specified, file to write upon termination")
+ results = flag.String("results", "", "If specified, list of file names that might contain task results")
+ timeout = flag.Duration("timeout", time.Duration(0), "If specified, sets timeout for step")
+ breakpointOnFailure = flag.Bool("breakpoint_on_failure", false, "If specified, expect steps to not skip on failure")
 )
 
-const defaultWaitPollingInterval = time.Second
+const (
+ defaultWaitPollingInterval = time.Second
+ breakpointExitSuffix = ".breakpointexit"
+)
+
+func checkForBreakpointOnFailure(e entrypoint.Entrypointer, breakpointExitPostFile string) {
+ if e.BreakpointOnFailure {
+ if waitErr := e.Waiter.Wait(breakpointExitPostFile, false, false); waitErr != nil {
+ log.Println("error occurred while waiting for " + breakpointExitPostFile + " : " + waitErr.Error())
+ }
+ // get exitcode from .breakpointexit
+ exitCode, readErr := e.BreakpointExitCode(breakpointExitPostFile)
+ // if readErr exists, the exitcode with default to 0 as we would like
+ // to encourage to continue running the next steps in the taskRun
+ if readErr != nil {
+ log.Println("error occurred while reading breakpoint exit code : " + readErr.Error())
+ }
+ os.Exit(exitCode)
+ }
+}
 
 func main() {
  // Add credential flags originally introduced with our legacy credentials helper
@@ -75,17 +95,18 @@ func main() {
  }
 
  e := entrypoint.Entrypointer{
- Entrypoint: *ep,
- WaitFiles: strings.Split(*waitFiles, ","),
- WaitFileContent: *waitFileContent,
- PostFile: *postFile,
- TerminationPath: *terminationPath,
- Args: flag.Args(),
- Waiter: &realWaiter{waitPollingInterval: defaultWaitPollingInterval},
- Runner: &realRunner{},
- PostWriter: &realPostWriter{},
- Results: strings.Split(*results, ","),
- Timeout: timeout,
+ Entrypoint: *ep,
+ WaitFiles: strings.Split(*waitFiles, ","),
+ WaitFileContent: *waitFileContent,
+ PostFile: *postFile,
+ TerminationPath: *terminationPath,
+ Args: flag.Args(),
+ Waiter: &realWaiter{waitPollingInterval: defaultWaitPollingInterval, breakpointOnFailure: *breakpointOnFailure},
+ Runner: &realRunner{},
+ PostWriter: &realPostWriter{},
+ Results: strings.Split(*results, ","),
+ Timeout: timeout,
+ BreakpointOnFailure: *breakpointOnFailure,
  }
 
  // Copy any creds injected by the controller into the $HOME directory of the current
@@ -95,6 +116,7 @@ func main() {
  }
 
  if err := e.Go(); err != nil {
+ breakpointExitPostFile := e.PostFile + breakpointExitSuffix
  switch t := err.(type) {
  case skipError:
  log.Print("Skipping step because a previous step failed")
@@ -110,10 +132,12 @@ func main() {
  // in both cases has an ExitStatus() method with the
  // same signature.
  if status, ok := t.Sys().(syscall.WaitStatus); ok {
+ checkForBreakpointOnFailure(e, breakpointExitPostFile)
  os.Exit(status.ExitStatus())
  }
  log.Fatalf("Error executing command (ExitError): %v", err)
  default:
+ checkForBreakpointOnFailure(e, breakpointExitPostFile)
  log.Fatalf("Error executing command: %v", err)
  }
  }

diff --git a/cmd/entrypoint/waiter.go b/cmd/entrypoint/waiter.go
@@ -11,6 +11,7 @@ import (
 // realWaiter actually waits for files, by polling.
 type realWaiter struct {
  waitPollingInterval time.Duration
+ breakpointOnFailure bool
 }
 
 var _ entrypoint.Waiter = (*realWaiter)(nil)
@@ -30,7 +31,7 @@ func (rw *realWaiter) setWaitPollingInterval(pollingInterval time.Duration) *rea
 //
 // If a file of the same name with a ".err" extension exists then this Wait
 // will end with a skipError.
-func (rw *realWaiter) Wait(file string, expectContent bool) error {
+func (rw *realWaiter) Wait(file string, expectContent bool, breakpointOnFailure bool) error {
  if file == "" {
  return nil
  }
@@ -42,7 +43,16 @@ func (rw *realWaiter) Wait(file string, expectContent bool) error {
  } else if !os.IsNotExist(err) {
  return fmt.Errorf("waiting for %q: %w", file, err)
  }
+ // When a .err file is read by this step, it means that a previous step has failed
+ // We wouldn't want this step to stop executing because the previous step failed during debug
+ // That is counterproductive to debugging
+ // Hence we disable skipError here so that the other steps in the failed taskRun can continue
+ // executing if breakpointOnFailure is enabled for the taskRun
+ // TLDR: Do not return skipError when breakpointOnFailure is enabled as it breaks execution of the TaskRun
  if _, err := os.Stat(file + ".err"); err == nil {
+ if breakpointOnFailure {
+ return nil
+ }
  return skipError("error file present, bail and skip the step")
  }
  }

diff --git a/cmd/entrypoint/waiter_test.go b/cmd/entrypoint/waiter_test.go
@@ -19,6 +19,7 @@ package main
 import (
  "io/ioutil"
  "os"
+ "strings"
  "testing"
  "time"
 )
@@ -37,7 +38,7 @@ func TestRealWaiterWaitMissingFile(t *testing.T) {
  rw := realWaiter{}
  doneCh := make(chan struct{})
  go func() {
- err := rw.setWaitPollingInterval(testWaitPollingInterval).Wait(tmp.Name(), false)
+ err := rw.setWaitPollingInterval(testWaitPollingInterval).Wait(tmp.Name(), false, false)
  if err != nil {
  t.Errorf("error waiting on tmp file %q", tmp.Name())
  }
@@ -60,7 +61,7 @@ func TestRealWaiterWaitWithFile(t *testing.T) {
  rw := realWaiter{}
  doneCh := make(chan struct{})
  go func() {
- err := rw.setWaitPollingInterval(testWaitPollingInterval).Wait(tmp.Name(), false)
+ err := rw.setWaitPollingInterval(testWaitPollingInterval).Wait(tmp.Name(), false, false)
  if err != nil {
  t.Errorf("error waiting on tmp file %q", tmp.Name())
  }
@@ -83,7 +84,7 @@ func TestRealWaiterWaitMissingContent(t *testing.T) {
  rw := realWaiter{}
  doneCh := make(chan struct{})
  go func() {
- err := rw.setWaitPollingInterval(testWaitPollingInterval).Wait(tmp.Name(), true)
+ err := rw.setWaitPollingInterval(testWaitPollingInterval).Wait(tmp.Name(), true, false)
  if err != nil {
  t.Errorf("error waiting on tmp file %q", tmp.Name())
  }
@@ -106,7 +107,7 @@ func TestRealWaiterWaitWithContent(t *testing.T) {
  rw := realWaiter{}
  doneCh := make(chan struct{})
  go func() {
- err := rw.setWaitPollingInterval(testWaitPollingInterval).Wait(tmp.Name(), true)
+ err := rw.setWaitPollingInterval(testWaitPollingInterval).Wait(tmp.Name(), true, false)
  if err != nil {
  t.Errorf("error waiting on tmp file %q", tmp.Name())
  }
@@ -122,3 +123,58 @@ func TestRealWaiterWaitWithContent(t *testing.T) {
  t.Errorf("expected Wait() to have detected a non-zero file size by now")
  }
 }
+
+func TestRealWaiterWaitWithErrorWaitfile(t *testing.T) {
+ tmp, err := ioutil.TempFile("", "real_waiter_test_file*.err")
+ if err != nil {
+ t.Errorf("error creating temp file: %v", err)
+ }
+ tmpFileName := strings.Replace(tmp.Name(), ".err", "", 1)
+ defer os.Remove(tmp.Name())
+ rw := realWaiter{}
+ doneCh := make(chan struct{})
+ go func() {
+ // error of type skipError is returned after encountering a error waitfile
+ err := rw.setWaitPollingInterval(testWaitPollingInterval).Wait(tmpFileName, false, false)
+ if err == nil {
+ t.Errorf("expected skipError upon encounter error waitfile")
+ }
+ switch typ := err.(type) {
+ case skipError:
+ close(doneCh)
+ default:
+ t.Errorf("unexpected error type %T", typ)
+ }
+ }()
+ select {
+ case <-doneCh:
+ // Success
+ case <-time.After(2 * testWaitPollingInterval):
+ t.Errorf("expected Wait() to have detected a non-zero file size by now")
+ }
+}
+
+func TestRealWaiterWaitWithBreakpointOnFailure(t *testing.T) {
+ tmp, err := ioutil.TempFile("", "real_waiter_test_file*.err")
+ if err != nil {
+ t.Errorf("error creating temp file: %v", err)
+ }
+ tmpFileName := strings.Replace(tmp.Name(), ".err", "", 1)
+ defer os.Remove(tmp.Name())
+ rw := realWaiter{}
+ doneCh := make(chan struct{})
+ go func() {
+ // When breakpoint on failure is enabled skipError shouldn't be returned for a error waitfile
+ err := rw.setWaitPollingInterval(testWaitPollingInterval).Wait(tmpFileName, false, true)
+ if err != nil {
+ t.Errorf("error waiting on tmp file %q", tmp.Name())
+ }
+ close(doneCh)
+ }()
+ select {
+ case <-doneCh:
+ // Success
+ case <-time.After(2 * testWaitPollingInterval):
+ t.Errorf("expected Wait() to have detected a non-zero file size by now")
+ }
+}
diff --git a/docs/debug.md b/docs/debug.md
@@ -0,0 +1,82 @@
+<!--
+---
+linkTitle: "Debug"
+weight: 11
+---
+-->
+# Debug
+
+- [Overview](#overview)
+- [Debugging TaskRuns](#debugging-taskruns)
+ - [Adding Breakpoints](#adding-breakpoints)
+ - [Breakpoint on Failure](#breakpoint-on-failure)
+ - [Failure of a Step](#failure-of-a-step)
+ - [Halting a Step on failure](#halting-a-step-on-failure)
+ - [Exiting breakpoint](#exiting-breakpoint)
+- [Debug Environment](#debug-environment)
+ - [Mounts](#mounts)
+ - [Debug Scripts](#debug-scripts)
+
+
+## Overview
+
+`Debug` spec is used for troubleshooting and breakpointing runtime resources. This doc helps understand the inner 
+workings of debug in Tekton. Currently only the `TaskRun` resource is supported. 
+
+## Debugging TaskRuns
+
+The following provides explanation on how Debugging TaskRuns is possible through Tekton. To understand how to use 
+the debug spec for TaskRuns follow the [TaskRun Debugging Documentation](taskruns.md#debugging-a-taskrun).
+
+### Breakpoint on Failure
+
+Halting a TaskRun execution on Failure of a step.
+
+#### Failure of a Step
+
+The entrypoint binary is used to manage the lifecycle of a step. Steps are aligned beforehand by the TaskRun controller
+allowing each step to run in a particular order. This is done using `-wait_file` and the `-post_file` flags. The former 
+let's the entrypoint binary know that it has to wait on creation of a particular file before starting execution of the step.
+And the latter provides information on the step number and signal the next step on completion of the step.
+
+On success of a step, the `-post-file` is written as is, signalling the next step which would have the same argument given
+for `-wait_file` to resume the entrypoint process and move ahead with the step. 
+
+On failure of a step, the `-post_file` is written with appending `.err` to it denoting that the previous step has failed with
+and error. The subsequent steps are skipped in this case as well, marking the TaskRun as a failure.
+
+#### Halting a Step on failure
+
+The failed step writes `<step-no>.err` to `/tekton/tools` and stops running completely. To be able to debug a step we would
+need it to continue running (not exit), not skip the next steps and signal health of the step. By disabling step skipping, 
+stopping write of the `<step-no>.err` file and waiting on a signal by the user to disable the halt, we would be simulating a 
+"breakpoint".
+
+In this breakpoint, which is essentially a limbo state the TaskRun finds itself in, the user can interact with the step 
+environment using a CLI or an IDE. 
+
+#### Exiting breakpoint
+
+To exit a step which has been paused upon failure, the step would wait on a file similar to `<step-no>.breakpointexit` which 
+would unpause and exit the step container. eg: Step 0 fails and is paused. Writing `0.breakpointexit` in `/tekton/tools`
+would unpause and exit the step container.
+
+## Debug Environment 
+
+Additional environment augmentations made available to the TaskRun Pod to aid in troubleshooting and managing step lifecycle.
+
+### Mounts
+
+`/tekton/debug/scripts` : Contains scripts which the user can run to mark the step as a success, failure or exit the breakpoint.
+Shared between all the containers.
+
+`/tekton/debug/info/<n>` : Contains information about the step. Single EmptyDir shared between all step containers, but renamed 
+to reflect step number. eg: Step 0 will have `/tekton/debug/info/0`, Step 1 will have `/tekton/debug/info/1` etc.
+
+### Debug Scripts
+
+`/tekton/debug/scripts/debug-continue` : Mark the step as completed with success by writing to `/tekton/tools`. eg: User wants to exit
+breakpoint for failed step 0. Running this script would create `/tekton/tools/0` and `/tekton/tools/0.breakpointexit`.
+
+`/tekton/debug/scripts/debug-fail-continue` : Mark the step as completed with failure by writing to `/tekton/tools`. eg: User wants to exit
+breakpoint for failed step 0. Running this script would create `/tekton/tools/0.err` and `/tekton/tools/0.breakpointexit`.
diff --git a/docs/taskruns.md b/docs/taskruns.md
@@ -23,13 +23,15 @@ weight: 300
  - [Monitoring `Steps`](#monitoring-steps)
  - [Monitoring `Results`](#monitoring-results)
 - [Cancelling a `TaskRun`](#cancelling-a-taskrun)
+- [Debugging a `TaskRun`](#debugging-a-taskrun)
 - [Events](events.md#taskruns)
 - [Running a TaskRun Hermetically](hermetic.md)
 - [Code examples](#code-examples)
  - [Example `TaskRun` with a referenced `Task`](#example-taskrun-with-a-referenced-task)
  - [Example `TaskRun` with an embedded `Task`](#example-taskrun-with-an-embedded-task)
  - [Reusing a `Task`](#reusing-a-task)
  - [Using custom `ServiceAccount` credentials](#using-custom-serviceaccount-credentials)
+ - [Running step containers as a non-root user](#running-step-containers-as-a-non-root-user)
 
 # Overview
 
@@ -447,6 +449,43 @@ spec:
  status: "TaskRunCancelled"
 ```
 
+
+### Debugging a `TaskRun`
+
+#### Breakpoint on Failure
+
+TaskRuns can be halted on failure for troubleshooting by providing the following spec patch as seen below.
+
+```yaml
+spec:
+ debug:
+ breakpoint: ["onFailure"]
+```
+
+Upon failure of a step, the TaskRun Pod execution is halted. If ths TaskRun Pod continues to run without any lifecycle
+change done by the user (running the debug-continue or debug-fail-continue script) the TaskRun would be subject to 
+[TaskRunTimeout](#configuring-the-failure-timeout). 
+During this time, the user/client can get remote shell access to the step container with a command such as the following.
+
+```bash
+kubectl exec -it print-date-d7tj5-pod-w5qrn -c step-print-date-human-readable 
+```
+
+#### Debug Environment
+
+After the user/client has access to the container environment, they can scour for any missing parts because of which 
+their step might have failed. 
+
+To control the lifecycle of the step to mark it as a success or a failure or close the breakpoint, there are scripts
+provided in the `/tekton/debug/scripts` directory in the container. The following are the scripts and the tasks they
+perform :-
+
+`debug-continue`: Mark the step as a success and exit the breakpoint.
+
+`debug-fail-continue`: Mark the step as a failure and exit the breakpoint.
+
+*More information on the inner workings of debug can be found in the [Debug documentation](debug.md)*
+
 ## Code examples
 
 To better understand `TaskRuns`, study the following code examples: