From 37ffd2ffa22fd45146f5382c97c4a94e932a0b62 Mon Sep 17 00:00:00 2001
From: Seth Hoenig <seth.a.hoenig@gmail.com>
Date: Wed, 4 May 2022 13:51:53 -0500
Subject: [PATCH] cgroups: make sure cgroup still exists after task restart

This PR modifies raw_exec and exec to ensure the cgroup for a task
they are driving still exists during a task restart. These drivers
have the same bug but with different root cause.

For raw_exec, we were removing the cgroup in 2 places - the cpuset
manager, and in the unix containment implementation (the thing that
uses freezer cgroup to clean house). During a task restart, the
containment would remove the cgroup, and when the task runner hooks
went to start again would block on waiting for the cgroup to exist,
which will never happen, because it gets created by the cpuset manager
which only runs as an alloc pre-start hook. The fix here is to simply
not delete the cgroup in the containment implementation; killing the
PIDs is enough. The removal happens in the cpuset manager later anyway.

For exec, it's the same idea, except DestroyTask is called on task
failure, which in turn calls into libcontainer, which in turn deletes
the cgroup. In this case we do not have control over the deletion of
the cgroup, so instead we hack the cgroup back into life after the
call to DestroyTask.

All of this only applies to cgroups v2.
---
 client/allocrunner/taskrunner/task_runner.go  |  1 +
 .../taskrunner/task_runner_test.go            | 28 +++++++++++++++++--
 client/lib/cgutil/group_killer.go             |  6 ++--
 drivers/exec/driver.go                        | 22 +++++++++++++++
 4 files changed, 52 insertions(+), 5 deletions(-)

diff --git a/client/allocrunner/taskrunner/task_runner.go b/client/allocrunner/taskrunner/task_runner.go
index b8c3b270c329..fb67556896e7 100644
--- a/client/allocrunner/taskrunner/task_runner.go
+++ b/client/allocrunner/taskrunner/task_runner.go
@@ -783,6 +783,7 @@ func (tr *TaskRunner) runDriver() error {
 
 	taskConfig := tr.buildTaskConfig()
 	if tr.cpusetCgroupPathGetter != nil {
+		tr.logger.Trace("waiting for cgroup to exist for", "allocID", tr.allocID, "task", tr.task)
 		cpusetCgroupPath, err := tr.cpusetCgroupPathGetter(tr.killCtx)
 		if err != nil {
 			return err
diff --git a/client/allocrunner/taskrunner/task_runner_test.go b/client/allocrunner/taskrunner/task_runner_test.go
index b6c0fdc53c68..15b025b2d5f3 100644
--- a/client/allocrunner/taskrunner/task_runner_test.go
+++ b/client/allocrunner/taskrunner/task_runner_test.go
@@ -253,6 +253,11 @@ func TestTaskRunner_Stop_ExitCode(t *testing.T) {
 		"command": "/bin/sleep",
 		"args":    []string{"1000"},
 	}
+	task.Env = map[string]string{
+		"NOMAD_PARENT_CGROUP": "nomad.slice",
+		"NOMAD_ALLOC_ID":      alloc.ID,
+		"NOMAD_TASK_NAME":     task.Name,
+	}
 
 	conf, cleanup := testTaskRunnerConfig(t, alloc, task.Name)
 	defer cleanup()
@@ -347,14 +352,17 @@ func TestTaskRunner_Restore_Running(t *testing.T) {
 // returned once it is running and waiting in pending along with a cleanup
 // func.
 func setupRestoreFailureTest(t *testing.T, alloc *structs.Allocation) (*TaskRunner, *Config, func()) {
-	ci.Parallel(t)
-
 	task := alloc.Job.TaskGroups[0].Tasks[0]
 	task.Driver = "raw_exec"
 	task.Config = map[string]interface{}{
 		"command": "sleep",
 		"args":    []string{"30"},
 	}
+	task.Env = map[string]string{
+		"NOMAD_PARENT_CGROUP": "nomad.slice",
+		"NOMAD_ALLOC_ID":      alloc.ID,
+		"NOMAD_TASK_NAME":     task.Name,
+	}
 	conf, cleanup1 := testTaskRunnerConfig(t, alloc, task.Name)
 	conf.StateDB = cstate.NewMemDB(conf.Logger) // "persist" state between runs
 
@@ -503,6 +511,11 @@ func TestTaskRunner_Restore_System(t *testing.T) {
 		"command": "sleep",
 		"args":    []string{"30"},
 	}
+	task.Env = map[string]string{
+		"NOMAD_PARENT_CGROUP": "nomad.slice",
+		"NOMAD_ALLOC_ID":      alloc.ID,
+		"NOMAD_TASK_NAME":     task.Name,
+	}
 	conf, cleanup := testTaskRunnerConfig(t, alloc, task.Name)
 	defer cleanup()
 	conf.StateDB = cstate.NewMemDB(conf.Logger) // "persist" state between runs
@@ -718,7 +731,11 @@ func TestTaskRunner_TaskEnv_None(t *testing.T) {
 			"echo $PATH",
 		},
 	}
-
+	task.Env = map[string]string{
+		"NOMAD_PARENT_CGROUP": "nomad.slice",
+		"NOMAD_ALLOC_ID":      alloc.ID,
+		"NOMAD_TASK_NAME":     task.Name,
+	}
 	tr, conf, cleanup := runTestTaskRunner(t, alloc, task.Name)
 	defer cleanup()
 
@@ -1780,6 +1797,11 @@ func TestTaskRunner_Download_RawExec(t *testing.T) {
 	task.Config = map[string]interface{}{
 		"command": "noop.sh",
 	}
+	task.Env = map[string]string{
+		"NOMAD_PARENT_CGROUP": "nomad.slice",
+		"NOMAD_ALLOC_ID":      alloc.ID,
+		"NOMAD_TASK_NAME":     task.Name,
+	}
 	task.Artifacts = []*structs.TaskArtifact{
 		{
 			GetterSource: fmt.Sprintf("%s/testdata/noop.sh", ts.URL),
diff --git a/client/lib/cgutil/group_killer.go b/client/lib/cgutil/group_killer.go
index 9eeae7fefeef..bcfefe632dd4 100644
--- a/client/lib/cgutil/group_killer.go
+++ b/client/lib/cgutil/group_killer.go
@@ -143,8 +143,10 @@ func (d *killer) v2(cgroup *configs.Cgroup) error {
 		return err
 	}
 
-	// remove the cgroup from disk
-	return mgr.Destroy()
+	// note: do NOT remove the cgroup from disk; leave that to the alloc-level
+	// cpuset mananager.
+
+	return nil
 }
 
 // kill is used to SIGKILL all processes in cgroup
diff --git a/drivers/exec/driver.go b/drivers/exec/driver.go
index 706c677c5199..eff9627eacd5 100644
--- a/drivers/exec/driver.go
+++ b/drivers/exec/driver.go
@@ -581,6 +581,25 @@ func (d *Driver) StopTask(taskID string, timeout time.Duration, signal string) e
 	return nil
 }
 
+// resetCgroup will re-create the v2 cgroup for the task after the task has been
+// destroyed by libcontainer. In the case of a task restart we call DestroyTask
+// which removes the cgroup - but we still need it!
+//
+// Ideally the cgroup management would be more unified - and we could do the creation
+// on a task runner pre-start hook, eliminating the need for this hack.
+func (d *Driver) resetCgroup(handle *taskHandle) {
+	if cgutil.UseV2 {
+		if handle.taskConfig.Resources != nil &&
+			handle.taskConfig.Resources.LinuxResources != nil &&
+			handle.taskConfig.Resources.LinuxResources.CpusetCgroupPath != "" {
+			err := os.Mkdir(handle.taskConfig.Resources.LinuxResources.CpusetCgroupPath, 0755)
+			if err != nil {
+				d.logger.Trace("failed to reset cgroup", "path", handle.taskConfig.Resources.LinuxResources.CpusetCgroupPath)
+			}
+		}
+	}
+}
+
 func (d *Driver) DestroyTask(taskID string, force bool) error {
 	handle, ok := d.tasks.Get(taskID)
 	if !ok {
@@ -599,6 +618,9 @@ func (d *Driver) DestroyTask(taskID string, force bool) error {
 		handle.pluginClient.Kill()
 	}
 
+	// workaround for the case where DestroyTask was issued on task restart
+	d.resetCgroup(handle)
+
 	d.tasks.Delete(taskID)
 	return nil
 }