From 0d37a62678fca526d0da59a5c51c4f42cc31d7b1 Mon Sep 17 00:00:00 2001 From: Jon Ayers Date: Wed, 21 Aug 2024 19:26:38 -0400 Subject: [PATCH] fix: add a 1m timeout to signal shutdown (#92) - It's possible for a number of reasons for the bootstrap script to not exit. This PR adds a hard timeout of 1 minute before we forcefully exit. --- README.md | 1 + cli/docker.go | 36 +++++++++++++++----- cmd/envbox/main.go | 5 +++ integration/docker_test.go | 48 ++++++++++++++++++++++++--- integration/integrationtest/docker.go | 12 +++++-- 5 files changed, 87 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index 8fb02eb..8c44f62 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,7 @@ The environment variables can be used to configure various aspects of the inner | `CODER_CPUS` | Dictates the number of CPUs to allocate the inner container. It is recommended to set this using the Kubernetes [Downward API](https://kubernetes.io/docs/tasks/inject-data-application/environment-variable-expose-pod-information/#use-container-fields-as-values-for-environment-variables). | false | | `CODER_MEMORY` | Dictates the max memory (in bytes) to allocate the inner container. It is recommended to set this using the Kubernetes [Downward API](https://kubernetes.io/docs/tasks/inject-data-application/environment-variable-expose-pod-information/#use-container-fields-as-values-for-environment-variables). | false | | `CODER_DISABLE_IDMAPPED_MOUNT` | Disables idmapped mounts in sysbox. For more information, see the [Sysbox Documentation](https://github.com/nestybox/sysbox/blob/master/docs/user-guide/configuration.md#disabling-id-mapped-mounts-on-sysbox). | false | +| `CODER_SHUTDOWN_TIMEOUT` | Configure a custom shutdown timeout to wait for the boostrap command to exit. Defaults to 1 minute. | false | ## Coder Template diff --git a/cli/docker.go b/cli/docker.go index 928a7c2..a080fdc 100644 --- a/cli/docker.go +++ b/cli/docker.go @@ -13,6 +13,7 @@ import ( "sort" "strconv" "strings" + "time" dockertypes "github.com/docker/docker/api/types" "github.com/docker/docker/api/types/container" @@ -39,6 +40,10 @@ const ( // EnvBoxContainerName is the name of the inner user container. EnvBoxPullImageSecretEnvVar = "CODER_IMAGE_PULL_SECRET" //nolint:gosec EnvBoxContainerName = "CODER_CVM_CONTAINER_NAME" + // We define a custom exit code to distinguish from the generic '1' when envbox exits due to a shutdown timeout. + // Docker claims exit codes 125-127 so we start at 150 to + // ensure we don't collide. + ExitCodeShutdownTimeout = 150 ) const ( @@ -77,8 +82,9 @@ const ( // with UID/GID 1000 will be mapped to `UserNamespaceOffset` + 1000 // on the host. Changing this value will result in improper mappings // on existing containers. - UserNamespaceOffset = 100000 - devDir = "/dev" + UserNamespaceOffset = 100000 + devDir = "/dev" + defaultShutdownTimeout = time.Minute ) var ( @@ -102,6 +108,7 @@ var ( EnvDockerConfig = "CODER_DOCKER_CONFIG" EnvDebug = "CODER_DEBUG" EnvDisableIDMappedMount = "CODER_DISABLE_IDMAPPED_MOUNT" + EnvShutdownTimeout = "CODER_SHUTDOWN_TIMEOUT" ) var envboxPrivateMounts = map[string]struct{}{ @@ -139,6 +146,7 @@ type flags struct { cpus int memory int disableIDMappedMount bool + shutdownTimeout time.Duration // Test flags. noStartupLogs bool @@ -348,6 +356,7 @@ func dockerCmd(ch chan func() error) *cobra.Command { cliflag.IntVarP(cmd.Flags(), &flags.cpus, "cpus", "", EnvCPUs, 0, "Number of CPUs to allocate inner container. e.g. 2") cliflag.IntVarP(cmd.Flags(), &flags.memory, "memory", "", EnvMemory, 0, "Max memory to allocate to the inner container in bytes.") cliflag.BoolVarP(cmd.Flags(), &flags.disableIDMappedMount, "disable-idmapped-mount", "", EnvDisableIDMappedMount, false, "Disable idmapped mounts in sysbox. Note that you may need an alternative (e.g. shiftfs).") + cliflag.DurationVarP(cmd.Flags(), &flags.shutdownTimeout, "shutdown-timeout", "", EnvShutdownTimeout, defaultShutdownTimeout, "Duration after which envbox will be forcefully terminated.") // Test flags. cliflag.BoolVarP(cmd.Flags(), &flags.noStartupLogs, "no-startup-log", "", "", false, "Do not log startup logs. Useful for testing.") @@ -724,18 +733,31 @@ func runDockerCVM(ctx context.Context, log slog.Logger, client dockerutil.Docker return xerrors.Errorf("exec inspect: %w", err) } - shutdownCh <- func() error { - log.Debug(ctx, "killing container", slog.F("bootstrap_pid", bootstrapPID)) + shutdownCh <- killBootstrapCmd(ctx, log, bootstrapPID, bootstrapExec.ID, client, flags.shutdownTimeout) + return nil +} + +// KillBootstrapCmd is the command we run when we receive a signal +// to kill the envbox container. +func killBootstrapCmd(ctx context.Context, log slog.Logger, pid int, execID string, client dockerutil.DockerClient, timeout time.Duration) func() error { + return func() error { + log.Debug(ctx, "killing container", + slog.F("bootstrap_pid", pid), + slog.F("timeout", timeout.String()), + ) + + ctx, cancel := context.WithTimeout(ctx, timeout) + defer cancel() // The PID returned is the PID _outside_ the container... //nolint:gosec - out, err := exec.Command("kill", "-TERM", strconv.Itoa(bootstrapPID)).CombinedOutput() + out, err := exec.CommandContext(ctx, "kill", "-TERM", strconv.Itoa(pid)).CombinedOutput() if err != nil { return xerrors.Errorf("kill bootstrap process (%s): %w", out, err) } log.Debug(ctx, "sent kill signal waiting for process to exit") - err = dockerutil.WaitForExit(ctx, client, bootstrapExec.ID) + err = dockerutil.WaitForExit(ctx, client, execID) if err != nil { return xerrors.Errorf("wait for exit: %w", err) } @@ -743,8 +765,6 @@ func runDockerCVM(ctx context.Context, log slog.Logger, client dockerutil.Docker log.Debug(ctx, "bootstrap process successfully exited") return nil } - - return nil } //nolint:revive diff --git a/cmd/envbox/main.go b/cmd/envbox/main.go index ebb77f9..995565d 100644 --- a/cmd/envbox/main.go +++ b/cmd/envbox/main.go @@ -8,6 +8,8 @@ import ( "runtime" "syscall" + "golang.org/x/xerrors" + "cdr.dev/slog" "cdr.dev/slog/sloggers/slogjson" "github.com/coder/envbox/cli" @@ -29,6 +31,9 @@ func main() { err := fn() if err != nil { log.Error(ctx, "shutdown function failed", slog.Error(err)) + if xerrors.Is(err, context.DeadlineExceeded) { + os.Exit(cli.ExitCodeShutdownTimeout) + } os.Exit(1) } default: diff --git a/integration/docker_test.go b/integration/docker_test.go index d043e76..5639c91 100644 --- a/integration/docker_test.go +++ b/integration/docker_test.go @@ -306,10 +306,8 @@ func TestDocker(t *testing.T) { require.Error(t, err) // Simulate a shutdown. - integrationtest.StopContainer(t, pool, resource.Container.ID, 30*time.Second) - - err = resource.Close() - require.NoError(t, err) + exitCode := integrationtest.StopContainer(t, pool, resource.Container.ID, 30*time.Second) + require.Equal(t, 0, exitCode) t.Logf("envbox %q started successfully, recreating...", resource.Container.ID) // Run the envbox container. @@ -326,6 +324,34 @@ func TestDocker(t *testing.T) { }) require.NoError(t, err) }) + + t.Run("ShutdownTimeout", func(t *testing.T) { + t.Parallel() + + pool, err := dockertest.NewPool("") + require.NoError(t, err) + + var ( + tmpdir = integrationtest.TmpDir(t) + binds = integrationtest.DefaultBinds(t, tmpdir) + ) + + envs := []string{fmt.Sprintf("%s=%s", cli.EnvShutdownTimeout, "1s")} + + // Run the envbox container. + resource := integrationtest.RunEnvbox(t, pool, &integrationtest.CreateDockerCVMConfig{ + Image: integrationtest.UbuntuImage, + Username: "root", + Envs: envs, + Binds: binds, + BootstrapScript: sigtrapForeverScript, + }) + + // Simulate a shutdown. + exitCode := integrationtest.StopContainer(t, pool, resource.Container.ID, 30*time.Second) + // We expect it to timeout which should result in a special exit code. + require.Equal(t, cli.ExitCodeShutdownTimeout, exitCode) + }) } func requireSliceNoContains(t *testing.T, ss []string, els ...string) { @@ -358,6 +384,20 @@ func bindMount(src, dest string, ro bool) string { return fmt.Sprintf("%s:%s", src, dest) } +const sigtrapForeverScript = `#!/bin/bash +cleanup() { + echo "Got a signal, going to sleep!" && sleep infinity + exit 0 +} + +trap 'cleanup' INT TERM + +while true; do + echo "Working..." + sleep 1 +done +` + const sigtrapScript = `#!/bin/bash cleanup() { echo "HANDLING A SIGNAL!" && touch /home/coder/foo && echo "touched file" diff --git a/integration/integrationtest/docker.go b/integration/integrationtest/docker.go index 088952c..0b78369 100644 --- a/integration/integrationtest/docker.go +++ b/integration/integrationtest/docker.go @@ -91,7 +91,12 @@ func RunEnvbox(t *testing.T, pool *dockertest.Pool, conf *CreateDockerCVMConfig) host.CPUQuota = int64(conf.CPUs) * int64(dockerutil.DefaultCPUPeriod) }) require.NoError(t, err) - // t.Cleanup(func() { _ = pool.Purge(resource) }) + t.Cleanup(func() { + // Only delete the container if the test passes. + if !t.Failed() { + resource.Close() + } + }) waitForCVM(t, pool, resource) @@ -264,7 +269,7 @@ func ExecEnvbox(t *testing.T, pool *dockertest.Pool, conf ExecConfig) ([]byte, e return buf.Bytes(), nil } -func StopContainer(t *testing.T, pool *dockertest.Pool, id string, to time.Duration) { +func StopContainer(t *testing.T, pool *dockertest.Pool, id string, to time.Duration) int { t.Helper() err := pool.Client.KillContainer(docker.KillContainerOptions{ @@ -283,10 +288,11 @@ func StopContainer(t *testing.T, pool *dockertest.Pool, id string, to time.Durat continue } - return + return cnt.State.ExitCode } t.Fatalf("timed out waiting for container %s to stop", id) + return 1 } // cmdLineEnvs returns args passed to the /envbox command