Skip to content

Commit

Permalink
Merge pull request #67 from syseleven/acunin/task-timeout-sigterm
Browse files Browse the repository at this point in the history
Handle task timeout to send SIGTERM
  • Loading branch information
Adri2000 authored Nov 2, 2023
2 parents f0133f2 + dd7f07a commit 5a9bea4
Show file tree
Hide file tree
Showing 8 changed files with 147 additions and 19 deletions.
14 changes: 10 additions & 4 deletions rebootmgr/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,12 +70,15 @@ def run_tasks(tasktype, con, hostname, dryrun, task_timeout):
for task in sorted(os.listdir("/etc/rebootmgr/%s_tasks/" % tasktype)):
task = os.path.join("/etc/rebootmgr/%s_tasks" % tasktype, task)
LOG.info("Run task %s" % task)
p = subprocess.Popen(task, env=env)
try:
subprocess.run(task, check=True, env=env, timeout=(task_timeout * 60))
except subprocess.CalledProcessError as e:
LOG.error("Task %s failed with return code %s. Exit" % (task, e.returncode))
sys.exit(EXIT_TASK_FAILED)
ret = p.wait(timeout=(task_timeout * 60))
except subprocess.TimeoutExpired:
p.terminate()
try:
p.wait(timeout=10)
except subprocess.TimeoutExpired:
p.kill()
LOG.error("Could not finish task %s in %i minutes. Exit" % (task, task_timeout))
LOG.error("Disable rebootmgr in consul for this node")
data = get_config(con, hostname)
Expand All @@ -84,6 +87,9 @@ def run_tasks(tasktype, con, hostname, dryrun, task_timeout):
put_config(con, hostname, data)
con.kv.delete("service/rebootmgr/reboot_in_progress")
sys.exit(EXIT_TASK_FAILED)
if ret != 0:
LOG.error("Task %s failed with return code %s. Exit" % (task, ret))
sys.exit(EXIT_TASK_FAILED)
LOG.info("task %s finished" % task)


Expand Down
68 changes: 61 additions & 7 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import subprocess

from unittest.mock import DEFAULT
from unittest.mock import MagicMock

import pytest
import consul
Expand Down Expand Up @@ -40,6 +41,57 @@ def fake_gethostname():
c.agent.service.deregister(name)


@pytest.fixture
def mock_subprocess_popen(mocker):
"""
Fixture for testing with mocked `subprocess.Popen`.
Returns a configured `MagicMock` instance.
You can optionally pass a `side_effect` as a second argument
which will be used as a side_effect for Popen.wait.
`side_effect` can be an Exception and will then be raised;
see the `MagicMock.side_effect` documentation for more information.
Example:
mocked_popen = mock_subprocess_popen(["reboot"])
call_your_tested_code()
mocked_popen.assert_any_call(["reboot"])
mocked_popen.wait.assert_called()
"""
wait_results = {}

def get_wait_result(command):
if isinstance(command, str):
command = [command]
elif isinstance(command, list):
pass
else:
raise ValueError("command must be either string or list")

return wait_results[json.dumps(command)]

def get_mocked_popen(command, *args, **kwargs):
mock = MagicMock()
return_value, side_effect = get_wait_result(command)
mock.wait.return_value = return_value
mock.wait.side_effect = side_effect
return mock

mocked_popen = mocker.patch("subprocess.Popen")
mocked_popen.side_effect = get_mocked_popen

def add(command, wait_return_value=None, wait_side_effect=None):
wait_results[json.dumps(command)] = wait_return_value, wait_side_effect
return mocked_popen

return add


@pytest.fixture
def mock_subprocess_run(mocker):
"""
Expand Down Expand Up @@ -101,7 +153,7 @@ def run(*args, catch_exceptions=False, **kwargs):


@pytest.fixture
def reboot_task(mocker, mock_subprocess_run):
def reboot_task(mocker, mock_subprocess_popen):
tasks = {"pre_boot": [], "post_boot": []}

def listdir(directory):
Expand All @@ -120,15 +172,17 @@ def create_task(tasktype, filename, exit_code=0, raise_timeout_expired=False):

tasks[tasktype] += [filename]

side_effect = None
if exit_code != 0:
side_effect = subprocess.CalledProcessError(exit_code, filename)
elif raise_timeout_expired:
if raise_timeout_expired:
return_value = None
side_effect = subprocess.TimeoutExpired(filename, 1234)
else:
return_value = exit_code
side_effect = None

mock_subprocess_run(
return mock_subprocess_popen(
["/etc/rebootmgr/{}_tasks/{}".format(tasktype, filename)],
side_effect)
wait_return_value=return_value,
wait_side_effect=side_effect)

return create_task

Expand Down
14 changes: 14 additions & 0 deletions tests/test_post_reboot.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,12 @@ def test_post_reboot_consul_checks_passing(
"""
mocker.patch("time.sleep")
mocked_run = mocker.patch("subprocess.run")
mocked_popen = mocker.patch("subprocess.Popen")

result = run_cli(rebootmgr, ["-v"])

mocked_run.assert_not_called()
mocked_popen.assert_not_called()
assert result.exit_code == 0


Expand All @@ -50,10 +52,12 @@ def test_post_reboot_consul_checks_failing(

mocker.patch("time.sleep")
mocked_run = mocker.patch("subprocess.run")
mocked_popen = mocker.patch("subprocess.Popen")

result = run_cli(rebootmgr, ["-v"])

mocked_run.assert_not_called()
mocked_popen.assert_not_called()
assert result.exit_code == EXIT_CONSUL_CHECKS_FAILED


Expand All @@ -66,10 +70,12 @@ def test_post_reboot_wait_until_healthy_and_are_healthy(
"""
mocker.patch("time.sleep")
mocked_run = mocker.patch("subprocess.run")
mocked_popen = mocker.patch("subprocess.Popen")

result = run_cli(rebootmgr, ["-v", "--post-reboot-wait-until-healthy"])

mocked_run.assert_not_called()
mocked_popen.assert_not_called()
assert result.exit_code == 0


Expand Down Expand Up @@ -103,10 +109,12 @@ def fake_sleep(seconds):

mocker.patch("time.sleep", new=fake_sleep)
mocked_run = mocker.patch("subprocess.run")
mocked_popen = mocker.patch("subprocess.Popen")

result = run_cli(rebootmgr, ["-v", "--post-reboot-wait-until-healthy"])

mocked_run.assert_not_called()
mocked_popen.assert_not_called()
assert sleep_counter == 0
assert result.exit_code == 0

Expand Down Expand Up @@ -135,6 +143,7 @@ def test_post_reboot_phase_fails_with_uptime(
run_cli, forward_consul_port, default_config, reboot_in_progress,
reboot_task, mocker):
mocker.patch('rebootmgr.main.open', new=mock_open(read_data='99999999.9 99999999.9'))
mocker.patch("subprocess.run")
reboot_task("post_boot", "50_another_task.sh")

result = run_cli(rebootmgr, ["-v", "--check-uptime"])
Expand All @@ -146,6 +155,7 @@ def test_post_reboot_phase_fails_with_uptime(
def test_post_reboot_succeeds_with_current_node_in_maintenance(
run_cli, consul_cluster, reboot_in_progress, forward_consul_port,
default_config, reboot_task, mocker):
mocker.patch("subprocess.run")
consul_cluster[0].agent.service.register("A", tags=["rebootmgr"])
consul_cluster[1].agent.service.register("A", tags=["rebootmgr"])
consul_cluster[2].agent.service.register("A", tags=["rebootmgr"])
Expand All @@ -163,6 +173,7 @@ def test_post_reboot_succeeds_with_current_node_in_maintenance(
def test_post_reboot_fails_with_other_node_in_maintenance(
run_cli, consul_cluster, reboot_in_progress, forward_consul_port,
default_config, reboot_task, mocker):
mocker.patch("subprocess.run")
consul_cluster[0].agent.service.register("A", tags=["rebootmgr"])
consul_cluster[1].agent.service.register("A", tags=["rebootmgr"])
consul_cluster[2].agent.service.register("A", tags=["rebootmgr"])
Expand All @@ -181,6 +192,7 @@ def test_post_reboot_succeeds_with_other_node_in_maintenance_but_ignoring(
run_cli, consul_cluster, reboot_in_progress, forward_consul_port,
default_config, reboot_task, mocker):

mocker.patch("subprocess.run")
consul_cluster[0].agent.service.register("A", tags=["rebootmgr"])
consul_cluster[1].agent.service.register("A", tags=["rebootmgr", "ignore_maintenance"])
consul_cluster[2].agent.service.register("A", tags=["rebootmgr"])
Expand Down Expand Up @@ -228,10 +240,12 @@ def fake_sleep(seconds):

mocker.patch("time.sleep", new=fake_sleep)
mocked_run = mocker.patch("subprocess.run")
mocked_popen = mocker.patch("subprocess.Popen")

result = run_cli(rebootmgr, ["-v", "--post-reboot-wait-until-healthy"])

mocked_run.assert_not_called()
mocked_popen.assert_not_called()
assert sleep_counter == 0
assert 'There were failed consul checks' in result.output
assert '_node_maintenance on consul2' in result.output
Expand Down
17 changes: 14 additions & 3 deletions tests/test_reboot.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def test_dryrun_reboot_succeeds_with_tasks(run_cli, forward_consul_port,
reboot_task, mock_subprocess_run,
mocker):
mocked_sleep = mocker.patch("time.sleep")
reboot_task("pre_boot", "00_some_task.sh")
mocked_popen = reboot_task("pre_boot", "00_some_task.sh")
mocked_run = mock_subprocess_run(["shutdown", "-r", "+1"])

result = run_cli(rebootmgr, ["-vv", "--dryrun"])
Expand All @@ -58,8 +58,11 @@ def test_dryrun_reboot_succeeds_with_tasks(run_cli, forward_consul_port,
assert "in key service/rebootmgr/reboot_in_progress" in result.output
assert result.exit_code == 0

assert mocked_run.call_count == 1
args, kwargs = mocked_run.call_args
# shutdown must not be called
mocked_run.assert_not_called()
# task should be called
assert mocked_popen.call_count == 1
args, kwargs = mocked_popen.call_args
assert args[0] == "/etc/rebootmgr/pre_boot_tasks/00_some_task.sh"
assert 'env' in kwargs
assert 'REBOOTMGR_DRY_RUN' in kwargs['env']
Expand All @@ -80,6 +83,7 @@ def test_reboot_fail(
mock_subprocess_run, mocker):
mocked_sleep = mocker.patch("time.sleep")

mocked_popen = mocker.patch("subprocess.Popen")
mocked_run = mock_subprocess_run(
["shutdown", "-r", "+1"],
side_effect=Exception("Failed to run reboot command"))
Expand All @@ -88,6 +92,7 @@ def test_reboot_fail(

assert result.exit_code == 1

mocked_popen.assert_not_called()
mocked_run.assert_any_call(["shutdown", "-r", "+1"], check=True)

# We want rebootmgr to sleep for 2 minutes after running the pre boot tasks,
Expand All @@ -112,10 +117,12 @@ def test_reboot_succeeds_if_this_node_is_in_maintenance(
consul_cluster[0].agent.maintenance(True)

mocker.patch("time.sleep")
mocked_popen = mocker.patch("subprocess.Popen")
mocked_run = mock_subprocess_run(["shutdown", "-r", "+1"])

result = run_cli(rebootmgr, ["-v"])

mocked_popen.assert_not_called()
mocked_run.assert_any_call(["shutdown", "-r", "+1"], check=True)
assert result.exit_code == 0

Expand All @@ -129,10 +136,12 @@ def test_reboot_fails_if_another_node_is_in_maintenance(
consul_cluster[1].agent.maintenance(True)

mocker.patch("time.sleep")
mocked_popen = mocker.patch("subprocess.Popen")
mocked_run = mock_subprocess_run(["shutdown", "-r", "+1"])

result = run_cli(rebootmgr, ["-v"])

mocked_popen.assert_not_called()
mocked_run.assert_not_called()
assert 'There were failed consul checks' in result.output
assert '_node_maintenance on consul2' in result.output
Expand All @@ -149,9 +158,11 @@ def test_reboot_succeeds_if_another_node_is_in_maintenance_but_ignoring(
consul_cluster[1].agent.maintenance(True)

mocker.patch("time.sleep")
mocked_popen = mocker.patch("subprocess.Popen")
mocked_run = mock_subprocess_run(["shutdown", "-r", "+1"])

result = run_cli(rebootmgr, ["-v"])

mocked_popen.assert_not_called()
mocked_run.assert_any_call(["shutdown", "-r", "+1"], check=True)
assert result.exit_code == 0
4 changes: 4 additions & 0 deletions tests/test_stopflag.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,14 @@ def test_set_global_stop_flag(
mock_subprocess_run, mocker):
mocked_sleep = mocker.patch("time.sleep")
mocked_run = mock_subprocess_run(["shutdown", "-r", "+1"])
mocked_popen = mocker.patch("subprocess.Popen")
datacenter = "test"

result = run_cli(rebootmgr, ["-v", "--set-global-stop-flag", datacenter])

mocked_sleep.assert_not_called()
mocked_run.assert_not_called()
mocked_popen.assert_not_called()
assert "Set "+datacenter+" global stop flag:" in result.output
idx, data = consul_cluster[0].kv.get("service/rebootmgr/stop", dc=datacenter)
assert idx is not None
Expand Down Expand Up @@ -96,12 +98,14 @@ def test_set_local_stop_flag(
mock_subprocess_run, mocker):
mocked_sleep = mocker.patch("time.sleep")
mocked_run = mock_subprocess_run(["shutdown", "-r", "+1"])
mocked_popen = mocker.patch("subprocess.Popen")
hostname = socket.gethostname().split(".")[0]

result = run_cli(rebootmgr, ["-v", "--set-local-stop-flag"])

mocked_sleep.assert_not_called()
mocked_run.assert_not_called()
mocked_popen.assert_not_called()
assert "Set "+hostname+" local stop flag:" in result.output
idx, data = consul_cluster[0].kv.get("service/rebootmgr/nodes/{}/config".format(
hostname))
Expand Down
17 changes: 12 additions & 5 deletions tests/test_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ def test_reboot_task_timeout(run_cli, consul_cluster, forward_consul_port, defau

def test_reboot_preboot_task_fails(run_cli, consul_cluster, forward_consul_port, default_config, reboot_task, mocker):
mocker.patch("time.sleep")
reboot_task("pre_boot", "00_some_task.sh", exit_code=1)
mocked_run = mocker.patch("subprocess.run")
mocked_popen = reboot_task("pre_boot", "00_some_task.sh", exit_code=1)

result = run_cli(rebootmgr)

Expand All @@ -29,13 +30,15 @@ def test_reboot_preboot_task_fails(run_cli, consul_cluster, forward_consul_port,
assert json.loads(data["Value"].decode()) == {
"enabled": True,
}
# TODO(oseibert): check that shutdown is NOT called.
assert mocked_popen.call_count == 1
mocked_run.assert_not_called()


def test_reboot_task_timeout_with_preexisting_config(run_cli, consul_cluster, forward_consul_port, reboot_task, mocker):
consul_cluster[0].kv.put("service/rebootmgr/nodes/{}/config".format(socket.gethostname()), '{"enabled": true, "test_preserved": true}')
mocker.patch("time.sleep")
reboot_task("pre_boot", "00_some_task.sh", raise_timeout_expired=True)
mocked_run = mocker.patch("subprocess.run")
mocked_popen = reboot_task("pre_boot", "00_some_task.sh", raise_timeout_expired=True)

result = run_cli(rebootmgr)

Expand All @@ -48,11 +51,13 @@ def test_reboot_task_timeout_with_preexisting_config(run_cli, consul_cluster, fo
"enabled": False,
"message": "Could not finish task /etc/rebootmgr/pre_boot_tasks/00_some_task.sh in 120 minutes"
}
# TODO(oseibert): check that shutdown is NOT called.
assert mocked_popen.call_count == 1
mocked_run.assert_not_called()


def test_post_reboot_phase_task_timeout(run_cli, consul_cluster, forward_consul_port, default_config, reboot_task, mocker):
reboot_task("post_boot", "50_another_task.sh", raise_timeout_expired=True)
mocked_run = mocker.patch("subprocess.run")
mocked_popen = reboot_task("post_boot", "50_another_task.sh", raise_timeout_expired=True)

mocker.patch("time.sleep")
consul_cluster[0].kv.put("service/rebootmgr/reboot_in_progress", socket.gethostname())
Expand All @@ -67,3 +72,5 @@ def test_post_reboot_phase_task_timeout(run_cli, consul_cluster, forward_consul_
"enabled": False,
"message": "Could not finish task /etc/rebootmgr/post_boot_tasks/50_another_task.sh in 120 minutes"
}
assert mocked_popen.call_count == 1
mocked_run.assert_not_called()
Loading

0 comments on commit 5a9bea4

Please sign in to comment.