Skip to content

Commit

Permalink
[jobmanager] Recover from job panics
Browse files Browse the repository at this point in the history
Signed-off-by: Dmitrii Okunev <xaionaro@meta.com>
  • Loading branch information
xaionaro committed Jul 4, 2023
1 parent fa98f00 commit dfc6f87
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 1 deletion.
4 changes: 4 additions & 0 deletions pkg/jobmanager/jobmanager.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
"time"

"github.com/facebookincubator/go-belt/beltctx"
"github.com/facebookincubator/go-belt/tool/experimental/errmon"
"github.com/insomniacslk/xjson"

"github.com/linuxboot/contest/pkg/api"
Expand Down Expand Up @@ -183,6 +184,9 @@ loop:
logging.Debugf(ev.Context, "Handling event %+v", ev)
handlerWg.Add(1)
go func() {
defer func() {
errmon.ObserveRecoverCtx(ev.Context, recover())
}()
defer handlerWg.Done()
jm.handleEvent(ev)
}()
Expand Down
9 changes: 8 additions & 1 deletion pkg/jobmanager/start.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,14 @@ func (jm *JobManager) startJob(ctx context.Context, j *job.Job, resumeState *job
logging.Debugf(ctx, "cancelling job context")
jobCancel()
}}
go jm.runJob(jobCtx, j, resumeState)

go func() {
defer func() {
errmon.ObserveRecoverCtx(ctx, recover())
}()

jm.runJob(jobCtx, j, resumeState)
}()
}

func (jm *JobManager) runJob(ctx context.Context, j *job.Job, resumeState *job.PauseEventPayload) {
Expand Down

0 comments on commit dfc6f87

Please sign in to comment.