Skip to content

Commit

Permalink
retry various network create errors (#161)
Browse files Browse the repository at this point in the history
"hardcode" a capped backoff retry for network create requests.

while the daemon does its best to find a non-conflicting cidr range to
create a network on, when multiple concurrent requests are trying to
create a network (such as when multiple harnesses are being created),
theres a relatively common case where conflicts happen.

a "dumb" retry like this seems to be common place in other solutions
like
[`kind`](https://github.com/kubernetes-sigs/kind/blob/0f1c56884daf22bd4d1559f9b955c26afaaeefb5/pkg/cluster/internal/providers/docker/network.go#L84),
so this approach just adds a non-configurable retry loop with a
reasonable backoff and cap. we could get more fancy and predetermine a
free cidr space during a locked op, but that feels unnecessary for now.
  • Loading branch information
joshrwolf authored Aug 13, 2024
1 parent fcaf902 commit cb76ca3
Showing 1 changed file with 49 additions and 12 deletions.
61 changes: 49 additions & 12 deletions internal/docker/network.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,12 @@ package docker
import (
"context"
"fmt"
"strings"
"time"

"github.com/docker/docker/api/types/network"
"github.com/google/uuid"
"k8s.io/apimachinery/pkg/util/wait"
)

type NetworkRequest struct {
Expand Down Expand Up @@ -33,26 +36,60 @@ func (d *docker) CreateNetwork(ctx context.Context, req *NetworkRequest) (*Netwo
req.Labels = make(map[string]string)
}

resp, err := d.cli.NetworkCreate(ctx, req.Name, network.CreateOptions{
Driver: "bridge",
Labels: d.withDefaultLabels(req.Labels),
IPAM: req.IPAM,
EnableIPv6: &req.EnableIPv6,
})
if err != nil {
return nil, err
}
var (
id string
lastErr error
)
if err := wait.ExponentialBackoffWithContext(ctx, wait.Backoff{
Duration: 1 * time.Second,
Factor: 2.0,
Jitter: 0.1,
Steps: 5,
Cap: 1 * time.Minute,
}, func(ctx context.Context) (bool, error) {
resp, err := d.cli.NetworkCreate(ctx, req.Name, network.CreateOptions{
Driver: "bridge",
Labels: d.withDefaultLabels(req.Labels),
IPAM: req.IPAM,
EnableIPv6: &req.EnableIPv6,
})
if err != nil {
if isRetryableNetworkCreateError(err) {
lastErr = err
return false, nil
}
return false, err
}

if resp.ID == "" {
return false, fmt.Errorf("failed o create network: network ID is empty")
}

if resp.ID == "" {
return nil, fmt.Errorf("failed o create network: network ID is empty")
id = resp.ID
return true, nil
}); err != nil {
return nil, fmt.Errorf("creating network: %w: last error: %w", err, lastErr)
}

return &NetworkAttachment{
Name: req.Name,
ID: resp.ID,
ID: id,
}, nil
}

func (d *docker) RemoveNetwork(ctx context.Context, nw *NetworkAttachment) error {
return d.cli.NetworkRemove(ctx, nw.ID)
}

func isRetryableNetworkCreateError(err error) bool {
errors := []string{
"Error response from daemon: could not find an available, non-overlapping IPv4 address pool among the defaults to assign to the network",
}
for _, e := range errors {
if err != nil && strings.Contains(err.Error(), e) {
return true
}
}
// If we get here, the error was not retryable
return false
}

0 comments on commit cb76ca3

Please sign in to comment.