Skip to content

Commit

Permalink
Merge pull request #1629 from GoogleCloudPlatform/release-candidate
Browse files Browse the repository at this point in the history
Release v1.21.0 to main branch
  • Loading branch information
tpdownes authored Jul 31, 2023
2 parents 252694a + 3b40c6b commit 6113058
Show file tree
Hide file tree
Showing 98 changed files with 2,368 additions and 742 deletions.
3 changes: 3 additions & 0 deletions .github/dependabot.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,6 @@ updates:
reviewers:
- ek-nag
- mattstreet-nag
# Disable version updates, do security updates only
# See https://docs.github.com/en/code-security/dependabot/dependabot-security-updates/configuring-dependabot-security-updates#overriding-the-default-behavior-with-a-configuration-file
open-pull-requests-limit: 0
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,8 @@ minutes. Please consider it only for blueprints that are quickly deployed.
The HPC Toolkit officially supports the following VM images:

* HPC CentOS 7
* HPC Rocky Linux 8
* Debian 11
* Ubuntu 20.04 LTS

For more information on these and other images, see
Expand Down
30 changes: 25 additions & 5 deletions cmd/create.go
Original file line number Diff line number Diff line change
Expand Up @@ -127,22 +127,42 @@ func expandOrDie(path string) config.DeploymentConfig {
return dc
}

func findPos(path config.Path, ctx config.YamlCtx) (config.Pos, bool) {
pos, ok := ctx.Pos(path)
for !ok && path.Parent() != nil {
path = path.Parent()
pos, ok = ctx.Pos(path)
}
return pos, ok
}

func renderError(err error, ctx config.YamlCtx) string {
var me config.Errors
if errors.As(err, &me) {
var sb strings.Builder
for _, e := range me.Errors {
sb.WriteString(renderError(e, ctx))
sb.WriteString("\n")
}
return sb.String()
}

var be config.BpError
if errors.As(err, &be) {
if pos, ok := ctx.Pos(be.Path); ok {
if pos, ok := findPos(be.Path, ctx); ok {
return renderRichError(be.Err, pos, ctx)
}
}
return err.Error()
}

func renderRichError(err error, pos config.Pos, ctx config.YamlCtx) string {
pref := fmt.Sprintf("%d: ", pos.Line)
arrow := strings.Repeat(" ", len(pref)+pos.Column-1) + "^"
return fmt.Sprintf(`
Error: %s
on line %d, column %d:
%d: %s
`, err, pos.Line, pos.Column, pos.Line, ctx.Lines[pos.Line-1])
%s%s
%s`, err, pref, ctx.Lines[pos.Line-1], arrow)
}

func setCLIVariables(bp *config.Blueprint, s []string) error {
Expand Down Expand Up @@ -197,7 +217,7 @@ func setValidationLevel(bp *config.Blueprint, s string) error {
case "IGNORE":
bp.ValidationLevel = config.ValidationIgnore
default:
return fmt.Errorf("invalid validation level (\"ERROR\", \"WARNING\", \"IGNORE\")")
return errors.New("invalid validation level (\"ERROR\", \"WARNING\", \"IGNORE\")")
}
return nil
}
Expand Down
3 changes: 1 addition & 2 deletions cmd/create_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -152,8 +152,7 @@ vars:
got := renderError(err, ctx)
c.Check(got, Equals, `
Error: arbuz
on line 3, column 9:
3: kale: dos
`)
^`)
}
}
4 changes: 2 additions & 2 deletions cmd/deploy.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ func runDeployCmd(cmd *cobra.Command, args []string) {
cobra.CheckErr(shell.ImportInputs(groupDir, artifactsDir, expandedBlueprintFile))

var err error
switch group.Kind {
switch group.Kind() {
case config.PackerKind:
// Packer groups are enforced to have length 1
subPath, e := modulewriter.DeploymentSource(group.Modules[0])
Expand All @@ -94,7 +94,7 @@ func runDeployCmd(cmd *cobra.Command, args []string) {
case config.TerraformKind:
err = deployTerraformGroup(groupDir)
default:
err = fmt.Errorf("group %s is an unsupported kind %s", groupDir, group.Kind.String())
err = fmt.Errorf("group %s is an unsupported kind %s", groupDir, group.Kind().String())
}
cobra.CheckErr(err)
}
Expand Down
4 changes: 2 additions & 2 deletions cmd/destroy.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ func runDestroyCmd(cmd *cobra.Command, args []string) error {
groupDir := filepath.Join(deploymentRoot, string(group.Name))

var err error
switch group.Kind {
switch group.Kind() {
case config.PackerKind:
// Packer groups are enforced to have length 1
// TODO: destroyPackerGroup(moduleDir)
Expand All @@ -89,7 +89,7 @@ func runDestroyCmd(cmd *cobra.Command, args []string) error {
case config.TerraformKind:
err = destroyTerraformGroup(groupDir)
default:
err = fmt.Errorf("group %s is an unsupported kind %s", groupDir, group.Kind.String())
err = fmt.Errorf("group %s is an unsupported kind %s", groupDir, group.Kind().String())
}
if err != nil {
return err
Expand Down
5 changes: 4 additions & 1 deletion cmd/export.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,9 +100,12 @@ func runExportCmd(cmd *cobra.Command, args []string) error {
if err != nil {
return err
}
if group.Kind == config.PackerKind {
if group.Kind() == config.PackerKind {
return fmt.Errorf("export command is unsupported on Packer modules because they do not have outputs")
}
if group.Kind() != config.TerraformKind {
return fmt.Errorf("export command is supported for Terraform modules only")
}

tf, err := shell.ConfigureTerraform(groupDir)
if err != nil {
Expand Down
2 changes: 1 addition & 1 deletion cmd/root.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ HPC deployments on the Google Cloud Platform.`,
log.Fatalf("cmd.Help function failed: %s", err)
}
},
Version: "v1.20.0",
Version: "v1.21.0",
Annotations: annotation,
}
)
Expand Down
8 changes: 4 additions & 4 deletions community/examples/hpc-slurm-local-ssd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,17 +42,17 @@ deployment_groups:
settings:
additional_disks:
- device_name: test-disk-1
disk_name:
disk_name: null
disk_size_gb: 375
disk_type: local-ssd
disk_labels:
disk_labels: {}
auto_delete: true
boot: false
- device_name: test-disk-2
disk_name:
disk_name: null
disk_size_gb: 375
disk_type: local-ssd
disk_labels:
disk_labels: {}
auto_delete: true
boot: false
bandwidth_tier: gvnic_enabled
Expand Down
1 change: 0 additions & 1 deletion community/examples/htc-htcondor.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,6 @@ deployment_groups:
- id: htcondor_cm
source: modules/compute/vm-instance
use:
- network1
- htcondor_startup_central_manager
settings:
name_prefix: cm
Expand Down
10 changes: 8 additions & 2 deletions community/examples/intel/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,10 @@ The [pfs-daos.yaml](pfs-daos.yaml) blueprint describes an environment with
- Two DAOS server instances
- Two DAOS client instances

The [pfs-daos.yaml](pfs-daos.yaml) blueprint uses a Packer template and Terraform modules from the [Google Cloud DAOS][google-cloud-daos] repository.
The [pfs-daos.yaml](pfs-daos.yaml) blueprint uses a Packer template and
Terraform modules from the [Google Cloud DAOS][google-cloud-daos] repository.
Please review the [introduction to image building](../../../docs/image-building.md)
for general information on building custom images using the Toolkit.

Identify a project to work in and substitute its unique id wherever you see
`<<PROJECT_ID>>` in the instructions below.
Expand Down Expand Up @@ -391,7 +394,10 @@ The blueprint uses modules from
- [community/modules/scheduler/SchedMD-slurm-on-gcp-login-node][SchedMD-slurm-on-gcp-login-node]
- [community/modules/compute/SchedMD-slurm-on-gcp-partition][SchedMD-slurm-on-gcp-partition]
The blueprint also uses a Packer template from the [Google Cloud DAOS][google-cloud-daos] repository.
The blueprint also uses a Packer template from the [Google Cloud
DAOS][google-cloud-daos] repository. Please review the [introduction to image
building](../../../docs/image-building.md) for general information on building
custom images using the Toolkit.
Identify a project to work in and substitute its unique id wherever you see
`<<PROJECT_ID>>` in the instructions below.
Expand Down
51 changes: 45 additions & 6 deletions community/modules/compute/gke-node-pool/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,16 +28,51 @@ can be overridden using the `taints` setting. See
[docs](https://cloud.google.com/kubernetes-engine/docs/how-to/node-taints) for
more info.

### Local SSD Storage
GKE offers two options for managing locally attached SSDs.

The first, and recommended, option is for GKE to manage the ephemeral storage
space on the node, which will then be automatically attached to pods which
request an `emptyDir` volume. This can be accomplished using the
[`local_ssd_count_ephemeral_storage`] variable.

The second, more complex, option is for GCP to attach these nodes as raw block
storage. In this case, the cluster administrator is responible for software
RAID settings, partitioning, formatting and mounting these disks on the host
OS. Still, this may be desired behavior in use cases which aren't supported
by an `emptyDir` volume (for example, a `ReadOnlyMany` or `ReadWriteMany` PV).
This can be accomplished using the [`local_ssd_count_nvme_block`] variable.

The [`local_ssd_count_ephemeral_storage`] and [`local_ssd_count_nvme_block`]
variables are mutually exclusive and cannot be mixed together.

Also, the number of SSDs which can be attached to a node depends on the
[machine type](https://cloud.google.com/compute/docs/disks#local_ssd_machine_type_restrictions).

See [docs](https://cloud.google.com/kubernetes-engine/docs/how-to/persistent-volumes/local-ssd)
for more info.

[`local_ssd_count_ephemeral_storage`]: #input\_local\_ssd\_count\_ephemeral\_storage
[`local_ssd_count_nvme_block`]: #input\_local\_ssd\_count\_nvme\_block

### Considerations with GPUs

When a GPU is attached to a node an additional taint is automatically added:
`nvidia.com/gpu=present:NoSchedule`. For jobs to get placed on these nodes, the
equivalent toleration is required. The `gke-job-template` module will
automatically apply this toleration when using a node pool with GPUs.

Nvidia GPU drivers must be installed by applying a DaemonSet to the cluster. See
Nvidia GPU drivers must be installed. The recommended approach for GKE to install
GPU dirvers is by applying a DaemonSet to the cluster. See
[these instructions](https://cloud.google.com/kubernetes-engine/docs/how-to/gpus#cos).

However, in some cases it may be desired to compile a different driver (such as
a desire to install a newer version, compatibility with the
[Nvidia GPU-operator](https://github.com/NVIDIA/gpu-operator) or other
use-cases). In this case, ensure that you turn off the
[enable_secure_boot](#input\_enable\_secure\_boot) option to allow unsigned
kernel modules to be loaded.

### GPUs Examples

There are several ways to add GPUs to a GKE node pool. See
Expand Down Expand Up @@ -141,16 +176,16 @@ limitations under the License.

| Name | Version |
|------|---------|
| <a name="requirement_terraform"></a> [terraform](#requirement\_terraform) | >= 1.0 |
| <a name="requirement_google"></a> [google](#requirement\_google) | >= 4.60.0, < 5.0 |
| <a name="requirement_google-beta"></a> [google-beta](#requirement\_google-beta) | >= 4.60.0, < 5.0 |
| <a name="requirement_terraform"></a> [terraform](#requirement\_terraform) | >= 1.2 |
| <a name="requirement_google"></a> [google](#requirement\_google) | >= 4.61.0, <= 4.74.0 |
| <a name="requirement_google-beta"></a> [google-beta](#requirement\_google-beta) | >= 4.61.0, <= 4.74.0 |

## Providers

| Name | Version |
|------|---------|
| <a name="provider_google"></a> [google](#provider\_google) | >= 4.60.0, < 5.0 |
| <a name="provider_google-beta"></a> [google-beta](#provider\_google-beta) | >= 4.60.0, < 5.0 |
| <a name="provider_google"></a> [google](#provider\_google) | >= 4.61.0, <= 4.74.0 |
| <a name="provider_google-beta"></a> [google-beta](#provider\_google-beta) | >= 4.61.0, <= 4.74.0 |

## Modules

Expand Down Expand Up @@ -181,9 +216,13 @@ No modules.
| <a name="input_disk_size_gb"></a> [disk\_size\_gb](#input\_disk\_size\_gb) | Size of disk for each node. | `number` | `100` | no |
| <a name="input_disk_type"></a> [disk\_type](#input\_disk\_type) | Disk type for each node. | `string` | `"pd-standard"` | no |
| <a name="input_enable_gcfs"></a> [enable\_gcfs](#input\_enable\_gcfs) | Enable the Google Container Filesystem (GCFS). See [restrictions](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster#gcfs_config). | `bool` | `false` | no |
| <a name="input_enable_secure_boot"></a> [enable\_secure\_boot](#input\_enable\_secure\_boot) | Enable secure boot for the nodes. Keep enabled unless custom kernel modules need to be loaded. See [here](https://cloud.google.com/compute/shielded-vm/docs/shielded-vm#secure-boot) for more info. | `bool` | `true` | no |
| <a name="input_guest_accelerator"></a> [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. | <pre>list(object({<br> type = string<br> count = number<br> gpu_partition_size = string<br> gpu_sharing_config = list(object({<br> gpu_sharing_strategy = string<br> max_shared_clients_per_gpu = number<br> }))<br> }))</pre> | `null` | no |
| <a name="input_image_type"></a> [image\_type](#input\_image\_type) | The default image type used by NAP once a new node pool is being created. Use either COS\_CONTAINERD or UBUNTU\_CONTAINERD. | `string` | `"COS_CONTAINERD"` | no |
| <a name="input_kubernetes_labels"></a> [kubernetes\_labels](#input\_kubernetes\_labels) | Kubernetes labels to be applied to each node in the node group. Key-value pairs. <br>(The `kubernetes.io/` and `k8s.io/` prefixes are reserved by Kubernetes Core components and cannot be specified) | `map(string)` | `null` | no |
| <a name="input_labels"></a> [labels](#input\_labels) | GCE resource labels to be applied to resources. Key-value pairs. | `map(string)` | n/a | yes |
| <a name="input_local_ssd_count_ephemeral_storage"></a> [local\_ssd\_count\_ephemeral\_storage](#input\_local\_ssd\_count\_ephemeral\_storage) | The number of local SSDs to attach to each node to back ephemeral storage.<br>Uses NVMe interfaces. Must be supported by `machine_type`.<br>[See above](#local-ssd-storage) for more info. | `number` | `0` | no |
| <a name="input_local_ssd_count_nvme_block"></a> [local\_ssd\_count\_nvme\_block](#input\_local\_ssd\_count\_nvme\_block) | The number of local SSDs to attach to each node to back block storage.<br>Uses NVMe interfaces. Must be supported by `machine_type`.<br>[See above](#local-ssd-storage) for more info. | `number` | `0` | no |
| <a name="input_machine_type"></a> [machine\_type](#input\_machine\_type) | The name of a Google Compute Engine machine type. | `string` | `"c2-standard-60"` | no |
| <a name="input_name"></a> [name](#input\_name) | The name of the node pool. If left blank, will default to the machine type. | `string` | `null` | no |
| <a name="input_project_id"></a> [project\_id](#input\_project\_id) | The project ID to host the cluster in. | `string` | n/a | yes |
Expand Down
17 changes: 15 additions & 2 deletions community/modules/compute/gke-node-pool/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ resource "google_container_node_pool" "node_pool" {
disk_size_gb = var.disk_size_gb
disk_type = var.disk_type
resource_labels = local.labels
labels = var.kubernetes_labels
service_account = var.service_account_email
oauth_scopes = var.service_account_scopes
machine_type = var.machine_type
Expand All @@ -84,8 +85,16 @@ resource "google_container_node_pool" "node_pool" {
image_type = var.image_type
guest_accelerator = var.guest_accelerator

ephemeral_storage_local_ssd_config {
local_ssd_count = var.local_ssd_count_ephemeral_storage
}

local_nvme_ssd_block_config {
local_ssd_count = var.local_ssd_count_nvme_block
}

shielded_instance_config {
enable_secure_boot = true
enable_secure_boot = var.enable_secure_boot
enable_integrity_monitoring = true
}

Expand All @@ -97,7 +106,7 @@ resource "google_container_node_pool" "node_pool" {
}

gvnic {
enabled = true
enabled = var.image_type == "COS_CONTAINERD"
}

dynamic "advanced_machine_features" {
Expand Down Expand Up @@ -137,6 +146,10 @@ resource "google_container_node_pool" "node_pool" {
condition = !local.static_node_set || !local.autoscale_set
error_message = "static_node_count cannot be set with either autoscaling_total_min_nodes or autoscaling_total_max_nodes."
}
precondition {
condition = !(var.local_ssd_count_ephemeral_storage > 0 && var.local_ssd_count_nvme_block > 0)
error_message = "Only one of local_ssd_count_ephemeral_storage or local_ssd_count_nvme_block can be set to a non-zero value."
}
}
}

Expand Down
37 changes: 37 additions & 0 deletions community/modules/compute/gke-node-pool/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,12 @@ variable "enable_gcfs" {
default = false
}

variable "enable_secure_boot" {
description = "Enable secure boot for the nodes. Keep enabled unless custom kernel modules need to be loaded. See [here](https://cloud.google.com/compute/shielded-vm/docs/shielded-vm#secure-boot) for more info."
type = bool
default = true
}

variable "guest_accelerator" {
description = "List of the type and count of accelerator cards attached to the instance."
type = list(object({
Expand All @@ -80,6 +86,28 @@ variable "image_type" {
default = "COS_CONTAINERD"
}

variable "local_ssd_count_ephemeral_storage" {
description = <<-EOT
The number of local SSDs to attach to each node to back ephemeral storage.
Uses NVMe interfaces. Must be supported by `machine_type`.
[See above](#local-ssd-storage) for more info.
EOT
type = number
default = 0
}

variable "local_ssd_count_nvme_block" {
description = <<-EOT
The number of local SSDs to attach to each node to back block storage.
Uses NVMe interfaces. Must be supported by `machine_type`.
[See above](#local-ssd-storage) for more info.
EOT
type = number
default = 0
}


variable "autoscaling_total_min_nodes" {
description = "Total minimum number of nodes in the NodePool."
type = number
Expand Down Expand Up @@ -176,6 +204,15 @@ variable "labels" {
type = map(string)
}

variable "kubernetes_labels" {
description = <<-EOT
Kubernetes labels to be applied to each node in the node group. Key-value pairs.
(The `kubernetes.io/` and `k8s.io/` prefixes are reserved by Kubernetes Core components and cannot be specified)
EOT
type = map(string)
default = null
}

variable "timeout_create" {
description = "Timeout for creating a node pool"
type = string
Expand Down
Loading

0 comments on commit 6113058

Please sign in to comment.