From e0e5539902b3555fad1185bbbeec0691b17d67fc Mon Sep 17 00:00:00 2001 From: Andy Bubune Amewuda Date: Fri, 7 Jun 2024 18:52:15 +0200 Subject: [PATCH 01/51] Add 'source' argument for path to prolog or epilog scripts --- .../schedmd-slurm-gcp-v6-controller/README.md | 4 ++-- .../variables.tf | 24 +++++++++++++++++-- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md index a41f4c732e..54bb589e2b 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md @@ -229,7 +229,7 @@ limitations under the License. | [enable\_shielded\_vm](#input\_enable\_shielded\_vm) | Enable the Shielded VM configuration. Note: the instance image must support option. | `bool` | `false` | no | | [enable\_slurm\_gcp\_plugins](#input\_enable\_slurm\_gcp\_plugins) | Enables calling hooks in scripts/slurm\_gcp\_plugins during cluster resume and suspend. | `any` | `false` | no | | [enable\_smt](#input\_enable\_smt) | Enables Simultaneous Multi-Threading (SMT) on instance. | `bool` | `false` | no | -| [epilog\_scripts](#input\_epilog\_scripts) | List of scripts to be used for Epilog. Programs for the slurmd to execute
on every node when a user's job completes.
See https://slurm.schedmd.com/slurm.conf.html#OPT_Epilog. |
list(object({
filename = string
content = string
}))
| `[]` | no | +| [epilog\_scripts](#input\_epilog\_scripts) | List of scripts to be used for Epilog. Programs for the slurmd to execute
on every node when a user's job completes.
See https://slurm.schedmd.com/slurm.conf.html#OPT_Epilog. |
list(object({
filename = string
content = optional(string)
source = optional(string)
}))
| `[]` | no | | [extra\_logging\_flags](#input\_extra\_logging\_flags) | The list of extra flags for the logging system to use. See the logging\_flags variable in scripts/util.py to get the list of supported log flags. | `map(bool)` | `{}` | no | | [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string,
count = number
}))
| `[]` | no | | [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm controller VM instance.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-6-5-hpc-rocky-linux-8",
"project": "schedmd-slurm-public"
}
| no | @@ -251,7 +251,7 @@ limitations under the License. | [partitions](#input\_partitions) | Cluster partitions as a list. See module slurm\_partition. |
list(object({
partition_name = string
partition_conf = optional(map(string), {})
partition_nodeset = optional(list(string), [])
partition_nodeset_dyn = optional(list(string), [])
partition_nodeset_tpu = optional(list(string), [])
enable_job_exclusive = optional(bool, false)
}))
| n/a | yes | | [preemptible](#input\_preemptible) | Allow the instance to be preempted. | `bool` | `false` | no | | [project\_id](#input\_project\_id) | Project ID to create resources in. | `string` | n/a | yes | -| [prolog\_scripts](#input\_prolog\_scripts) | List of scripts to be used for Prolog. Programs for the slurmd to execute
whenever it is asked to run a job step from a new job allocation.
See https://slurm.schedmd.com/slurm.conf.html#OPT_Prolog. |
list(object({
filename = string
content = string
}))
| `[]` | no | +| [prolog\_scripts](#input\_prolog\_scripts) | List of scripts to be used for Prolog. Programs for the slurmd to execute
whenever it is asked to run a job step from a new job allocation.
See https://slurm.schedmd.com/slurm.conf.html#OPT_Prolog. |
list(object({
filename = string
content = optional(string)
source = optional(string)
}))
| `[]` | no | | [region](#input\_region) | The default region to place resources in. | `string` | n/a | yes | | [service\_account](#input\_service\_account) | DEPRECATED: Use `service_account_email` and `service_account_scopes` instead. |
object({
email = string
scopes = set(string)
})
| `null` | no | | [service\_account\_email](#input\_service\_account\_email) | Service account e-mail address to attach to the controller instance. | `string` | `null` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf index a91a80f387..dedf078bcd 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf @@ -519,9 +519,19 @@ See https://slurm.schedmd.com/slurm.conf.html#OPT_Prolog. EOD type = list(object({ filename = string - content = string + content = optional(string) + source = optional(string) })) default = [] + + validation { + condition = alltrue([ + for script in var.prolog_scripts : + (script.content != null && script.source == null) || + (script.content == null && script.source != null) + ]) + error_message = "Either 'content' or 'source' must be defined, but not both." + } } variable "epilog_scripts" { @@ -532,9 +542,19 @@ See https://slurm.schedmd.com/slurm.conf.html#OPT_Epilog. EOD type = list(object({ filename = string - content = string + content = optional(string) + source = optional(string) })) default = [] + + validation { + condition = alltrue([ + for script in var.epilog_scripts : + (script.content != null && script.source == null) || + (script.content == null && script.source != null) + ]) + error_message = "Either 'content' or 'source' must be defined, but not both." + } } variable "enable_external_prolog_epilog" { From 08b8ff94838dfd32ce060499f9c6d4104e7a5901 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Fri, 14 Jun 2024 19:03:06 +0000 Subject: [PATCH 02/51] Prevent mixing dynamic and non-dynamic nodesets in single partition Can not mix dynamic with non-dynamic nodesets due to Slurms inability to turn off "power management" at nodeset level (can only do it at partition or node level). We want to disable "power management" on dynamic nodes; We don't know names of dymamic nodes in advance; The only way is to do it at partition level. --- .../schedmd-slurm-gcp-v6-partition/outputs.tf | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-partition/outputs.tf b/community/modules/compute/schedmd-slurm-gcp-v6-partition/outputs.tf index 743cf4db7e..e75c6293f1 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-partition/outputs.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-partition/outputs.tf @@ -31,13 +31,11 @@ output "partitions" { } precondition { - condition = local.has_node || local.has_dyn || local.has_tpu - error_message = "Partition must contain at least one type of nodeset." - } - - precondition { - condition = ((!local.has_node || !local.has_dyn) && local.has_tpu) || ((local.has_node || local.has_dyn) && !local.has_tpu) - error_message = "Partition cannot contain TPU and non-TPU nodesets." + # Can not mix TPU with other non-TPU nodesets due to SlurmGCP specific limitations; + # Can not mix dynamic with non-dynamic nodesets due to Slurms inability to + # turn off "power management" at nodeset level (can only do it at partition or node level). + condition = sum([for b in [local.has_node, local.has_dyn, local.has_tpu] : b ? 1 : 0]) == 1 + error_message = "Partition must contain exactly one type of nodeset." } } From 5674a88d3d5faab5689240a497eff3b4e32ddd19 Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Fri, 14 Jun 2024 09:36:17 -0700 Subject: [PATCH 03/51] Minor updates to create rc script --- tools/cloud-build/babysit/run | 2 +- tools/create-release-candidate.sh | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/cloud-build/babysit/run b/tools/cloud-build/babysit/run index 951291529a..1422718f5f 100755 --- a/tools/cloud-build/babysit/run +++ b/tools/cloud-build/babysit/run @@ -22,7 +22,7 @@ if __name__ == "__main__": Probably Guido van Rossum Usage: - $ tools/cloud-build/babysit/run --pr #### --auto --project + $ tools/cloud-build/babysit/run --pr #### --auto """ import os import sys diff --git a/tools/create-release-candidate.sh b/tools/create-release-candidate.sh index 2b7a5bfb22..1d113dbcd6 100644 --- a/tools/create-release-candidate.sh +++ b/tools/create-release-candidate.sh @@ -96,6 +96,7 @@ git switch -c "${RC_BRANCH}" develop echo "Creating new Toolkit release-candidate branch" git push -u "${REMOTE_NAME}" "${RC_BRANCH}" git switch -c "${V_BRANCH}" "${RC_BRANCH}" +echo "converting old v${OLD_MAJOR}.${OLD_MINOR}.${OLD_PATCH} to new ${NEW_TAG}" git sed "v${OLD_MAJOR}\.${OLD_MINOR}\.${OLD_PATCH}" "${NEW_TAG}" -- **/*.go **/versions.tf git add -u echo "Creating new branch with version update to ${NEW_VERSION}" @@ -113,4 +114,4 @@ echo echo echo "Consider running the test babysitter using the pull request number from above:" echo -echo "babysit.py --pr ${PR_NUM} --all -c 1" +echo "tools/cloud-build/babysit/run --pr --all -c 1" From d21ca5435599ca14038db9afbbc98317c0acdba0 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 17 Jun 2024 10:29:26 +0000 Subject: [PATCH 04/51] Bump google.golang.org/api from 0.183.0 to 0.184.0 Bumps [google.golang.org/api](https://github.com/googleapis/google-api-go-client) from 0.183.0 to 0.184.0. - [Release notes](https://github.com/googleapis/google-api-go-client/releases) - [Changelog](https://github.com/googleapis/google-api-go-client/blob/main/CHANGES.md) - [Commits](https://github.com/googleapis/google-api-go-client/compare/v0.183.0...v0.184.0) --- updated-dependencies: - dependency-name: google.golang.org/api dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- go.mod | 18 +++++++++--------- go.sum | 40 ++++++++++++++++++++-------------------- 2 files changed, 29 insertions(+), 29 deletions(-) diff --git a/go.mod b/go.mod index b284491f4a..85b3fe8237 100644 --- a/go.mod +++ b/go.mod @@ -15,7 +15,7 @@ require ( github.com/spf13/cobra v1.8.0 github.com/zclconf/go-cty v1.14.4 golang.org/x/exp v0.0.0-20231110203233-9a3e6036ecaa - google.golang.org/genproto v0.0.0-20240528184218-531527333157 // indirect + google.golang.org/genproto v0.0.0-20240604185151-ef581f913117 // indirect gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c gopkg.in/yaml.v3 v3.0.1 ) @@ -27,7 +27,7 @@ require ( github.com/hashicorp/terraform-exec v0.21.0 github.com/mattn/go-isatty v0.0.20 github.com/zclconf/go-cty-debug v0.0.0-20191215020915-b22d67c1ba0b - google.golang.org/api v0.183.0 + google.golang.org/api v0.184.0 ) require ( @@ -50,12 +50,12 @@ require ( go.opentelemetry.io/otel v1.24.0 // indirect go.opentelemetry.io/otel/metric v1.24.0 // indirect go.opentelemetry.io/otel/trace v1.24.0 // indirect - golang.org/x/mod v0.16.0 // indirect + golang.org/x/mod v0.17.0 // indirect golang.org/x/sync v0.7.0 // indirect golang.org/x/time v0.5.0 // indirect - golang.org/x/tools v0.15.0 // indirect - google.golang.org/genproto/googleapis/api v0.0.0-20240521202816-d264139d666e // indirect - google.golang.org/genproto/googleapis/rpc v0.0.0-20240528184218-531527333157 // indirect + golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20240528184218-531527333157 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20240604185151-ef581f913117 // indirect ) require ( @@ -95,11 +95,11 @@ require ( github.com/ulikunitz/xz v0.5.10 // indirect github.com/xanzy/ssh-agent v0.3.3 // indirect go.opencensus.io v0.24.0 // indirect - golang.org/x/crypto v0.23.0 // indirect - golang.org/x/net v0.25.0 // indirect + golang.org/x/crypto v0.24.0 // indirect + golang.org/x/net v0.26.0 // indirect golang.org/x/oauth2 v0.21.0 // indirect golang.org/x/sys v0.21.0 - golang.org/x/text v0.15.0 // indirect + golang.org/x/text v0.16.0 // indirect google.golang.org/grpc v1.64.0 // indirect google.golang.org/protobuf v1.34.1 // indirect gopkg.in/warnings.v0 v0.1.2 // indirect diff --git a/go.sum b/go.sum index 039c3a4062..19d4c4defb 100644 --- a/go.sum +++ b/go.sum @@ -534,8 +534,8 @@ golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8U golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= -golang.org/x/crypto v0.23.0 h1:dIJU/v2J8Mdglj/8rJ6UUOM3Zc9zLZxVZwwxMooUSAI= -golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8= +golang.org/x/crypto v0.24.0 h1:mnl8DM0o513X8fdIkmyFE/5hTYxbwYOjDS/+rK6qpRI= +golang.org/x/crypto v0.24.0/go.mod h1:Z1PMYSOR5nyMcyAVAIQSKCDwalqy85Aqn1x3Ws4L5DM= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190510132918-efd6b22b2522/go.mod h1:ZjyILWgesfNpC6sMxTJOJm9Kp84zZh5NQWvqDGG3Qr8= @@ -574,8 +574,8 @@ golang.org/x/mod v0.4.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.4.1/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.4.2/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= -golang.org/x/mod v0.16.0 h1:QX4fJ0Rr5cPQCF7O9lh9Se4pmwfwskqZfq5moyldzic= -golang.org/x/mod v0.16.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/mod v0.17.0 h1:zY54UmvipHiNd+pm+m0x9KhZ9hl1/7QNMyxXbc6ICqA= +golang.org/x/mod v0.17.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180811021610-c39426892332/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= @@ -625,8 +625,8 @@ golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug golang.org/x/net v0.0.0-20220909164309-bea034e7d591/go.mod h1:YDH+HFinaLZZlnHAfSS6ZXJJ9M9t4Dl22yv3iI2vPwk= golang.org/x/net v0.0.0-20221014081412-f15817d10f9b/go.mod h1:YDH+HFinaLZZlnHAfSS6ZXJJ9M9t4Dl22yv3iI2vPwk= golang.org/x/net v0.1.0/go.mod h1:Cx3nUiGt4eDBEyega/BKRp+/AlGL8hYe7U9odMt2Cco= -golang.org/x/net v0.25.0 h1:d/OCCoBEUq33pjydKrGQhw7IlUPI2Oylr+8qLx49kac= -golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM= +golang.org/x/net v0.26.0 h1:soB7SVo0PWrY4vPW/+ay0jKDNScG2X9wFeYlXIvJsOQ= +golang.org/x/net v0.26.0/go.mod h1:5YKkiSynbBIh3p6iOc/vibscux0x38BZDkn8sCUPxHE= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= @@ -743,8 +743,8 @@ golang.org/x/sys v0.21.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.1.0/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= -golang.org/x/term v0.20.0 h1:VnkxpohqXaOBYJtBmEppKUG6mXpi+4O6purfc2+sMhw= -golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY= +golang.org/x/term v0.21.0 h1:WVXCp+/EBEHOj53Rvu+7KiT/iElMrO8ACK16SMZ3jaA= +golang.org/x/term v0.21.0/go.mod h1:ooXLefLobQVslOqselCNF4SxFAaoS6KujMbsGzSDmX0= golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= @@ -755,8 +755,8 @@ golang.org/x/text v0.3.5/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= golang.org/x/text v0.4.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= -golang.org/x/text v0.15.0 h1:h1V/4gjBv8v9cjcR6+AR5+/cIYK5N/WAgiv4xlsEtAk= -golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/text v0.16.0 h1:a94ExnEXNtEwYLGJSIUxnWoxoRz/ZcCsV63ROupILh4= +golang.org/x/text v0.16.0/go.mod h1:GhwF1Be+LQoKShO3cGOHzqOgRrGaYc9AvblQOmPVHnI= golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= @@ -814,8 +814,8 @@ golang.org/x/tools v0.1.3/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk= golang.org/x/tools v0.1.4/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk= golang.org/x/tools v0.1.5/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk= golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= -golang.org/x/tools v0.15.0 h1:zdAyfUGbYmuVokhzVmghFl2ZJh5QhcfebBgmVPFYA+8= -golang.org/x/tools v0.15.0/go.mod h1:hpksKq4dtpQWS1uQ61JkdqWM3LscIS6Slf+VVkm+wQk= +golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d h1:vU5i/LfpvrRCpgM/VPfJLg5KjxD3E+hfT1SH+d9zLwg= +golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= @@ -874,8 +874,8 @@ google.golang.org/api v0.96.0/go.mod h1:w7wJQLTM+wvQpNf5JyEcBoxK0RH7EDrh/L4qfsuJ google.golang.org/api v0.97.0/go.mod h1:w7wJQLTM+wvQpNf5JyEcBoxK0RH7EDrh/L4qfsuJ13s= google.golang.org/api v0.98.0/go.mod h1:w7wJQLTM+wvQpNf5JyEcBoxK0RH7EDrh/L4qfsuJ13s= google.golang.org/api v0.100.0/go.mod h1:ZE3Z2+ZOr87Rx7dqFsdRQkRBk36kDtp/h+QpHbB7a70= -google.golang.org/api v0.183.0 h1:PNMeRDwo1pJdgNcFQ9GstuLe/noWKIc89pRWRLMvLwE= -google.golang.org/api v0.183.0/go.mod h1:q43adC5/pHoSZTx5h2mSmdF7NcyfW9JuDyIOJAgS9ZQ= +google.golang.org/api v0.184.0 h1:dmEdk6ZkJNXy1JcDhn/ou0ZUq7n9zropG2/tR4z+RDg= +google.golang.org/api v0.184.0/go.mod h1:CeDTtUEiYENAf8PPG5VZW2yNp2VM3VWbCeTioAZBTBA= google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= google.golang.org/appengine v1.5.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= @@ -984,12 +984,12 @@ google.golang.org/genproto v0.0.0-20221010155953-15ba04fc1c0e/go.mod h1:3526vdqw google.golang.org/genproto v0.0.0-20221014173430-6e2ab493f96b/go.mod h1:1vXfmgAz9N9Jx0QA82PqRVauvCz1SGSz739p0f183jM= google.golang.org/genproto v0.0.0-20221014213838-99cd37c6964a/go.mod h1:1vXfmgAz9N9Jx0QA82PqRVauvCz1SGSz739p0f183jM= google.golang.org/genproto v0.0.0-20221025140454-527a21cfbd71/go.mod h1:9qHF0xnpdSfF6knlcsnpzUu5y+rpwgbvsyGAZPBMg4s= -google.golang.org/genproto v0.0.0-20240528184218-531527333157 h1:u7WMYrIrVvs0TF5yaKwKNbcJyySYf+HAIFXxWltJOXE= -google.golang.org/genproto v0.0.0-20240528184218-531527333157/go.mod h1:ubQlAQnzejB8uZzszhrTCU2Fyp6Vi7ZE5nn0c3W8+qQ= -google.golang.org/genproto/googleapis/api v0.0.0-20240521202816-d264139d666e h1:SkdGTrROJl2jRGT/Fxv5QUf9jtdKCQh4KQJXbXVLAi0= -google.golang.org/genproto/googleapis/api v0.0.0-20240521202816-d264139d666e/go.mod h1:LweJcLbyVij6rCex8YunD8DYR5VDonap/jYl3ZRxcIU= -google.golang.org/genproto/googleapis/rpc v0.0.0-20240528184218-531527333157 h1:Zy9XzmMEflZ/MAaA7vNcoebnRAld7FsPW1EeBB7V0m8= -google.golang.org/genproto/googleapis/rpc v0.0.0-20240528184218-531527333157/go.mod h1:EfXuqaE1J41VCDicxHzUDm+8rk+7ZdXzHV0IhO/I6s0= +google.golang.org/genproto v0.0.0-20240604185151-ef581f913117 h1:HCZ6DlkKtCDAtD8ForECsY3tKuaR+p4R3grlK80uCCc= +google.golang.org/genproto v0.0.0-20240604185151-ef581f913117/go.mod h1:lesfX/+9iA+3OdqeCpoDddJaNxVB1AB6tD7EfqMmprc= +google.golang.org/genproto/googleapis/api v0.0.0-20240528184218-531527333157 h1:7whR9kGa5LUwFtpLm2ArCEejtnxlGeLbAyjFY8sGNFw= +google.golang.org/genproto/googleapis/api v0.0.0-20240528184218-531527333157/go.mod h1:99sLkeliLXfdj2J75X3Ho+rrVCaJze0uwN7zDDkjPVU= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240604185151-ef581f913117 h1:1GBuWVLM/KMVUv1t1En5Gs+gFZCNd360GGb4sSxtrhU= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240604185151-ef581f913117/go.mod h1:EfXuqaE1J41VCDicxHzUDm+8rk+7ZdXzHV0IhO/I6s0= google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= google.golang.org/grpc v1.20.1/go.mod h1:10oTOabMzJvdu6/UiuZezV6QK5dSlG84ov/aaiqXj38= google.golang.org/grpc v1.21.1/go.mod h1:oYelfM1adQP15Ek0mdvEgi9Df8B9CZIaU1084ijfRaM= From af4b8287e0cd61b6d185afee0d903c128b5b37f5 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 17 Jun 2024 10:29:48 +0000 Subject: [PATCH 05/51] Bump github.com/spf13/cobra from 1.8.0 to 1.8.1 Bumps [github.com/spf13/cobra](https://github.com/spf13/cobra) from 1.8.0 to 1.8.1. - [Release notes](https://github.com/spf13/cobra/releases) - [Commits](https://github.com/spf13/cobra/compare/v1.8.0...v1.8.1) --- updated-dependencies: - dependency-name: github.com/spf13/cobra dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- go.mod | 2 +- go.sum | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/go.mod b/go.mod index b284491f4a..b0ad329c95 100644 --- a/go.mod +++ b/go.mod @@ -12,7 +12,7 @@ require ( github.com/otiai10/copy v1.14.0 github.com/pkg/errors v0.9.1 github.com/spf13/afero v1.11.0 - github.com/spf13/cobra v1.8.0 + github.com/spf13/cobra v1.8.1 github.com/zclconf/go-cty v1.14.4 golang.org/x/exp v0.0.0-20231110203233-9a3e6036ecaa google.golang.org/genproto v0.0.0-20240528184218-531527333157 // indirect diff --git a/go.sum b/go.sum index 039c3a4062..e4962f26bb 100644 --- a/go.sum +++ b/go.sum @@ -230,7 +230,7 @@ github.com/cncf/xds/go v0.0.0-20210805033703-aa0b78936158/go.mod h1:eXthEFrGJvWH github.com/cncf/xds/go v0.0.0-20210922020428-25de7278fc84/go.mod h1:eXthEFrGJvWHgFFCl3hGmgk+/aYT6PnTQLykKQRLhEs= github.com/cncf/xds/go v0.0.0-20211001041855-01bcc9b48dfe/go.mod h1:eXthEFrGJvWHgFFCl3hGmgk+/aYT6PnTQLykKQRLhEs= github.com/cncf/xds/go v0.0.0-20211011173535-cb28da3451f1/go.mod h1:eXthEFrGJvWHgFFCl3hGmgk+/aYT6PnTQLykKQRLhEs= -github.com/cpuguy83/go-md2man/v2 v2.0.3/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= +github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= github.com/cyphar/filepath-securejoin v0.2.4 h1:Ugdm7cg7i6ZK6x3xDF1oEu1nfkyfH53EtKeQYTC3kyg= github.com/cyphar/filepath-securejoin v0.2.4/go.mod h1:aPGpWjXOXUn2NCNjFvBE6aRxGGx79pTxQpKOJNYHHl4= @@ -472,8 +472,8 @@ github.com/skeema/knownhosts v1.2.2/go.mod h1:xYbVRSPxqBZFrdmDyMmsOs+uX1UZC3nTN3 github.com/spaolacci/murmur3 v0.0.0-20180118202830-f09979ecbc72/go.mod h1:JwIasOWyU6f++ZhiEuf87xNszmSA2myDM2Kzu9HwQUA= github.com/spf13/afero v1.11.0 h1:WJQKhtpdm3v2IzqG8VMqrr6Rf3UYpEF239Jy9wNepM8= github.com/spf13/afero v1.11.0/go.mod h1:GH9Y3pIexgf1MTIWtNGyogA5MwRIDXGUr+hbWNoBjkY= -github.com/spf13/cobra v1.8.0 h1:7aJaZx1B85qltLMc546zn58BxxfZdR/W22ej9CFoEf0= -github.com/spf13/cobra v1.8.0/go.mod h1:WXLWApfZ71AjXPya3WOlMsY9yMs7YeiHhFVlvLyhcho= +github.com/spf13/cobra v1.8.1 h1:e5/vxKd/rZsfSJMUX1agtjeTDf+qv1/JdBF8gg5k9ZM= +github.com/spf13/cobra v1.8.1/go.mod h1:wHxEcudfqmLYa8iTfL+OuZPbBZkmvliBWKIezN3kD9Y= github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= From dc09af945fb4018461b6287111cafbb07805ba61 Mon Sep 17 00:00:00 2001 From: Ankit Kinra <1037624+ankitkinra@users.noreply.github.com> Date: Fri, 14 Jun 2024 21:20:44 +0000 Subject: [PATCH 06/51] Add GKE integration test for ml-gke blueprint --- .../daily-tests/builds/ml-gke.yaml | 62 +++++++++++++++++++ .../cloud-build/daily-tests/tests/ml-gke.yml | 22 +++++++ 2 files changed, 84 insertions(+) create mode 100644 tools/cloud-build/daily-tests/builds/ml-gke.yaml create mode 100644 tools/cloud-build/daily-tests/tests/ml-gke.yml diff --git a/tools/cloud-build/daily-tests/builds/ml-gke.yaml b/tools/cloud-build/daily-tests/builds/ml-gke.yaml new file mode 100644 index 0000000000..80033c5f9f --- /dev/null +++ b/tools/cloud-build/daily-tests/builds/ml-gke.yaml @@ -0,0 +1,62 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +tags: +- m.gke-cluster +- m.gke-job-template +- m.gke-node-pool +- m.vpc +- gke + +timeout: 14400s # 4hr + +steps: +- id: gke + name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner + entrypoint: /bin/bash + env: + - "ANSIBLE_HOST_KEY_CHECKING=false" + - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" + args: + - -c + - | + set -x -e + cd /workspace && make + BUILD_ID_FULL=$BUILD_ID + BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6} + SG_EXAMPLE=community/examples/ml-gke.yaml + + # adding vm to act as remote node + echo ' - id: remote-node' >> $${SG_EXAMPLE} + echo ' source: modules/compute/vm-instance' >> $${SG_EXAMPLE} + echo ' use: [network1]' >> $${SG_EXAMPLE} + echo ' settings:' >> $${SG_EXAMPLE} + echo ' machine_type: e2-standard-2' >> $${SG_EXAMPLE} + echo ' zone: asia-southeast1-b' >> $${SG_EXAMPLE} + + echo ' - id: ubuntu_pool' >> $${SG_EXAMPLE} + echo ' source: community/modules/compute/gke-node-pool' >> $${SG_EXAMPLE} + echo ' use: [gke_cluster]' >> $${SG_EXAMPLE} + echo ' settings: {name: ubuntu, image_type: UBUNTU_CONTAINERD}' >> $${SG_EXAMPLE} + + # avoids conflict with other tests + sed -i "s/gke-subnet/gke-subnet-$${BUILD_ID_SHORT}/" $${SG_EXAMPLE} + + IP=$(curl ifconfig.me) + sed -i "s//$${IP}/" $${SG_EXAMPLE} + + ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ + --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --extra-vars="@tools/cloud-build/daily-tests/tests/ml-gke.yml" diff --git a/tools/cloud-build/daily-tests/tests/ml-gke.yml b/tools/cloud-build/daily-tests/tests/ml-gke.yml new file mode 100644 index 0000000000..c45659a1b1 --- /dev/null +++ b/tools/cloud-build/daily-tests/tests/ml-gke.yml @@ -0,0 +1,22 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +test_name: ml-gke +deployment_name: ml-gke-{{ build }} +zone: asia-southeast1-b # for remote node +workspace: /workspace +blueprint_yaml: "{{ workspace }}/community/examples/ml-gke.yaml" +network: "{{ deployment_name }}-net" +remote_node: "{{ deployment_name }}-0" +post_deploy_tests: [] From 357fd9d8786d33660b9213e2d31797d3d74a31ed Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 18 Jun 2024 01:23:34 +0000 Subject: [PATCH 07/51] Bump urllib3 from 1.26.18 to 1.26.19 in /community/front-end/ofe Bumps [urllib3](https://github.com/urllib3/urllib3) from 1.26.18 to 1.26.19. - [Release notes](https://github.com/urllib3/urllib3/releases) - [Changelog](https://github.com/urllib3/urllib3/blob/1.26.19/CHANGES.rst) - [Commits](https://github.com/urllib3/urllib3/compare/1.26.18...1.26.19) --- updated-dependencies: - dependency-name: urllib3 dependency-type: direct:production ... Signed-off-by: dependabot[bot] --- community/front-end/ofe/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/community/front-end/ofe/requirements.txt b/community/front-end/ofe/requirements.txt index 45059632b8..d39ac99799 100644 --- a/community/front-end/ofe/requirements.txt +++ b/community/front-end/ofe/requirements.txt @@ -92,7 +92,7 @@ tomlkit==0.11.8 typing-inspect==0.9.0 typing_extensions==4.6.3 uritemplate==4.1.1 -urllib3==1.26.18 +urllib3==1.26.19 uvicorn==0.22.0 virtualenv==20.23.1 wrapt==1.15.0 From 0d2f2dda377a1558e5c830f008845def5632726d Mon Sep 17 00:00:00 2001 From: Ankit Kinra <1037624+ankitkinra@users.noreply.github.com> Date: Thu, 13 Jun 2024 17:47:46 +0000 Subject: [PATCH 08/51] Added option to enable access to GKE cluster via all GCP public IP address space * updated the readme for the new variable in the module --- community/modules/scheduler/gke-cluster/README.md | 1 + community/modules/scheduler/gke-cluster/main.tf | 1 + community/modules/scheduler/gke-cluster/variables.tf | 6 ++++++ 3 files changed, 8 insertions(+) diff --git a/community/modules/scheduler/gke-cluster/README.md b/community/modules/scheduler/gke-cluster/README.md index f98ce0fcc5..2a027200fd 100644 --- a/community/modules/scheduler/gke-cluster/README.md +++ b/community/modules/scheduler/gke-cluster/README.md @@ -122,6 +122,7 @@ limitations under the License. | [enable\_private\_endpoint](#input\_enable\_private\_endpoint) | (Beta) Whether the master's internal IP address is used as the cluster endpoint. | `bool` | `true` | no | | [enable\_private\_ipv6\_google\_access](#input\_enable\_private\_ipv6\_google\_access) | The private IPv6 google access type for the VMs in this subnet. | `bool` | `true` | no | | [enable\_private\_nodes](#input\_enable\_private\_nodes) | (Beta) Whether nodes have internal IP addresses only. | `bool` | `true` | no | +| [gcp\_public\_cidrs\_access\_enabled](#input\_gcp\_public\_cidrs\_access\_enabled) | Whether the cluster master is accessible via all the Google Compute Engine Public IPs. To view this list of IP addresses look here https://cloud.google.com/compute/docs/faq#find_ip_range | `bool` | `false` | no | | [labels](#input\_labels) | GCE resource labels to be applied to resources. Key-value pairs. | `map(string)` | n/a | yes | | [maintenance\_exclusions](#input\_maintenance\_exclusions) | List of maintenance exclusions. A cluster can have up to three. |
list(object({
name = string
start_time = string
end_time = string
exclusion_scope = string
}))
| `[]` | no | | [maintenance\_start\_time](#input\_maintenance\_start\_time) | Start time for daily maintenance operations. Specified in GMT with `HH:MM` format. | `string` | `"09:00"` | no | diff --git a/community/modules/scheduler/gke-cluster/main.tf b/community/modules/scheduler/gke-cluster/main.tf index 8d6e048b43..59e6822a19 100644 --- a/community/modules/scheduler/gke-cluster/main.tf +++ b/community/modules/scheduler/gke-cluster/main.tf @@ -64,6 +64,7 @@ resource "google_container_cluster" "gke_cluster" { display_name = cidr_blocks.value.display_name } } + gcp_public_cidrs_access_enabled = var.gcp_public_cidrs_access_enabled } private_ipv6_google_access = var.enable_private_ipv6_google_access ? "PRIVATE_IPV6_GOOGLE_ACCESS_TO_GOOGLE" : null diff --git a/community/modules/scheduler/gke-cluster/variables.tf b/community/modules/scheduler/gke-cluster/variables.tf index 5ace7cae91..3cf7c7dc5f 100644 --- a/community/modules/scheduler/gke-cluster/variables.tf +++ b/community/modules/scheduler/gke-cluster/variables.tf @@ -214,6 +214,12 @@ variable "enable_master_global_access" { default = false } +variable "gcp_public_cidrs_access_enabled" { + description = "Whether the cluster master is accessible via all the Google Compute Engine Public IPs. To view this list of IP addresses look here https://cloud.google.com/compute/docs/faq#find_ip_range" + type = bool + default = false +} + variable "master_authorized_networks" { description = "External network that can access Kubernetes master through HTTPS. Must be specified in CIDR notation." type = list(object({ From 570a15197156c43d91928da2950019a253cfc06e Mon Sep 17 00:00:00 2001 From: Farhad Sharabiani Date: Thu, 20 Jun 2024 03:21:21 +0000 Subject: [PATCH 09/51] Create 'pre-existing-gke-cluster' module --- .../pre-existing-gke-cluster/README.md | 88 +++++++++++++++++++ .../pre-existing-gke-cluster/main.tf | 21 +++++ .../pre-existing-gke-cluster/metadata.yaml | 19 ++++ .../pre-existing-gke-cluster/outputs.tf | 28 ++++++ .../pre-existing-gke-cluster/variables.tf | 30 +++++++ .../pre-existing-gke-cluster/versions.tf | 30 +++++++ modules/README.md | 3 + 7 files changed, 219 insertions(+) create mode 100644 community/modules/scheduler/pre-existing-gke-cluster/README.md create mode 100644 community/modules/scheduler/pre-existing-gke-cluster/main.tf create mode 100644 community/modules/scheduler/pre-existing-gke-cluster/metadata.yaml create mode 100644 community/modules/scheduler/pre-existing-gke-cluster/outputs.tf create mode 100644 community/modules/scheduler/pre-existing-gke-cluster/variables.tf create mode 100644 community/modules/scheduler/pre-existing-gke-cluster/versions.tf diff --git a/community/modules/scheduler/pre-existing-gke-cluster/README.md b/community/modules/scheduler/pre-existing-gke-cluster/README.md new file mode 100644 index 0000000000..50676ba8d6 --- /dev/null +++ b/community/modules/scheduler/pre-existing-gke-cluster/README.md @@ -0,0 +1,88 @@ +## Description + +This module discovers a Google Kubernetes Engine ([GKE](https://cloud.google.com/kubernetes-engine)) cluster that already exists in Google Cloud and +outputs cluster attributes that uniquely identify it for use by other modules. +The module outputs are aligned with the [gke-cluster module][gke-cluster] so that it can be used +as a drop-in substitute when a GKE cluster already exists. + +The below sample blueprint discovers the existing GKE cluster named "my-gke-cluster" in "us-central1" region. With the `use` keyword, the +[gke-node-pool] module accepts the `cluser_id` +input variable that uniquely identifies the existing GKE cluster in which the +GKE node pool will be created. + +[gke-cluster]: ../gke-cluster/README.md +[gke-node-pool]: ../../compute/gke-node-pool/README.md + +### Example + +```yaml +- id: existing-gke-cluster + source: community/modules/scheduler/pre-existing-gke-cluster + settings: + project_id: $(vars.project_id) + cluster_name: my-gke-cluster + region: us-central1 + +- id: compute_pool + source: community/modules/compute/gke-node-pool + use: [existing-gke-cluster] +``` + +> **_NOTE:_** The `project_id` and `region` settings would be inferred from the +> deployment variables of the same name, but they are included here for clarity. + +## License + + +Copyright 2024 Google LLC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | >= 0.14.0 | +| [google](#requirement\_google) | > 5.0 | + +## Providers + +| Name | Version | +|------|---------| +| [google](#provider\_google) | > 5.0 | + +## Modules + +No modules. + +## Resources + +| Name | Type | +|------|------| +| [google_container_cluster.existing_gke_cluster](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/container_cluster) | data source | + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [cluster\_name](#input\_cluster\_name) | Name of the existing cluster | `string` | n/a | yes | +| [project\_id](#input\_project\_id) | Project that hosts the existing cluster | `string` | n/a | yes | +| [region](#input\_region) | Region in which to search for the cluster | `string` | n/a | yes | + +## Outputs + +| Name | Description | +|------|-------------| +| [cluster\_id](#output\_cluster\_id) | An identifier for the gke cluster with format projects//locations//clusters/. | +| [gke\_cluster\_exists](#output\_gke\_cluster\_exists) | A static flag that signals to downstream modules that a cluster exists. | + diff --git a/community/modules/scheduler/pre-existing-gke-cluster/main.tf b/community/modules/scheduler/pre-existing-gke-cluster/main.tf new file mode 100644 index 0000000000..c59e35e8da --- /dev/null +++ b/community/modules/scheduler/pre-existing-gke-cluster/main.tf @@ -0,0 +1,21 @@ +/** + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +data "google_container_cluster" "existing_gke_cluster" { + name = var.cluster_name + project = var.project_id + location = var.region +} diff --git a/community/modules/scheduler/pre-existing-gke-cluster/metadata.yaml b/community/modules/scheduler/pre-existing-gke-cluster/metadata.yaml new file mode 100644 index 0000000000..17bedb471b --- /dev/null +++ b/community/modules/scheduler/pre-existing-gke-cluster/metadata.yaml @@ -0,0 +1,19 @@ +# Copyright 2024 "Google LLC" +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +spec: + requirements: + services: + - container.googleapis.com diff --git a/community/modules/scheduler/pre-existing-gke-cluster/outputs.tf b/community/modules/scheduler/pre-existing-gke-cluster/outputs.tf new file mode 100644 index 0000000000..6166aa6bad --- /dev/null +++ b/community/modules/scheduler/pre-existing-gke-cluster/outputs.tf @@ -0,0 +1,28 @@ +/** + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +output "cluster_id" { + description = "An identifier for the gke cluster with format projects//locations//clusters/." + value = data.google_container_cluster.existing_gke_cluster.id +} + +output "gke_cluster_exists" { + description = "A static flag that signals to downstream modules that a cluster exists." + value = true + depends_on = [ + data.google_container_cluster.existing_gke_cluster + ] +} diff --git a/community/modules/scheduler/pre-existing-gke-cluster/variables.tf b/community/modules/scheduler/pre-existing-gke-cluster/variables.tf new file mode 100644 index 0000000000..5d2121ba69 --- /dev/null +++ b/community/modules/scheduler/pre-existing-gke-cluster/variables.tf @@ -0,0 +1,30 @@ +/** + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +variable "project_id" { + description = "Project that hosts the existing cluster" + type = string +} + +variable "cluster_name" { + description = "Name of the existing cluster" + type = string +} + +variable "region" { + description = "Region in which to search for the cluster" + type = string +} diff --git a/community/modules/scheduler/pre-existing-gke-cluster/versions.tf b/community/modules/scheduler/pre-existing-gke-cluster/versions.tf new file mode 100644 index 0000000000..9083cd5299 --- /dev/null +++ b/community/modules/scheduler/pre-existing-gke-cluster/versions.tf @@ -0,0 +1,30 @@ +/** + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +*/ + +terraform { + required_providers { + google = { + source = "hashicorp/google" + version = "> 5.0" + } + } + + provider_meta "google" { + module_name = "blueprints/terraform/hpc-toolkit:pre-existing-gke-cluster/v1.35.0" + } + + required_version = ">= 0.14.0" +} diff --git a/modules/README.md b/modules/README.md index b9475195e5..bae9d14143 100644 --- a/modules/README.md +++ b/modules/README.md @@ -184,6 +184,8 @@ Pub/Sub subscription. Primarily used for [FSI - MonteCarlo Tutorial][fsi-monteca submission of Google Cloud Batch jobs. * **[gke-cluster]** ![community-badge] ![experimental-badge] : Creates a Kubernetes cluster using GKE. +* **[pre-existing-gke-cluster]** ![community-badge] ![experimental-badge] : Retrieves an existing + GKE cluster's attributes to be used in other modules as a substitute for creating a new cluster ([gke-cluster]). * **[schedmd-slurm-gcp-v5-controller]** ![community-badge] : Creates a Slurm controller node using [slurm-gcp-version-5]. * **[schedmd-slurm-gcp-v5-login]** ![community-badge] : @@ -210,6 +212,7 @@ Pub/Sub subscription. Primarily used for [FSI - MonteCarlo Tutorial][fsi-monteca [batch-job-template]: ../modules/scheduler/batch-job-template/README.md [batch-login-node]: ../modules/scheduler/batch-login-node/README.md [gke-cluster]: ../community/modules/scheduler/gke-cluster/README.md +[pre-existing-gke-cluster]: ../community/modules/scheduler/pre-existing-gke-cluster/README.md [htcondor-setup]: ../community/modules/scheduler/htcondor-setup/README.md [htcondor-pool-secrets]: ../community/modules/scheduler/htcondor-pool-secrets/README.md [htcondor-access-point]: ../community/modules/scheduler/htcondor-access-point/README.md From 2603699fa008fa6348ea61cda261bd445e72a27c Mon Sep 17 00:00:00 2001 From: aneo-ssam Date: Thu, 20 Jun 2024 15:37:12 +0200 Subject: [PATCH 10/51] Added disk_type for htcondor EP template --- community/modules/compute/htcondor-execute-point/main.tf | 1 + .../modules/compute/htcondor-execute-point/variables.tf | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/community/modules/compute/htcondor-execute-point/main.tf b/community/modules/compute/htcondor-execute-point/main.tf index 35740ac547..0810cdbe50 100644 --- a/community/modules/compute/htcondor-execute-point/main.tf +++ b/community/modules/compute/htcondor-execute-point/main.tf @@ -151,6 +151,7 @@ module "execute_point_instance_template" { machine_type = var.machine_type disk_size_gb = var.disk_size_gb + disk_type = var.disk_type gpu = one(local.guest_accelerator) preemptible = var.spot startup_script = local.is_windows_image ? null : module.startup_script.startup_script diff --git a/community/modules/compute/htcondor-execute-point/variables.tf b/community/modules/compute/htcondor-execute-point/variables.tf index 75fde2b84c..2c54efe9ff 100644 --- a/community/modules/compute/htcondor-execute-point/variables.tf +++ b/community/modules/compute/htcondor-execute-point/variables.tf @@ -174,6 +174,12 @@ variable "disk_size_gb" { default = 100 } +variable "disk_type" { + description = "Disk type for template" + type = string + default = "pd-standard" +} + variable "windows_startup_ps1" { description = "Startup script to run at boot-time for Windows-based HTCondor execute points" type = list(string) From c5943a46e4ad17ab3ed7c0895c48eae36c9c44a9 Mon Sep 17 00:00:00 2001 From: aneo-ssam Date: Thu, 20 Jun 2024 16:46:46 +0200 Subject: [PATCH 11/51] Pre-commit and checks --- community/modules/compute/htcondor-execute-point/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/community/modules/compute/htcondor-execute-point/README.md b/community/modules/compute/htcondor-execute-point/README.md index 181ae3a91e..dabaa58fef 100644 --- a/community/modules/compute/htcondor-execute-point/README.md +++ b/community/modules/compute/htcondor-execute-point/README.md @@ -229,6 +229,7 @@ limitations under the License. | [central\_manager\_ips](#input\_central\_manager\_ips) | List of IP addresses of HTCondor Central Managers | `list(string)` | n/a | yes | | [deployment\_name](#input\_deployment\_name) | HPC Toolkit deployment name. HTCondor cloud resource names will include this value. | `string` | n/a | yes | | [disk\_size\_gb](#input\_disk\_size\_gb) | Boot disk size in GB | `number` | `100` | no | +| [disk\_type](#input\_disk\_type) | Disk type for template | `string` | `"pd-standard"` | no | | [distribution\_policy\_target\_shape](#input\_distribution\_policy\_target\_shape) | Target shape across zones for instance group managing execute points | `string` | `"ANY"` | no | | [enable\_oslogin](#input\_enable\_oslogin) | Enable or Disable OS Login with "ENABLE" or "DISABLE". Set to "INHERIT" to inherit project OS Login setting. | `string` | `"ENABLE"` | no | | [enable\_shielded\_vm](#input\_enable\_shielded\_vm) | Enable the Shielded VM configuration (var.shielded\_instance\_config). | `bool` | `false` | no | From a6431f382985a05d0a8e0c97887592c229c124ad Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Thu, 20 Jun 2024 23:23:31 +0000 Subject: [PATCH 12/51] Mention `ghpc deploy` in `cmd/README.md` --- cmd/README.md | 83 ++++++++++++++++++++++++++++----------------------- 1 file changed, 45 insertions(+), 38 deletions(-) diff --git a/cmd/README.md b/cmd/README.md index 0309a03287..2127a14735 100644 --- a/cmd/README.md +++ b/cmd/README.md @@ -7,30 +7,38 @@ clusters, also referred to as the gHPC Engine. ### Usage - ghpc -`ghpc [FLAGS]` - -`ghpc [SUBCOMMAND]` +```bash +ghpc [FLAGS] +ghpc [SUBCOMMAND] +``` ### Subcommands - ghpc -[create](#ghpc-create): Create a new deployment +* [`deploy`](#ghpc-deploy): Deploy an HPC cluster on Google Cloud +* [`create`](#ghpc-create): Create a new deployment +* [`expand`](#ghpc-expand): Expand the blueprint without creating a new deployment +* [`completion`](#ghpc-completion): Generate completion script +* [`help`](#ghpc-help): Display help information for any command -[expand](#ghpc-expand): Expand the blueprint without creating a new deployment +### Flags - ghpc -[completion](#ghpc-completion): Generate completion script +* `-h, --help`: displays detailed help for the ghpc command. +* `-v, --version`: displays the version of ghpc being used. -[help](#ghpc-help): Display help information for any command +### Example - ghpc -### Flags - ghpc +```bash +ghpc --version +``` -+ -h, --help: displays detailed help for the ghpc command. +## ghpc deploy -+ -v, --version: displays the version of ghpc being used. +`ghpc deploy` deploys an HPC cluster on Google Cloud using the deployment directory created by `ghpc create` or creates one from supplied blueprint file. -### Example - ghpc +### Usage - deploy ```bash -ghpc --version +ghpc deploy ( | ) [flags] ``` ## ghpc create @@ -39,38 +47,37 @@ ghpc --version ### Usage - create -`ghpc create BLUEPRINT_NAME [FLAGS]` +```sh +ghpc create BLUEPRINT_FILE [FLAGS] +``` ### Positional arguments - create -`BLUEPRINT_NAME`: the name of the blueprint file that is used for the deployment. +`BLUEPRINT_FILE`: the name of the blueprint file that is used for the deployment. ### Flags - create -+ `--backend-config strings`: Comma-separated list of name=value variables to set Terraform backend configuration. Can be used multiple times. - -+ `-h, --help`: display detailed help for the create command. - -+ `-o, --out string`: sets the output directory where the HPC deployment directory will be created. - -+ `-w, --overwrite-deployment`: If specified, an existing deployment directory is overwritten by the new deployment. - - + Terraform state IS preserved. - + Terraform workspaces are NOT supported (behavior undefined). - + Packer is NOT supported. - -+ `-l, --validation-level string`: sets validation level to one of ("ERROR", "WARNING", "IGNORE") (default "WARNING"). - -+ `--vars strings`: comma-separated list of name=value variables to override YAML configuration. Can be used multiple times. Arrays or maps containing comma-separated values must be enclosed in double quotes. The double quotes may require escaping depending on the shell used. Examples below have been tested using a `bash` shell: - + `--vars foo=bar,baz=2` - + `--vars bar=2 --vars baz=3.14` - + `--vars foo=true` - + `--vars "foo={bar: baz}"` - + `--vars "\"foo={bar: baz, qux: quux}\""` - + `--vars "\"foo={bar: baz}\"",\"b=[foo,3,3.14]\"` - + `--vars "\"a={foo: [bar, baz]}\"",\"b=[foo,3,3.14]\"` - + `--vars \"b=[foo,3,3.14]\"` - + `--vars \"b=[[foo,bar],3,3.14]\"` +* `--backend-config strings`: Comma-separated list of name=value variables to set Terraform backend configuration. Can be used multiple times. +* `-h, --help`: display detailed help for the create command. +* `-o, --out string`: sets the output directory where the HPC deployment directory will be created. +* `-w, --overwrite-deployment`: If specified, an existing deployment directory is overwritten by the new deployment. + + * Terraform state IS preserved. + * Terraform workspaces are NOT supported (behavior undefined). + * Packer is NOT supported. + +* `-l, --validation-level string`: sets validation level to one of ("ERROR", "WARNING", "IGNORE") (default "WARNING"). +* `--vars strings`: comma-separated list of name=value variables to override YAML configuration. Can be used multiple times. Arrays or maps containing comma-separated values must be enclosed in double quotes. The double quotes may require escaping depending on the shell used. Examples below have been tested using a `bash` shell: + + * `--vars foo=bar,baz=2` + * `--vars bar=2 --vars baz=3.14` + * `--vars foo=true` + * `--vars "foo={bar: baz}"` + * `--vars "\"foo={bar: baz, qux: quux}\""` + * `--vars "\"foo={bar: baz}\"",\"b=[foo,3,3.14]\"` + * `--vars "\"a={foo: [bar, baz]}\"",\"b=[foo,3,3.14]\"` + * `--vars \"b=[foo,3,3.14]\"` + * `--vars \"b=[[foo,bar],3,3.14]\"` ### Example - create From f07db38b033a3bcbc7d56819d7343b2c59394b5e Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Thu, 20 Jun 2024 23:47:02 +0000 Subject: [PATCH 13/51] Add mig module to the modules list --- modules/README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modules/README.md b/modules/README.md index d369c65c97..9633b8c0f9 100644 --- a/modules/README.md +++ b/modules/README.md @@ -54,6 +54,7 @@ Modules that are still in development and less stable are labeled with the pool][htcondor-setup]. * **[pbspro-execution]** ![community-badge] ![experimental-badge] : Creates execution hosts for use in a PBS Professional cluster. +* **[mig]** ![community-badge] ![experimental-badge] : Creates a Managed Instance Group. * **[notebook]** ![community-badge] ![experimental-badge] : Creates a Vertex AI Notebook. Primarily used for [FSI - MonteCarlo Tutorial][fsi-montecarlo-on-batch-tutorial]. @@ -67,6 +68,7 @@ Modules that are still in development and less stable are labeled with the [schedmd-slurm-gcp-v6-nodeset-tpu]: ../community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/README.md [htcondor-execute-point]: ../community/modules/compute/htcondor-execute-point/README.md [pbspro-execution]: ../community/modules/compute/pbspro-execution/README.md +[mig]: ../community/modules/compute/mig/README.md [notebook]: ../community/modules/compute/notebook/README.md [fsi-montecarlo-on-batch-tutorial]: ../docs/tutorials/fsi-montecarlo-on-batch/README.md From 7a07acb02c549eb6720cbcfa0c0761bfa0415045 Mon Sep 17 00:00:00 2001 From: Alyssa Date: Fri, 21 Jun 2024 00:21:29 +0000 Subject: [PATCH 14/51] Update internal usage of Toolkit modules to v1.35.0 --- .../modules/compute/htcondor-execute-point/README.md | 2 +- community/modules/compute/htcondor-execute-point/main.tf | 2 +- community/modules/compute/pbspro-execution/README.md | 6 +++--- community/modules/compute/pbspro-execution/main.tf | 6 +++--- .../remote-desktop/chrome-remote-desktop/README.md | 4 ++-- .../modules/remote-desktop/chrome-remote-desktop/main.tf | 4 ++-- .../modules/scheduler/htcondor-access-point/README.md | 2 +- community/modules/scheduler/htcondor-access-point/main.tf | 2 +- .../modules/scheduler/htcondor-central-manager/README.md | 2 +- .../modules/scheduler/htcondor-central-manager/main.tf | 2 +- .../modules/scheduler/htcondor-service-accounts/README.md | 6 +++--- .../modules/scheduler/htcondor-service-accounts/main.tf | 6 +++--- community/modules/scheduler/pbspro-client/README.md | 6 +++--- community/modules/scheduler/pbspro-client/main.tf | 6 +++--- community/modules/scheduler/pbspro-server/README.md | 8 ++++---- community/modules/scheduler/pbspro-server/main.tf | 8 ++++---- community/modules/scripts/ramble-execute/README.md | 2 +- community/modules/scripts/ramble-execute/main.tf | 2 +- community/modules/scripts/ramble-setup/README.md | 2 +- community/modules/scripts/ramble-setup/main.tf | 2 +- community/modules/scripts/spack-execute/README.md | 2 +- community/modules/scripts/spack-execute/main.tf | 2 +- community/modules/scripts/spack-setup/README.md | 2 +- community/modules/scripts/spack-setup/main.tf | 2 +- modules/compute/vm-instance/README.md | 2 +- .../compute/vm-instance/startup_from_network_storage.tf | 2 +- modules/network/multivpc/README.md | 2 +- modules/network/multivpc/main.tf | 2 +- modules/scheduler/batch-job-template/README.md | 2 +- .../batch-job-template/startup_from_network_storage.tf | 2 +- modules/scheduler/batch-login-node/README.md | 2 +- modules/scheduler/batch-login-node/main.tf | 2 +- 32 files changed, 52 insertions(+), 52 deletions(-) diff --git a/community/modules/compute/htcondor-execute-point/README.md b/community/modules/compute/htcondor-execute-point/README.md index 181ae3a91e..b16897234a 100644 --- a/community/modules/compute/htcondor-execute-point/README.md +++ b/community/modules/compute/htcondor-execute-point/README.md @@ -212,7 +212,7 @@ limitations under the License. |------|--------|---------| | [execute\_point\_instance\_template](#module\_execute\_point\_instance\_template) | terraform-google-modules/vm/google//modules/instance_template | 10.1.1 | | [mig](#module\_mig) | terraform-google-modules/vm/google//modules/mig | 10.1.1 | -| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.34.0&depth=1 | +| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.35.0&depth=1 | ## Resources diff --git a/community/modules/compute/htcondor-execute-point/main.tf b/community/modules/compute/htcondor-execute-point/main.tf index 35740ac547..eb34422c54 100644 --- a/community/modules/compute/htcondor-execute-point/main.tf +++ b/community/modules/compute/htcondor-execute-point/main.tf @@ -125,7 +125,7 @@ resource "google_storage_bucket_object" "execute_config" { } module "startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.34.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.35.0&depth=1" project_id = var.project_id region = var.region diff --git a/community/modules/compute/pbspro-execution/README.md b/community/modules/compute/pbspro-execution/README.md index d875b8503f..e8334fdab2 100644 --- a/community/modules/compute/pbspro-execution/README.md +++ b/community/modules/compute/pbspro-execution/README.md @@ -74,9 +74,9 @@ No providers. | Name | Source | Version | |------|--------|---------| -| [execution\_startup\_script](#module\_execution\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.34.0&depth=1 | -| [pbs\_execution](#module\_pbs\_execution) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance | v1.34.0&depth=1 | -| [pbs\_install](#module\_pbs\_install) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-install | v1.34.0&depth=1 | +| [execution\_startup\_script](#module\_execution\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.35.0&depth=1 | +| [pbs\_execution](#module\_pbs\_execution) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance | v1.35.0&depth=1 | +| [pbs\_install](#module\_pbs\_install) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-install | v1.35.0&depth=1 | ## Resources diff --git a/community/modules/compute/pbspro-execution/main.tf b/community/modules/compute/pbspro-execution/main.tf index 732630ff6c..172712e20d 100644 --- a/community/modules/compute/pbspro-execution/main.tf +++ b/community/modules/compute/pbspro-execution/main.tf @@ -42,7 +42,7 @@ locals { } module "pbs_install" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-install?ref=v1.34.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-install?ref=v1.35.0&depth=1" pbs_exec = var.pbs_exec pbs_home = var.pbs_home @@ -53,7 +53,7 @@ module "pbs_install" { } module "execution_startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.34.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.35.0&depth=1" deployment_name = var.deployment_name project_id = var.project_id @@ -68,7 +68,7 @@ module "execution_startup_script" { } module "pbs_execution" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance?ref=v1.34.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance?ref=v1.35.0&depth=1" instance_count = var.instance_count spot = var.spot diff --git a/community/modules/remote-desktop/chrome-remote-desktop/README.md b/community/modules/remote-desktop/chrome-remote-desktop/README.md index 212ed8e3ef..64894f4b09 100644 --- a/community/modules/remote-desktop/chrome-remote-desktop/README.md +++ b/community/modules/remote-desktop/chrome-remote-desktop/README.md @@ -63,8 +63,8 @@ No providers. | Name | Source | Version | |------|--------|---------| -| [client\_startup\_script](#module\_client\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.34.0&depth=1 | -| [instances](#module\_instances) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance | v1.34.0&depth=1 | +| [client\_startup\_script](#module\_client\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.35.0&depth=1 | +| [instances](#module\_instances) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance | v1.35.0&depth=1 | ## Resources diff --git a/community/modules/remote-desktop/chrome-remote-desktop/main.tf b/community/modules/remote-desktop/chrome-remote-desktop/main.tf index 7af80f6b88..8369dc9b12 100644 --- a/community/modules/remote-desktop/chrome-remote-desktop/main.tf +++ b/community/modules/remote-desktop/chrome-remote-desktop/main.tf @@ -55,7 +55,7 @@ locals { } module "client_startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.34.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.35.0&depth=1" deployment_name = var.deployment_name project_id = var.project_id @@ -71,7 +71,7 @@ module "client_startup_script" { } module "instances" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance?ref=v1.34.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance?ref=v1.35.0&depth=1" instance_count = var.instance_count name_prefix = var.name_prefix diff --git a/community/modules/scheduler/htcondor-access-point/README.md b/community/modules/scheduler/htcondor-access-point/README.md index 9441a17a0f..15e760c653 100644 --- a/community/modules/scheduler/htcondor-access-point/README.md +++ b/community/modules/scheduler/htcondor-access-point/README.md @@ -122,7 +122,7 @@ limitations under the License. |------|--------|---------| | [access\_point\_instance\_template](#module\_access\_point\_instance\_template) | github.com/terraform-google-modules/terraform-google-vm//modules/instance_template | 73dc845 | | [htcondor\_ap](#module\_htcondor\_ap) | terraform-google-modules/vm/google//modules/mig | 10.1.1 | -| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.34.0&depth=1 | +| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.35.0&depth=1 | ## Resources diff --git a/community/modules/scheduler/htcondor-access-point/main.tf b/community/modules/scheduler/htcondor-access-point/main.tf index 79e0498d59..a8d3e9ba12 100644 --- a/community/modules/scheduler/htcondor-access-point/main.tf +++ b/community/modules/scheduler/htcondor-access-point/main.tf @@ -171,7 +171,7 @@ resource "google_storage_bucket_object" "ap_config" { } module "startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.34.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.35.0&depth=1" project_id = var.project_id region = var.region diff --git a/community/modules/scheduler/htcondor-central-manager/README.md b/community/modules/scheduler/htcondor-central-manager/README.md index ad7981baf3..1f298cf64a 100644 --- a/community/modules/scheduler/htcondor-central-manager/README.md +++ b/community/modules/scheduler/htcondor-central-manager/README.md @@ -106,7 +106,7 @@ limitations under the License. |------|--------|---------| | [central\_manager\_instance\_template](#module\_central\_manager\_instance\_template) | terraform-google-modules/vm/google//modules/instance_template | 10.1.1 | | [htcondor\_cm](#module\_htcondor\_cm) | terraform-google-modules/vm/google//modules/mig | 10.1.1 | -| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.34.0&depth=1 | +| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.35.0&depth=1 | ## Resources diff --git a/community/modules/scheduler/htcondor-central-manager/main.tf b/community/modules/scheduler/htcondor-central-manager/main.tf index 3d4bd7bb9e..293ebce7d3 100644 --- a/community/modules/scheduler/htcondor-central-manager/main.tf +++ b/community/modules/scheduler/htcondor-central-manager/main.tf @@ -110,7 +110,7 @@ resource "google_storage_bucket_object" "cm_config" { } module "startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.34.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.35.0&depth=1" project_id = var.project_id region = var.region diff --git a/community/modules/scheduler/htcondor-service-accounts/README.md b/community/modules/scheduler/htcondor-service-accounts/README.md index 59d15a5593..409c707d3e 100644 --- a/community/modules/scheduler/htcondor-service-accounts/README.md +++ b/community/modules/scheduler/htcondor-service-accounts/README.md @@ -100,9 +100,9 @@ No providers. | Name | Source | Version | |------|--------|---------| -| [access\_point\_service\_account](#module\_access\_point\_service\_account) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/project/service-account | v1.34.0&depth=1 | -| [central\_manager\_service\_account](#module\_central\_manager\_service\_account) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/project/service-account | v1.34.0&depth=1 | -| [execute\_point\_service\_account](#module\_execute\_point\_service\_account) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/project/service-account | v1.34.0&depth=1 | +| [access\_point\_service\_account](#module\_access\_point\_service\_account) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/project/service-account | v1.35.0&depth=1 | +| [central\_manager\_service\_account](#module\_central\_manager\_service\_account) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/project/service-account | v1.35.0&depth=1 | +| [execute\_point\_service\_account](#module\_execute\_point\_service\_account) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/project/service-account | v1.35.0&depth=1 | ## Resources diff --git a/community/modules/scheduler/htcondor-service-accounts/main.tf b/community/modules/scheduler/htcondor-service-accounts/main.tf index 7881fd3559..028fce2145 100644 --- a/community/modules/scheduler/htcondor-service-accounts/main.tf +++ b/community/modules/scheduler/htcondor-service-accounts/main.tf @@ -21,7 +21,7 @@ # require them module "access_point_service_account" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/project/service-account?ref=v1.34.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/project/service-account?ref=v1.35.0&depth=1" project_id = var.project_id display_name = "HTCondor Access Point" @@ -31,7 +31,7 @@ module "access_point_service_account" { } module "execute_point_service_account" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/project/service-account?ref=v1.34.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/project/service-account?ref=v1.35.0&depth=1" project_id = var.project_id display_name = "HTCondor Execute Point" @@ -41,7 +41,7 @@ module "execute_point_service_account" { } module "central_manager_service_account" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/project/service-account?ref=v1.34.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/project/service-account?ref=v1.35.0&depth=1" project_id = var.project_id display_name = "HTCondor Central Manager" diff --git a/community/modules/scheduler/pbspro-client/README.md b/community/modules/scheduler/pbspro-client/README.md index 2d0871481f..4cabd2e375 100644 --- a/community/modules/scheduler/pbspro-client/README.md +++ b/community/modules/scheduler/pbspro-client/README.md @@ -74,9 +74,9 @@ No providers. | Name | Source | Version | |------|--------|---------| -| [client\_startup\_script](#module\_client\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.34.0&depth=1 | -| [pbs\_client](#module\_pbs\_client) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance | v1.34.0&depth=1 | -| [pbs\_install](#module\_pbs\_install) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-install | v1.34.0&depth=1 | +| [client\_startup\_script](#module\_client\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.35.0&depth=1 | +| [pbs\_client](#module\_pbs\_client) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance | v1.35.0&depth=1 | +| [pbs\_install](#module\_pbs\_install) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-install | v1.35.0&depth=1 | ## Resources diff --git a/community/modules/scheduler/pbspro-client/main.tf b/community/modules/scheduler/pbspro-client/main.tf index b4a944656d..0d8c8e1247 100644 --- a/community/modules/scheduler/pbspro-client/main.tf +++ b/community/modules/scheduler/pbspro-client/main.tf @@ -32,7 +32,7 @@ locals { } module "pbs_install" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-install?ref=v1.34.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-install?ref=v1.35.0&depth=1" pbs_exec = var.pbs_exec pbs_home = var.pbs_home @@ -43,7 +43,7 @@ module "pbs_install" { } module "client_startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.34.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.35.0&depth=1" deployment_name = var.deployment_name project_id = var.project_id @@ -57,7 +57,7 @@ module "client_startup_script" { } module "pbs_client" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance?ref=v1.34.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance?ref=v1.35.0&depth=1" instance_count = var.instance_count spot = var.spot diff --git a/community/modules/scheduler/pbspro-server/README.md b/community/modules/scheduler/pbspro-server/README.md index b0541357bf..85ad925986 100644 --- a/community/modules/scheduler/pbspro-server/README.md +++ b/community/modules/scheduler/pbspro-server/README.md @@ -69,10 +69,10 @@ No providers. | Name | Source | Version | |------|--------|---------| -| [pbs\_install](#module\_pbs\_install) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-install | v1.34.0&depth=1 | -| [pbs\_qmgr](#module\_pbs\_qmgr) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-qmgr | v1.34.0&depth=1 | -| [pbs\_server](#module\_pbs\_server) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance | v1.34.0&depth=1 | -| [server\_startup\_script](#module\_server\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.34.0&depth=1 | +| [pbs\_install](#module\_pbs\_install) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-install | v1.35.0&depth=1 | +| [pbs\_qmgr](#module\_pbs\_qmgr) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-qmgr | v1.35.0&depth=1 | +| [pbs\_server](#module\_pbs\_server) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance | v1.35.0&depth=1 | +| [server\_startup\_script](#module\_server\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.35.0&depth=1 | ## Resources diff --git a/community/modules/scheduler/pbspro-server/main.tf b/community/modules/scheduler/pbspro-server/main.tf index 53bb5fdfbe..8341e68564 100644 --- a/community/modules/scheduler/pbspro-server/main.tf +++ b/community/modules/scheduler/pbspro-server/main.tf @@ -32,7 +32,7 @@ locals { } module "pbs_install" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-install?ref=v1.34.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-install?ref=v1.35.0&depth=1" pbs_data_service_user = var.pbs_data_service_user pbs_exec = var.pbs_exec @@ -45,7 +45,7 @@ module "pbs_install" { } module "pbs_qmgr" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-qmgr?ref=v1.34.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-qmgr?ref=v1.35.0&depth=1" client_host_count = var.client_host_count client_hostname_prefix = var.client_hostname_prefix @@ -55,7 +55,7 @@ module "pbs_qmgr" { } module "server_startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.34.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.35.0&depth=1" deployment_name = var.deployment_name project_id = var.project_id @@ -70,7 +70,7 @@ module "server_startup_script" { } module "pbs_server" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance?ref=v1.34.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance?ref=v1.35.0&depth=1" instance_count = var.instance_count spot = var.spot diff --git a/community/modules/scripts/ramble-execute/README.md b/community/modules/scripts/ramble-execute/README.md index f8bc48f9dc..aa752fe0e2 100644 --- a/community/modules/scripts/ramble-execute/README.md +++ b/community/modules/scripts/ramble-execute/README.md @@ -77,7 +77,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.34.0&depth=1 | +| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.35.0&depth=1 | ## Resources diff --git a/community/modules/scripts/ramble-execute/main.tf b/community/modules/scripts/ramble-execute/main.tf index d3505838c4..a44b9838a3 100644 --- a/community/modules/scripts/ramble-execute/main.tf +++ b/community/modules/scripts/ramble-execute/main.tf @@ -55,7 +55,7 @@ locals { } module "startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.34.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.35.0&depth=1" labels = local.labels project_id = var.project_id diff --git a/community/modules/scripts/ramble-setup/README.md b/community/modules/scripts/ramble-setup/README.md index fb0e2e0be0..d62bb3affe 100644 --- a/community/modules/scripts/ramble-setup/README.md +++ b/community/modules/scripts/ramble-setup/README.md @@ -86,7 +86,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.34.0&depth=1 | +| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.35.0&depth=1 | ## Resources diff --git a/community/modules/scripts/ramble-setup/main.tf b/community/modules/scripts/ramble-setup/main.tf index a14a71f3ab..6f2462b279 100644 --- a/community/modules/scripts/ramble-setup/main.tf +++ b/community/modules/scripts/ramble-setup/main.tf @@ -94,7 +94,7 @@ resource "google_storage_bucket" "bucket" { } module "startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.34.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.35.0&depth=1" labels = local.labels project_id = var.project_id diff --git a/community/modules/scripts/spack-execute/README.md b/community/modules/scripts/spack-execute/README.md index 1df6b493ab..58595c2861 100644 --- a/community/modules/scripts/spack-execute/README.md +++ b/community/modules/scripts/spack-execute/README.md @@ -104,7 +104,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.34.0&depth=1 | +| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.35.0&depth=1 | ## Resources diff --git a/community/modules/scripts/spack-execute/main.tf b/community/modules/scripts/spack-execute/main.tf index 0465e3d7d5..93a121c2de 100644 --- a/community/modules/scripts/spack-execute/main.tf +++ b/community/modules/scripts/spack-execute/main.tf @@ -54,7 +54,7 @@ locals { } module "startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.34.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.35.0&depth=1" labels = local.labels project_id = var.project_id diff --git a/community/modules/scripts/spack-setup/README.md b/community/modules/scripts/spack-setup/README.md index 36ab18585f..3649c58f36 100644 --- a/community/modules/scripts/spack-setup/README.md +++ b/community/modules/scripts/spack-setup/README.md @@ -340,7 +340,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.34.0&depth=1 | +| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.35.0&depth=1 | ## Resources diff --git a/community/modules/scripts/spack-setup/main.tf b/community/modules/scripts/spack-setup/main.tf index f65ceb40cf..356e745715 100644 --- a/community/modules/scripts/spack-setup/main.tf +++ b/community/modules/scripts/spack-setup/main.tf @@ -100,7 +100,7 @@ resource "google_storage_bucket" "bucket" { } module "startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.34.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.35.0&depth=1" labels = local.labels project_id = var.project_id diff --git a/modules/compute/vm-instance/README.md b/modules/compute/vm-instance/README.md index 55e6d9caa7..159019dc0c 100644 --- a/modules/compute/vm-instance/README.md +++ b/modules/compute/vm-instance/README.md @@ -185,7 +185,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [netstorage\_startup\_script](#module\_netstorage\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.34.0&depth=1 | +| [netstorage\_startup\_script](#module\_netstorage\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.35.0&depth=1 | ## Resources diff --git a/modules/compute/vm-instance/startup_from_network_storage.tf b/modules/compute/vm-instance/startup_from_network_storage.tf index acf3fe037f..17dbd97bd2 100644 --- a/modules/compute/vm-instance/startup_from_network_storage.tf +++ b/modules/compute/vm-instance/startup_from_network_storage.tf @@ -55,7 +55,7 @@ locals { } module "netstorage_startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.34.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.35.0&depth=1" labels = local.labels project_id = var.project_id diff --git a/modules/network/multivpc/README.md b/modules/network/multivpc/README.md index 490c39c02f..83b5d1a524 100644 --- a/modules/network/multivpc/README.md +++ b/modules/network/multivpc/README.md @@ -88,7 +88,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [vpcs](#module\_vpcs) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/network/vpc | v1.34.0&depth=1 | +| [vpcs](#module\_vpcs) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/network/vpc | v1.35.0&depth=1 | ## Resources diff --git a/modules/network/multivpc/main.tf b/modules/network/multivpc/main.tf index 8ab4d438e1..782a3f45a4 100644 --- a/modules/network/multivpc/main.tf +++ b/modules/network/multivpc/main.tf @@ -44,7 +44,7 @@ resource "terraform_data" "global_ip_cidr_suffix" { } module "vpcs" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/network/vpc?ref=v1.34.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/network/vpc?ref=v1.35.0&depth=1" count = var.network_count diff --git a/modules/scheduler/batch-job-template/README.md b/modules/scheduler/batch-job-template/README.md index 3891776c69..ef9b816346 100644 --- a/modules/scheduler/batch-job-template/README.md +++ b/modules/scheduler/batch-job-template/README.md @@ -139,7 +139,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| | [instance\_template](#module\_instance\_template) | terraform-google-modules/vm/google//modules/instance_template | ~> 10.1.1 | -| [netstorage\_startup\_script](#module\_netstorage\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.34.0&depth=1 | +| [netstorage\_startup\_script](#module\_netstorage\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.35.0&depth=1 | ## Resources diff --git a/modules/scheduler/batch-job-template/startup_from_network_storage.tf b/modules/scheduler/batch-job-template/startup_from_network_storage.tf index acf3fe037f..17dbd97bd2 100644 --- a/modules/scheduler/batch-job-template/startup_from_network_storage.tf +++ b/modules/scheduler/batch-job-template/startup_from_network_storage.tf @@ -55,7 +55,7 @@ locals { } module "netstorage_startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.34.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.35.0&depth=1" labels = local.labels project_id = var.project_id diff --git a/modules/scheduler/batch-login-node/README.md b/modules/scheduler/batch-login-node/README.md index d5a1f2ad4a..4eb7da8a3a 100644 --- a/modules/scheduler/batch-login-node/README.md +++ b/modules/scheduler/batch-login-node/README.md @@ -89,7 +89,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [login\_startup\_script](#module\_login\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.34.0&depth=1 | +| [login\_startup\_script](#module\_login\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.35.0&depth=1 | ## Resources diff --git a/modules/scheduler/batch-login-node/main.tf b/modules/scheduler/batch-login-node/main.tf index bc5d298817..50960758cd 100644 --- a/modules/scheduler/batch-login-node/main.tf +++ b/modules/scheduler/batch-login-node/main.tf @@ -94,7 +94,7 @@ locals { } module "login_startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.34.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.35.0&depth=1" labels = local.labels project_id = var.project_id deployment_name = var.deployment_name From ebf1fe82ee340b1d9c97468f5223bb9274d4b7b8 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 24 Jun 2024 10:22:23 +0000 Subject: [PATCH 15/51] Bump github.com/hashicorp/go-getter from 1.7.4 to 1.7.5 Bumps [github.com/hashicorp/go-getter](https://github.com/hashicorp/go-getter) from 1.7.4 to 1.7.5. - [Release notes](https://github.com/hashicorp/go-getter/releases) - [Changelog](https://github.com/hashicorp/go-getter/blob/main/.goreleaser.yml) - [Commits](https://github.com/hashicorp/go-getter/compare/v1.7.4...v1.7.5) --- updated-dependencies: - dependency-name: github.com/hashicorp/go-getter dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- go.mod | 2 +- go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index 5ef0af0e29..3cad66de62 100644 --- a/go.mod +++ b/go.mod @@ -5,7 +5,7 @@ go 1.21 require ( cloud.google.com/go/storage v1.40.0 // indirect github.com/go-git/go-git/v5 v5.12.0 - github.com/hashicorp/go-getter v1.7.4 + github.com/hashicorp/go-getter v1.7.5 github.com/hashicorp/hcl v1.0.0 // indirect github.com/hashicorp/hcl/v2 v2.20.1 github.com/hashicorp/terraform-config-inspect v0.0.0-20230925220900-5a6f8d18746d diff --git a/go.sum b/go.sum index 9c6bead171..dae7fbdc27 100644 --- a/go.sum +++ b/go.sum @@ -381,8 +381,8 @@ github.com/googleapis/go-type-adapters v1.0.0/go.mod h1:zHW75FOG2aur7gAO2B+MLby+ github.com/grpc-ecosystem/grpc-gateway v1.16.0/go.mod h1:BDjrQk3hbvj6Nolgz8mAMFbcEtjT1g+wF4CSlocrBnw= github.com/hashicorp/go-cleanhttp v0.5.2 h1:035FKYIWjmULyFRBKPs8TBQoi0x6d9G4xc9neXJWAZQ= github.com/hashicorp/go-cleanhttp v0.5.2/go.mod h1:kO/YDlP8L1346E6Sodw+PrpBSV4/SoxCXGY6BqNFT48= -github.com/hashicorp/go-getter v1.7.4 h1:3yQjWuxICvSpYwqSayAdKRFcvBl1y/vogCxczWSmix0= -github.com/hashicorp/go-getter v1.7.4/go.mod h1:W7TalhMmbPmsSMdNjD0ZskARur/9GJ17cfHTRtXV744= +github.com/hashicorp/go-getter v1.7.5 h1:dT58k9hQ/vbxNMwoI5+xFYAJuv6152UNvdHokfI5wE4= +github.com/hashicorp/go-getter v1.7.5/go.mod h1:W7TalhMmbPmsSMdNjD0ZskARur/9GJ17cfHTRtXV744= github.com/hashicorp/go-safetemp v1.0.0 h1:2HR189eFNrjHQyENnQMMpCiBAsRxzbTMIgBhEyExpmo= github.com/hashicorp/go-safetemp v1.0.0/go.mod h1:oaerMy3BhqiTbVye6QuFhFtIceqFoDHxNAB65b+Rj1I= github.com/hashicorp/go-version v1.6.0 h1:feTTfFNnjP967rlCxM/I9g701jU+RN74YKx2mOkIeek= From a384c26daf4fa4f98cda6e2de5a9096141e3b962 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 24 Jun 2024 10:22:52 +0000 Subject: [PATCH 16/51] Bump google.golang.org/api from 0.184.0 to 0.185.0 Bumps [google.golang.org/api](https://github.com/googleapis/google-api-go-client) from 0.184.0 to 0.185.0. - [Release notes](https://github.com/googleapis/google-api-go-client/releases) - [Changelog](https://github.com/googleapis/google-api-go-client/blob/main/CHANGES.md) - [Commits](https://github.com/googleapis/google-api-go-client/compare/v0.184.0...v0.185.0) --- updated-dependencies: - dependency-name: google.golang.org/api dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- go.mod | 14 +++++++------- go.sum | 28 ++++++++++++++-------------- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/go.mod b/go.mod index 5ef0af0e29..1aa9695ad7 100644 --- a/go.mod +++ b/go.mod @@ -3,7 +3,7 @@ module hpc-toolkit go 1.21 require ( - cloud.google.com/go/storage v1.40.0 // indirect + cloud.google.com/go/storage v1.41.0 // indirect github.com/go-git/go-git/v5 v5.12.0 github.com/hashicorp/go-getter v1.7.4 github.com/hashicorp/hcl v1.0.0 // indirect @@ -15,7 +15,7 @@ require ( github.com/spf13/cobra v1.8.1 github.com/zclconf/go-cty v1.14.4 golang.org/x/exp v0.0.0-20231110203233-9a3e6036ecaa - google.golang.org/genproto v0.0.0-20240604185151-ef581f913117 // indirect + google.golang.org/genproto v0.0.0-20240617180043-68d350f18fd4 // indirect gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c gopkg.in/yaml.v3 v3.0.1 ) @@ -27,7 +27,7 @@ require ( github.com/hashicorp/terraform-exec v0.21.0 github.com/mattn/go-isatty v0.0.20 github.com/zclconf/go-cty-debug v0.0.0-20191215020915-b22d67c1ba0b - google.golang.org/api v0.184.0 + google.golang.org/api v0.185.0 ) require ( @@ -54,12 +54,12 @@ require ( golang.org/x/sync v0.7.0 // indirect golang.org/x/time v0.5.0 // indirect golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d // indirect - google.golang.org/genproto/googleapis/api v0.0.0-20240528184218-531527333157 // indirect - google.golang.org/genproto/googleapis/rpc v0.0.0-20240604185151-ef581f913117 // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20240610135401-a8a62080eff3 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20240617180043-68d350f18fd4 // indirect ) require ( - cloud.google.com/go v0.114.0 // indirect + cloud.google.com/go v0.115.0 // indirect cloud.google.com/go/compute/metadata v0.3.0 // indirect cloud.google.com/go/iam v1.1.8 // indirect github.com/Microsoft/go-winio v0.6.1 // indirect @@ -101,6 +101,6 @@ require ( golang.org/x/sys v0.21.0 golang.org/x/text v0.16.0 // indirect google.golang.org/grpc v1.64.0 // indirect - google.golang.org/protobuf v1.34.1 // indirect + google.golang.org/protobuf v1.34.2 // indirect gopkg.in/warnings.v0 v0.1.2 // indirect ) diff --git a/go.sum b/go.sum index 9c6bead171..7d4d5cd393 100644 --- a/go.sum +++ b/go.sum @@ -30,8 +30,8 @@ cloud.google.com/go v0.100.2/go.mod h1:4Xra9TjzAeYHrl5+oeLlzbM2k3mjVhZh4UqTZ//w9 cloud.google.com/go v0.102.0/go.mod h1:oWcCzKlqJ5zgHQt9YsaeTY9KzIvjyy0ArmiBUgpQ+nc= cloud.google.com/go v0.102.1/go.mod h1:XZ77E9qnTEnrgEOvr4xzfdX5TRo7fB4T2F4O6+34hIU= cloud.google.com/go v0.104.0/go.mod h1:OO6xxXdJyvuJPcEPBLN9BJPD+jep5G1+2U5B5gkRYtA= -cloud.google.com/go v0.114.0 h1:OIPFAdfrFDFO2ve2U7r/H5SwSbBzEdrBdE7xkgwc+kY= -cloud.google.com/go v0.114.0/go.mod h1:ZV9La5YYxctro1HTPug5lXH/GefROyW8PPD4T8n9J8E= +cloud.google.com/go v0.115.0 h1:CnFSK6Xo3lDYRoBKEcAtia6VSC837/ZkJuRduSFnr14= +cloud.google.com/go v0.115.0/go.mod h1:8jIM5vVgoAEoiVxQ/O4BFTfHqulPZgs/ufEzMcFMdWU= cloud.google.com/go/aiplatform v1.22.0/go.mod h1:ig5Nct50bZlzV6NvKaTwmplLLddFx0YReh9WfTO5jKw= cloud.google.com/go/aiplatform v1.24.0/go.mod h1:67UUvRBKG6GTayHKV8DBv2RtR1t93YRu5B1P3x99mYY= cloud.google.com/go/analytics v0.11.0/go.mod h1:DjEWCu41bVbYcKyvlws9Er60YE4a//bK6mnhWvQeFNI= @@ -173,8 +173,8 @@ cloud.google.com/go/storage v1.10.0/go.mod h1:FLPqc6j+Ki4BU591ie1oL6qBQGu2Bl/tZ9 cloud.google.com/go/storage v1.22.1/go.mod h1:S8N1cAStu7BOeFfE8KAQzmyyLkK8p/vmRq6kuBTW58Y= cloud.google.com/go/storage v1.23.0/go.mod h1:vOEEDNFnciUMhBeT6hsJIn3ieU5cFRmzeLgDvXzfIXc= cloud.google.com/go/storage v1.27.0/go.mod h1:x9DOL8TK/ygDUMieqwfhdpQryTeEkhGKMi80i/iqR2s= -cloud.google.com/go/storage v1.40.0 h1:VEpDQV5CJxFmJ6ueWNsKxcr1QAYOXEgxDa+sBbJahPw= -cloud.google.com/go/storage v1.40.0/go.mod h1:Rrj7/hKlG87BLqDJYtwR0fbPld8uJPbQ2ucUMY7Ir0g= +cloud.google.com/go/storage v1.41.0 h1:RusiwatSu6lHeEXe3kglxakAmAbfV+rhtPqA6i8RBx0= +cloud.google.com/go/storage v1.41.0/go.mod h1:J1WCa/Z2FcgdEDuPUY8DxT5I+d9mFKsCepp5vR6Sq80= cloud.google.com/go/talent v1.1.0/go.mod h1:Vl4pt9jiHKvOgF9KoZo6Kob9oV4lwd/ZD5Cto54zDRw= cloud.google.com/go/talent v1.2.0/go.mod h1:MoNF9bhFQbiJ6eFD3uSsg0uBALw4n4gaCaEjBw9zo8g= cloud.google.com/go/videointelligence v1.6.0/go.mod h1:w0DIDlVRKtwPCn/C4iwZIJdvC69yInhW0cfi+p546uU= @@ -874,8 +874,8 @@ google.golang.org/api v0.96.0/go.mod h1:w7wJQLTM+wvQpNf5JyEcBoxK0RH7EDrh/L4qfsuJ google.golang.org/api v0.97.0/go.mod h1:w7wJQLTM+wvQpNf5JyEcBoxK0RH7EDrh/L4qfsuJ13s= google.golang.org/api v0.98.0/go.mod h1:w7wJQLTM+wvQpNf5JyEcBoxK0RH7EDrh/L4qfsuJ13s= google.golang.org/api v0.100.0/go.mod h1:ZE3Z2+ZOr87Rx7dqFsdRQkRBk36kDtp/h+QpHbB7a70= -google.golang.org/api v0.184.0 h1:dmEdk6ZkJNXy1JcDhn/ou0ZUq7n9zropG2/tR4z+RDg= -google.golang.org/api v0.184.0/go.mod h1:CeDTtUEiYENAf8PPG5VZW2yNp2VM3VWbCeTioAZBTBA= +google.golang.org/api v0.185.0 h1:ENEKk1k4jW8SmmaT6RE+ZasxmxezCrD5Vw4npvr+pAU= +google.golang.org/api v0.185.0/go.mod h1:HNfvIkJGlgrIlrbYkAm9W9IdkmKZjOTVh33YltygGbg= google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= google.golang.org/appengine v1.5.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= @@ -984,12 +984,12 @@ google.golang.org/genproto v0.0.0-20221010155953-15ba04fc1c0e/go.mod h1:3526vdqw google.golang.org/genproto v0.0.0-20221014173430-6e2ab493f96b/go.mod h1:1vXfmgAz9N9Jx0QA82PqRVauvCz1SGSz739p0f183jM= google.golang.org/genproto v0.0.0-20221014213838-99cd37c6964a/go.mod h1:1vXfmgAz9N9Jx0QA82PqRVauvCz1SGSz739p0f183jM= google.golang.org/genproto v0.0.0-20221025140454-527a21cfbd71/go.mod h1:9qHF0xnpdSfF6knlcsnpzUu5y+rpwgbvsyGAZPBMg4s= -google.golang.org/genproto v0.0.0-20240604185151-ef581f913117 h1:HCZ6DlkKtCDAtD8ForECsY3tKuaR+p4R3grlK80uCCc= -google.golang.org/genproto v0.0.0-20240604185151-ef581f913117/go.mod h1:lesfX/+9iA+3OdqeCpoDddJaNxVB1AB6tD7EfqMmprc= -google.golang.org/genproto/googleapis/api v0.0.0-20240528184218-531527333157 h1:7whR9kGa5LUwFtpLm2ArCEejtnxlGeLbAyjFY8sGNFw= -google.golang.org/genproto/googleapis/api v0.0.0-20240528184218-531527333157/go.mod h1:99sLkeliLXfdj2J75X3Ho+rrVCaJze0uwN7zDDkjPVU= -google.golang.org/genproto/googleapis/rpc v0.0.0-20240604185151-ef581f913117 h1:1GBuWVLM/KMVUv1t1En5Gs+gFZCNd360GGb4sSxtrhU= -google.golang.org/genproto/googleapis/rpc v0.0.0-20240604185151-ef581f913117/go.mod h1:EfXuqaE1J41VCDicxHzUDm+8rk+7ZdXzHV0IhO/I6s0= +google.golang.org/genproto v0.0.0-20240617180043-68d350f18fd4 h1:CUiCqkPw1nNrNQzCCG4WA65m0nAmQiwXHpub3dNyruU= +google.golang.org/genproto v0.0.0-20240617180043-68d350f18fd4/go.mod h1:EvuUDCulqGgV80RvP1BHuom+smhX4qtlhnNatHuroGQ= +google.golang.org/genproto/googleapis/api v0.0.0-20240610135401-a8a62080eff3 h1:QW9+G6Fir4VcRXVH8x3LilNAb6cxBGLa6+GM4hRwexE= +google.golang.org/genproto/googleapis/api v0.0.0-20240610135401-a8a62080eff3/go.mod h1:kdrSS/OiLkPrNUpzD4aHgCq2rVuC/YRxok32HXZ4vRE= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240617180043-68d350f18fd4 h1:Di6ANFilr+S60a4S61ZM00vLdw0IrQOSMS2/6mrnOU0= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240617180043-68d350f18fd4/go.mod h1:Ue6ibwXGpU+dqIcODieyLOcgj7z8+IcskoNIgZxtrFY= google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= google.golang.org/grpc v1.20.1/go.mod h1:10oTOabMzJvdu6/UiuZezV6QK5dSlG84ov/aaiqXj38= google.golang.org/grpc v1.21.1/go.mod h1:oYelfM1adQP15Ek0mdvEgi9Df8B9CZIaU1084ijfRaM= @@ -1043,8 +1043,8 @@ google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQ google.golang.org/protobuf v1.27.1/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= google.golang.org/protobuf v1.28.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= google.golang.org/protobuf v1.28.1/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= -google.golang.org/protobuf v1.34.1 h1:9ddQBjfCyZPOHPUiPxpYESBLc+T8P3E+Vo4IbKZgFWg= -google.golang.org/protobuf v1.34.1/go.mod h1:c6P6GXX6sHbq/GpV6MGZEdwhWPcYBgnhAHhKbcUYpos= +google.golang.org/protobuf v1.34.2 h1:6xV6lTsCfpGD21XK49h7MhtcApnLqkfYgPcdHftf6hg= +google.golang.org/protobuf v1.34.2/go.mod h1:qYOHts0dSfpeUzUFpOMr/WGzszTmLH+DiWniOlNbLDw= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= From 2e56f7f051f3455d8d958af3571e5de5d804481c Mon Sep 17 00:00:00 2001 From: Ankit Kinra <1037624+ankitkinra@users.noreply.github.com> Date: Fri, 21 Jun 2024 21:16:04 +0000 Subject: [PATCH 17/51] Add a new post deploy test for gke cluster to do basic sanity check on the cluster --- .../test-validation/test-gke-job.yml | 45 +++++++++++++++++++ .../cloud-build/daily-tests/tests/ml-gke.yml | 8 +++- .../cloud-build/images/test-runner/Dockerfile | 3 ++ 3 files changed, 55 insertions(+), 1 deletion(-) create mode 100644 tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-gke-job.yml diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-gke-job.yml b/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-gke-job.yml new file mode 100644 index 0000000000..f1be62e220 --- /dev/null +++ b/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-gke-job.yml @@ -0,0 +1,45 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +- name: Assert variables are defined + ansible.builtin.assert: + that: + - cli_deployment_vars.region is defined + - custom_vars.project is defined + +- name: Get cluster credentials for kubectl + delegate_to: localhost + ansible.builtin.command: gcloud container clusters get-credentials {{ deployment_name }} --region {{ cli_deployment_vars.region }} --project {{ custom_vars.project }} + +- name: Execute the job + delegate_to: localhost + ansible.builtin.shell: | + array=({{ workspace }}/{{ deployment_name }}/primary/my-job*) + kubectl create -f ${array[0]} + args: + executable: /bin/bash + changed_when: False + +- name: Wait for job to complete + delegate_to: localhost + ansible.builtin.command: | + kubectl get job --field-selector status.successful=1 + register: job_completion + until: job_completion.stdout_lines | length > 1 + retries: 40 + delay: 15 + +- name: Print job_completion debug output + ansible.builtin.debug: + var: job_completion.stdout_lines diff --git a/tools/cloud-build/daily-tests/tests/ml-gke.yml b/tools/cloud-build/daily-tests/tests/ml-gke.yml index c45659a1b1..7c475a2fae 100644 --- a/tools/cloud-build/daily-tests/tests/ml-gke.yml +++ b/tools/cloud-build/daily-tests/tests/ml-gke.yml @@ -14,9 +14,15 @@ --- test_name: ml-gke deployment_name: ml-gke-{{ build }} +region: asia-southeast1 zone: asia-southeast1-b # for remote node workspace: /workspace blueprint_yaml: "{{ workspace }}/community/examples/ml-gke.yaml" network: "{{ deployment_name }}-net" remote_node: "{{ deployment_name }}-0" -post_deploy_tests: [] +cli_deployment_vars: + region: "{{ region }}" +custom_vars: + project: "{{ project }}" +post_deploy_tests: +- test-validation/test-gke-job.yml diff --git a/tools/cloud-build/images/test-runner/Dockerfile b/tools/cloud-build/images/test-runner/Dockerfile index ea9e148789..5538328dfe 100644 --- a/tools/cloud-build/images/test-runner/Dockerfile +++ b/tools/cloud-build/images/test-runner/Dockerfile @@ -36,6 +36,9 @@ RUN curl -fsSL https://apt.releases.hashicorp.com/gpg | apt-key add - && \ curl https://packages.cloud.google.com/apt/doc/apt-key.gpg \ | apt-key --keyring /usr/share/keyrings/cloud.google.gpg add - && \ apt-get -y update && apt-get -y install google-cloud-sdk && \ + apt-get -y install kubectl && \ + # following is required to execute kubectl commands + apt-get -y install google-cloud-cli-gke-gcloud-auth-plugin && \ apt-get clean && rm -rf /var/lib/apt/lists/* && \ # install ansible and python dependencies pip install --no-cache-dir --upgrade pip && \ From 9f9d82acafa9e6fad231d18786983cefcd10ec65 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 24 Jun 2024 22:51:50 +0000 Subject: [PATCH 18/51] Bump github.com/hashicorp/hcl/v2 from 2.20.1 to 2.21.0 Bumps [github.com/hashicorp/hcl/v2](https://github.com/hashicorp/hcl) from 2.20.1 to 2.21.0. - [Release notes](https://github.com/hashicorp/hcl/releases) - [Changelog](https://github.com/hashicorp/hcl/blob/main/CHANGELOG.md) - [Commits](https://github.com/hashicorp/hcl/compare/v2.20.1...v2.21.0) --- updated-dependencies: - dependency-name: github.com/hashicorp/hcl/v2 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- go.mod | 4 ++-- go.sum | 14 ++++---------- 2 files changed, 6 insertions(+), 12 deletions(-) diff --git a/go.mod b/go.mod index dc851f38a6..ace3c5616b 100644 --- a/go.mod +++ b/go.mod @@ -7,7 +7,7 @@ require ( github.com/go-git/go-git/v5 v5.12.0 github.com/hashicorp/go-getter v1.7.5 github.com/hashicorp/hcl v1.0.0 // indirect - github.com/hashicorp/hcl/v2 v2.20.1 + github.com/hashicorp/hcl/v2 v2.21.0 github.com/hashicorp/terraform-config-inspect v0.0.0-20230925220900-5a6f8d18746d github.com/otiai10/copy v1.14.0 github.com/pkg/errors v0.9.1 @@ -26,7 +26,7 @@ require ( github.com/google/go-cmp v0.6.0 github.com/hashicorp/terraform-exec v0.21.0 github.com/mattn/go-isatty v0.0.20 - github.com/zclconf/go-cty-debug v0.0.0-20191215020915-b22d67c1ba0b + github.com/zclconf/go-cty-debug v0.0.0-20240509010212-0d6042c53940 google.golang.org/api v0.185.0 ) diff --git a/go.sum b/go.sum index b3c4331516..2ac66a1a9d 100644 --- a/go.sum +++ b/go.sum @@ -202,7 +202,6 @@ github.com/agext/levenshtein v1.2.3/go.mod h1:JEDfjyjHDjOF/1e4FlBE/PkbqA9OfWu2ki github.com/anmitsu/go-shlex v0.0.0-20200514113438-38f4b401e2be h1:9AeTilPcZAjCFIImctFaOjnTIavg87rW78vTPkQqLI8= github.com/anmitsu/go-shlex v0.0.0-20200514113438-38f4b401e2be/go.mod h1:ySMOLuWl6zY27l47sB3qLNK6tF2fkHG55UZxx8oIVo4= github.com/antihax/optional v1.0.0/go.mod h1:uupD/76wgC+ih3iEmQUL+0Ugr19nfwCT1kdvxnR2qWY= -github.com/apparentlymart/go-textseg v1.0.0/go.mod h1:z96Txxhf3xSFMPmb5X/1W05FF/Nj9VFpLOpjS5yuumk= github.com/apparentlymart/go-textseg/v15 v15.0.0 h1:uYvfpb3DyLSCGWnctWKGj857c6ew1u1fNQOlOtuGxQY= github.com/apparentlymart/go-textseg/v15 v15.0.0/go.mod h1:K8XmNZdhEBkdlyDdvbmmsvpAG721bKi0joRfFdHIWJ4= github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio= @@ -293,7 +292,6 @@ github.com/golang/mock v1.4.3/go.mod h1:UOMv5ysSaYNkG+OFQykRIcU/QvvxJf3p21QfJ2Bt github.com/golang/mock v1.4.4/go.mod h1:l3mdAwkq5BuhzHwde/uurv3sEJeZMXNpwsxVWU71h+4= github.com/golang/mock v1.5.0/go.mod h1:CWnOUgYIOo4TcNZ0wHX3YZCqsaM1I1Jvs6v3mP3KVu8= github.com/golang/mock v1.6.0/go.mod h1:p6yTPP+5HYm5mzsMV8JkE6ZKdX+/wYM6Hr+LicevLPs= -github.com/golang/protobuf v1.1.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= @@ -393,8 +391,8 @@ github.com/hashicorp/hc-install v0.6.4 h1:QLqlM56/+SIIGvGcfFiwMY3z5WGXT066suo/v9 github.com/hashicorp/hc-install v0.6.4/go.mod h1:05LWLy8TD842OtgcfBbOT0WMoInBMUSHjmDx10zuBIA= github.com/hashicorp/hcl v1.0.0 h1:0Anlzjpi4vEasTeNFn2mLJgTSwt0+6sfsiTG8qcWGx4= github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ= -github.com/hashicorp/hcl/v2 v2.20.1 h1:M6hgdyz7HYt1UN9e61j+qKJBqR3orTWbI1HKBJEdxtc= -github.com/hashicorp/hcl/v2 v2.20.1/go.mod h1:TZDqQ4kNKCbh1iJp99FdPiUaVDDUPivbqxZulxDYqL4= +github.com/hashicorp/hcl/v2 v2.21.0 h1:lve4q/o/2rqwYOgUg3y3V2YPyD1/zkCLGjIV74Jit14= +github.com/hashicorp/hcl/v2 v2.21.0/go.mod h1:62ZYHrXgPoX8xBnzl8QzbWq4dyDsDtfCRgIq1rbJEvA= github.com/hashicorp/terraform-config-inspect v0.0.0-20230925220900-5a6f8d18746d h1:g6kHlvZrFPFKeWRj5q/zyJA5gu7rlJGPf17h8hX7LHY= github.com/hashicorp/terraform-config-inspect v0.0.0-20230925220900-5a6f8d18746d/go.mod h1:l8HcFPm9cQh6Q0KSWoYPiePqMvRFenybP1CH2MjKdlg= github.com/hashicorp/terraform-exec v0.21.0 h1:uNkLAe95ey5Uux6KJdua6+cv8asgILFVWkd/RG0D2XQ= @@ -427,7 +425,6 @@ github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= -github.com/kylelemons/godebug v0.0.0-20170820004349-d65d576e9348/go.mod h1:B69LEHPfb2qLo0BaaOLcbitczOKLWTsrBG9LczfCD4k= github.com/mattn/go-colorable v0.0.9/go.mod h1:9vuHe8Xs5qXnSaW/c/ABM9alt+Vo+STaOChaDxuIBZU= github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA= github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg= @@ -491,7 +488,6 @@ github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsT github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= github.com/ulikunitz/xz v0.5.10 h1:t92gobL9l3HE202wg3rlk19F6X+JOxl9BBrCCMYEYd8= github.com/ulikunitz/xz v0.5.10/go.mod h1:nbz6k7qbPmH4IRqmfOplQw/tblSgqTqBwxkY0oWt/14= -github.com/vmihailenco/msgpack v3.3.3+incompatible/go.mod h1:fy3FlTQTDXWkZ7Bh6AcGMlsjHatGryHQYUTf1ShIgkk= github.com/xanzy/ssh-agent v0.3.3 h1:+/15pJfg/RsTxqYcX6fHqOXZwwMP+2VyYWJeWM2qQFM= github.com/xanzy/ssh-agent v0.3.3/go.mod h1:6dzNDKs0J9rVPHPhaGCukekBHKqfl+L3KghI1Bc68Uw= github.com/yuin/goldmark v1.1.25/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= @@ -500,11 +496,10 @@ github.com/yuin/goldmark v1.1.32/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9de github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.3.5/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k= github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= -github.com/zclconf/go-cty v1.2.0/go.mod h1:hOPWgoHbaTUnI5k4D2ld+GRpFJSCe6bCM7m1q/N4PQ8= github.com/zclconf/go-cty v1.14.4 h1:uXXczd9QDGsgu0i/QFR/hzI5NYCHLf6NQw/atrbnhq8= github.com/zclconf/go-cty v1.14.4/go.mod h1:VvMs5i0vgZdhYawQNq5kePSpLAoz8u1xvZgrPIxfnZE= -github.com/zclconf/go-cty-debug v0.0.0-20191215020915-b22d67c1ba0b h1:FosyBZYxY34Wul7O/MSKey3txpPYyCqVO5ZyceuQJEI= -github.com/zclconf/go-cty-debug v0.0.0-20191215020915-b22d67c1ba0b/go.mod h1:ZRKQfBXbGkpdV6QMzT3rU1kSTAnfu1dO8dPKjYprgj8= +github.com/zclconf/go-cty-debug v0.0.0-20240509010212-0d6042c53940 h1:4r45xpDWB6ZMSMNJFMOjqrGHynW3DIBuR2H9j0ug+Mo= +github.com/zclconf/go-cty-debug v0.0.0-20240509010212-0d6042c53940/go.mod h1:CmBdvvj3nqzfzJ6nTCIwDTPZ56aVGvDrmztiO5g3qrM= go.opencensus.io v0.21.0/go.mod h1:mSImk1erAIZhrmZN+AvHh14ztQfjbGwt4TtuofqLduU= go.opencensus.io v0.22.0/go.mod h1:+kGneAE2xo2IficOXnaByMWTGM9T73dGwxeWcUqIpI8= go.opencensus.io v0.22.2/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw= @@ -577,7 +572,6 @@ golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91 golang.org/x/mod v0.17.0 h1:zY54UmvipHiNd+pm+m0x9KhZ9hl1/7QNMyxXbc6ICqA= golang.org/x/mod v0.17.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= -golang.org/x/net v0.0.0-20180811021610-c39426892332/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20190108225652-1e06a53dbb7e/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= From a9abdadcb5a6dd8ca819851caa6b80e9108622b8 Mon Sep 17 00:00:00 2001 From: Farhad Sharabiani Date: Tue, 25 Jun 2024 07:37:54 +0000 Subject: [PATCH 19/51] fixed dilimiters issue in some Readme files --- community/modules/scheduler/gke-cluster/README.md | 2 +- community/modules/scheduler/pre-existing-gke-cluster/README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/community/modules/scheduler/gke-cluster/README.md b/community/modules/scheduler/gke-cluster/README.md index f98ce0fcc5..94e2dd48cb 100644 --- a/community/modules/scheduler/gke-cluster/README.md +++ b/community/modules/scheduler/gke-cluster/README.md @@ -155,7 +155,7 @@ limitations under the License. | Name | Description | |------|-------------| -| [cluster\_id](#output\_cluster\_id) | An identifier for the resource with format projects//locations//clusters/. | +| [cluster\_id](#output\_cluster\_id) | An identifier for the resource with format projects/{{project\_id}}/locations/{{region}}/clusters/{{name}}. | | [gke\_cluster\_exists](#output\_gke\_cluster\_exists) | A static flag that signals to downstream modules that a cluster has been created. Needed by community/modules/scripts/kubernetes-operations. | | [instructions](#output\_instructions) | Instructions on how to connect to the created cluster. | | [k8s\_service\_account\_name](#output\_k8s\_service\_account\_name) | Name of k8s service account. | diff --git a/community/modules/scheduler/pre-existing-gke-cluster/README.md b/community/modules/scheduler/pre-existing-gke-cluster/README.md index 50676ba8d6..4bc8933908 100644 --- a/community/modules/scheduler/pre-existing-gke-cluster/README.md +++ b/community/modules/scheduler/pre-existing-gke-cluster/README.md @@ -83,6 +83,6 @@ No modules. | Name | Description | |------|-------------| -| [cluster\_id](#output\_cluster\_id) | An identifier for the gke cluster with format projects//locations//clusters/. | +| [cluster\_id](#output\_cluster\_id) | An identifier for the gke cluster with format projects/{{project\_id}}/locations/{{region}}/clusters/{{name}}. | | [gke\_cluster\_exists](#output\_gke\_cluster\_exists) | A static flag that signals to downstream modules that a cluster exists. | From 5b6ca6059486e1641c0ece26a0d1681de61779a9 Mon Sep 17 00:00:00 2001 From: Farhad Sharabiani Date: Tue, 25 Jun 2024 16:21:12 +0000 Subject: [PATCH 20/51] miinor changes on outputs description --- community/modules/scheduler/gke-cluster/outputs.tf | 2 +- .../modules/scheduler/pre-existing-gke-cluster/outputs.tf | 2 +- modules/README.md | 3 +-- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/community/modules/scheduler/gke-cluster/outputs.tf b/community/modules/scheduler/gke-cluster/outputs.tf index 408b9aec47..53ee068ca2 100644 --- a/community/modules/scheduler/gke-cluster/outputs.tf +++ b/community/modules/scheduler/gke-cluster/outputs.tf @@ -15,7 +15,7 @@ */ output "cluster_id" { - description = "An identifier for the resource with format projects//locations//clusters/." + description = "An identifier for the resource with format projects/{{project_id}}/locations/{{region}}/clusters/{{name}}." value = google_container_cluster.gke_cluster.id } diff --git a/community/modules/scheduler/pre-existing-gke-cluster/outputs.tf b/community/modules/scheduler/pre-existing-gke-cluster/outputs.tf index 6166aa6bad..9bfd571b61 100644 --- a/community/modules/scheduler/pre-existing-gke-cluster/outputs.tf +++ b/community/modules/scheduler/pre-existing-gke-cluster/outputs.tf @@ -15,7 +15,7 @@ */ output "cluster_id" { - description = "An identifier for the gke cluster with format projects//locations//clusters/." + description = "An identifier for the gke cluster with format projects/{{project_id}}/locations/{{region}}/clusters/{{name}}." value = data.google_container_cluster.existing_gke_cluster.id } diff --git a/modules/README.md b/modules/README.md index bae9d14143..56d7862947 100644 --- a/modules/README.md +++ b/modules/README.md @@ -184,8 +184,7 @@ Pub/Sub subscription. Primarily used for [FSI - MonteCarlo Tutorial][fsi-monteca submission of Google Cloud Batch jobs. * **[gke-cluster]** ![community-badge] ![experimental-badge] : Creates a Kubernetes cluster using GKE. -* **[pre-existing-gke-cluster]** ![community-badge] ![experimental-badge] : Retrieves an existing - GKE cluster's attributes to be used in other modules as a substitute for creating a new cluster ([gke-cluster]). +* **[pre-existing-gke-cluster]** ![community-badge] ![experimental-badge] : Retrieves an existing GKE cluster. Substitute for ([gke-cluster]) module. * **[schedmd-slurm-gcp-v5-controller]** ![community-badge] : Creates a Slurm controller node using [slurm-gcp-version-5]. * **[schedmd-slurm-gcp-v5-login]** ![community-badge] : From 4ef1c47b775a8be022ffef7dc768ae92031c05e5 Mon Sep 17 00:00:00 2001 From: Ankit Kinra <1037624+ankitkinra@users.noreply.github.com> Date: Tue, 11 Jun 2024 18:18:52 +0000 Subject: [PATCH 21/51] Add known gpu types and their accelerators to gke module Also extracted accelerator type from known gpu machine families --- community/examples/ml-gke.yaml | 13 +---- .../compute/gke-node-pool/gpu_definition.tf | 56 +++++++++++++++++++ .../modules/compute/gke-node-pool/main.tf | 39 +++++++++---- examples/README.md | 40 ++++++++++--- tools/duplicate-diff.py | 1 + 5 files changed, 119 insertions(+), 30 deletions(-) create mode 100644 community/modules/compute/gke-node-pool/gpu_definition.tf diff --git a/community/examples/ml-gke.yaml b/community/examples/ml-gke.yaml index d6ae26b173..6963474810 100644 --- a/community/examples/ml-gke.yaml +++ b/community/examples/ml-gke.yaml @@ -50,23 +50,16 @@ deployment_groups: cidr_block: $(vars.authorized_cidr) outputs: [instructions] - - id: g2-pool + - id: g2_pool source: community/modules/compute/gke-node-pool use: [gke_cluster] settings: disk_type: pd-balanced machine_type: g2-standard-4 - guest_accelerator: - - type: nvidia-l4 - count: 1 - gpu_partition_size: null - gpu_sharing_config: null - gpu_driver_installation_config: - - gpu_driver_version: "DEFAULT" - - id: job-template + - id: job_template source: community/modules/compute/gke-job-template - use: [g2-pool] + use: [g2_pool] settings: image: nvidia/cuda:11.0.3-runtime-ubuntu20.04 command: diff --git a/community/modules/compute/gke-node-pool/gpu_definition.tf b/community/modules/compute/gke-node-pool/gpu_definition.tf new file mode 100644 index 0000000000..c3c16542b1 --- /dev/null +++ b/community/modules/compute/gke-node-pool/gpu_definition.tf @@ -0,0 +1,56 @@ +/** + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +*/ + +## Required variables: +# guest_accelerator +# machine_type + +locals { + # example state; terraform will ignore diffs if last element of URL matches + # guest_accelerator = [ + # { + # count = 1 + # type = "https://www.googleapis.com/compute/beta/projects/PROJECT/zones/ZONE/acceleratorTypes/nvidia-tesla-a100" + # }, + # ] + accelerator_machines = { + "a2-highgpu-1g" = { type = "nvidia-tesla-a100", count = 1 }, + "a2-highgpu-2g" = { type = "nvidia-tesla-a100", count = 2 }, + "a2-highgpu-4g" = { type = "nvidia-tesla-a100", count = 4 }, + "a2-highgpu-8g" = { type = "nvidia-tesla-a100", count = 8 }, + "a2-megagpu-16g" = { type = "nvidia-tesla-a100", count = 16 }, + "a2-ultragpu-1g" = { type = "nvidia-a100-80gb", count = 1 }, + "a2-ultragpu-2g" = { type = "nvidia-a100-80gb", count = 2 }, + "a2-ultragpu-4g" = { type = "nvidia-a100-80gb", count = 4 }, + "a2-ultragpu-8g" = { type = "nvidia-a100-80gb", count = 8 }, + "a3-highgpu-8g" = { type = "nvidia-h100-80gb", count = 8 }, + "g2-standard-4" = { type = "nvidia-l4", count = 1 }, + "g2-standard-8" = { type = "nvidia-l4", count = 1 }, + "g2-standard-12" = { type = "nvidia-l4", count = 1 }, + "g2-standard-16" = { type = "nvidia-l4", count = 1 }, + "g2-standard-24" = { type = "nvidia-l4", count = 2 }, + "g2-standard-32" = { type = "nvidia-l4", count = 1 }, + "g2-standard-48" = { type = "nvidia-l4", count = 4 }, + "g2-standard-96" = { type = "nvidia-l4", count = 8 }, + } + generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], []) + + # Select in priority order: + # (1) var.guest_accelerator if not empty + # (2) local.generated_guest_accelerator if not empty + # (3) default to empty list if both are empty + guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), []) +} diff --git a/community/modules/compute/gke-node-pool/main.tf b/community/modules/compute/gke-node-pool/main.tf index aba0a64489..58e7117b75 100644 --- a/community/modules/compute/gke-node-pool/main.tf +++ b/community/modules/compute/gke-node-pool/main.tf @@ -22,7 +22,8 @@ locals { locals { sa_email = var.service_account_email != null ? var.service_account_email : data.google_compute_default_service_account.default_sa.email - has_gpu = var.guest_accelerator != null || contains(["a2", "g2"], local.machine_family) + preattached_gpu_machine_family = contains(["a2", "a3", "g2"], local.machine_family) + has_gpu = local.guest_accelerator != null || local.preattached_gpu_machine_family gpu_taint = local.has_gpu ? [{ key = "nvidia.com/gpu" value = "present" @@ -73,16 +74,26 @@ resource "google_container_node_pool" "node_pool" { } node_config { - disk_size_gb = var.disk_size_gb - disk_type = var.disk_type - resource_labels = local.labels - labels = var.kubernetes_labels - service_account = var.service_account_email - oauth_scopes = var.service_account_scopes - machine_type = var.machine_type - spot = var.spot - image_type = var.image_type - guest_accelerator = var.guest_accelerator + disk_size_gb = var.disk_size_gb + disk_type = var.disk_type + resource_labels = local.labels + labels = var.kubernetes_labels + service_account = var.service_account_email + oauth_scopes = var.service_account_scopes + machine_type = var.machine_type + spot = var.spot + image_type = var.image_type + + dynamic "guest_accelerator" { + for_each = local.guest_accelerator + content { + type = guest_accelerator.value.type + count = guest_accelerator.value.count + gpu_driver_installation_config = try(guest_accelerator.value.gpu_driver_installation_config, [{ gpu_driver_version = "DEFAULT" }]) + gpu_partition_size = try(guest_accelerator.value.gpu_partition_size, null) + gpu_sharing_config = try(guest_accelerator.value.gpu_sharing_config, null) + } + } dynamic "taint" { for_each = concat(var.taints, local.gpu_taint) @@ -158,6 +169,12 @@ resource "google_container_node_pool" "node_pool" { condition = !(var.local_ssd_count_ephemeral_storage > 0 && var.local_ssd_count_nvme_block > 0) error_message = "Only one of local_ssd_count_ephemeral_storage or local_ssd_count_nvme_block can be set to a non-zero value." } + + # non preattached gpu machine type should always have guest_accelerator config defined + precondition { + condition = (local.preattached_gpu_machine_family && length(var.guest_accelerator != null ? var.guest_accelerator : []) >= 0) || (!local.preattached_gpu_machine_family && length(var.guest_accelerator != null ? var.guest_accelerator : []) > 0) + error_message = "Non-GPU machine types should have user defined guest_accelerator values" + } } } diff --git a/examples/README.md b/examples/README.md index e1aba7e807..6b184a84cd 100644 --- a/examples/README.md +++ b/examples/README.md @@ -1382,8 +1382,8 @@ Toolkit. It includes: > work. See note below. * Creation of a regional GKE cluster. -* Creation of an autoscaling GKE node pool with `g2` machines each with 1 - attached L4 GPUs. Note: This blueprint has also been tested with `a2` machines, +* Creation of an autoscaling GKE node pool with `g2` machines. + Note: This blueprint has also been tested with `a2` machines, but as capacity is hard to find the example uses `g2` machines which have better obtainability. If using with `a2` machines it is recommended to first obtain an automatic reservation. @@ -1395,15 +1395,37 @@ Toolkit. It includes: settings: disk_type: pd-balanced machine_type: a2-highgpu-2g - guest_accelerator: - - type: nvidia-tesla-a100 - count: 2 - gpu_partition_size: null - gpu_sharing_config: null - gpu_driver_installation_config: - - gpu_driver_version: "DEFAULT" ``` +Users only need to provide machine type for standard ["a2", "a3" and "g2"] machine families however for +other and custom machine families users need to provide the entire configuration as follows: + +```yaml +machine_type: n1-standard-1 +guest_accelerator: +- type: nvidia-tesla-t4 + count: 1 + gpu_partition_size: null + gpu_sharing_config: null + gpu_driver_installation_config: + - gpu_driver_version: "DEFAULT" +``` + +Custom g2 pool + +```yaml +machine_type: g2-custom-16-55296 +guest_accelerator: +- type: nvidia-l4 + count: 1 + gpu_partition_size: null + gpu_sharing_config: + - max_shared_clients_per_gpu: 2 + gpu_sharing_strategy: "TIME_SHARING" + gpu_driver_installation_config: + - gpu_driver_version: "LATEST" +``` + * Configuration of the cluster using default drivers provided by GKE. * Creation of a job template yaml file that can be used to submit jobs to the GPU node pool. diff --git a/tools/duplicate-diff.py b/tools/duplicate-diff.py index d67fb391b3..05cb9fb872 100644 --- a/tools/duplicate-diff.py +++ b/tools/duplicate-diff.py @@ -44,6 +44,7 @@ "community/modules/compute/schedmd-slurm-gcp-v6-nodeset/gpu_definition.tf", "community/modules/scheduler/schedmd-slurm-gcp-v6-controller/gpu_definition.tf", "community/modules/scheduler/schedmd-slurm-gcp-v6-login/gpu_definition.tf", + "community/modules/compute/gke-node-pool/gpu_definition.tf", ], [ "community/modules/compute/gke-node-pool/threads_per_core_calc.tf", From f6bfeb40dec12c1c1b1c8a7e0c7f77a1edebe823 Mon Sep 17 00:00:00 2001 From: Farhad Sharabiani Date: Tue, 25 Jun 2024 16:39:31 +0000 Subject: [PATCH 22/51] terraform required version updated --- community/modules/scheduler/pre-existing-gke-cluster/README.md | 2 +- .../modules/scheduler/pre-existing-gke-cluster/versions.tf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/community/modules/scheduler/pre-existing-gke-cluster/README.md b/community/modules/scheduler/pre-existing-gke-cluster/README.md index 4bc8933908..ebd4950e1b 100644 --- a/community/modules/scheduler/pre-existing-gke-cluster/README.md +++ b/community/modules/scheduler/pre-existing-gke-cluster/README.md @@ -52,7 +52,7 @@ limitations under the License. | Name | Version | |------|---------| -| [terraform](#requirement\_terraform) | >= 0.14.0 | +| [terraform](#requirement\_terraform) | >= 1.0.0 | | [google](#requirement\_google) | > 5.0 | ## Providers diff --git a/community/modules/scheduler/pre-existing-gke-cluster/versions.tf b/community/modules/scheduler/pre-existing-gke-cluster/versions.tf index 9083cd5299..30d00afe9c 100644 --- a/community/modules/scheduler/pre-existing-gke-cluster/versions.tf +++ b/community/modules/scheduler/pre-existing-gke-cluster/versions.tf @@ -26,5 +26,5 @@ terraform { module_name = "blueprints/terraform/hpc-toolkit:pre-existing-gke-cluster/v1.35.0" } - required_version = ">= 0.14.0" + required_version = ">= 1.0.0" } From 9c777ca3288fa7923c933eb90fdad6d316e23350 Mon Sep 17 00:00:00 2001 From: Ankit Kinra <1037624+ankitkinra@users.noreply.github.com> Date: Tue, 25 Jun 2024 17:13:40 +0000 Subject: [PATCH 23/51] Lost the commit so re-adding the changes --- community/modules/compute/gke-node-pool/main.tf | 6 ------ examples/README.md | 7 +++++-- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/community/modules/compute/gke-node-pool/main.tf b/community/modules/compute/gke-node-pool/main.tf index 58e7117b75..772c7ebaf7 100644 --- a/community/modules/compute/gke-node-pool/main.tf +++ b/community/modules/compute/gke-node-pool/main.tf @@ -169,12 +169,6 @@ resource "google_container_node_pool" "node_pool" { condition = !(var.local_ssd_count_ephemeral_storage > 0 && var.local_ssd_count_nvme_block > 0) error_message = "Only one of local_ssd_count_ephemeral_storage or local_ssd_count_nvme_block can be set to a non-zero value." } - - # non preattached gpu machine type should always have guest_accelerator config defined - precondition { - condition = (local.preattached_gpu_machine_family && length(var.guest_accelerator != null ? var.guest_accelerator : []) >= 0) || (!local.preattached_gpu_machine_family && length(var.guest_accelerator != null ? var.guest_accelerator : []) > 0) - error_message = "Non-GPU machine types should have user defined guest_accelerator values" - } } } diff --git a/examples/README.md b/examples/README.md index 6b184a84cd..18d5784996 100644 --- a/examples/README.md +++ b/examples/README.md @@ -1397,8 +1397,11 @@ Toolkit. It includes: machine_type: a2-highgpu-2g ``` -Users only need to provide machine type for standard ["a2", "a3" and "g2"] machine families however for -other and custom machine families users need to provide the entire configuration as follows: + Users only need to provide machine type for standard ["a2", "a3" and "g2"] machine families, + while the other settings like `type`, `count` , `gpu_driver_installation_config` will default to + machine family specific values. + However, for other standard or custom machine families users will need to provide + the entire configuration as follows: ```yaml machine_type: n1-standard-1 From 6fa10c8be3d484c568e6dbb4f5e18b1c74612874 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Tue, 25 Jun 2024 16:50:32 -0500 Subject: [PATCH 24/51] Disable force_destroy on bucket created in A3 Mega blueprint. Prevents accidental deletion of manually created objects --- .../machine-learning/a3-megagpu-8g/slurm-a3mega-cluster.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-cluster.yaml b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-cluster.yaml index 5e227950ac..a9f9ff4679 100644 --- a/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-cluster.yaml +++ b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-cluster.yaml @@ -46,7 +46,6 @@ deployment_groups: local_mount: /gcs mount_options: defaults,rw,_netdev,implicit_dirs,allow_other,implicit_dirs,file_mode=777,dir_mode=777 random_suffix: true - force_destroy: true - id: gpunets source: modules/network/multivpc From 3aa73de728a7156c557c9523ecd85c6cb3f7db50 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 26 Jun 2024 19:01:49 +0000 Subject: [PATCH 25/51] Bump djangorestframework in /community/front-end/ofe Bumps [djangorestframework](https://github.com/encode/django-rest-framework) from 3.14.0 to 3.15.2. - [Release notes](https://github.com/encode/django-rest-framework/releases) - [Commits](https://github.com/encode/django-rest-framework/compare/3.14.0...3.15.2) --- updated-dependencies: - dependency-name: djangorestframework dependency-type: direct:production ... Signed-off-by: dependabot[bot] --- community/front-end/ofe/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/community/front-end/ofe/requirements.txt b/community/front-end/ofe/requirements.txt index 45059632b8..1866c67131 100644 --- a/community/front-end/ofe/requirements.txt +++ b/community/front-end/ofe/requirements.txt @@ -22,7 +22,7 @@ git+https://github.com/jazzband/django-revproxy.git@d2234005135dc0771b7c4e0bb046 Django==4.2.11 django-allauth==0.54.0 django-extensions==3.2.3 -djangorestframework==3.14.0 +djangorestframework==3.15.2 filelock==3.12.2 google-api-core==2.11.1 google-api-python-client==2.90.0 From f3aae44c97f5f506bd8e0fec026cccc0f3c167a3 Mon Sep 17 00:00:00 2001 From: Sam Skillman Date: Wed, 26 Jun 2024 21:02:25 +0000 Subject: [PATCH 26/51] Revert "Remove installation of enroot and pyxis from a3-highgpu-8g blueprint" This reverts commit c4f87dd2e59a6303e572cb203000f9e1960b257a. --- .../a3-highgpu-8g/ml-slurm-a3-1-image.yaml | 26 +++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml b/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml index d3f7cf9310..99b2f40838 100644 --- a/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml +++ b/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml @@ -127,12 +127,34 @@ deployment_groups: [Install] WantedBy=multi-user.target - - type: data - destination: /etc/enroot/enroot.conf + - type: shell + destination: install_enroot_pyxis.sh content: | + #!/bin/bash + set -e -o pipefail + ### Setting up Enroot + if ! dpkg -l enroot &>/dev/null; then + arch=\$(dpkg --print-architecture) + curl -fSsL -O https://github.com/NVIDIA/enroot/releases/download/v3.4.1/enroot_3.4.1-1_${arch}.deb + curl -fSsL -O https://github.com/NVIDIA/enroot/releases/download/v3.4.1/enroot+caps_3.4.1-1_${arch}.deb # optional + apt-get update + apt-get install --assume-yes ./*.deb + rm enroot*.deb + fi + # configure enroot + # use single quotes around EOT to avoid shell interpolation + cat <<'EOT' > /etc/enroot/enroot.conf ENROOT_RUNTIME_PATH /mnt/localssd/${UID}/enroot/runtime ENROOT_CACHE_PATH /mnt/localssd/${UID}/enroot/cache ENROOT_DATA_PATH /mnt/localssd/${UID}/enroot/data + EOT + ### Install Pyxis + if [ ! -f "/usr/local/lib/slurm/spank_pyxis.so" ]; then + git clone --depth 1 https://github.com/NVIDIA/pyxis.git + cd pyxis && make install && cd - + rm -rf pyxis + echo "required /usr/local/lib/slurm/spank_pyxis.so" > /etc/slurm/plugstack.conf + fi - type: shell destination: install_mdadm.sh content: | From bb55e2fe9c6b3abefdd69f25f291d45d7ede60de Mon Sep 17 00:00:00 2001 From: Ankit Kinra <1037624+ankitkinra@users.noreply.github.com> Date: Thu, 27 Jun 2024 21:14:20 +0000 Subject: [PATCH 27/51] Only enable gpu taints if guest_acclerator list is not empty --- community/modules/compute/gke-node-pool/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/community/modules/compute/gke-node-pool/main.tf b/community/modules/compute/gke-node-pool/main.tf index 772c7ebaf7..e8bb2f7145 100644 --- a/community/modules/compute/gke-node-pool/main.tf +++ b/community/modules/compute/gke-node-pool/main.tf @@ -23,7 +23,7 @@ locals { sa_email = var.service_account_email != null ? var.service_account_email : data.google_compute_default_service_account.default_sa.email preattached_gpu_machine_family = contains(["a2", "a3", "g2"], local.machine_family) - has_gpu = local.guest_accelerator != null || local.preattached_gpu_machine_family + has_gpu = (local.guest_accelerator != null && length(local.guest_accelerator) > 0) || local.preattached_gpu_machine_family gpu_taint = local.has_gpu ? [{ key = "nvidia.com/gpu" value = "present" From 56094d10a0e85ad923a75a2b8a39e15a62a42a0b Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 1 Jul 2024 10:24:43 +0000 Subject: [PATCH 28/51] Bump google.golang.org/api from 0.185.0 to 0.186.0 Bumps [google.golang.org/api](https://github.com/googleapis/google-api-go-client) from 0.185.0 to 0.186.0. - [Release notes](https://github.com/googleapis/google-api-go-client/releases) - [Changelog](https://github.com/googleapis/google-api-go-client/blob/main/CHANGES.md) - [Commits](https://github.com/googleapis/google-api-go-client/compare/v0.185.0...v0.186.0) --- updated-dependencies: - dependency-name: google.golang.org/api dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- go.mod | 6 +++--- go.sum | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/go.mod b/go.mod index ace3c5616b..7a7435fb98 100644 --- a/go.mod +++ b/go.mod @@ -27,11 +27,11 @@ require ( github.com/hashicorp/terraform-exec v0.21.0 github.com/mattn/go-isatty v0.0.20 github.com/zclconf/go-cty-debug v0.0.0-20240509010212-0d6042c53940 - google.golang.org/api v0.185.0 + google.golang.org/api v0.186.0 ) require ( - cloud.google.com/go/auth v0.5.1 // indirect + cloud.google.com/go/auth v0.6.0 // indirect cloud.google.com/go/auth/oauth2adapt v0.2.2 // indirect dario.cat/mergo v1.0.0 // indirect github.com/apparentlymart/go-textseg/v15 v15.0.0 // indirect @@ -40,7 +40,7 @@ require ( github.com/felixge/httpsnoop v1.0.4 // indirect github.com/go-logr/logr v1.4.1 // indirect github.com/go-logr/stdr v1.2.2 // indirect - github.com/googleapis/gax-go/v2 v2.12.4 // indirect + github.com/googleapis/gax-go/v2 v2.12.5 // indirect github.com/hashicorp/terraform-json v0.22.1 // indirect github.com/mattn/go-colorable v0.1.13 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect diff --git a/go.sum b/go.sum index 2ac66a1a9d..0c9ffbf1dc 100644 --- a/go.sum +++ b/go.sum @@ -46,8 +46,8 @@ cloud.google.com/go/asset v1.8.0/go.mod h1:mUNGKhiqIdbr8X7KNayoYvyc4HbbFO9URsjby cloud.google.com/go/assuredworkloads v1.5.0/go.mod h1:n8HOZ6pff6re5KYfBXcFvSViQjDwxFkAkmUFffJRbbY= cloud.google.com/go/assuredworkloads v1.6.0/go.mod h1:yo2YOk37Yc89Rsd5QMVECvjaMKymF9OP+QXWlKXUkXw= cloud.google.com/go/assuredworkloads v1.7.0/go.mod h1:z/736/oNmtGAyU47reJgGN+KVoYoxeLBoj4XkKYscNI= -cloud.google.com/go/auth v0.5.1 h1:0QNO7VThG54LUzKiQxv8C6x1YX7lUrzlAa1nVLF8CIw= -cloud.google.com/go/auth v0.5.1/go.mod h1:vbZT8GjzDf3AVqCcQmqeeM32U9HBFc32vVVAbwDsa6s= +cloud.google.com/go/auth v0.6.0 h1:5x+d6b5zdezZ7gmLWD1m/xNjnaQ2YDhmIz/HH3doy1g= +cloud.google.com/go/auth v0.6.0/go.mod h1:b4acV+jLQDyjwm4OXHYjNvRi4jvGBzHWJRtJcy+2P4g= cloud.google.com/go/auth/oauth2adapt v0.2.2 h1:+TTV8aXpjeChS9M+aTtN/TjdQnzJvmzKFt//oWu7HX4= cloud.google.com/go/auth/oauth2adapt v0.2.2/go.mod h1:wcYjgpZI9+Yu7LyYBg4pqSiaRkfEK3GQcpb7C/uyF1Q= cloud.google.com/go/automl v1.5.0/go.mod h1:34EjfoFGMZ5sgJ9EoLsRtdPSNZLcfflJR39VbVNS2M0= @@ -373,8 +373,8 @@ github.com/googleapis/gax-go/v2 v2.3.0/go.mod h1:b8LNqSzNabLiUpXKkY7HAR5jr6bIT99 github.com/googleapis/gax-go/v2 v2.4.0/go.mod h1:XOTVJ59hdnfJLIP/dh8n5CGryZR2LxK9wbMD5+iXC6c= github.com/googleapis/gax-go/v2 v2.5.1/go.mod h1:h6B0KMMFNtI2ddbGJn3T3ZbwkeT6yqEF02fYlzkUCyo= github.com/googleapis/gax-go/v2 v2.6.0/go.mod h1:1mjbznJAPHFpesgE5ucqfYEscaz5kMdcIDwU/6+DDoY= -github.com/googleapis/gax-go/v2 v2.12.4 h1:9gWcmF85Wvq4ryPFvGFaOgPIs1AQX0d0bcbGw4Z96qg= -github.com/googleapis/gax-go/v2 v2.12.4/go.mod h1:KYEYLorsnIGDi/rPC8b5TdlB9kbKoFubselGIoBMCwI= +github.com/googleapis/gax-go/v2 v2.12.5 h1:8gw9KZK8TiVKB6q3zHY3SBzLnrGp6HQjyfYBYGmXdxA= +github.com/googleapis/gax-go/v2 v2.12.5/go.mod h1:BUDKcWo+RaKq5SC9vVYL0wLADa3VcfswbOMMRmB9H3E= github.com/googleapis/go-type-adapters v1.0.0/go.mod h1:zHW75FOG2aur7gAO2B+MLby+cLsWGBF62rFAi7WjWO4= github.com/grpc-ecosystem/grpc-gateway v1.16.0/go.mod h1:BDjrQk3hbvj6Nolgz8mAMFbcEtjT1g+wF4CSlocrBnw= github.com/hashicorp/go-cleanhttp v0.5.2 h1:035FKYIWjmULyFRBKPs8TBQoi0x6d9G4xc9neXJWAZQ= @@ -868,8 +868,8 @@ google.golang.org/api v0.96.0/go.mod h1:w7wJQLTM+wvQpNf5JyEcBoxK0RH7EDrh/L4qfsuJ google.golang.org/api v0.97.0/go.mod h1:w7wJQLTM+wvQpNf5JyEcBoxK0RH7EDrh/L4qfsuJ13s= google.golang.org/api v0.98.0/go.mod h1:w7wJQLTM+wvQpNf5JyEcBoxK0RH7EDrh/L4qfsuJ13s= google.golang.org/api v0.100.0/go.mod h1:ZE3Z2+ZOr87Rx7dqFsdRQkRBk36kDtp/h+QpHbB7a70= -google.golang.org/api v0.185.0 h1:ENEKk1k4jW8SmmaT6RE+ZasxmxezCrD5Vw4npvr+pAU= -google.golang.org/api v0.185.0/go.mod h1:HNfvIkJGlgrIlrbYkAm9W9IdkmKZjOTVh33YltygGbg= +google.golang.org/api v0.186.0 h1:n2OPp+PPXX0Axh4GuSsL5QL8xQCTb2oDwyzPnQvqUug= +google.golang.org/api v0.186.0/go.mod h1:hvRbBmgoje49RV3xqVXrmP6w93n6ehGgIVPYrGtBFFc= google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= google.golang.org/appengine v1.5.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= From 8a70b97d43d662e653ae9596a090c0b510c59831 Mon Sep 17 00:00:00 2001 From: aneo-ssam Date: Mon, 1 Jul 2024 14:53:11 +0200 Subject: [PATCH 29/51] Set default disk type to pd-balanced --- community/modules/compute/htcondor-execute-point/README.md | 2 +- community/modules/compute/htcondor-execute-point/variables.tf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/community/modules/compute/htcondor-execute-point/README.md b/community/modules/compute/htcondor-execute-point/README.md index dabaa58fef..8a08efd40f 100644 --- a/community/modules/compute/htcondor-execute-point/README.md +++ b/community/modules/compute/htcondor-execute-point/README.md @@ -229,7 +229,7 @@ limitations under the License. | [central\_manager\_ips](#input\_central\_manager\_ips) | List of IP addresses of HTCondor Central Managers | `list(string)` | n/a | yes | | [deployment\_name](#input\_deployment\_name) | HPC Toolkit deployment name. HTCondor cloud resource names will include this value. | `string` | n/a | yes | | [disk\_size\_gb](#input\_disk\_size\_gb) | Boot disk size in GB | `number` | `100` | no | -| [disk\_type](#input\_disk\_type) | Disk type for template | `string` | `"pd-standard"` | no | +| [disk\_type](#input\_disk\_type) | Disk type for template | `string` | `"pd-balanced"` | no | | [distribution\_policy\_target\_shape](#input\_distribution\_policy\_target\_shape) | Target shape across zones for instance group managing execute points | `string` | `"ANY"` | no | | [enable\_oslogin](#input\_enable\_oslogin) | Enable or Disable OS Login with "ENABLE" or "DISABLE". Set to "INHERIT" to inherit project OS Login setting. | `string` | `"ENABLE"` | no | | [enable\_shielded\_vm](#input\_enable\_shielded\_vm) | Enable the Shielded VM configuration (var.shielded\_instance\_config). | `bool` | `false` | no | diff --git a/community/modules/compute/htcondor-execute-point/variables.tf b/community/modules/compute/htcondor-execute-point/variables.tf index 2c54efe9ff..e6607ef76f 100644 --- a/community/modules/compute/htcondor-execute-point/variables.tf +++ b/community/modules/compute/htcondor-execute-point/variables.tf @@ -177,7 +177,7 @@ variable "disk_size_gb" { variable "disk_type" { description = "Disk type for template" type = string - default = "pd-standard" + default = "pd-balanced" } variable "windows_startup_ps1" { From 7eff272d8b77f9077ac3d6ea446d9fc51f1ff011 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Niesiob=C4=99dzki?= Date: Mon, 1 Jul 2024 15:45:53 +0000 Subject: [PATCH 30/51] Add controller and login instances to outputs --- .../schedmd-slurm-gcp-v6-controller/README.md | 2 ++ .../schedmd-slurm-gcp-v6-controller/outputs.tf | 10 ++++++++++ 2 files changed, 12 insertions(+) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md index 622eaf5835..5a2a9b0672 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md @@ -298,4 +298,6 @@ limitations under the License. | [instructions](#output\_instructions) | Post deployment instructions. | | [slurm\_bucket\_path](#output\_slurm\_bucket\_path) | Bucket path used by cluster. | | [slurm\_cluster\_name](#output\_slurm\_cluster\_name) | Slurm cluster name. | +| [slurm\_controller\_instance](#output\_slurm\_controller\_instance) | Compute instance of controller node | +| [slurm\_login\_instances](#output\_slurm\_login\_instances) | Compute instances of login nodes | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/outputs.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/outputs.tf index 0cf4521f5f..06ffb93594 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/outputs.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/outputs.tf @@ -17,6 +17,16 @@ output "slurm_cluster_name" { value = local.slurm_cluster_name } +output "slurm_controller_instance" { + description = "Compute instance of controller node" + value = module.slurm_controller_instance.slurm_instances[0] +} + +output "slurm_login_instances" { + description = "Compute instances of login nodes" + value = flatten([for k, v in module.slurm_login_instance : v.slurm_instances]) +} + output "slurm_bucket_path" { description = "Bucket path used by cluster." value = module.slurm_files.slurm_bucket_path From 251ce6e8f85a5f1c756d1ba08f2b68d4088e4019 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Mon, 1 Jul 2024 15:11:38 -0500 Subject: [PATCH 31/51] Move GCESysPrep to provisioner in Windows scripts We have observed that GCESysPrep is not always given enough time to run at shutdown when run from the shutdown script metadata. This moves the command to a PowerShell provisioner and continues to use to the -no_shutdown argument to ensure that GCESysPrep itself does not power down the Packer VM. --- modules/packer/custom-image/image.pkr.hcl | 9 ++++++++- .../expectations/igc_pkr/one/image/image.pkr.hcl | 9 ++++++++- .../expectations/text_escape/zero/lime/image.pkr.hcl | 9 ++++++++- 3 files changed, 24 insertions(+), 3 deletions(-) diff --git a/modules/packer/custom-image/image.pkr.hcl b/modules/packer/custom-image/image.pkr.hcl index d742fed982..44e34d95ed 100644 --- a/modules/packer/custom-image/image.pkr.hcl +++ b/modules/packer/custom-image/image.pkr.hcl @@ -52,7 +52,6 @@ locals { sysprep-specialize-script-cmd = "winrm quickconfig -quiet & net user /add ${local.windows_packer_user} & net localgroup administrators ${local.windows_packer_user} /add & winrm set winrm/config/service/auth @{Basic=\\\"true\\\"}" windows-shutdown-script-cmd = <<-EOT net user /delete ${local.windows_packer_user} - GCESysprep -no_shutdown EOT } user_metadata = local.communicator == "winrm" ? local.windows_user_metadata : local.linux_user_metadata @@ -159,6 +158,14 @@ build { } } + dynamic "provisioner" { + labels = ["powershell"] + for_each = length(var.windows_startup_ps1) == 0 ? [1] : [] + content { + inline = "GCESysprep -no_shutdown" + } + } + # provisioner "ansible-local" blocks # this installs custom roles/collections from ansible-galaxy in /home/packer # which will be removed at the end; consider modifying /etc/ansible/ansible.cfg diff --git a/tools/validate_configs/golden_copies/expectations/igc_pkr/one/image/image.pkr.hcl b/tools/validate_configs/golden_copies/expectations/igc_pkr/one/image/image.pkr.hcl index d742fed982..44e34d95ed 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_pkr/one/image/image.pkr.hcl +++ b/tools/validate_configs/golden_copies/expectations/igc_pkr/one/image/image.pkr.hcl @@ -52,7 +52,6 @@ locals { sysprep-specialize-script-cmd = "winrm quickconfig -quiet & net user /add ${local.windows_packer_user} & net localgroup administrators ${local.windows_packer_user} /add & winrm set winrm/config/service/auth @{Basic=\\\"true\\\"}" windows-shutdown-script-cmd = <<-EOT net user /delete ${local.windows_packer_user} - GCESysprep -no_shutdown EOT } user_metadata = local.communicator == "winrm" ? local.windows_user_metadata : local.linux_user_metadata @@ -159,6 +158,14 @@ build { } } + dynamic "provisioner" { + labels = ["powershell"] + for_each = length(var.windows_startup_ps1) == 0 ? [1] : [] + content { + inline = "GCESysprep -no_shutdown" + } + } + # provisioner "ansible-local" blocks # this installs custom roles/collections from ansible-galaxy in /home/packer # which will be removed at the end; consider modifying /etc/ansible/ansible.cfg diff --git a/tools/validate_configs/golden_copies/expectations/text_escape/zero/lime/image.pkr.hcl b/tools/validate_configs/golden_copies/expectations/text_escape/zero/lime/image.pkr.hcl index d742fed982..44e34d95ed 100644 --- a/tools/validate_configs/golden_copies/expectations/text_escape/zero/lime/image.pkr.hcl +++ b/tools/validate_configs/golden_copies/expectations/text_escape/zero/lime/image.pkr.hcl @@ -52,7 +52,6 @@ locals { sysprep-specialize-script-cmd = "winrm quickconfig -quiet & net user /add ${local.windows_packer_user} & net localgroup administrators ${local.windows_packer_user} /add & winrm set winrm/config/service/auth @{Basic=\\\"true\\\"}" windows-shutdown-script-cmd = <<-EOT net user /delete ${local.windows_packer_user} - GCESysprep -no_shutdown EOT } user_metadata = local.communicator == "winrm" ? local.windows_user_metadata : local.linux_user_metadata @@ -159,6 +158,14 @@ build { } } + dynamic "provisioner" { + labels = ["powershell"] + for_each = length(var.windows_startup_ps1) == 0 ? [1] : [] + content { + inline = "GCESysprep -no_shutdown" + } + } + # provisioner "ansible-local" blocks # this installs custom roles/collections from ansible-galaxy in /home/packer # which will be removed at the end; consider modifying /etc/ansible/ansible.cfg From 1e30ab973425b030dd0aa3e02d8f5d42f57aa787 Mon Sep 17 00:00:00 2001 From: Harsh Thakkar Date: Tue, 25 Jun 2024 23:17:32 +0000 Subject: [PATCH 32/51] Add parallelstore support in pre-existing-network-storage module --- .../pre-existing-network-storage/outputs.tf | 18 +++- .../scripts/install-daos-client.sh | 84 +++++++++++++++++++ .../scripts/mount-daos.sh | 40 +++++++++ 3 files changed, 140 insertions(+), 2 deletions(-) create mode 100644 modules/file-system/pre-existing-network-storage/scripts/install-daos-client.sh create mode 100644 modules/file-system/pre-existing-network-storage/scripts/mount-daos.sh diff --git a/modules/file-system/pre-existing-network-storage/outputs.tf b/modules/file-system/pre-existing-network-storage/outputs.tf index aecf9b41ea..def20d97c8 100644 --- a/modules/file-system/pre-existing-network-storage/outputs.tf +++ b/modules/file-system/pre-existing-network-storage/outputs.tf @@ -35,6 +35,7 @@ locals { remote_mount = contains(local.mount_vanilla_supported_fstype, var.fs_type) ? ( local.remote_mount_with_slash ) : var.remote_mount + # Client Install ddn_lustre_client_install_script = templatefile( "${path.module}/templates/ddn_exascaler_luster_client_install.tftpl", @@ -44,19 +45,22 @@ locals { local_mount = var.local_mount } ) - nfs_client_install_script = file("${path.module}/scripts/install-nfs-client.sh") - gcs_fuse_install_script = file("${path.module}/scripts/install-gcs-fuse.sh") + nfs_client_install_script = file("${path.module}/scripts/install-nfs-client.sh") + gcs_fuse_install_script = file("${path.module}/scripts/install-gcs-fuse.sh") + daos_client_install_script = file("${path.module}/scripts/install-daos-client.sh") install_scripts = { "lustre" = local.ddn_lustre_client_install_script "nfs" = local.nfs_client_install_script "gcsfuse" = local.gcs_fuse_install_script + "daos" = local.daos_client_install_script } client_install_runner = { "type" = "shell" "content" = lookup(local.install_scripts, var.fs_type, "echo 'skipping: client_install_runner not yet supported for ${var.fs_type}'") "destination" = "install_filesystem_client${replace(var.local_mount, "/", "_")}.sh" + "args" = var.fs_type == "daos" ? "--access_points=\"${var.remote_mount}\"" : "" } mount_vanilla_supported_fstype = ["lustre", "nfs"] @@ -77,11 +81,21 @@ locals { "args" = "\"not-used\" \"${local.gcsbucket}\" \"${var.local_mount}\" \"${var.fs_type}\" \"${var.mount_options}\"" "content" = file("${path.module}/scripts/mount.sh") } + + mount_runner_daos = { + "type" = "shell" + "content" = file("${path.module}/scripts/mount-daos.sh") + "args" = "--local_mount=\"${var.local_mount}\" --mount_options=\"${var.mount_options}\"" + "destination" = "mount_filesystem${replace(var.local_mount, "/", "_")}.sh" + } + mount_scripts = { "lustre" = local.mount_runner_vanilla "nfs" = local.mount_runner_vanilla "gcsfuse" = local.mount_runner_gcsfuse + "daos" = local.mount_runner_daos } + mount_runner = lookup(local.mount_scripts, var.fs_type, local.mount_runner_vanilla) } diff --git a/modules/file-system/pre-existing-network-storage/scripts/install-daos-client.sh b/modules/file-system/pre-existing-network-storage/scripts/install-daos-client.sh new file mode 100644 index 0000000000..84354710d2 --- /dev/null +++ b/modules/file-system/pre-existing-network-storage/scripts/install-daos-client.sh @@ -0,0 +1,84 @@ +#!/bin/bash +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -e + +# Parse access_points. +for arg in "$@"; do + if [[ $arg == --access_points=* ]]; then + access_points="${arg#*=}" + fi +done + +# Install the DAOS client library +# The following commands should be executed on each client vm. +## For Rocky linux 8. +if grep -q "ID=\"rocky\"" /etc/os-release && lsb_release -rs | grep -q "8\.[0-9]"; then + + # 1) Add the Parallelstore package repository + tee /etc/yum.repos.d/parallelstore-v2-4-el8.repo < --disable-wb-cache --eq-count=8. +for arg in "$@"; do + if [[ $arg == --local_mount=* ]]; then + local_mount="${arg#*=}" + fi + if [[ $arg == --mount_options=* ]]; then + mount_options="${arg#*=}" + mount_options="--${mount_options//,/ --}" + fi +done + +# Mount parallelstore instance to client vm. +mkdir -p "$local_mount" +chmod 777 "$local_mount" + +# Mount container for multi-user. +fuse_config=/etc/fuse.conf +sed -i "s/#.*user_allow_other/user_allow_other/g" $fuse_config +dfuse -m "$local_mount" --pool default-pool --container default-container --multi-user -o "$mount_options" + +exit 0 From c2077f6b231495e38d6890dd60e560113463cb29 Mon Sep 17 00:00:00 2001 From: Harsh Thakkar Date: Mon, 24 Jun 2024 18:52:17 +0000 Subject: [PATCH 33/51] Add parallelstore module and support for rocky 8, ubuntu 22.04 and debian 12 --- examples/README.md | 35 ++++++++ modules/README.md | 2 + modules/file-system/parallelstore/README.md | 68 ++++++++++++++ modules/file-system/parallelstore/main.tf | 66 ++++++++++++++ .../file-system/parallelstore/metadata.yaml | 19 ++++ modules/file-system/parallelstore/outputs.tf | 40 +++++++++ .../scripts/install-daos-client.sh | 84 +++++++++++++++++ .../parallelstore/scripts/mount-daos.sh | 40 +++++++++ .../file-system/parallelstore/variables.tf | 89 +++++++++++++++++++ modules/file-system/parallelstore/versions.tf | 36 ++++++++ 10 files changed, 479 insertions(+) create mode 100644 modules/file-system/parallelstore/README.md create mode 100644 modules/file-system/parallelstore/main.tf create mode 100644 modules/file-system/parallelstore/metadata.yaml create mode 100644 modules/file-system/parallelstore/outputs.tf create mode 100644 modules/file-system/parallelstore/scripts/install-daos-client.sh create mode 100644 modules/file-system/parallelstore/scripts/mount-daos.sh create mode 100644 modules/file-system/parallelstore/variables.tf create mode 100644 modules/file-system/parallelstore/versions.tf diff --git a/examples/README.md b/examples/README.md index 18d5784996..6a946e5061 100644 --- a/examples/README.md +++ b/examples/README.md @@ -1888,3 +1888,38 @@ To avoid these issues, the `ghpc_stage` function can be used to copy a file (or The `ghpc_stage` function will always look first in the path specified in the blueprint. If the file is not found at this path then `ghpc_stage` will look for the staged file in the deployment folder, if a deployment folder exists. This means that you can redeploy a blueprint (`ghpc deploy -w`) so long as you have the deployment folder from the original deployment, even if locally referenced files are not available. + + +## Requirements + +No requirements. + +## Providers + +| Name | Version | +|------|---------| +| [google-beta](#provider\_google-beta) | n/a | + +## Modules + +No modules. + +## Resources + +| Name | Type | +|------|------| +| [google-beta_google_compute_global_address.private_ip_alloc](https://registry.terraform.io/providers/hashicorp/google-beta/latest/docs/resources/google_compute_global_address) | resource | +| [google-beta_google_compute_network.network](https://registry.terraform.io/providers/hashicorp/google-beta/latest/docs/resources/google_compute_network) | resource | +| [google-beta_google_parallelstore_instance.instance](https://registry.terraform.io/providers/hashicorp/google-beta/latest/docs/resources/google_parallelstore_instance) | resource | +| [google-beta_google_service_networking_connection.default](https://registry.terraform.io/providers/hashicorp/google-beta/latest/docs/resources/google_service_networking_connection) | resource | + +## Inputs + +No inputs. + +## Outputs + +| Name | Description | +|------|-------------| +| [access\_points](#output\_access\_points) | Output access points | + diff --git a/modules/README.md b/modules/README.md index 41055bb85a..fa5f7c1409 100644 --- a/modules/README.md +++ b/modules/README.md @@ -91,6 +91,7 @@ Modules that are still in development and less stable are labeled with the ### File System * **[filestore]** ![core-badge] : Creates a [filestore](https://cloud.google.com/filestore) file system. +* **[parallelstore]** ![core-badge] ![experimental-badge]: Creates a [parallelstore](https://cloud.google.com/parallelstore) file system. * **[pre-existing-network-storage]** ![core-badge] : Specifies a pre-existing file system that can be mounted on a VM. * **[DDN-EXAScaler]** ![community-badge] : Creates @@ -105,6 +106,7 @@ Modules that are still in development and less stable are labeled with the configures an NFS server that can be mounted by other VM. [filestore]: file-system/filestore/README.md +[parallelstore]: file-system/parallelstore/README.md [pre-existing-network-storage]: file-system/pre-existing-network-storage/README.md [ddn-exascaler]: ../community/modules/file-system/DDN-EXAScaler/README.md [intel-daos]: ../community/modules/file-system/Intel-DAOS/README.md diff --git a/modules/file-system/parallelstore/README.md b/modules/file-system/parallelstore/README.md new file mode 100644 index 0000000000..32643d965c --- /dev/null +++ b/modules/file-system/parallelstore/README.md @@ -0,0 +1,68 @@ + +Copyright 2024 Google LLC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | >= 0.13 | +| [google-beta](#requirement\_google-beta) | >= 5.25.0 | +| [null](#requirement\_null) | ~> 3.0 | +| [random](#requirement\_random) | ~> 3.0 | + +## Providers + +| Name | Version | +|------|---------| +| [google-beta](#provider\_google-beta) | >= 5.25.0 | +| [null](#provider\_null) | ~> 3.0 | +| [random](#provider\_random) | ~> 3.0 | + +## Modules + +No modules. + +## Resources + +| Name | Type | +|------|------| +| [google-beta_google_parallelstore_instance.instance](https://registry.terraform.io/providers/hashicorp/google-beta/latest/docs/resources/google_parallelstore_instance) | resource | +| [null_resource.hydration](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource | +| [random_id.resource_name_suffix](https://registry.terraform.io/providers/hashicorp/random/latest/docs/resources/id) | resource | + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [deployment\_name](#input\_deployment\_name) | Name of the HPC deployment. | `string` | n/a | yes | +| [destination\_hydration\_parallelstore](#input\_destination\_hydration\_parallelstore) | The name of local path to import data on parallelstore instance from GCS bucket. | `string` | `"/"` | no | +| [labels](#input\_labels) | Labels to add to parallel store instance. | `map(string)` | `{}` | no | +| [local\_mount](#input\_local\_mount) | The mount point where the contents of the device may be accessed after mounting. | `string` | `"/parallelstore"` | no | +| [mount\_options](#input\_mount\_options) | Options describing various aspects of the parallelstore instance. | `string` | `"disable-wb-cache,thread-count=16,eq-count=8"` | no | +| [name](#input\_name) | Name of parallelstore instance. | `string` | `null` | no | +| [network\_id](#input\_network\_id) | The ID of the GCE VPC network to which the instance is connected given in the format:
`projects//global/networks/`" | `string` | n/a | yes | +| [private\_vpc\_connection\_peering](#input\_private\_vpc\_connection\_peering) | The name of the VPC Network peering connection. | `string` | n/a | yes | +| [project\_id](#input\_project\_id) | Project in which the HPC deployment will be created. | `string` | n/a | yes | +| [size\_gb](#input\_size\_gb) | Storage size of the parallelstore instance in GB. | `number` | `12000` | no | +| [source\_gcs\_bucket\_uri](#input\_source\_gcs\_bucket\_uri) | The name of the GCS bucket to import data from to parallelstore. | `string` | `""` | no | +| [zone](#input\_zone) | Location for parallelstore instance. | `string` | n/a | yes | + +## Outputs + +| Name | Description | +|------|-------------| +| [instructions](#output\_instructions) | Instructions to monitor import-data operation from GCS bucket to parallelstore. | +| [network\_storage](#output\_network\_storage) | Describes a parallelstore instance. | + diff --git a/modules/file-system/parallelstore/main.tf b/modules/file-system/parallelstore/main.tf new file mode 100644 index 0000000000..7d3ee33c79 --- /dev/null +++ b/modules/file-system/parallelstore/main.tf @@ -0,0 +1,66 @@ +/** + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +locals { + # This label allows for billing report tracking based on module. + labels = merge(var.labels, { ghpc_module = "parallelstore", ghpc_role = "file-system" }) +} + +locals { + fs_type = "daos" + server_ip = "" + remote_mount = "" + access_points = jsonencode(google_parallelstore_instance.instance.access_points) + + client_install_runner = { + "type" = "shell" + "source" = "${path.module}/scripts/install-daos-client.sh" + "args" = "--access_points=\"${local.access_points}\"" + "destination" = "install_daos_client.sh" + } + + mount_runner = { + "type" = "shell" + "source" = "${path.module}/scripts/mount-daos.sh" + "args" = "--local_mount=\"${var.local_mount}\" --mount_options=\"${var.mount_options}\"" + "destination" = "mount_daos.sh" + } +} + +resource "random_id" "resource_name_suffix" { + byte_length = 4 +} + +resource "google_parallelstore_instance" "instance" { + instance_id = var.name != null ? var.name : "${var.deployment_name}-${random_id.resource_name_suffix.hex}" + location = var.zone + capacity_gib = var.size_gb + network = var.network_id + + labels = local.labels + + provider = google-beta + depends_on = [var.private_vpc_connection_peering] +} + +resource "null_resource" "hydration" { + count = var.source_gcs_bucket_uri != "" ? 1 : 0 + + depends_on = [resource.google_parallelstore_instance.instance] + provisioner "local-exec" { + command = "curl -X POST -H \"Content-Type: application/json\" -H \"Authorization: Bearer $(gcloud auth print-access-token)\" -d '{\"source_gcs_bucket\": {\"uri\":\"${var.source_gcs_bucket_uri}\"}, \"destination_parallelstore\": {\"path\":\"${var.destination_hydration_parallelstore}\"}}' https://parallelstore.googleapis.com/v1beta/projects/${var.project_id}/locations/${var.zone}/instances/${var.name}:importData" + } +} diff --git a/modules/file-system/parallelstore/metadata.yaml b/modules/file-system/parallelstore/metadata.yaml new file mode 100644 index 0000000000..c0994d15bb --- /dev/null +++ b/modules/file-system/parallelstore/metadata.yaml @@ -0,0 +1,19 @@ +# Copyright 2024 "Google LLC" +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +spec: + requirements: + services: + - parallelstore.googleapis.com diff --git a/modules/file-system/parallelstore/outputs.tf b/modules/file-system/parallelstore/outputs.tf new file mode 100644 index 0000000000..6bae6b892d --- /dev/null +++ b/modules/file-system/parallelstore/outputs.tf @@ -0,0 +1,40 @@ +/** + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +locals { + operation_instructions = <<-EOT + Data is being imported from GCS bucket to parallelstore instance. It may + not be available immediately. + EOT +} + +output "network_storage" { + description = "Describes a parallelstore instance." + value = { + server_ip = local.server_ip + remote_mount = local.remote_mount + local_mount = var.local_mount + fs_type = local.fs_type + mount_options = var.mount_options + client_install_runner = local.client_install_runner + mount_runner = local.mount_runner + } +} + +output "instructions" { + description = "Instructions to monitor import-data operation from GCS bucket to parallelstore." + value = var.source_gcs_bucket_uri != "" ? local.operation_instructions : "Data is not imported from GCS bucket." +} diff --git a/modules/file-system/parallelstore/scripts/install-daos-client.sh b/modules/file-system/parallelstore/scripts/install-daos-client.sh new file mode 100644 index 0000000000..84354710d2 --- /dev/null +++ b/modules/file-system/parallelstore/scripts/install-daos-client.sh @@ -0,0 +1,84 @@ +#!/bin/bash +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -e + +# Parse access_points. +for arg in "$@"; do + if [[ $arg == --access_points=* ]]; then + access_points="${arg#*=}" + fi +done + +# Install the DAOS client library +# The following commands should be executed on each client vm. +## For Rocky linux 8. +if grep -q "ID=\"rocky\"" /etc/os-release && lsb_release -rs | grep -q "8\.[0-9]"; then + + # 1) Add the Parallelstore package repository + tee /etc/yum.repos.d/parallelstore-v2-4-el8.repo < --disable-wb-cache --eq-count=8. +for arg in "$@"; do + if [[ $arg == --local_mount=* ]]; then + local_mount="${arg#*=}" + fi + if [[ $arg == --mount_options=* ]]; then + mount_options="${arg#*=}" + mount_options="--${mount_options//,/ --}" + fi +done + +# Mount parallelstore instance to client vm. +mkdir -p "$local_mount" +chmod 777 "$local_mount" + +# Mount container for multi-user. +fuse_config=/etc/fuse.conf +sed -i "s/#.*user_allow_other/user_allow_other/g" $fuse_config +dfuse -m "$local_mount" --pool default-pool --container default-container --multi-user -o "$mount_options" + +exit 0 diff --git a/modules/file-system/parallelstore/variables.tf b/modules/file-system/parallelstore/variables.tf new file mode 100644 index 0000000000..2f6696f35f --- /dev/null +++ b/modules/file-system/parallelstore/variables.tf @@ -0,0 +1,89 @@ +/** + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +variable "project_id" { + description = "Project in which the HPC deployment will be created." + type = string +} + +variable "deployment_name" { + description = "Name of the HPC deployment." + type = string +} + +variable "name" { + description = "Name of parallelstore instance." + type = string + default = null +} + +variable "zone" { + description = "Location for parallelstore instance." + type = string +} + +variable "size_gb" { + description = "Storage size of the parallelstore instance in GB." + type = number + default = 12000 +} + +variable "labels" { + description = "Labels to add to parallel store instance." + type = map(string) + default = {} +} + +variable "local_mount" { + description = "The mount point where the contents of the device may be accessed after mounting." + type = string + default = "/parallelstore" +} + +variable "mount_options" { + description = "Options describing various aspects of the parallelstore instance." + type = string + default = "disable-wb-cache,thread-count=16,eq-count=8" +} + +variable "private_vpc_connection_peering" { + description = "The name of the VPC Network peering connection." + type = string +} + +variable "network_id" { + description = <<-EOT + The ID of the GCE VPC network to which the instance is connected given in the format: + `projects//global/networks/`" + EOT + type = string + validation { + condition = length(split("/", var.network_id)) == 5 + error_message = "The network id must be provided in the following format: projects//global/networks/." + } +} + +variable "source_gcs_bucket_uri" { + description = "The name of the GCS bucket to import data from to parallelstore." + type = string + default = "" +} + +variable "destination_hydration_parallelstore" { + description = "The name of local path to import data on parallelstore instance from GCS bucket." + type = string + default = "/" +} diff --git a/modules/file-system/parallelstore/versions.tf b/modules/file-system/parallelstore/versions.tf new file mode 100644 index 0000000000..24069a479c --- /dev/null +++ b/modules/file-system/parallelstore/versions.tf @@ -0,0 +1,36 @@ +/** + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +terraform { + required_version = ">= 0.13" + + required_providers { + google-beta = { + source = "hashicorp/google-beta" + version = ">= 5.25.0" + } + + random = { + source = "hashicorp/random" + version = "~> 3.0" + } + + null = { + source = "hashicorp/null" + version = "~> 3.0" + } + } +} From 98b6abb9dcf2f577ac603f2181d11ba186663e39 Mon Sep 17 00:00:00 2001 From: Harsh Thakkar Date: Mon, 1 Jul 2024 21:49:16 +0000 Subject: [PATCH 34/51] Add parallelstore driver and mount scripts in tools/diff --- tools/duplicate-diff.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/tools/duplicate-diff.py b/tools/duplicate-diff.py index 05cb9fb872..33de0e5565 100644 --- a/tools/duplicate-diff.py +++ b/tools/duplicate-diff.py @@ -75,7 +75,15 @@ [ "community/modules/scheduler/schedmd-slurm-gcp-v5-controller/etc/long-prolog-slurm.conf.tpl", "community/modules/scheduler/schedmd-slurm-gcp-v6-controller/etc/long-prolog-slurm.conf.tpl", - ] + ], + [ + "modules/file-system/parallelstore/scripts/install-daos-client.sh", + "modules/file-system/pre-existing-network-storage/scripts/install-daos-client.sh", + ], + [ + "modules/file-system/parallelstore/scripts/mount-daos.sh", + "modules/file-system/pre-existing-network-storage/scripts/mount-daos.sh", + ], ] for group in duplicates: From e89a9fa60441a67568d9f21b436a3ed20a554e20 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Fri, 14 Jun 2024 22:40:12 +0000 Subject: [PATCH 35/51] Add `schedmd-slurm-gcp-v6-nodeset-dynamic` module --- .../README.md | 134 +++++++ .../gpu_definition.tf | 56 +++ .../main.tf | 100 +++++ .../metadata.yaml | 20 + .../outputs.tf | 36 ++ .../source_image_logic.tf | 72 ++++ .../variables.tf | 345 ++++++++++++++++++ .../versions.tf | 29 ++ modules/README.md | 3 + tools/duplicate-diff.py | 2 + 10 files changed, 797 insertions(+) create mode 100644 community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md create mode 100644 community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/gpu_definition.tf create mode 100644 community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf create mode 100644 community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/metadata.yaml create mode 100644 community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/outputs.tf create mode 100644 community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/source_image_logic.tf create mode 100644 community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/variables.tf create mode 100644 community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/versions.tf diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md new file mode 100644 index 0000000000..4b57cd5dfa --- /dev/null +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md @@ -0,0 +1,134 @@ +## Description + +This module creates an instance template to be used by dynamic nodes, +also it creates a nodeset data structure intended to be input to the +[schedmd-slurm-gcp-v6-partition](../schedmd-slurm-gcp-v6-partition/) module. + +### Example + +The following code snippet creates an instance template to be used by MIG. + +```yaml + - id: dynamic_ns + source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic + use: [network, controller] + settings: + machine_type: n2-standard-2 + + - id: dynamic_partition + source: community/modules/compute/schedmd-slurm-gcp-v6-partition + use: [dynamic_ns] + settings: + partition_name: mp + is_default: true + + - id: controller + source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller + use: [network, dynamic_partition] + + - id: mig + source: community/modules/compute/mig + settings: + versions: + - name: highlander # there can be only one + instance_template: $(dynamic_ns.instance_template_self_link) + base_instance_name: $(dynamic_ns.node_name_prefix) +``` + +## Custom Images + +For more information on creating valid custom images for the node group VM +instances or for custom instance templates, see our [vm-images.md] documentation +page. + +[vm-images.md]: ../../../../docs/vm-images.md#slurm-on-gcp-custom-images + +## GPU Support + +More information on GPU support in Slurm on GCP and other HPC Toolkit modules +can be found at [docs/gpu-support.md](../../../../docs/gpu-support.md) + +## Support +The HPC Toolkit team maintains the wrapper around the [slurm-on-gcp] terraform +modules. For support with the underlying modules, see the instructions in the +[slurm-gcp README][slurm-gcp-readme]. + +[slurm-on-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp +[slurm-gcp-readme]: https://github.com/GoogleCloudPlatform/slurm-gcp#slurm-on-google-cloud-platform + + +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | >= 1.3 | +| [google](#requirement\_google) | >= 5.11 | + +## Providers + +| Name | Version | +|------|---------| +| [google](#provider\_google) | >= 5.11 | + +## Modules + +| Name | Source | Version | +|------|--------|---------| +| [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.5.8 | + +## Resources + +| Name | Type | +|------|------| +| [google_compute_default_service_account.default](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_default_service_account) | data source | +| [google_compute_image.slurm](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_image) | data source | + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [access\_config](#input\_access\_config) | Access configurations, i.e. IPs via which the VM instance can be accessed via the Internet. |
list(object({
nat_ip = string
network_tier = string
}))
| `[]` | no | +| [additional\_disks](#input\_additional\_disks) | Configurations of additional disks to be included on the partition nodes. (do not use "disk\_type: local-ssd"; known issue being addressed) |
list(object({
disk_name = string
device_name = string
disk_size_gb = number
disk_type = string
disk_labels = map(string)
auto_delete = bool
boot = bool
}))
| `[]` | no | +| [additional\_networks](#input\_additional\_networks) | Additional network interface details for GCE, if any. |
list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
}))
| `[]` | no | +| [bandwidth\_tier](#input\_bandwidth\_tier) | Configures the network interface card and the maximum egress bandwidth for VMs.
- Setting `platform_default` respects the Google Cloud Platform API default values for networking.
- Setting `virtio_enabled` explicitly selects the VirtioNet network adapter.
- Setting `gvnic_enabled` selects the gVNIC network adapter (without Tier 1 high bandwidth).
- Setting `tier_1_enabled` selects both the gVNIC adapter and Tier 1 high bandwidth networking.
- Note: both gVNIC and Tier 1 networking require a VM image with gVNIC support as well as specific VM families and shapes.
- See [official docs](https://cloud.google.com/compute/docs/networking/configure-vm-with-high-bandwidth-configuration) for more details. | `string` | `"platform_default"` | no | +| [can\_ip\_forward](#input\_can\_ip\_forward) | Enable IP forwarding, for NAT instances for example. | `bool` | `false` | no | +| [disk\_auto\_delete](#input\_disk\_auto\_delete) | Whether or not the boot disk should be auto-deleted. | `bool` | `true` | no | +| [disk\_labels](#input\_disk\_labels) | Labels specific to the boot disk. These will be merged with var.labels. | `map(string)` | `{}` | no | +| [disk\_size\_gb](#input\_disk\_size\_gb) | Size of boot disk to create for the partition compute nodes. | `number` | `50` | no | +| [disk\_type](#input\_disk\_type) | Boot disk type, can be either hyperdisk-balanced, hyperdisk-extreme, pd-ssd, pd-standard, pd-balanced, or pd-extreme. | `string` | `"pd-standard"` | no | +| [enable\_confidential\_vm](#input\_enable\_confidential\_vm) | Enable the Confidential VM configuration. Note: the instance image must support option. | `bool` | `false` | no | +| [enable\_oslogin](#input\_enable\_oslogin) | Enables Google Cloud os-login for user login and authentication for VMs.
See https://cloud.google.com/compute/docs/oslogin | `bool` | `true` | no | +| [enable\_public\_ips](#input\_enable\_public\_ips) | If set to true. The node group VMs will have a random public IP assigned to it. Ignored if access\_config is set. | `bool` | `false` | no | +| [enable\_shielded\_vm](#input\_enable\_shielded\_vm) | Enable the Shielded VM configuration. Note: the instance image must support option. | `bool` | `false` | no | +| [enable\_smt](#input\_enable\_smt) | Enables Simultaneous Multi-Threading (SMT) on instance. | `bool` | `false` | no | +| [enable\_spot\_vm](#input\_enable\_spot\_vm) | Enable the partition to use spot VMs (https://cloud.google.com/spot-vms). | `bool` | `false` | no | +| [feature](#input\_feature) | The node feature, used to bind nodes to the nodeset. If not set, the nodeset name will be used. | `string` | `null` | no | +| [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string,
count = number
}))
| `[]` | no | +| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm node group VM instances.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-6-5-hpc-rocky-linux-8",
"project": "schedmd-slurm-public"
}
| no | +| [instance\_image\_custom](#input\_instance\_image\_custom) | A flag that designates that the user is aware that they are requesting
to use a custom and potentially incompatible image for this Slurm on
GCP module.

If the field is set to false, only the compatible families and project
names will be accepted. The deployment will fail with any other image
family or name. If set to true, no checks will be done.

See: https://goo.gle/hpc-slurm-images | `bool` | `false` | no | +| [labels](#input\_labels) | Labels to add to partition compute instances. Key-value pairs. | `map(string)` | `{}` | no | +| [machine\_type](#input\_machine\_type) | Compute Platform machine type to use for this partition compute nodes. | `string` | `"c2-standard-60"` | no | +| [metadata](#input\_metadata) | Metadata, provided as a map. | `map(string)` | `{}` | no | +| [min\_cpu\_platform](#input\_min\_cpu\_platform) | The name of the minimum CPU platform that you want the instance to use. | `string` | `null` | no | +| [name](#input\_name) | Name of the nodeset. Automatically populated by the module id if not set.
If setting manually, ensure a unique value across all nodesets. | `string` | n/a | yes | +| [on\_host\_maintenance](#input\_on\_host\_maintenance) | Instance availability Policy.

Note: Placement groups are not supported when on\_host\_maintenance is set to
"MIGRATE" and will be deactivated regardless of the value of
enable\_placement. To support enable\_placement, ensure on\_host\_maintenance is
set to "TERMINATE". | `string` | `"TERMINATE"` | no | +| [preemptible](#input\_preemptible) | Should use preemptibles to burst. | `bool` | `false` | no | +| [project\_id](#input\_project\_id) | Project ID to create resources in. | `string` | n/a | yes | +| [region](#input\_region) | The default region for Cloud resources. | `string` | n/a | yes | +| [service\_account\_email](#input\_service\_account\_email) | Service account e-mail address to attach to the compute instances. | `string` | `null` | no | +| [service\_account\_scopes](#input\_service\_account\_scopes) | Scopes to attach to the compute instances. | `set(string)` |
[
"https://www.googleapis.com/auth/cloud-platform"
]
| no | +| [shielded\_instance\_config](#input\_shielded\_instance\_config) | Shielded VM configuration for the instance. Note: not used unless
enable\_shielded\_vm is 'true'.
- enable\_integrity\_monitoring : Compare the most recent boot measurements to the
integrity policy baseline and return a pair of pass/fail results depending on
whether they match or not.
- enable\_secure\_boot : Verify the digital signature of all boot components, and
halt the boot process if signature verification fails.
- enable\_vtpm : Use a virtualized trusted platform module, which is a
specialized computer chip you can use to encrypt objects like keys and
certificates. |
object({
enable_integrity_monitoring = bool
enable_secure_boot = bool
enable_vtpm = bool
})
|
{
"enable_integrity_monitoring": true,
"enable_secure_boot": true,
"enable_vtpm": true
}
| no | +| [slurm\_bucket\_path](#input\_slurm\_bucket\_path) | Path to the Slurm bucket. | `string` | n/a | yes | +| [slurm\_cluster\_name](#input\_slurm\_cluster\_name) | Name of the Slurm cluster. | `string` | n/a | yes | +| [spot\_instance\_config](#input\_spot\_instance\_config) | Configuration for spot VMs. |
object({
termination_action = string
})
| `null` | no | +| [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | Subnet to deploy to. | `string` | n/a | yes | +| [tags](#input\_tags) | Network tag list. | `list(string)` | `[]` | no | + +## Outputs + +| Name | Description | +|------|-------------| +| [instance\_template\_self\_link](#output\_instance\_template\_self\_link) | The URI of the template. | +| [node\_name\_prefix](#output\_node\_name\_prefix) | The prefix to be used for the node names.

Make sure that nodes are named `-`
This temporary required for proper functioning of the nodes.
While Slurm scheduler uses "features" to bind node and nodeset,
the SlurmGCP relies on node names for this (to be switched to features as well). | +| [nodeset\_dyn](#output\_nodeset\_dyn) | Details of the nodeset. Typically used as input to `schedmd-slurm-gcp-v6-partition`. | + diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/gpu_definition.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/gpu_definition.tf new file mode 100644 index 0000000000..c3c16542b1 --- /dev/null +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/gpu_definition.tf @@ -0,0 +1,56 @@ +/** + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +*/ + +## Required variables: +# guest_accelerator +# machine_type + +locals { + # example state; terraform will ignore diffs if last element of URL matches + # guest_accelerator = [ + # { + # count = 1 + # type = "https://www.googleapis.com/compute/beta/projects/PROJECT/zones/ZONE/acceleratorTypes/nvidia-tesla-a100" + # }, + # ] + accelerator_machines = { + "a2-highgpu-1g" = { type = "nvidia-tesla-a100", count = 1 }, + "a2-highgpu-2g" = { type = "nvidia-tesla-a100", count = 2 }, + "a2-highgpu-4g" = { type = "nvidia-tesla-a100", count = 4 }, + "a2-highgpu-8g" = { type = "nvidia-tesla-a100", count = 8 }, + "a2-megagpu-16g" = { type = "nvidia-tesla-a100", count = 16 }, + "a2-ultragpu-1g" = { type = "nvidia-a100-80gb", count = 1 }, + "a2-ultragpu-2g" = { type = "nvidia-a100-80gb", count = 2 }, + "a2-ultragpu-4g" = { type = "nvidia-a100-80gb", count = 4 }, + "a2-ultragpu-8g" = { type = "nvidia-a100-80gb", count = 8 }, + "a3-highgpu-8g" = { type = "nvidia-h100-80gb", count = 8 }, + "g2-standard-4" = { type = "nvidia-l4", count = 1 }, + "g2-standard-8" = { type = "nvidia-l4", count = 1 }, + "g2-standard-12" = { type = "nvidia-l4", count = 1 }, + "g2-standard-16" = { type = "nvidia-l4", count = 1 }, + "g2-standard-24" = { type = "nvidia-l4", count = 2 }, + "g2-standard-32" = { type = "nvidia-l4", count = 1 }, + "g2-standard-48" = { type = "nvidia-l4", count = 4 }, + "g2-standard-96" = { type = "nvidia-l4", count = 8 }, + } + generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], []) + + # Select in priority order: + # (1) var.guest_accelerator if not empty + # (2) local.generated_guest_accelerator if not empty + # (3) default to empty list if both are empty + guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), []) +} diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf new file mode 100644 index 0000000000..ca76477c4d --- /dev/null +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf @@ -0,0 +1,100 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +locals { + # This label allows for billing report tracking based on module. + labels = merge(var.labels, { ghpc_module = "schedmd-slurm-gcp-v6-nodeset-dynamic", ghpc_role = "compute" }) +} + +locals { + nodeset_name = substr(replace(var.name, "/[^a-z0-9]/", ""), 0, 14) + feature = coalesce(var.feature, local.nodeset_name) + metadata = merge(var.metadata, { slurmd_feature = local.feature }) + + nodeset = { + nodeset_name = local.nodeset_name + nodeset_feature : local.feature + } + + additional_disks = [ + for ad in var.additional_disks : { + disk_name = ad.disk_name + device_name = ad.device_name + disk_type = ad.disk_type + disk_size_gb = ad.disk_size_gb + disk_labels = merge(ad.disk_labels, local.labels) + auto_delete = ad.auto_delete + boot = ad.boot + } + ] + + public_access_config = var.enable_public_ips ? [{ nat_ip = null, network_tier = null }] : [] + access_config = length(var.access_config) == 0 ? local.public_access_config : var.access_config + + service_account = { + email = coalesce(var.service_account_email, data.google_compute_default_service_account.default.email) + scopes = var.service_account_scopes + } +} + +data "google_compute_default_service_account" "default" { + project = var.project_id +} + + +module "slurm_nodeset_template" { + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.5.8" + + project_id = var.project_id + region = var.region + name_prefix = local.nodeset_name + slurm_cluster_name = var.slurm_cluster_name + slurm_instance_role = "compute" + slurm_bucket_path = var.slurm_bucket_path + metadata = local.metadata + + additional_disks = local.additional_disks + disk_auto_delete = var.disk_auto_delete + disk_labels = merge(local.labels, var.disk_labels) + disk_size_gb = var.disk_size_gb + disk_type = var.disk_type + + bandwidth_tier = var.bandwidth_tier + can_ip_forward = var.can_ip_forward + + disable_smt = !var.enable_smt + enable_confidential_vm = var.enable_confidential_vm + enable_oslogin = var.enable_oslogin + enable_shielded_vm = var.enable_shielded_vm + shielded_instance_config = var.shielded_instance_config + + labels = local.labels + machine_type = var.machine_type + + min_cpu_platform = var.min_cpu_platform + on_host_maintenance = var.on_host_maintenance + termination_action = try(var.spot_instance_config.termination_action, null) + preemptible = var.preemptible + spot = var.enable_spot_vm + service_account = local.service_account + gpu = one(local.guest_accelerator) # requires gpu_definition.tf + source_image_family = local.source_image_family # requires source_image_logic.tf + source_image_project = local.source_image_project_normalized # requires source_image_logic.tf + source_image = local.source_image # requires source_image_logic.tf + + subnetwork = var.subnetwork_self_link + additional_networks = var.additional_networks + access_config = local.access_config + tags = concat([var.slurm_cluster_name], var.tags) +} diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/metadata.yaml b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/metadata.yaml new file mode 100644 index 0000000000..a99e59d09f --- /dev/null +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/metadata.yaml @@ -0,0 +1,20 @@ +# Copyright 2023 "Google LLC" +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +spec: + requirements: + services: [compute.googleapis.com] +ghpc: + inject_module_id: name diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/outputs.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/outputs.tf new file mode 100644 index 0000000000..2d2d1415cf --- /dev/null +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/outputs.tf @@ -0,0 +1,36 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +output "nodeset_dyn" { + description = "Details of the nodeset. Typically used as input to `schedmd-slurm-gcp-v6-partition`." + value = local.nodeset +} + +output "instance_template_self_link" { + description = "The URI of the template." + value = module.slurm_nodeset_template.self_link +} + +output "node_name_prefix" { + description = <<-EOD + The prefix to be used for the node names. + + Make sure that nodes are named `-` + This temporary required for proper functioning of the nodes. + While Slurm scheduler uses "features" to bind node and nodeset, + the SlurmGCP relies on node names for this (to be switched to features as well). + EOD + value = "${var.slurm_cluster_name}-${local.nodeset_name}" + +} diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/source_image_logic.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/source_image_logic.tf new file mode 100644 index 0000000000..40b2e53ef8 --- /dev/null +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/source_image_logic.tf @@ -0,0 +1,72 @@ +/** + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +locals { + # Currently supported images and projects + known_project_families = { + schedmd-slurm-public = [ + "slurm-gcp-6-5-debian-11", + "slurm-gcp-6-5-hpc-rocky-linux-8", + "slurm-gcp-6-5-ubuntu-2004-lts", + "slurm-gcp-6-5-ubuntu-2204-lts-arm64", + "slurm-gcp-6-5-hpc-centos-7" + ] + } + + # This approach to "hacking" the project name allows a chain of Terraform + # calls to set the instance source_image (boot disk) with a "relative + # resource name" that passes muster with VPC Service Control rules + # + # https://github.com/terraform-google-modules/terraform-google-vm/blob/735bd415fc5f034d46aa0de7922e8fada2327c0c/modules/instance_template/main.tf#L28 + # https://cloud.google.com/apis/design/resource_names#relative_resource_name + source_image_project_normalized = (can(var.instance_image.family) ? + "projects/${data.google_compute_image.slurm.project}/global/images/family" : + "projects/${data.google_compute_image.slurm.project}/global/images" + ) + source_image_family = can(var.instance_image.family) ? data.google_compute_image.slurm.family : "" + source_image = can(var.instance_image.name) ? data.google_compute_image.slurm.name : "" +} + +data "google_compute_image" "slurm" { + family = try(var.instance_image.family, null) + name = try(var.instance_image.name, null) + project = var.instance_image.project + + lifecycle { + precondition { + condition = length(regexall("^projects/.+?/global/images/family$", var.instance_image.project)) == 0 + error_message = "The \"project\" field in var.instance_image no longer supports a long-form ending in \"family\". Specify only the project ID." + } + + postcondition { + condition = var.instance_image_custom || contains(keys(local.known_project_families), self.project) + error_message = <<-EOD + Images in project ${self.project} are not published by SchedMD. Images must be created by compatible releases of the Terraform and Packer modules following the guidance at https://goo.gle/hpc-slurm-images. Set var.instance_image_custom to true to silence this error and acknowledge that you are using a compatible image. + EOD + } + postcondition { + condition = !contains(keys(local.known_project_families), self.project) || try(contains(local.known_project_families[self.project], self.family), false) + error_message = <<-EOD + Image family ${self.family} published by SchedMD in project ${self.project} is not compatible with this release of the Terraform Slurm modules. Select from known compatible releases: + ${join("\n", [for p in try(local.known_project_families[self.project], []) : "\t\"${p}\""])} + EOD + } + postcondition { + condition = var.disk_size_gb >= self.disk_size_gb + error_message = "'disk_size_gb: ${var.disk_size_gb}' is smaller than the image size (${self.disk_size_gb}GB), please increase the blueprint disk size" + } + } +} diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/variables.tf new file mode 100644 index 0000000000..5021d57d14 --- /dev/null +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/variables.tf @@ -0,0 +1,345 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +variable "name" { + description = <<-EOD + Name of the nodeset. Automatically populated by the module id if not set. + If setting manually, ensure a unique value across all nodesets. + EOD + type = string +} + +variable "feature" { + type = string + description = "The node feature, used to bind nodes to the nodeset. If not set, the nodeset name will be used." + default = null +} + +variable "project_id" { + type = string + description = "Project ID to create resources in." +} + +variable "slurm_cluster_name" { + description = "Name of the Slurm cluster." + type = string +} + +variable "slurm_bucket_path" { + description = "Path to the Slurm bucket." + type = string +} + + +variable "machine_type" { + description = "Compute Platform machine type to use for this partition compute nodes." + type = string + default = "c2-standard-60" +} + +variable "metadata" { + type = map(string) + description = "Metadata, provided as a map." + default = {} +} + +variable "instance_image" { + description = <<-EOD + Defines the image that will be used in the Slurm node group VM instances. + + Expected Fields: + name: The name of the image. Mutually exclusive with family. + family: The image family to use. Mutually exclusive with name. + project: The project where the image is hosted. + + For more information on creating custom images that comply with Slurm on GCP + see the "Slurm on GCP Custom Images" section in docs/vm-images.md. + EOD + type = map(string) + default = { + family = "slurm-gcp-6-5-hpc-rocky-linux-8" + project = "schedmd-slurm-public" + } + + validation { + condition = can(coalesce(var.instance_image.project)) + error_message = "In var.instance_image, the \"project\" field must be a string set to the Cloud project ID." + } + + validation { + condition = can(coalesce(var.instance_image.name)) != can(coalesce(var.instance_image.family)) + error_message = "In var.instance_image, exactly one of \"family\" or \"name\" fields must be set to desired image family or name." + } +} + +variable "instance_image_custom" { + description = <<-EOD + A flag that designates that the user is aware that they are requesting + to use a custom and potentially incompatible image for this Slurm on + GCP module. + + If the field is set to false, only the compatible families and project + names will be accepted. The deployment will fail with any other image + family or name. If set to true, no checks will be done. + + See: https://goo.gle/hpc-slurm-images + EOD + type = bool + default = false +} + +variable "tags" { + type = list(string) + description = "Network tag list." + default = [] +} + +variable "disk_type" { + description = "Boot disk type, can be either hyperdisk-balanced, hyperdisk-extreme, pd-ssd, pd-standard, pd-balanced, or pd-extreme." + type = string + default = "pd-standard" +} + +variable "disk_size_gb" { + description = "Size of boot disk to create for the partition compute nodes." + type = number + default = 50 +} + +variable "disk_auto_delete" { + type = bool + description = "Whether or not the boot disk should be auto-deleted." + default = true +} + +variable "disk_labels" { + description = "Labels specific to the boot disk. These will be merged with var.labels." + type = map(string) + default = {} +} + +variable "additional_disks" { + description = "Configurations of additional disks to be included on the partition nodes. (do not use \"disk_type: local-ssd\"; known issue being addressed)" + type = list(object({ + disk_name = string + device_name = string + disk_size_gb = number + disk_type = string + disk_labels = map(string) + auto_delete = bool + boot = bool + })) + default = [] +} + +variable "enable_confidential_vm" { + type = bool + description = "Enable the Confidential VM configuration. Note: the instance image must support option." + default = false +} + +variable "enable_shielded_vm" { + type = bool + description = "Enable the Shielded VM configuration. Note: the instance image must support option." + default = false +} + +variable "shielded_instance_config" { + type = object({ + enable_integrity_monitoring = bool + enable_secure_boot = bool + enable_vtpm = bool + }) + description = <<-EOD + Shielded VM configuration for the instance. Note: not used unless + enable_shielded_vm is 'true'. + - enable_integrity_monitoring : Compare the most recent boot measurements to the + integrity policy baseline and return a pair of pass/fail results depending on + whether they match or not. + - enable_secure_boot : Verify the digital signature of all boot components, and + halt the boot process if signature verification fails. + - enable_vtpm : Use a virtualized trusted platform module, which is a + specialized computer chip you can use to encrypt objects like keys and + certificates. + EOD + default = { + enable_integrity_monitoring = true + enable_secure_boot = true + enable_vtpm = true + } +} + + +variable "enable_oslogin" { + type = bool + description = <<-EOD + Enables Google Cloud os-login for user login and authentication for VMs. + See https://cloud.google.com/compute/docs/oslogin + EOD + default = true +} + +variable "can_ip_forward" { + description = "Enable IP forwarding, for NAT instances for example." + type = bool + default = false +} + +variable "enable_smt" { + type = bool + description = "Enables Simultaneous Multi-Threading (SMT) on instance." + default = false +} + +variable "labels" { + description = "Labels to add to partition compute instances. Key-value pairs." + type = map(string) + default = {} +} + +variable "min_cpu_platform" { + description = "The name of the minimum CPU platform that you want the instance to use." + type = string + default = null +} + +variable "on_host_maintenance" { + type = string + description = <<-EOD + Instance availability Policy. + + Note: Placement groups are not supported when on_host_maintenance is set to + "MIGRATE" and will be deactivated regardless of the value of + enable_placement. To support enable_placement, ensure on_host_maintenance is + set to "TERMINATE". + EOD + default = "TERMINATE" +} + +variable "guest_accelerator" { + description = "List of the type and count of accelerator cards attached to the instance." + type = list(object({ + type = string, + count = number + })) + default = [] + nullable = false + + validation { + condition = length(var.guest_accelerator) <= 1 + error_message = "The Slurm modules supports 0 or 1 models of accelerator card on each node." + } +} + +variable "preemptible" { + description = "Should use preemptibles to burst." + type = bool + default = false +} + + +variable "service_account_email" { + description = "Service account e-mail address to attach to the compute instances." + type = string + default = null +} + +variable "service_account_scopes" { + description = "Scopes to attach to the compute instances." + type = set(string) + default = ["https://www.googleapis.com/auth/cloud-platform"] +} + +variable "enable_spot_vm" { + description = "Enable the partition to use spot VMs (https://cloud.google.com/spot-vms)." + type = bool + default = false +} + +variable "spot_instance_config" { + description = "Configuration for spot VMs." + type = object({ + termination_action = string + }) + default = null +} + +variable "bandwidth_tier" { + description = < Date: Tue, 2 Jul 2024 10:53:41 -0500 Subject: [PATCH 36/51] Address bug in 251ce6e8 --- modules/packer/custom-image/image.pkr.hcl | 6 ++++-- .../expectations/igc_pkr/one/image/image.pkr.hcl | 6 ++++-- .../expectations/text_escape/zero/lime/image.pkr.hcl | 6 ++++-- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/modules/packer/custom-image/image.pkr.hcl b/modules/packer/custom-image/image.pkr.hcl index 44e34d95ed..e4f30dfb58 100644 --- a/modules/packer/custom-image/image.pkr.hcl +++ b/modules/packer/custom-image/image.pkr.hcl @@ -160,9 +160,11 @@ build { dynamic "provisioner" { labels = ["powershell"] - for_each = length(var.windows_startup_ps1) == 0 ? [1] : [] + for_each = length(var.windows_startup_ps1) > 0 ? [1] : [] content { - inline = "GCESysprep -no_shutdown" + inline = [ + "GCESysprep -no_shutdown" + ] } } diff --git a/tools/validate_configs/golden_copies/expectations/igc_pkr/one/image/image.pkr.hcl b/tools/validate_configs/golden_copies/expectations/igc_pkr/one/image/image.pkr.hcl index 44e34d95ed..e4f30dfb58 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_pkr/one/image/image.pkr.hcl +++ b/tools/validate_configs/golden_copies/expectations/igc_pkr/one/image/image.pkr.hcl @@ -160,9 +160,11 @@ build { dynamic "provisioner" { labels = ["powershell"] - for_each = length(var.windows_startup_ps1) == 0 ? [1] : [] + for_each = length(var.windows_startup_ps1) > 0 ? [1] : [] content { - inline = "GCESysprep -no_shutdown" + inline = [ + "GCESysprep -no_shutdown" + ] } } diff --git a/tools/validate_configs/golden_copies/expectations/text_escape/zero/lime/image.pkr.hcl b/tools/validate_configs/golden_copies/expectations/text_escape/zero/lime/image.pkr.hcl index 44e34d95ed..e4f30dfb58 100644 --- a/tools/validate_configs/golden_copies/expectations/text_escape/zero/lime/image.pkr.hcl +++ b/tools/validate_configs/golden_copies/expectations/text_escape/zero/lime/image.pkr.hcl @@ -160,9 +160,11 @@ build { dynamic "provisioner" { labels = ["powershell"] - for_each = length(var.windows_startup_ps1) == 0 ? [1] : [] + for_each = length(var.windows_startup_ps1) > 0 ? [1] : [] content { - inline = "GCESysprep -no_shutdown" + inline = [ + "GCESysprep -no_shutdown" + ] } } From 9a451e46e82661d1059cc7f709b8ff2352e8f0dc Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Tue, 2 Jul 2024 15:25:08 -0500 Subject: [PATCH 37/51] Disable PBS Pro test This commit disables the PBS Pro test as the infrastructure only supports testing against CentOS 7 which is now EOL. The modules may work under CentOS 8, however they are untested. --- .../daily-tests/builds/pbspro.yaml | 51 ------------------- 1 file changed, 51 deletions(-) delete mode 100644 tools/cloud-build/daily-tests/builds/pbspro.yaml diff --git a/tools/cloud-build/daily-tests/builds/pbspro.yaml b/tools/cloud-build/daily-tests/builds/pbspro.yaml deleted file mode 100644 index 15b1ad9798..0000000000 --- a/tools/cloud-build/daily-tests/builds/pbspro.yaml +++ /dev/null @@ -1,51 +0,0 @@ -# Copyright 2023 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- -tags: -- m.filestore -- m.pbspro-client -- m.pbspro-execution -- m.pbspro-server -- m.pre-existing-vpc -- pbspro - -timeout: 14400s # 4hr -availableSecrets: - secretManager: - - versionName: projects/$PROJECT_ID/secrets/pbspro_rpm_bucket/versions/1 - env: PBSPRO_RPM_BUCKET -steps: -## Test simple golang build -- id: pbspro - name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner - entrypoint: /bin/bash - env: - - "ANSIBLE_HOST_KEY_CHECKING=false" - - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" - secretEnv: ["PBSPRO_RPM_BUCKET"] - args: - - -c - - | - set -x -e - cd /workspace && make - BUILD_ID_FULL=$BUILD_ID - BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6} - SG_EXAMPLE=tools/validate_configs/test_configs/pbs-unwrapped.yaml - - sed -i "s,replace-pbspro-rpm-bucket,$${PBSPRO_RPM_BUCKET},g" $${SG_EXAMPLE} - - ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ - --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ - --extra-vars="@tools/cloud-build/daily-tests/tests/pbspro.yml" From b83107e0630849aa3a59d1eebe6de9efd294d5ca Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Tue, 2 Jul 2024 17:14:32 -0500 Subject: [PATCH 38/51] CentOS 7 boot hotfix This hotfix applies changes to the yum repository configuration that are necessary after the EOL of CentOS 7. --- .../files/early_run_hotfixes.sh | 32 +++++++++++++++++++ modules/scripts/startup-script/main.tf | 7 ++++ 2 files changed, 39 insertions(+) create mode 100644 modules/scripts/startup-script/files/early_run_hotfixes.sh diff --git a/modules/scripts/startup-script/files/early_run_hotfixes.sh b/modules/scripts/startup-script/files/early_run_hotfixes.sh new file mode 100644 index 0000000000..682e1352a1 --- /dev/null +++ b/modules/scripts/startup-script/files/early_run_hotfixes.sh @@ -0,0 +1,32 @@ +#!/bin/bash +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This script applies fixes to VMs that must occur early in boot. For example, +# when yum or apt repositories are misconfigured, preventing most package +# operations from completing successfully. + +source /etc/os-release + +if [[ "$PRETTY_NAME" == "CentOS Linux 7 (Core)" ]]; then + echo "Applying hotfixes for CentOS 7" + if grep -q '^mirrorlist' /etc/yum.repos.d/CentOS-Base.repo; then + echo "Removing mirrorlist from default CentOS 7 repositories" + sed -i '/^mirrorlist/d' /etc/yum.repos.d/CentOS-Base.repo + fi + if grep -q '^#baseurl=http://mirror.centos.org' /etc/yum.repos.d/CentOS-Base.repo; then + echo "Reconfiguring default CentOS 7 repositories to use CentOS Vault" + sed -i 's,^#baseurl=http://mirror.centos.org/,baseurl=http://vault.centos.org/,' /etc/yum.repos.d/CentOS-Base.repo + fi +fi diff --git a/modules/scripts/startup-script/main.tf b/modules/scripts/startup-script/main.tf index b9d9163da7..0486d78533 100644 --- a/modules/scripts/startup-script/main.tf +++ b/modules/scripts/startup-script/main.tf @@ -108,8 +108,15 @@ locals { args = var.ansible_virtualenv_path }] : [] + hotfix_runner = [{ + type = "shell" + source = "${path.module}/files/early_run_hotfixes.sh" + destination = "early_run_hotfixes.sh" + }] + runners = concat( local.warnings, + local.hotfix_runner, local.proxy_runner, local.monitoring_agent_installer, local.ansible_installer, From 419968aaaf96556c2da68ca8e4cd830af3a26b28 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Tue, 2 Jul 2024 17:14:32 -0500 Subject: [PATCH 39/51] Adopt CentOS 7 boot fix commit in startup-script module --- modules/compute/vm-instance/README.md | 2 +- modules/compute/vm-instance/startup_from_network_storage.tf | 2 +- modules/scheduler/batch-job-template/README.md | 2 +- .../batch-job-template/startup_from_network_storage.tf | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/modules/compute/vm-instance/README.md b/modules/compute/vm-instance/README.md index 159019dc0c..01ec6f6bd1 100644 --- a/modules/compute/vm-instance/README.md +++ b/modules/compute/vm-instance/README.md @@ -185,7 +185,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [netstorage\_startup\_script](#module\_netstorage\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.35.0&depth=1 | +| [netstorage\_startup\_script](#module\_netstorage\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | b83107e0 | ## Resources diff --git a/modules/compute/vm-instance/startup_from_network_storage.tf b/modules/compute/vm-instance/startup_from_network_storage.tf index 17dbd97bd2..993952ad20 100644 --- a/modules/compute/vm-instance/startup_from_network_storage.tf +++ b/modules/compute/vm-instance/startup_from_network_storage.tf @@ -55,7 +55,7 @@ locals { } module "netstorage_startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.35.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=b83107e0" labels = local.labels project_id = var.project_id diff --git a/modules/scheduler/batch-job-template/README.md b/modules/scheduler/batch-job-template/README.md index ef9b816346..2ac9dafc81 100644 --- a/modules/scheduler/batch-job-template/README.md +++ b/modules/scheduler/batch-job-template/README.md @@ -139,7 +139,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| | [instance\_template](#module\_instance\_template) | terraform-google-modules/vm/google//modules/instance_template | ~> 10.1.1 | -| [netstorage\_startup\_script](#module\_netstorage\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.35.0&depth=1 | +| [netstorage\_startup\_script](#module\_netstorage\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | b83107e0 | ## Resources diff --git a/modules/scheduler/batch-job-template/startup_from_network_storage.tf b/modules/scheduler/batch-job-template/startup_from_network_storage.tf index 17dbd97bd2..993952ad20 100644 --- a/modules/scheduler/batch-job-template/startup_from_network_storage.tf +++ b/modules/scheduler/batch-job-template/startup_from_network_storage.tf @@ -55,7 +55,7 @@ locals { } module "netstorage_startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.35.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=b83107e0" labels = local.labels project_id = var.project_id From 07d2061d87d113dd4b8a72c9a202d49090f2ab98 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Tue, 2 Jul 2024 17:14:32 -0500 Subject: [PATCH 40/51] Address centos-7 image family deprecation The centos-7 family has been entirely deprecated. This change uses the final image from the family in integration tests until we elect to remove CentOS 7 testing entirely from the Toolkit. --- tools/cloud-build/daily-tests/blueprints/lustre-vm.yaml | 2 +- tools/validate_configs/os_compatibility_tests/vm-filestore.yaml | 2 +- tools/validate_configs/os_compatibility_tests/vm-lustre.yaml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/cloud-build/daily-tests/blueprints/lustre-vm.yaml b/tools/cloud-build/daily-tests/blueprints/lustre-vm.yaml index 63cda04503..8da170c5b2 100644 --- a/tools/cloud-build/daily-tests/blueprints/lustre-vm.yaml +++ b/tools/cloud-build/daily-tests/blueprints/lustre-vm.yaml @@ -98,7 +98,7 @@ deployment_groups: name_prefix: centos add_deployment_name_before_prefix: true instance_image: - family: centos-7 + name: centos-7-v20240611 project: centos-cloud - id: wait-centos source: community/modules/scripts/wait-for-startup diff --git a/tools/validate_configs/os_compatibility_tests/vm-filestore.yaml b/tools/validate_configs/os_compatibility_tests/vm-filestore.yaml index 34557f50ad..feb20de9e9 100644 --- a/tools/validate_configs/os_compatibility_tests/vm-filestore.yaml +++ b/tools/validate_configs/os_compatibility_tests/vm-filestore.yaml @@ -71,7 +71,7 @@ deployment_groups: - homefs settings: instance_image: - family: centos-7 + name: centos-7-v20240611 project: centos-cloud name_prefix: workstation-centos instance_count: 1 diff --git a/tools/validate_configs/os_compatibility_tests/vm-lustre.yaml b/tools/validate_configs/os_compatibility_tests/vm-lustre.yaml index 4ad6c51b77..2263c7f1af 100644 --- a/tools/validate_configs/os_compatibility_tests/vm-lustre.yaml +++ b/tools/validate_configs/os_compatibility_tests/vm-lustre.yaml @@ -79,7 +79,7 @@ deployment_groups: name_prefix: centos instance_count: 1 instance_image: - family: centos-7 + name: centos-7-v20240611 project: centos-cloud - id: wait-centos source: community/modules/scripts/wait-for-startup From 2852bcf1d11fe514010bc9d2eb27f1350463bcf9 Mon Sep 17 00:00:00 2001 From: Harsh Thakkar Date: Tue, 25 Jun 2024 23:20:52 +0000 Subject: [PATCH 41/51] Update network-storage interface to support runners and pass daos scripts during startup --- .../schedmd-slurm-gcp-v6-controller/README.md | 11 +-- .../slurm_files.tf | 69 ++++++++++++++++--- .../variables.tf | 38 +++++----- 3 files changed, 89 insertions(+), 29 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md index 5a2a9b0672..5c8d0cef7e 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md @@ -100,7 +100,7 @@ example: max_hops: 1 ``` -> [!NOTE] +> [!NOTE] > `schedmd-slurm-gcp-v6-nodeset.settings.enable_placement: true` must also be > set for max-distance to take effect. @@ -195,6 +195,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| | [bucket](#module\_bucket) | terraform-google-modules/cloud-storage/google | ~> 5.0 | +| [daos\_network\_storage\_scripts](#module\_daos\_network\_storage\_scripts) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.34.0&depth=1 | | [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.5.9 | | [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.5.9 | | [slurm\_files](#module\_slurm\_files) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_files | 6.5.9 | @@ -244,7 +245,7 @@ limitations under the License. | [disk\_size\_gb](#input\_disk\_size\_gb) | Boot disk size in GB. | `number` | `50` | no | | [disk\_type](#input\_disk\_type) | Boot disk type, can be either hyperdisk-balanced, hyperdisk-extreme, pd-ssd, pd-standard, pd-balanced, or pd-extreme. | `string` | `"pd-ssd"` | no | | [enable\_bigquery\_load](#input\_enable\_bigquery\_load) | Enables loading of cluster job usage into big query.

NOTE: Requires Google Bigquery API. | `bool` | `false` | no | -| [enable\_cleanup\_compute](#input\_enable\_cleanup\_compute) | Enables automatic cleanup of compute nodes and resource policies (e.g.
placement groups) managed by this module, when cluster is destroyed.

*WARNING*: Toggling this off will impact the running workload.
Deployed compute nodes and controller will be destroyed. | `bool` | `true` | no | +| [enable\_cleanup\_compute](#input\_enable\_cleanup\_compute) | Enables automatic cleanup of compute nodes and resource policies (e.g.
placement groups) managed by this module, when cluster is destroyed.

*WARNING*: Toggling this off will impact the running workload.
Deployed compute nodes and controller will be destroyed. | `bool` | `true` | no | | [enable\_confidential\_vm](#input\_enable\_confidential\_vm) | Enable the Confidential VM configuration. Note: the instance image must support option. | `bool` | `false` | no | | [enable\_controller\_public\_ips](#input\_enable\_controller\_public\_ips) | If set to true. The controller will have a random public IP assigned to it. Ignored if access\_config is set. | `bool` | `false` | no | | [enable\_debug\_logging](#input\_enable\_debug\_logging) | Enables debug logging mode. | `bool` | `false` | no | @@ -269,10 +270,10 @@ limitations under the License. | [machine\_type](#input\_machine\_type) | Machine type to create. | `string` | `"c2-standard-4"` | no | | [metadata](#input\_metadata) | Metadata, provided as a map. | `map(string)` | `{}` | no | | [min\_cpu\_platform](#input\_min\_cpu\_platform) | Specifies a minimum CPU platform. Applicable values are the friendly names of
CPU platforms, such as Intel Haswell or Intel Skylake. See the complete list:
https://cloud.google.com/compute/docs/instances/specify-min-cpu-platform | `string` | `null` | no | -| [network\_storage](#input\_network\_storage) | An array of network attached storage mounts to be configured on all instances. |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
}))
| `[]` | no | -| [nodeset](#input\_nodeset) | Define nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 1)
node_conf = optional(map(string), {})
nodeset_name = string
additional_disks = optional(list(object({
disk_name = optional(string)
device_name = optional(string)
disk_size_gb = optional(number)
disk_type = optional(string)
disk_labels = optional(map(string), {})
auto_delete = optional(bool, true)
boot = optional(bool, false)
})), [])
bandwidth_tier = optional(string, "platform_default")
can_ip_forward = optional(bool, false)
disable_smt = optional(bool, false)
disk_auto_delete = optional(bool, true)
disk_labels = optional(map(string), {})
disk_size_gb = optional(number)
disk_type = optional(string)
enable_confidential_vm = optional(bool, false)
enable_placement = optional(bool, false)
enable_oslogin = optional(bool, true)
enable_shielded_vm = optional(bool, false)
gpu = optional(object({
count = number
type = string
}))
labels = optional(map(string), {})
machine_type = optional(string)
maintenance_interval = optional(string)
metadata = optional(map(string), {})
min_cpu_platform = optional(string)
network_tier = optional(string, "STANDARD")
network_storage = optional(list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
})), [])
on_host_maintenance = optional(string)
preemptible = optional(bool, false)
region = optional(string)
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
shielded_instance_config = optional(object({
enable_integrity_monitoring = optional(bool, true)
enable_secure_boot = optional(bool, true)
enable_vtpm = optional(bool, true)
}))
source_image_family = optional(string)
source_image_project = optional(string)
source_image = optional(string)
subnetwork_self_link = string
additional_networks = optional(list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
})))
access_config = optional(list(object({
nat_ip = string
network_tier = string
})))
spot = optional(bool, false)
tags = optional(list(string), [])
termination_action = optional(string)
zones = optional(list(string), [])
zone_target_shape = optional(string, "ANY_SINGLE_ZONE")
reservation_name = optional(string)
startup_script = optional(list(object({
filename = string
content = string })), [])
}))
| `[]` | no | +| [network\_storage](#input\_network\_storage) | An array of network attached storage mounts to be configured on all instances. |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
client_install_runner = optional(map(string))
mount_runner = optional(map(string))
}))
| `[]` | no | +| [nodeset](#input\_nodeset) | Define nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 1)
node_conf = optional(map(string), {})
nodeset_name = string
additional_disks = optional(list(object({
disk_name = optional(string)
device_name = optional(string)
disk_size_gb = optional(number)
disk_type = optional(string)
disk_labels = optional(map(string), {})
auto_delete = optional(bool, true)
boot = optional(bool, false)
})), [])
bandwidth_tier = optional(string, "platform_default")
can_ip_forward = optional(bool, false)
disable_smt = optional(bool, false)
disk_auto_delete = optional(bool, true)
disk_labels = optional(map(string), {})
disk_size_gb = optional(number)
disk_type = optional(string)
enable_confidential_vm = optional(bool, false)
enable_placement = optional(bool, false)
enable_oslogin = optional(bool, true)
enable_shielded_vm = optional(bool, false)
gpu = optional(object({
count = number
type = string
}))
labels = optional(map(string), {})
machine_type = optional(string)
maintenance_interval = optional(string)
metadata = optional(map(string), {})
min_cpu_platform = optional(string)
network_tier = optional(string, "STANDARD")
network_storage = optional(list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
client_install_runner = optional(map(string))
mount_runner = optional(map(string))
})), [])
on_host_maintenance = optional(string)
preemptible = optional(bool, false)
region = optional(string)
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
shielded_instance_config = optional(object({
enable_integrity_monitoring = optional(bool, true)
enable_secure_boot = optional(bool, true)
enable_vtpm = optional(bool, true)
}))
source_image_family = optional(string)
source_image_project = optional(string)
source_image = optional(string)
subnetwork_self_link = string
additional_networks = optional(list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
})))
access_config = optional(list(object({
nat_ip = string
network_tier = string
})))
spot = optional(bool, false)
tags = optional(list(string), [])
termination_action = optional(string)
zones = optional(list(string), [])
zone_target_shape = optional(string, "ANY_SINGLE_ZONE")
reservation_name = optional(string)
startup_script = optional(list(object({
filename = string
content = string })), [])
}))
| `[]` | no | | [nodeset\_dyn](#input\_nodeset\_dyn) | Defines dynamic nodesets, as a list. |
list(object({
nodeset_name = string
nodeset_feature = string
}))
| `[]` | no | -| [nodeset\_tpu](#input\_nodeset\_tpu) | Define TPU nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 5)
nodeset_name = string
enable_public_ip = optional(bool, false)
node_type = string
accelerator_config = optional(object({
topology = string
version = string
}), {
topology = ""
version = ""
})
tf_version = string
preemptible = optional(bool, false)
preserve_tpu = optional(bool, false)
zone = string
data_disks = optional(list(string), [])
docker_image = optional(string, "")
network_storage = optional(list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
})), [])
subnetwork = string
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
project_id = string
reserved = optional(string, false)
}))
| `[]` | no | +| [nodeset\_tpu](#input\_nodeset\_tpu) | Define TPU nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 5)
nodeset_name = string
enable_public_ip = optional(bool, false)
node_type = string
accelerator_config = optional(object({
topology = string
version = string
}), {
topology = ""
version = ""
})
tf_version = string
preemptible = optional(bool, false)
preserve_tpu = optional(bool, false)
zone = string
data_disks = optional(list(string), [])
docker_image = optional(string, "")
network_storage = optional(list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
client_install_runner = optional(map(string))
mount_runner = optional(map(string))
})), [])
subnetwork = string
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
project_id = string
reserved = optional(string, false)
}))
| `[]` | no | | [on\_host\_maintenance](#input\_on\_host\_maintenance) | Instance availability Policy. | `string` | `"MIGRATE"` | no | | [partitions](#input\_partitions) | Cluster partitions as a list. See module slurm\_partition. |
list(object({
partition_name = string
partition_conf = optional(map(string), {})
partition_nodeset = optional(list(string), [])
partition_nodeset_dyn = optional(list(string), [])
partition_nodeset_tpu = optional(list(string), [])
enable_job_exclusive = optional(bool, false)
}))
| n/a | yes | | [preemptible](#input\_preemptible) | Allow the instance to be preempted. | `bool` | `false` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf index cd0fc9a049..54ceea74b2 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf @@ -69,23 +69,67 @@ resource "google_storage_bucket_iam_binding" "legacy_readers" { members = compact(local.viewers) } +locals { + daos_ns = [ + for ns in var.network_storage : + ns if ns.fs_type == "daos" + ] + + daos_client_install_runners = [ + for ns in local.daos_ns : + ns.client_install_runner if ns.client_install_runner != null + ] + + daos_mount_runners = [ + for ns in local.daos_ns : + ns.mount_runner if ns.mount_runner != null + ] + + daos_network_storage_runners = concat( + local.daos_client_install_runners, + local.daos_mount_runners, + ) + + daos_install_mount_script = { + filename = "ghpc_daos_mount.sh" + content = length(local.daos_ns) > 0 ? module.daos_network_storage_scripts[0].startup_script : "" + } +} + # SLURM FILES locals { - ghpc_startup_script_controller = [{ + ghpc_startup_controller = { filename = "ghpc_startup.sh" content = var.controller_startup_script - }] - ghpc_startup_script_login = [{ + } + ghpc_startup_script_controller = length(local.daos_ns) > 0 ? [local.daos_install_mount_script, local.ghpc_startup_controller] : [local.ghpc_startup_controller] + + ghpc_startup_login = { filename = "ghpc_startup.sh" content = var.login_startup_script - }] - ghpc_startup_script_compute = [{ + } + ghpc_startup_script_login = length(local.daos_ns) > 0 ? [local.daos_install_mount_script, local.ghpc_startup_login] : [local.ghpc_startup_login] + + ghpc_startup_compute = { filename = "ghpc_startup.sh" content = var.compute_startup_script - }] + } + ghpc_startup_script_compute = length(local.daos_ns) > 0 ? [local.daos_install_mount_script, local.ghpc_startup_compute] : [local.ghpc_startup_compute] + nodeset_startup_scripts = { for k, v in local.nodeset_map : k => v.startup_script } } +module "daos_network_storage_scripts" { + count = length(local.daos_ns) > 0 ? 1 : 0 + + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.34.0&depth=1" + labels = local.labels + project_id = var.project_id + deployment_name = var.deployment_name + region = var.region + runners = local.daos_network_storage_runners +} + module "slurm_files" { source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_files?ref=6.5.9" @@ -121,8 +165,17 @@ module "slurm_files" { enable_slurm_gcp_plugins = var.enable_slurm_gcp_plugins disable_default_mounts = !var.enable_default_mounts - network_storage = var.network_storage - login_network_storage = var.login_network_storage + network_storage = [ + for storage in var.network_storage : { + server_ip = storage.server_ip, + remote_mount = storage.remote_mount, + local_mount = storage.local_mount, + fs_type = storage.fs_type, + mount_options = storage.mount_options + } + if storage.fs_type != "daos" + ] + login_network_storage = var.login_network_storage partitions = [for p in var.partitions : { partition : p }] nodeset = values(module.slurm_nodeset)[*] diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf index 1013ec1178..5d28e8acb5 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf @@ -217,11 +217,13 @@ variable "nodeset" { min_cpu_platform = optional(string) network_tier = optional(string, "STANDARD") network_storage = optional(list(object({ - server_ip = string - remote_mount = string - local_mount = string - fs_type = string - mount_options = string + server_ip = string + remote_mount = string + local_mount = string + fs_type = string + mount_options = string + client_install_runner = optional(map(string)) + mount_runner = optional(map(string)) })), []) on_host_maintenance = optional(string) preemptible = optional(bool, false) @@ -298,11 +300,13 @@ variable "nodeset_tpu" { data_disks = optional(list(string), []) docker_image = optional(string, "") network_storage = optional(list(object({ - server_ip = string - remote_mount = string - local_mount = string - fs_type = string - mount_options = string + server_ip = string + remote_mount = string + local_mount = string + fs_type = string + mount_options = string + client_install_runner = optional(map(string)) + mount_runner = optional(map(string)) })), []) subnetwork = string service_account = optional(object({ @@ -379,7 +383,7 @@ variable "enable_cleanup_compute" { Enables automatic cleanup of compute nodes and resource policies (e.g. placement groups) managed by this module, when cluster is destroyed. -*WARNING*: Toggling this off will impact the running workload. +*WARNING*: Toggling this off will impact the running workload. Deployed compute nodes and controller will be destroyed. EOD type = bool @@ -438,11 +442,13 @@ variable "disable_default_mounts" { # tflint-ignore: terraform_unused_declaratio variable "network_storage" { description = "An array of network attached storage mounts to be configured on all instances." type = list(object({ - server_ip = string, - remote_mount = string, - local_mount = string, - fs_type = string, - mount_options = string, + server_ip = string, + remote_mount = string, + local_mount = string, + fs_type = string, + mount_options = string, + client_install_runner = optional(map(string)) + mount_runner = optional(map(string)) })) default = [] } From 22a9fa741d335fdcd1ca77dcfd2009a9327edc3c Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Mon, 8 Jul 2024 11:51:44 -0500 Subject: [PATCH 42/51] Modify a3-highgpu-8g image-building blueprint network to match updated values from 5cb9faa4 --- .../machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml b/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml index 99b2f40838..46f9ae59db 100644 --- a/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml +++ b/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml @@ -27,8 +27,8 @@ vars: zone: customer-zone disk_size: 200 final_image_family: slurm-dlvm - network_name_system: slurm-sys-net - subnetwork_name_system: slurm-sys-subnet + network_name_system: slurm-a3-base-sysnet + subnetwork_name_system: slurm-a3-base-sysnet-subnet slurm_cluster_name: slurm0 source_image_project_id: source-image-project-id # use value supplied by Google Cloud staff source_image: source-image-name # use value supplied by Google Cloud staff From 97b05e42b7b684ac82acab911925d6e7c5abe776 Mon Sep 17 00:00:00 2001 From: Harsh Thakkar Date: Mon, 8 Jul 2024 19:29:10 +0000 Subject: [PATCH 43/51] Fix hydration from GCS bucket if name is not provided --- modules/file-system/parallelstore/main.tf | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/modules/file-system/parallelstore/main.tf b/modules/file-system/parallelstore/main.tf index 7d3ee33c79..c2e4d2a43c 100644 --- a/modules/file-system/parallelstore/main.tf +++ b/modules/file-system/parallelstore/main.tf @@ -23,6 +23,7 @@ locals { fs_type = "daos" server_ip = "" remote_mount = "" + id = var.name != null ? var.name : "${var.deployment_name}-${random_id.resource_name_suffix.hex}" access_points = jsonencode(google_parallelstore_instance.instance.access_points) client_install_runner = { @@ -45,7 +46,7 @@ resource "random_id" "resource_name_suffix" { } resource "google_parallelstore_instance" "instance" { - instance_id = var.name != null ? var.name : "${var.deployment_name}-${random_id.resource_name_suffix.hex}" + instance_id = local.id location = var.zone capacity_gib = var.size_gb network = var.network_id @@ -61,6 +62,6 @@ resource "null_resource" "hydration" { depends_on = [resource.google_parallelstore_instance.instance] provisioner "local-exec" { - command = "curl -X POST -H \"Content-Type: application/json\" -H \"Authorization: Bearer $(gcloud auth print-access-token)\" -d '{\"source_gcs_bucket\": {\"uri\":\"${var.source_gcs_bucket_uri}\"}, \"destination_parallelstore\": {\"path\":\"${var.destination_hydration_parallelstore}\"}}' https://parallelstore.googleapis.com/v1beta/projects/${var.project_id}/locations/${var.zone}/instances/${var.name}:importData" + command = "curl -X POST -H \"Content-Type: application/json\" -H \"Authorization: Bearer $(gcloud auth print-access-token)\" -d '{\"source_gcs_bucket\": {\"uri\":\"${var.source_gcs_bucket_uri}\"}, \"destination_parallelstore\": {\"path\":\"${var.destination_hydration_parallelstore}\"}}' https://parallelstore.googleapis.com/v1beta/projects/${var.project_id}/locations/${var.zone}/instances/${local.id}:importData" } } From 5df1d3c89694acc1cdad092644f17d63e83a8c22 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Mon, 8 Jul 2024 21:14:44 +0000 Subject: [PATCH 44/51] Remove `enable_devel` from examples --- community/examples/hpc-build-slurm-image.yaml | 4 ++-- .../machine-learning/a3-megagpu-8g/slurm-a3mega-cluster.yaml | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/community/examples/hpc-build-slurm-image.yaml b/community/examples/hpc-build-slurm-image.yaml index 6271b3362c..f721019844 100644 --- a/community/examples/hpc-build-slurm-image.yaml +++ b/community/examples/hpc-build-slurm-image.yaml @@ -70,6 +70,8 @@ deployment_groups: } - type: shell destination: install_slurm.sh + # Note: changes to slurm-gcp `/scripts` folder in the built image will not reflect in the deployed cluster. + # Instead the scripts referenced in `schedmd-slurm-gcp-v6-controller/slurm_files` will be used. content: | #!/bin/bash set -e -o pipefail @@ -117,5 +119,3 @@ deployment_groups: settings: machine_type: n2d-standard-4 instance_image: $(vars.built_instance_image) - # Will cause Slurm auto-scaling scripts to be sourced from built image - enable_devel: false diff --git a/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-cluster.yaml b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-cluster.yaml index cbf70fe9db..5ec78fc43e 100644 --- a/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-cluster.yaml +++ b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-cluster.yaml @@ -241,7 +241,6 @@ deployment_groups: machine_type: c2-standard-8 enable_cleanup_compute: true enable_external_prolog_epilog: true - enable_devel: false slurm_conf_tpl: modules/embedded/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/etc/long-prolog-slurm.conf.tpl enable_controller_public_ips: false controller_startup_script: $(controller_startup.startup_script) From df8005242f9383f981580489f46b893275e5e331 Mon Sep 17 00:00:00 2001 From: Harsh Thakkar Date: Mon, 8 Jul 2024 23:22:26 +0000 Subject: [PATCH 45/51] Update parameters for parallelstore module as per feedback --- modules/file-system/parallelstore/README.md | 6 +++--- modules/file-system/parallelstore/main.tf | 15 ++++++++------- modules/file-system/parallelstore/outputs.tf | 9 ++++++++- .../parallelstore/scripts/mount-daos.sh | 4 +++- modules/file-system/parallelstore/variables.tf | 14 +++++++++----- .../scripts/mount-daos.sh | 4 +++- 6 files changed, 34 insertions(+), 18 deletions(-) diff --git a/modules/file-system/parallelstore/README.md b/modules/file-system/parallelstore/README.md index 32643d965c..38bc60114f 100644 --- a/modules/file-system/parallelstore/README.md +++ b/modules/file-system/parallelstore/README.md @@ -47,16 +47,16 @@ No modules. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| | [deployment\_name](#input\_deployment\_name) | Name of the HPC deployment. | `string` | n/a | yes | -| [destination\_hydration\_parallelstore](#input\_destination\_hydration\_parallelstore) | The name of local path to import data on parallelstore instance from GCS bucket. | `string` | `"/"` | no | +| [import\_destination\_path](#input\_import\_destination\_path) | The name of local path to import data on parallelstore instance from GCS bucket. | `string` | `null` | no | +| [import\_gcs\_bucket\_uri](#input\_import\_gcs\_bucket\_uri) | The name of the GCS bucket to import data from to parallelstore. | `string` | `null` | no | | [labels](#input\_labels) | Labels to add to parallel store instance. | `map(string)` | `{}` | no | | [local\_mount](#input\_local\_mount) | The mount point where the contents of the device may be accessed after mounting. | `string` | `"/parallelstore"` | no | | [mount\_options](#input\_mount\_options) | Options describing various aspects of the parallelstore instance. | `string` | `"disable-wb-cache,thread-count=16,eq-count=8"` | no | | [name](#input\_name) | Name of parallelstore instance. | `string` | `null` | no | | [network\_id](#input\_network\_id) | The ID of the GCE VPC network to which the instance is connected given in the format:
`projects//global/networks/`" | `string` | n/a | yes | -| [private\_vpc\_connection\_peering](#input\_private\_vpc\_connection\_peering) | The name of the VPC Network peering connection. | `string` | n/a | yes | +| [private\_vpc\_connection\_peering](#input\_private\_vpc\_connection\_peering) | The name of the VPC Network peering connection.
If using new VPC, please use community/modules/network/private-service-access to create private-service-access and
If using existing VPC with private-service-access enabled, set this manually." | `string` | n/a | yes | | [project\_id](#input\_project\_id) | Project in which the HPC deployment will be created. | `string` | n/a | yes | | [size\_gb](#input\_size\_gb) | Storage size of the parallelstore instance in GB. | `number` | `12000` | no | -| [source\_gcs\_bucket\_uri](#input\_source\_gcs\_bucket\_uri) | The name of the GCS bucket to import data from to parallelstore. | `string` | `""` | no | | [zone](#input\_zone) | Location for parallelstore instance. | `string` | n/a | yes | ## Outputs diff --git a/modules/file-system/parallelstore/main.tf b/modules/file-system/parallelstore/main.tf index c2e4d2a43c..885ebc7484 100644 --- a/modules/file-system/parallelstore/main.tf +++ b/modules/file-system/parallelstore/main.tf @@ -20,11 +20,12 @@ locals { } locals { - fs_type = "daos" - server_ip = "" - remote_mount = "" - id = var.name != null ? var.name : "${var.deployment_name}-${random_id.resource_name_suffix.hex}" - access_points = jsonencode(google_parallelstore_instance.instance.access_points) + fs_type = "daos" + server_ip = "" + remote_mount = "" + id = var.name != null ? var.name : "${var.deployment_name}-${random_id.resource_name_suffix.hex}" + access_points = jsonencode(google_parallelstore_instance.instance.access_points) + destination_path = var.import_destination_path == null ? "/" : var.import_destination_path client_install_runner = { "type" = "shell" @@ -58,10 +59,10 @@ resource "google_parallelstore_instance" "instance" { } resource "null_resource" "hydration" { - count = var.source_gcs_bucket_uri != "" ? 1 : 0 + count = var.import_gcs_bucket_uri != null ? 1 : 0 depends_on = [resource.google_parallelstore_instance.instance] provisioner "local-exec" { - command = "curl -X POST -H \"Content-Type: application/json\" -H \"Authorization: Bearer $(gcloud auth print-access-token)\" -d '{\"source_gcs_bucket\": {\"uri\":\"${var.source_gcs_bucket_uri}\"}, \"destination_parallelstore\": {\"path\":\"${var.destination_hydration_parallelstore}\"}}' https://parallelstore.googleapis.com/v1beta/projects/${var.project_id}/locations/${var.zone}/instances/${local.id}:importData" + command = "curl -X POST -H \"Content-Type: application/json\" -H \"Authorization: Bearer $(gcloud auth print-access-token)\" -d '{\"source_gcs_bucket\": {\"uri\":\"${var.import_gcs_bucket_uri}\"}, \"destination_parallelstore\": {\"path\":\"${local.destination_path}\"}}' https://parallelstore.googleapis.com/v1beta/projects/${var.project_id}/locations/${var.zone}/instances/${local.id}:importData" } } diff --git a/modules/file-system/parallelstore/outputs.tf b/modules/file-system/parallelstore/outputs.tf index 6bae6b892d..f6e817ac8a 100644 --- a/modules/file-system/parallelstore/outputs.tf +++ b/modules/file-system/parallelstore/outputs.tf @@ -32,9 +32,16 @@ output "network_storage" { client_install_runner = local.client_install_runner mount_runner = local.mount_runner } + + precondition { + condition = var.import_gcs_bucket_uri != null || var.import_destination_path == null + error_message = <<-EOD + Please specify import_gcs_bucket_uri to import data to parallelstore instance. + EOD + } } output "instructions" { description = "Instructions to monitor import-data operation from GCS bucket to parallelstore." - value = var.source_gcs_bucket_uri != "" ? local.operation_instructions : "Data is not imported from GCS bucket." + value = var.import_gcs_bucket_uri != null ? local.operation_instructions : "Data is not imported from GCS bucket." } diff --git a/modules/file-system/parallelstore/scripts/mount-daos.sh b/modules/file-system/parallelstore/scripts/mount-daos.sh index 9593f6e713..2be854f2d0 100644 --- a/modules/file-system/parallelstore/scripts/mount-daos.sh +++ b/modules/file-system/parallelstore/scripts/mount-daos.sh @@ -35,6 +35,8 @@ chmod 777 "$local_mount" # Mount container for multi-user. fuse_config=/etc/fuse.conf sed -i "s/#.*user_allow_other/user_allow_other/g" $fuse_config -dfuse -m "$local_mount" --pool default-pool --container default-container --multi-user -o "$mount_options" +# To parse mount_options as --disable-wb-cache --eq-count=8. +# shellcheck disable=SC2086 +dfuse -m "$local_mount" --pool default-pool --container default-container --multi-user $mount_options exit 0 diff --git a/modules/file-system/parallelstore/variables.tf b/modules/file-system/parallelstore/variables.tf index 2f6696f35f..8dcac7c528 100644 --- a/modules/file-system/parallelstore/variables.tf +++ b/modules/file-system/parallelstore/variables.tf @@ -60,7 +60,11 @@ variable "mount_options" { } variable "private_vpc_connection_peering" { - description = "The name of the VPC Network peering connection." + description = <<-EOT + The name of the VPC Network peering connection. + If using new VPC, please use community/modules/network/private-service-access to create private-service-access and + If using existing VPC with private-service-access enabled, set this manually." + EOT type = string } @@ -76,14 +80,14 @@ variable "network_id" { } } -variable "source_gcs_bucket_uri" { +variable "import_gcs_bucket_uri" { description = "The name of the GCS bucket to import data from to parallelstore." type = string - default = "" + default = null } -variable "destination_hydration_parallelstore" { +variable "import_destination_path" { description = "The name of local path to import data on parallelstore instance from GCS bucket." type = string - default = "/" + default = null } diff --git a/modules/file-system/pre-existing-network-storage/scripts/mount-daos.sh b/modules/file-system/pre-existing-network-storage/scripts/mount-daos.sh index 9593f6e713..2be854f2d0 100644 --- a/modules/file-system/pre-existing-network-storage/scripts/mount-daos.sh +++ b/modules/file-system/pre-existing-network-storage/scripts/mount-daos.sh @@ -35,6 +35,8 @@ chmod 777 "$local_mount" # Mount container for multi-user. fuse_config=/etc/fuse.conf sed -i "s/#.*user_allow_other/user_allow_other/g" $fuse_config -dfuse -m "$local_mount" --pool default-pool --container default-container --multi-user -o "$mount_options" +# To parse mount_options as --disable-wb-cache --eq-count=8. +# shellcheck disable=SC2086 +dfuse -m "$local_mount" --pool default-pool --container default-container --multi-user $mount_options exit 0 From 0968a5aa698ac94e352ba93fca3453943245b2e7 Mon Sep 17 00:00:00 2001 From: Ankit Kinra <1037624+ankitkinra@users.noreply.github.com> Date: Wed, 17 Jul 2024 16:12:48 +0000 Subject: [PATCH 46/51] Update test image to workaround CentOS end of life issues --- examples/serverless-batch-mpi.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/serverless-batch-mpi.yaml b/examples/serverless-batch-mpi.yaml index 9d18a51a52..44775e7245 100644 --- a/examples/serverless-batch-mpi.yaml +++ b/examples/serverless-batch-mpi.yaml @@ -156,8 +156,8 @@ deployment_groups: task_count: 2 mpi_mode: true instance_image: - family: batch-centos-7-official - project: batch-custom-image + family: hpc-rocky-linux-8 + project: cloud-hpc-image-public - id: batch-login source: modules/scheduler/batch-login-node From f60056bbc7040734e4b80a53774b00d81f3d50c0 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Wed, 17 Jul 2024 18:42:02 +0000 Subject: [PATCH 47/51] Update A3 mega blueprint to use Slurm-GCP 6.5.12 --- examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml index a77682abcf..607ce1b1d7 100644 --- a/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml +++ b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml @@ -104,7 +104,7 @@ deployment_groups: apt-get install -y git ansible-galaxy role install googlecloudplatform.google_cloud_ops_agents ansible-pull \ - -U https://github.com/GoogleCloudPlatform/slurm-gcp -C 6.5.9 \ + -U https://github.com/GoogleCloudPlatform/slurm-gcp -C 6.5.12 \ -i localhost, --limit localhost --connection=local \ -e @/var/tmp/slurm_vars.json \ ansible/playbook.yml From bc974c8084452c1cb13e9e5acf1504e6e9118af8 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Wed, 17 Jul 2024 18:42:02 +0000 Subject: [PATCH 48/51] Workaround GoogleCloudPlatform/guest-agent#401 Fix the versions for local google guest VM services so that they do not upgrade to versions that are known to have boot-time issues for the following combination: - building image using Packer on a build VM without local NVME devices - final image used on a VM with local NVME devices In this combination, network configurations persist that do not match the final naming conventions of the network interfaces because of differing PCI bus layout. --- .../machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml index 607ce1b1d7..090f87b668 100644 --- a/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml +++ b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml @@ -57,13 +57,17 @@ deployment_groups: # if you follow this rule, any module which supports DKMS will be # properly configured at the end of image building (gVNIC, NVIDIA, ...) - type: shell - destination: disable_unattended_upgrades.sh + destination: prevent_unintentional_upgrades.sh content: | #!/bin/bash set -e -o pipefail systemctl stop unattended-upgrades.service systemctl disable unattended-upgrades.service systemctl mask unattended-upgrades.service + apt-mark hold google-compute-engine + apt-mark hold google-compute-engine-oslogin + apt-mark hold google-guest-agent + apt-mark hold google-osconfig-agent - type: ansible-local destination: install_headers_archive.yml content: | From 777374ea22998d61f5aac93266c36a21edb877c7 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Wed, 17 Jul 2024 22:38:57 +0000 Subject: [PATCH 49/51] Add validator for Terraform version and SlurmGCP6 --- pkg/validators/adhoc.go | 72 +++++++++++++++++++ pkg/validators/validators.go | 5 +- pkg/validators/validators_test.go | 11 +-- .../tasks/create_deployment_directory.yml | 1 + tools/enforce_coverage.pl | 2 +- tools/validate_configs/validate_configs.sh | 2 +- 6 files changed, 85 insertions(+), 8 deletions(-) create mode 100644 pkg/validators/adhoc.go diff --git a/pkg/validators/adhoc.go b/pkg/validators/adhoc.go new file mode 100644 index 0000000000..d7503593ca --- /dev/null +++ b/pkg/validators/adhoc.go @@ -0,0 +1,72 @@ +// Copyright 2023 "Google LLC" +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package validators + +import ( + "encoding/json" + "fmt" + "hpc-toolkit/pkg/config" + "os/exec" + "strings" +) + +func testTfVersionForSlurm(bp config.Blueprint, _ config.Dict) error { + slurm := false + bp.WalkModulesSafe(func(_ config.ModulePath, m *config.Module) { + if strings.HasSuffix(m.Source, "slurm-gcp-v6-controller") { + slurm = true + } + }) + + if !slurm { + return nil + } + + ver, err := tfVersion() + if err != nil { + return nil + } + + if ver <= "1.4.0" { + return nil + } + + return fmt.Errorf("using a newer version of Terraform can lead to controller replacement on reconfigure for Slurm GCP v6\n\n" + + "Please be advised of this known issue: https://github.com/GoogleCloudPlatform/hpc-toolkit/issues/2774\n" + + "Until resolved it is advised to use Terraform 1.4.0 with Slurm deployments.\n\n" + + "To silence this warning, add flag: --skip-validators=test_tf_version_for_slurm") + +} + +func tfVersion() (string, error) { + path, err := exec.LookPath("terraform") + if err != nil { + return "", err + } + + out, err := exec.Command(path, "version", "--json").Output() + if err != nil { + return "", err + } + + var version struct { + TerraformVersion string `json:"terraform_version"` + } + if err := json.Unmarshal(out, &version); err != nil { + return "", err + } + + return version.TerraformVersion, nil +} diff --git a/pkg/validators/validators.go b/pkg/validators/validators.go index 21cf622144..18397bfaa6 100644 --- a/pkg/validators/validators.go +++ b/pkg/validators/validators.go @@ -54,6 +54,7 @@ const ( testZoneInRegionName = "test_zone_in_region" testModuleNotUsedName = "test_module_not_used" testDeploymentVariableNotUsedName = "test_deployment_variable_not_used" + testTfVersionForSlurmName = "test_tf_version_for_slurm" ) func implementations() map[string]func(config.Blueprint, config.Dict) error { @@ -65,6 +66,7 @@ func implementations() map[string]func(config.Blueprint, config.Dict) error { testZoneInRegionName: testZoneInRegion, testModuleNotUsedName: testModuleNotUsed, testDeploymentVariableNotUsedName: testDeploymentVariableNotUsed, + testTfVersionForSlurmName: testTfVersionForSlurm, } } @@ -165,7 +167,8 @@ func defaults(bp config.Blueprint) []config.Validator { defaults := []config.Validator{ {Validator: testModuleNotUsedName}, - {Validator: testDeploymentVariableNotUsedName}} + {Validator: testDeploymentVariableNotUsedName}, + {Validator: testTfVersionForSlurmName}} // always add the project ID validator before subsequent validators that can // only succeed if credentials can access the project. If the project ID diff --git a/pkg/validators/validators_test.go b/pkg/validators/validators_test.go index 3f690e5821..4f8fc578ad 100644 --- a/pkg/validators/validators_test.go +++ b/pkg/validators/validators_test.go @@ -73,6 +73,7 @@ func (s *MySuite) TestCheckInputs(c *C) { func (s *MySuite) TestDefaultValidators(c *C) { unusedMods := config.Validator{Validator: "test_module_not_used"} unusedVars := config.Validator{Validator: "test_deployment_variable_not_used"} + slurmTf := config.Validator{Validator: "test_tf_version_for_slurm"} prjInp := config.Dict{}.With("project_id", config.GlobalRef("project_id").AsValue()) regInp := prjInp.With("region", config.GlobalRef("region").AsValue()) @@ -93,14 +94,14 @@ func (s *MySuite) TestDefaultValidators(c *C) { { bp := config.Blueprint{} c.Check(defaults(bp), DeepEquals, []config.Validator{ - unusedMods, unusedVars}) + unusedMods, unusedVars, slurmTf}) } { bp := config.Blueprint{Vars: config.Dict{}. With("project_id", cty.StringVal("f00b"))} c.Check(defaults(bp), DeepEquals, []config.Validator{ - unusedMods, unusedVars, projectExists, apisEnabled}) + unusedMods, unusedVars, slurmTf, projectExists, apisEnabled}) } { @@ -109,7 +110,7 @@ func (s *MySuite) TestDefaultValidators(c *C) { With("region", cty.StringVal("narnia"))} c.Check(defaults(bp), DeepEquals, []config.Validator{ - unusedMods, unusedVars, projectExists, apisEnabled, regionExists}) + unusedMods, unusedVars, slurmTf, projectExists, apisEnabled, regionExists}) } { @@ -118,7 +119,7 @@ func (s *MySuite) TestDefaultValidators(c *C) { With("zone", cty.StringVal("danger"))} c.Check(defaults(bp), DeepEquals, []config.Validator{ - unusedMods, unusedVars, projectExists, apisEnabled, zoneExists}) + unusedMods, unusedVars, slurmTf, projectExists, apisEnabled, zoneExists}) } { @@ -128,6 +129,6 @@ func (s *MySuite) TestDefaultValidators(c *C) { With("zone", cty.StringVal("danger"))} c.Check(defaults(bp), DeepEquals, []config.Validator{ - unusedMods, unusedVars, projectExists, apisEnabled, regionExists, zoneExists, zoneInRegion}) + unusedMods, unusedVars, slurmTf, projectExists, apisEnabled, regionExists, zoneExists, zoneInRegion}) } } diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/tasks/create_deployment_directory.yml b/tools/cloud-build/daily-tests/ansible_playbooks/tasks/create_deployment_directory.yml index 4f6d93eca8..f515e26aac 100644 --- a/tools/cloud-build/daily-tests/ansible_playbooks/tasks/create_deployment_directory.yml +++ b/tools/cloud-build/daily-tests/ansible_playbooks/tasks/create_deployment_directory.yml @@ -34,6 +34,7 @@ ansible.builtin.command: | ./ghpc create -l ERROR "{{ blueprint_yaml }}" \ --backend-config bucket={{ state_bucket }} \ + --skip-validators=test_tf_version_for_slurm \ --vars project_id={{ project }} \ --vars deployment_name={{ deployment_name }} \ {{ deployment_vars_str if deployment_vars_str is defined else '' }} diff --git a/tools/enforce_coverage.pl b/tools/enforce_coverage.pl index b258d07869..2f1f1211f0 100755 --- a/tools/enforce_coverage.pl +++ b/tools/enforce_coverage.pl @@ -24,7 +24,7 @@ cmd 40 pkg/shell 0 pkg/logging 0 - pkg/validators 13 + pkg/validators 10 pkg/inspect 60 pkg/modulewriter 79 pkg 80 diff --git a/tools/validate_configs/validate_configs.sh b/tools/validate_configs/validate_configs.sh index 3a3abde3e8..fb92005498 100755 --- a/tools/validate_configs/validate_configs.sh +++ b/tools/validate_configs/validate_configs.sh @@ -26,7 +26,7 @@ run_test() { exampleFile=$(basename "$example") DEPLOYMENT=$(echo "${exampleFile%.yaml}-$(basename "${tmpdir##*.}")" | sed -e 's/\(.*\)/\L\1/') PROJECT="invalid-project" - VALIDATORS_TO_SKIP="test_project_exists,test_apis_enabled,test_region_exists,test_zone_exists,test_zone_in_region" + VALIDATORS_TO_SKIP="test_project_exists,test_apis_enabled,test_region_exists,test_zone_exists,test_zone_in_region,test_tf_version_for_slurm" GHPC_PATH="${cwd}/ghpc" BP_PATH="${cwd}/${example}" # Cover the three possible starting sequences for local sources: ./ ../ / From 80fb26a25a23a211b04845d0fcd7d3ba17345bc4 Mon Sep 17 00:00:00 2001 From: Ankit Kinra <1037624+ankitkinra@users.noreply.github.com> Date: Thu, 18 Jul 2024 18:18:24 +0000 Subject: [PATCH 50/51] Update image to new centos image for both login and builder nodes --- examples/serverless-batch-mpi.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/serverless-batch-mpi.yaml b/examples/serverless-batch-mpi.yaml index 44775e7245..af156a2a83 100644 --- a/examples/serverless-batch-mpi.yaml +++ b/examples/serverless-batch-mpi.yaml @@ -20,6 +20,9 @@ vars: deployment_name: batch-wrf region: us-central1 zone: us-central1-c + instance_image: + family: hpc-centos-7 + project: cloud-hpc-image-public deployment_groups: - group: primary @@ -155,9 +158,6 @@ deployment_groups: machine_type: c2-standard-60 task_count: 2 mpi_mode: true - instance_image: - family: hpc-rocky-linux-8 - project: cloud-hpc-image-public - id: batch-login source: modules/scheduler/batch-login-node From d1a5b9ce2b56bb2d5d3bf8b3f4fdab60bf3bcb50 Mon Sep 17 00:00:00 2001 From: Ankit Kinra <1037624+ankitkinra@users.noreply.github.com> Date: Thu, 18 Jul 2024 22:20:08 +0000 Subject: [PATCH 51/51] Update version to v1.36.0 --- cmd/root.go | 2 +- community/modules/compute/gke-node-pool/versions.tf | 2 +- community/modules/compute/htcondor-execute-point/versions.tf | 2 +- community/modules/compute/mig/versions.tf | 2 +- .../compute/schedmd-slurm-gcp-v5-node-group/versions.tf | 2 +- .../compute/schedmd-slurm-gcp-v5-partition/versions.tf | 2 +- .../compute/schedmd-slurm-gcp-v6-nodeset-dynamic/versions.tf | 2 +- .../compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf | 2 +- .../modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf | 2 +- .../compute/schedmd-slurm-gcp-v6-partition/versions.tf | 2 +- .../modules/database/slurm-cloudsql-federation/versions.tf | 4 ++-- .../modules/file-system/cloud-storage-bucket/versions.tf | 2 +- .../modules/file-system/gke-persistent-volume/versions.tf | 2 +- community/modules/file-system/nfs-server/versions.tf | 2 +- community/modules/files/fsi-montecarlo-on-batch/versions.tf | 4 ++-- community/modules/network/private-service-access/versions.tf | 4 ++-- community/modules/project/service-enablement/versions.tf | 2 +- community/modules/pubsub/bigquery-sub/versions.tf | 4 ++-- community/modules/pubsub/topic/versions.tf | 2 +- community/modules/scheduler/gke-cluster/versions.tf | 2 +- community/modules/scheduler/htcondor-access-point/versions.tf | 2 +- .../modules/scheduler/htcondor-central-manager/versions.tf | 2 +- community/modules/scheduler/htcondor-pool-secrets/versions.tf | 2 +- .../modules/scheduler/pre-existing-gke-cluster/versions.tf | 2 +- .../scheduler/schedmd-slurm-gcp-v5-controller/versions.tf | 2 +- .../modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf | 2 +- .../scheduler/schedmd-slurm-gcp-v6-controller/versions.tf | 2 +- .../modules/scheduler/schedmd-slurm-gcp-v6-login/versions.tf | 2 +- community/modules/scripts/wait-for-startup/versions.tf | 2 +- community/modules/scripts/windows-startup-script/versions.tf | 2 +- modules/compute/vm-instance/versions.tf | 4 ++-- modules/file-system/filestore/versions.tf | 4 ++-- modules/monitoring/dashboard/versions.tf | 2 +- modules/network/firewall-rules/versions.tf | 2 +- modules/network/pre-existing-subnetwork/versions.tf | 2 +- modules/network/pre-existing-vpc/versions.tf | 2 +- modules/scheduler/batch-login-node/versions.tf | 2 +- modules/scripts/startup-script/versions.tf | 2 +- 38 files changed, 44 insertions(+), 44 deletions(-) diff --git a/cmd/root.go b/cmd/root.go index b58fee9041..b62fa5ee83 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -52,7 +52,7 @@ HPC deployments on the Google Cloud Platform.`, logging.Fatal("cmd.Help function failed: %s", err) } }, - Version: "v1.35.1", + Version: "v1.36.0", Annotations: annotation, } ) diff --git a/community/modules/compute/gke-node-pool/versions.tf b/community/modules/compute/gke-node-pool/versions.tf index ba5944ee6f..e1334464ae 100644 --- a/community/modules/compute/gke-node-pool/versions.tf +++ b/community/modules/compute/gke-node-pool/versions.tf @@ -26,6 +26,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:gke-node-pool/v1.35.1" + module_name = "blueprints/terraform/hpc-toolkit:gke-node-pool/v1.36.0" } } diff --git a/community/modules/compute/htcondor-execute-point/versions.tf b/community/modules/compute/htcondor-execute-point/versions.tf index 8a5ec8d8b7..113cbceb5c 100644 --- a/community/modules/compute/htcondor-execute-point/versions.tf +++ b/community/modules/compute/htcondor-execute-point/versions.tf @@ -25,6 +25,6 @@ terraform { } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-execute-point/v1.35.1" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-execute-point/v1.36.0" } } diff --git a/community/modules/compute/mig/versions.tf b/community/modules/compute/mig/versions.tf index da70130db2..70996dd4e3 100644 --- a/community/modules/compute/mig/versions.tf +++ b/community/modules/compute/mig/versions.tf @@ -22,6 +22,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:mig/v1.35.1" + module_name = "blueprints/terraform/hpc-toolkit:mig/v1.36.0" } } diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/versions.tf index 584088b456..84372410ac 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-node-group/v1.35.1" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-node-group/v1.36.0" } required_version = ">= 1.1" } diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v5-partition/versions.tf index 676d2a3283..6d8fba3689 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-partition/v1.35.1" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-partition/v1.36.0" } required_version = ">= 0.13.0" } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/versions.tf index d6962d04c1..0a80ea1fd7 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/versions.tf @@ -24,6 +24,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset-dynamic/v1.35.1" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset-dynamic/v1.36.0" } } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf index d3c4ce50b6..22625edb8b 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf @@ -25,6 +25,6 @@ terraform { } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset-tpu/v1.35.1" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset-tpu/v1.36.0" } } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf index 3b0e1e5f0d..8e91575ea5 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf @@ -24,6 +24,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset/v1.35.1" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset/v1.36.0" } } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-partition/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v6-partition/versions.tf index d627faaa07..5f23b4e776 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-partition/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-partition/versions.tf @@ -18,6 +18,6 @@ terraform { required_version = ">= 1.3" provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-partition/v1.35.1" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-partition/v1.36.0" } } diff --git a/community/modules/database/slurm-cloudsql-federation/versions.tf b/community/modules/database/slurm-cloudsql-federation/versions.tf index 1e30114de0..7f652a6e43 100644 --- a/community/modules/database/slurm-cloudsql-federation/versions.tf +++ b/community/modules/database/slurm-cloudsql-federation/versions.tf @@ -26,10 +26,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.35.1" + module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.36.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.35.1" + module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.36.0" } required_version = ">= 0.13.0" diff --git a/community/modules/file-system/cloud-storage-bucket/versions.tf b/community/modules/file-system/cloud-storage-bucket/versions.tf index b0ac4ee4df..3a8e088ee2 100644 --- a/community/modules/file-system/cloud-storage-bucket/versions.tf +++ b/community/modules/file-system/cloud-storage-bucket/versions.tf @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:cloud-storage-bucket/v1.35.1" + module_name = "blueprints/terraform/hpc-toolkit:cloud-storage-bucket/v1.36.0" } required_version = ">= 0.14.0" } diff --git a/community/modules/file-system/gke-persistent-volume/versions.tf b/community/modules/file-system/gke-persistent-volume/versions.tf index 9625ba58bd..81e86616f8 100644 --- a/community/modules/file-system/gke-persistent-volume/versions.tf +++ b/community/modules/file-system/gke-persistent-volume/versions.tf @@ -29,6 +29,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:gke-persistent-volume/v1.35.1" + module_name = "blueprints/terraform/hpc-toolkit:gke-persistent-volume/v1.36.0" } } diff --git a/community/modules/file-system/nfs-server/versions.tf b/community/modules/file-system/nfs-server/versions.tf index f5203ed9f1..70f6acdb64 100644 --- a/community/modules/file-system/nfs-server/versions.tf +++ b/community/modules/file-system/nfs-server/versions.tf @@ -30,7 +30,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:nfs-server/v1.35.1" + module_name = "blueprints/terraform/hpc-toolkit:nfs-server/v1.36.0" } required_version = ">= 0.14.0" diff --git a/community/modules/files/fsi-montecarlo-on-batch/versions.tf b/community/modules/files/fsi-montecarlo-on-batch/versions.tf index 37aefd1d46..eb60000962 100644 --- a/community/modules/files/fsi-montecarlo-on-batch/versions.tf +++ b/community/modules/files/fsi-montecarlo-on-batch/versions.tf @@ -35,9 +35,9 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:fsi-montecarlo-on-batch/v1.35.1" + module_name = "blueprints/terraform/hpc-toolkit:fsi-montecarlo-on-batch/v1.36.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:fsi-montecarlo-on-batch/v1.35.1" + module_name = "blueprints/terraform/hpc-toolkit:fsi-montecarlo-on-batch/v1.36.0" } } diff --git a/community/modules/network/private-service-access/versions.tf b/community/modules/network/private-service-access/versions.tf index a90e59fdf8..88364ff571 100644 --- a/community/modules/network/private-service-access/versions.tf +++ b/community/modules/network/private-service-access/versions.tf @@ -30,11 +30,11 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:private-service-access/v1.35.1" + module_name = "blueprints/terraform/hpc-toolkit:private-service-access/v1.36.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:private-service-access/v1.35.1" + module_name = "blueprints/terraform/hpc-toolkit:private-service-access/v1.36.0" } required_version = ">= 1.2" diff --git a/community/modules/project/service-enablement/versions.tf b/community/modules/project/service-enablement/versions.tf index ce48375ef1..a6c6472bf3 100644 --- a/community/modules/project/service-enablement/versions.tf +++ b/community/modules/project/service-enablement/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:service-enablement/v1.35.1" + module_name = "blueprints/terraform/hpc-toolkit:service-enablement/v1.36.0" } required_version = ">= 0.14.0" diff --git a/community/modules/pubsub/bigquery-sub/versions.tf b/community/modules/pubsub/bigquery-sub/versions.tf index a2bf6e424c..fe9863bbc4 100644 --- a/community/modules/pubsub/bigquery-sub/versions.tf +++ b/community/modules/pubsub/bigquery-sub/versions.tf @@ -26,10 +26,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:bigquery-sub/v1.35.1" + module_name = "blueprints/terraform/hpc-toolkit:bigquery-sub/v1.36.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:bigquery-sub/v1.35.1" + module_name = "blueprints/terraform/hpc-toolkit:bigquery-sub/v1.36.0" } required_version = ">= 1.0" } diff --git a/community/modules/pubsub/topic/versions.tf b/community/modules/pubsub/topic/versions.tf index 16bb48cf06..793a29bb17 100644 --- a/community/modules/pubsub/topic/versions.tf +++ b/community/modules/pubsub/topic/versions.tf @@ -27,6 +27,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:topic/v1.35.1" + module_name = "blueprints/terraform/hpc-toolkit:topic/v1.36.0" } } diff --git a/community/modules/scheduler/gke-cluster/versions.tf b/community/modules/scheduler/gke-cluster/versions.tf index 70e1cc70f7..47b970beca 100644 --- a/community/modules/scheduler/gke-cluster/versions.tf +++ b/community/modules/scheduler/gke-cluster/versions.tf @@ -30,6 +30,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:gke-cluster/v1.35.1" + module_name = "blueprints/terraform/hpc-toolkit:gke-cluster/v1.36.0" } } diff --git a/community/modules/scheduler/htcondor-access-point/versions.tf b/community/modules/scheduler/htcondor-access-point/versions.tf index 330a76515a..8cfd060a3d 100644 --- a/community/modules/scheduler/htcondor-access-point/versions.tf +++ b/community/modules/scheduler/htcondor-access-point/versions.tf @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-access-point/v1.35.1" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-access-point/v1.36.0" } required_version = ">= 1.1" diff --git a/community/modules/scheduler/htcondor-central-manager/versions.tf b/community/modules/scheduler/htcondor-central-manager/versions.tf index f952fc3c36..2cf04df48f 100644 --- a/community/modules/scheduler/htcondor-central-manager/versions.tf +++ b/community/modules/scheduler/htcondor-central-manager/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-central-manager/v1.35.1" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-central-manager/v1.36.0" } required_version = ">= 1.1.0" diff --git a/community/modules/scheduler/htcondor-pool-secrets/versions.tf b/community/modules/scheduler/htcondor-pool-secrets/versions.tf index b553665acc..6ea61b0009 100644 --- a/community/modules/scheduler/htcondor-pool-secrets/versions.tf +++ b/community/modules/scheduler/htcondor-pool-secrets/versions.tf @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-pool-secrets/v1.35.1" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-pool-secrets/v1.36.0" } required_version = ">= 1.3.0" diff --git a/community/modules/scheduler/pre-existing-gke-cluster/versions.tf b/community/modules/scheduler/pre-existing-gke-cluster/versions.tf index 30d00afe9c..dbb9fd4df3 100644 --- a/community/modules/scheduler/pre-existing-gke-cluster/versions.tf +++ b/community/modules/scheduler/pre-existing-gke-cluster/versions.tf @@ -23,7 +23,7 @@ terraform { } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:pre-existing-gke-cluster/v1.35.0" + module_name = "blueprints/terraform/hpc-toolkit:pre-existing-gke-cluster/v1.36.0" } required_version = ">= 1.0.0" diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/versions.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/versions.tf index 835482578e..2196381b7f 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/versions.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-controller/v1.35.1" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-controller/v1.36.0" } required_version = ">= 1.1" } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf index 14639af5d3..18cd72a152 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-login/v1.35.1" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-login/v1.36.0" } required_version = ">= 1.1" } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/versions.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/versions.tf index 9758993f5f..0bf1434778 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/versions.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/versions.tf @@ -30,6 +30,6 @@ terraform { } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-controller/v1.35.1" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-controller/v1.36.0" } } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/versions.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/versions.tf index 2cdf6f8ea6..ecefe85753 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/versions.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/versions.tf @@ -24,6 +24,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-login/v1.35.1" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-login/v1.36.0" } } diff --git a/community/modules/scripts/wait-for-startup/versions.tf b/community/modules/scripts/wait-for-startup/versions.tf index c171fed095..7b500d155a 100644 --- a/community/modules/scripts/wait-for-startup/versions.tf +++ b/community/modules/scripts/wait-for-startup/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:wait-for-startup/v1.35.1" + module_name = "blueprints/terraform/hpc-toolkit:wait-for-startup/v1.36.0" } required_version = ">= 0.14.0" diff --git a/community/modules/scripts/windows-startup-script/versions.tf b/community/modules/scripts/windows-startup-script/versions.tf index 04b21d10d1..337c503acf 100644 --- a/community/modules/scripts/windows-startup-script/versions.tf +++ b/community/modules/scripts/windows-startup-script/versions.tf @@ -16,7 +16,7 @@ terraform { provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:windows-startup-script/v1.35.1" + module_name = "blueprints/terraform/hpc-toolkit:windows-startup-script/v1.36.0" } required_version = ">= 0.14.0" diff --git a/modules/compute/vm-instance/versions.tf b/modules/compute/vm-instance/versions.tf index 58c93f7b9e..d6fa92f966 100644 --- a/modules/compute/vm-instance/versions.tf +++ b/modules/compute/vm-instance/versions.tf @@ -31,10 +31,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.35.1" + module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.36.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.35.1" + module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.36.0" } required_version = ">= 1.3.0" diff --git a/modules/file-system/filestore/versions.tf b/modules/file-system/filestore/versions.tf index 072623b9f7..930903d87d 100644 --- a/modules/file-system/filestore/versions.tf +++ b/modules/file-system/filestore/versions.tf @@ -26,10 +26,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.35.1" + module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.36.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.35.1" + module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.36.0" } required_version = ">= 0.14.0" diff --git a/modules/monitoring/dashboard/versions.tf b/modules/monitoring/dashboard/versions.tf index af7ac58b23..169fcea376 100644 --- a/modules/monitoring/dashboard/versions.tf +++ b/modules/monitoring/dashboard/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:dashboard/v1.35.1" + module_name = "blueprints/terraform/hpc-toolkit:dashboard/v1.36.0" } required_version = ">= 0.14.0" diff --git a/modules/network/firewall-rules/versions.tf b/modules/network/firewall-rules/versions.tf index 44fbf7b38f..0cc69859da 100644 --- a/modules/network/firewall-rules/versions.tf +++ b/modules/network/firewall-rules/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:firewall-rules/v1.35.1" + module_name = "blueprints/terraform/hpc-toolkit:firewall-rules/v1.36.0" } required_version = ">= 1.3" diff --git a/modules/network/pre-existing-subnetwork/versions.tf b/modules/network/pre-existing-subnetwork/versions.tf index 6bac79625a..2d68222016 100644 --- a/modules/network/pre-existing-subnetwork/versions.tf +++ b/modules/network/pre-existing-subnetwork/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:pre-existing-subnetwork/v1.33.0" + module_name = "blueprints/terraform/hpc-toolkit:pre-existing-subnetwork/v1.36.0" } required_version = ">= 0.14.0" diff --git a/modules/network/pre-existing-vpc/versions.tf b/modules/network/pre-existing-vpc/versions.tf index 28f06cbefe..55abfb23cc 100644 --- a/modules/network/pre-existing-vpc/versions.tf +++ b/modules/network/pre-existing-vpc/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:pre-existing-vpc/v1.35.1" + module_name = "blueprints/terraform/hpc-toolkit:pre-existing-vpc/v1.36.0" } required_version = ">= 0.14.0" diff --git a/modules/scheduler/batch-login-node/versions.tf b/modules/scheduler/batch-login-node/versions.tf index e62084fdc5..aa32ab692e 100644 --- a/modules/scheduler/batch-login-node/versions.tf +++ b/modules/scheduler/batch-login-node/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:batch-login-node/v1.35.1" + module_name = "blueprints/terraform/hpc-toolkit:batch-login-node/v1.36.0" } required_version = ">= 0.14.0" diff --git a/modules/scripts/startup-script/versions.tf b/modules/scripts/startup-script/versions.tf index 9b544eabfb..964e9e3567 100644 --- a/modules/scripts/startup-script/versions.tf +++ b/modules/scripts/startup-script/versions.tf @@ -30,7 +30,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:startup-script/v1.35.1" + module_name = "blueprints/terraform/hpc-toolkit:startup-script/v1.36.0" } required_version = ">= 0.14.0"