From 85e3ce834297860abf0faeba0231e3a27e4cd2d0 Mon Sep 17 00:00:00 2001 From: Farhad Sharabiani Date: Fri, 27 Sep 2024 11:52:11 +0000 Subject: [PATCH 1/3] Added compatibility check for GPUDirect and GKE version --- modules/compute/gke-node-pool/README.md | 1 + modules/compute/gke-node-pool/gpu_direct.tf | 28 +++++++++++++++++++ modules/compute/gke-node-pool/variables.tf | 5 ++++ modules/scheduler/gke-cluster/README.md | 1 + modules/scheduler/gke-cluster/outputs.tf | 5 ++++ .../pre-existing-gke-cluster/README.md | 1 + .../pre-existing-gke-cluster/outputs.tf | 5 ++++ 7 files changed, 46 insertions(+) diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md index fcf7414af6..7b1cffbf68 100644 --- a/modules/compute/gke-node-pool/README.md +++ b/modules/compute/gke-node-pool/README.md @@ -294,6 +294,7 @@ limitations under the License. | [disk\_type](#input\_disk\_type) | Disk type for each node. | `string` | `null` | no | | [enable\_gcfs](#input\_enable\_gcfs) | Enable the Google Container Filesystem (GCFS). See [restrictions](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster#gcfs_config). | `bool` | `false` | no | | [enable\_secure\_boot](#input\_enable\_secure\_boot) | Enable secure boot for the nodes. Keep enabled unless custom kernel modules need to be loaded. See [here](https://cloud.google.com/compute/shielded-vm/docs/shielded-vm#secure-boot) for more info. | `bool` | `true` | no | +| [gke\_master\_version](#input\_gke\_master\_version) | GKE master version | `string` | n/a | yes | | [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = optional(string)
count = optional(number, 0)
gpu_driver_installation_config = optional(list(object({
gpu_driver_version = string
})))
gpu_partition_size = optional(string)
gpu_sharing_config = optional(list(object({
gpu_sharing_strategy = optional(string)
max_shared_clients_per_gpu = optional(number)
})))
}))
| `null` | no | | [host\_maintenance\_interval](#input\_host\_maintenance\_interval) | Specifies the frequency of planned maintenance events. | `string` | `""` | no | | [image\_type](#input\_image\_type) | The default image type used by NAP once a new node pool is being created. Use either COS\_CONTAINERD or UBUNTU\_CONTAINERD. | `string` | `"COS_CONTAINERD"` | no | diff --git a/modules/compute/gke-node-pool/gpu_direct.tf b/modules/compute/gke-node-pool/gpu_direct.tf index b22c353f69..4fef57e914 100644 --- a/modules/compute/gke-node-pool/gpu_direct.tf +++ b/modules/compute/gke-node-pool/gpu_direct.tf @@ -33,6 +33,12 @@ locals { updated_workload_path = replace(local.workload_path_tcpx, ".yaml", "-tcpx.yaml") rxdm_version = "v2.0.12" # matching nccl-tcpx-installer version v3.1.9 min_additional_networks = 4 + min_gke_versions = { + "1.27" = "1.27.7-gke.1121000" + "1.28" = "1.28.8-gke.1095000" + "1.29" = "1.29.3-gke.1093000" + "1.30" = "1.30.2-gke.1023000" + } } "a3-megagpu-8g" = { # Manifest to be installed for enabling TCPXO on a3-megagpu-8g machines @@ -43,10 +49,25 @@ locals { updated_workload_path = replace(local.workload_path_tcpxo, ".yaml", "-tcpxo.yaml") rxdm_version = "v1.0.10" # matching nccl-tcpxo-installer version v1.0.4 min_additional_networks = 8 + min_gke_versions = { + "1.28" = "1.28.9-gke.1250000" + "1.29" = "1.29.4-gke.1542000" + "1.30" = "1.30.4-gke.1129000" + } } } min_additional_networks = try(local.gpu_direct_settings[var.machine_type].min_additional_networks, 0) + + gke_version_regex = "(\\d+\\.\\d+)\\.(\\d+)-gke\\.(\\d+)" # GKE version format: 1.X.Y-gke.Z , regex output: ["1.X" , "Y", "Z"] + + gke_version_parts = regex(local.gke_version_regex, var.gke_master_version) + gke_version_major = local.gke_version_parts[0] + + min_gke_versions = try(local.gpu_direct_setting[var.machine_type].min_gke_versions, null) + min_version = try(contains(keys(local.min_gke_versions), local.gke_version_major), false) ? local.min_gke_versions[local.gke_version_major] : "1.0.0-gke.0" + min_version_parts = regex(local.gke_version_regex, local.min_version) + gke_gpudirect_compatible = local.gke_version_parts[1] > local.min_version_parts[1] || (local.gke_version_parts[1] == local.min_version_parts[1] && local.gke_version_parts[2] >= local.min_version_parts[2]) } check "gpu_direct_check_multi_vpc" { @@ -55,3 +76,10 @@ check "gpu_direct_check_multi_vpc" { error_message = "To achieve optimal performance for ${var.machine_type} machine, at least ${local.min_additional_networks} additional vpc is recommended. You could configure it in the blueprint through modules/network/multivpc with network_count set as ${local.min_additional_networks}" } } + +check "gke_master_version_requirements" { + assert { + condition = local.gke_gpudirect_compatible + error_message = "GPUDirect is not supported on GKE master version ${var.gke_master_version} for ${var.machine_type} machine. For supported version details visit https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#requirements" + } +} diff --git a/modules/compute/gke-node-pool/variables.tf b/modules/compute/gke-node-pool/variables.tf index ef1277744f..62160a2448 100644 --- a/modules/compute/gke-node-pool/variables.tf +++ b/modules/compute/gke-node-pool/variables.tf @@ -360,3 +360,8 @@ variable "initial_node_count" { type = number default = null } + +variable "gke_master_version" { + description = "GKE master version" + type = string +} diff --git a/modules/scheduler/gke-cluster/README.md b/modules/scheduler/gke-cluster/README.md index 3a72e1149b..4548db2fc9 100644 --- a/modules/scheduler/gke-cluster/README.md +++ b/modules/scheduler/gke-cluster/README.md @@ -194,6 +194,7 @@ limitations under the License. |------|-------------| | [cluster\_id](#output\_cluster\_id) | An identifier for the resource with format projects/{{project\_id}}/locations/{{region}}/clusters/{{name}}. | | [gke\_cluster\_exists](#output\_gke\_cluster\_exists) | A static flag that signals to downstream modules that a cluster has been created. Needed by community/modules/scripts/kubernetes-operations. | +| [gke\_master\_version](#output\_gke\_master\_version) | GKE cluster's master version. | | [instructions](#output\_instructions) | Instructions on how to connect to the created cluster. | | [k8s\_service\_account\_name](#output\_k8s\_service\_account\_name) | Name of k8s service account. | diff --git a/modules/scheduler/gke-cluster/outputs.tf b/modules/scheduler/gke-cluster/outputs.tf index 53ee068ca2..4daed8ee25 100644 --- a/modules/scheduler/gke-cluster/outputs.tf +++ b/modules/scheduler/gke-cluster/outputs.tf @@ -74,3 +74,8 @@ output "k8s_service_account_name" { description = "Name of k8s service account." value = one(module.workload_identity[*].k8s_service_account_name) } + +output "gke_master_version" { + description = "GKE cluster's master version." + value = google_container_cluster.gke_cluster.master_version +} diff --git a/modules/scheduler/pre-existing-gke-cluster/README.md b/modules/scheduler/pre-existing-gke-cluster/README.md index 519715480d..1f2904d889 100644 --- a/modules/scheduler/pre-existing-gke-cluster/README.md +++ b/modules/scheduler/pre-existing-gke-cluster/README.md @@ -111,4 +111,5 @@ limitations under the License. |------|-------------| | [cluster\_id](#output\_cluster\_id) | An identifier for the gke cluster with format projects/{{project\_id}}/locations/{{region}}/clusters/{{name}}. | | [gke\_cluster\_exists](#output\_gke\_cluster\_exists) | A static flag that signals to downstream modules that a cluster exists. | +| [gke\_master\_version](#output\_gke\_master\_version) | GKE cluster's master version. | diff --git a/modules/scheduler/pre-existing-gke-cluster/outputs.tf b/modules/scheduler/pre-existing-gke-cluster/outputs.tf index 9bfd571b61..90772d3dae 100644 --- a/modules/scheduler/pre-existing-gke-cluster/outputs.tf +++ b/modules/scheduler/pre-existing-gke-cluster/outputs.tf @@ -26,3 +26,8 @@ output "gke_cluster_exists" { data.google_container_cluster.existing_gke_cluster ] } + +output "gke_master_version" { + description = "GKE cluster's master version." + value = data.google_container_cluster.existing_gke_cluster.master_version +} From 7780f46e729c237e197947b5c6324257734bee20 Mon Sep 17 00:00:00 2001 From: Farhad Sharabiani Date: Wed, 2 Oct 2024 19:53:33 +0000 Subject: [PATCH 2/3] wordings updated --- modules/compute/gke-node-pool/README.md | 2 +- modules/compute/gke-node-pool/gpu_direct.tf | 18 +++++++++--------- modules/compute/gke-node-pool/variables.tf | 4 ++-- modules/scheduler/gke-cluster/README.md | 2 +- modules/scheduler/gke-cluster/outputs.tf | 4 ++-- .../pre-existing-gke-cluster/README.md | 2 +- .../pre-existing-gke-cluster/outputs.tf | 4 ++-- 7 files changed, 18 insertions(+), 18 deletions(-) diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md index 7b1cffbf68..03652cf29e 100644 --- a/modules/compute/gke-node-pool/README.md +++ b/modules/compute/gke-node-pool/README.md @@ -294,7 +294,7 @@ limitations under the License. | [disk\_type](#input\_disk\_type) | Disk type for each node. | `string` | `null` | no | | [enable\_gcfs](#input\_enable\_gcfs) | Enable the Google Container Filesystem (GCFS). See [restrictions](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster#gcfs_config). | `bool` | `false` | no | | [enable\_secure\_boot](#input\_enable\_secure\_boot) | Enable secure boot for the nodes. Keep enabled unless custom kernel modules need to be loaded. See [here](https://cloud.google.com/compute/shielded-vm/docs/shielded-vm#secure-boot) for more info. | `bool` | `true` | no | -| [gke\_master\_version](#input\_gke\_master\_version) | GKE master version | `string` | n/a | yes | +| [gke\_version](#input\_gke\_version) | GKE version | `string` | n/a | yes | | [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = optional(string)
count = optional(number, 0)
gpu_driver_installation_config = optional(list(object({
gpu_driver_version = string
})))
gpu_partition_size = optional(string)
gpu_sharing_config = optional(list(object({
gpu_sharing_strategy = optional(string)
max_shared_clients_per_gpu = optional(number)
})))
}))
| `null` | no | | [host\_maintenance\_interval](#input\_host\_maintenance\_interval) | Specifies the frequency of planned maintenance events. | `string` | `""` | no | | [image\_type](#input\_image\_type) | The default image type used by NAP once a new node pool is being created. Use either COS\_CONTAINERD or UBUNTU\_CONTAINERD. | `string` | `"COS_CONTAINERD"` | no | diff --git a/modules/compute/gke-node-pool/gpu_direct.tf b/modules/compute/gke-node-pool/gpu_direct.tf index 4fef57e914..27c61f0256 100644 --- a/modules/compute/gke-node-pool/gpu_direct.tf +++ b/modules/compute/gke-node-pool/gpu_direct.tf @@ -33,7 +33,7 @@ locals { updated_workload_path = replace(local.workload_path_tcpx, ".yaml", "-tcpx.yaml") rxdm_version = "v2.0.12" # matching nccl-tcpx-installer version v3.1.9 min_additional_networks = 4 - min_gke_versions = { + major_minor_version_acceptable_map = { "1.27" = "1.27.7-gke.1121000" "1.28" = "1.28.8-gke.1095000" "1.29" = "1.29.3-gke.1093000" @@ -49,7 +49,7 @@ locals { updated_workload_path = replace(local.workload_path_tcpxo, ".yaml", "-tcpxo.yaml") rxdm_version = "v1.0.10" # matching nccl-tcpxo-installer version v1.0.4 min_additional_networks = 8 - min_gke_versions = { + major_minor_version_acceptable_map = { "1.28" = "1.28.9-gke.1250000" "1.29" = "1.29.4-gke.1542000" "1.30" = "1.30.4-gke.1129000" @@ -61,13 +61,13 @@ locals { gke_version_regex = "(\\d+\\.\\d+)\\.(\\d+)-gke\\.(\\d+)" # GKE version format: 1.X.Y-gke.Z , regex output: ["1.X" , "Y", "Z"] - gke_version_parts = regex(local.gke_version_regex, var.gke_master_version) + gke_version_parts = regex(local.gke_version_regex, var.gke_version) gke_version_major = local.gke_version_parts[0] - min_gke_versions = try(local.gpu_direct_setting[var.machine_type].min_gke_versions, null) - min_version = try(contains(keys(local.min_gke_versions), local.gke_version_major), false) ? local.min_gke_versions[local.gke_version_major] : "1.0.0-gke.0" - min_version_parts = regex(local.gke_version_regex, local.min_version) - gke_gpudirect_compatible = local.gke_version_parts[1] > local.min_version_parts[1] || (local.gke_version_parts[1] == local.min_version_parts[1] && local.gke_version_parts[2] >= local.min_version_parts[2]) + major_minor_version_acceptable_map = try(local.gpu_direct_setting[var.machine_type].major_minor_version_acceptable_map, null) + minor_version_acceptable = try(contains(keys(local.major_minor_version_acceptable_map), local.gke_version_major), false) ? local.major_minor_version_acceptable_map[local.gke_version_major] : "1.0.0-gke.0" + minor_version_acceptable_parts = regex(local.gke_version_regex, local.minor_version_acceptable) + gke_gpudirect_compatible = local.gke_version_parts[1] > local.minor_version_acceptable_parts[1] || (local.gke_version_parts[1] == local.minor_version_acceptable_parts[1] && local.gke_version_parts[2] >= local.minor_version_acceptable_parts[2]) } check "gpu_direct_check_multi_vpc" { @@ -77,9 +77,9 @@ check "gpu_direct_check_multi_vpc" { } } -check "gke_master_version_requirements" { +check "gke_version_requirements" { assert { condition = local.gke_gpudirect_compatible - error_message = "GPUDirect is not supported on GKE master version ${var.gke_master_version} for ${var.machine_type} machine. For supported version details visit https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#requirements" + error_message = "GPUDirect is not supported on GKE master version ${var.gke_version} for ${var.machine_type} machine. For supported version details visit https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#requirements" } } diff --git a/modules/compute/gke-node-pool/variables.tf b/modules/compute/gke-node-pool/variables.tf index 62160a2448..b24aef91df 100644 --- a/modules/compute/gke-node-pool/variables.tf +++ b/modules/compute/gke-node-pool/variables.tf @@ -361,7 +361,7 @@ variable "initial_node_count" { default = null } -variable "gke_master_version" { - description = "GKE master version" +variable "gke_version" { + description = "GKE version" type = string } diff --git a/modules/scheduler/gke-cluster/README.md b/modules/scheduler/gke-cluster/README.md index 4548db2fc9..583af203da 100644 --- a/modules/scheduler/gke-cluster/README.md +++ b/modules/scheduler/gke-cluster/README.md @@ -194,7 +194,7 @@ limitations under the License. |------|-------------| | [cluster\_id](#output\_cluster\_id) | An identifier for the resource with format projects/{{project\_id}}/locations/{{region}}/clusters/{{name}}. | | [gke\_cluster\_exists](#output\_gke\_cluster\_exists) | A static flag that signals to downstream modules that a cluster has been created. Needed by community/modules/scripts/kubernetes-operations. | -| [gke\_master\_version](#output\_gke\_master\_version) | GKE cluster's master version. | +| [gke\_version](#output\_gke\_version) | GKE cluster's version. | | [instructions](#output\_instructions) | Instructions on how to connect to the created cluster. | | [k8s\_service\_account\_name](#output\_k8s\_service\_account\_name) | Name of k8s service account. | diff --git a/modules/scheduler/gke-cluster/outputs.tf b/modules/scheduler/gke-cluster/outputs.tf index 4daed8ee25..28e00171ff 100644 --- a/modules/scheduler/gke-cluster/outputs.tf +++ b/modules/scheduler/gke-cluster/outputs.tf @@ -75,7 +75,7 @@ output "k8s_service_account_name" { value = one(module.workload_identity[*].k8s_service_account_name) } -output "gke_master_version" { - description = "GKE cluster's master version." +output "gke_version" { + description = "GKE cluster's version." value = google_container_cluster.gke_cluster.master_version } diff --git a/modules/scheduler/pre-existing-gke-cluster/README.md b/modules/scheduler/pre-existing-gke-cluster/README.md index 1f2904d889..4caf7ff258 100644 --- a/modules/scheduler/pre-existing-gke-cluster/README.md +++ b/modules/scheduler/pre-existing-gke-cluster/README.md @@ -111,5 +111,5 @@ limitations under the License. |------|-------------| | [cluster\_id](#output\_cluster\_id) | An identifier for the gke cluster with format projects/{{project\_id}}/locations/{{region}}/clusters/{{name}}. | | [gke\_cluster\_exists](#output\_gke\_cluster\_exists) | A static flag that signals to downstream modules that a cluster exists. | -| [gke\_master\_version](#output\_gke\_master\_version) | GKE cluster's master version. | +| [gke\_version](#output\_gke\_version) | GKE cluster's version. | diff --git a/modules/scheduler/pre-existing-gke-cluster/outputs.tf b/modules/scheduler/pre-existing-gke-cluster/outputs.tf index 90772d3dae..8884ee30b0 100644 --- a/modules/scheduler/pre-existing-gke-cluster/outputs.tf +++ b/modules/scheduler/pre-existing-gke-cluster/outputs.tf @@ -27,7 +27,7 @@ output "gke_cluster_exists" { ] } -output "gke_master_version" { - description = "GKE cluster's master version." +output "gke_version" { + description = "GKE cluster's version." value = data.google_container_cluster.existing_gke_cluster.master_version } From 14864db2a7e9c8587cd8caeb5d288d0ba0266a34 Mon Sep 17 00:00:00 2001 From: Farhad Sharabiani Date: Wed, 2 Oct 2024 21:45:48 +0000 Subject: [PATCH 3/3] minor wording update --- modules/compute/gke-node-pool/gpu_direct.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/compute/gke-node-pool/gpu_direct.tf b/modules/compute/gke-node-pool/gpu_direct.tf index 27c61f0256..00dd298971 100644 --- a/modules/compute/gke-node-pool/gpu_direct.tf +++ b/modules/compute/gke-node-pool/gpu_direct.tf @@ -80,6 +80,6 @@ check "gpu_direct_check_multi_vpc" { check "gke_version_requirements" { assert { condition = local.gke_gpudirect_compatible - error_message = "GPUDirect is not supported on GKE master version ${var.gke_version} for ${var.machine_type} machine. For supported version details visit https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#requirements" + error_message = "GPUDirect is not supported on GKE version ${var.gke_version} for ${var.machine_type} machine. For supported version details visit https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#requirements" } }