From e4390e3649f85dea65bae085f90ffb2e3fc1aeb7 Mon Sep 17 00:00:00 2001
From: annuay <annuay@google.com>
Date: Thu, 3 Oct 2024 15:46:35 +0000
Subject: [PATCH] update modules to accept kubectl provider from root

---
 modules/compute/gke-node-pool/README.md       |  1 +
 modules/compute/gke-node-pool/gpu_direct.tf   | 28 +++++++++++++++++++
 modules/compute/gke-node-pool/main.tf         |  3 --
 modules/compute/gke-node-pool/variables.tf    |  5 ++++
 modules/management/kubectl-apply/README.md    | 13 ++-------
 modules/management/kubectl-apply/main.tf      | 25 +++--------------
 modules/management/kubectl-apply/providers.tf |  8 ------
 modules/management/kubectl-apply/variables.tf | 11 --------
 modules/management/kubectl-apply/versions.tf  | 12 --------
 modules/scheduler/gke-cluster/README.md       |  4 +++
 modules/scheduler/gke-cluster/main.tf         |  3 --
 modules/scheduler/gke-cluster/outputs.tf      | 20 +++++++++++++
 .../pre-existing-gke-cluster/README.md        |  5 ++++
 .../pre-existing-gke-cluster/main.tf          |  5 ++--
 .../pre-existing-gke-cluster/outputs.tf       | 21 ++++++++++++++
 15 files changed, 92 insertions(+), 72 deletions(-)
diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md
index fcf7414af6..03652cf29e 100644
--- a/modules/compute/gke-node-pool/README.md
+++ b/modules/compute/gke-node-pool/README.md
@@ -294,6 +294,7 @@ limitations under the License.
 | <a name="input_disk_type"></a> [disk\_type](#input\_disk\_type) | Disk type for each node. | `string` | `null` | no |
 | <a name="input_enable_gcfs"></a> [enable\_gcfs](#input\_enable\_gcfs) | Enable the Google Container Filesystem (GCFS). See [restrictions](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster#gcfs_config). | `bool` | `false` | no |
 | <a name="input_enable_secure_boot"></a> [enable\_secure\_boot](#input\_enable\_secure\_boot) | Enable secure boot for the nodes.  Keep enabled unless custom kernel modules need to be loaded. See [here](https://cloud.google.com/compute/shielded-vm/docs/shielded-vm#secure-boot) for more info. | `bool` | `true` | no |
+| <a name="input_gke_version"></a> [gke\_version](#input\_gke\_version) | GKE version | `string` | n/a | yes |
 | <a name="input_guest_accelerator"></a> [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. | <pre>list(object({<br/>    type  = optional(string)<br/>    count = optional(number, 0)<br/>    gpu_driver_installation_config = optional(list(object({<br/>      gpu_driver_version = string<br/>    })))<br/>    gpu_partition_size = optional(string)<br/>    gpu_sharing_config = optional(list(object({<br/>      gpu_sharing_strategy       = optional(string)<br/>      max_shared_clients_per_gpu = optional(number)<br/>    })))<br/>  }))</pre> | `null` | no |
 | <a name="input_host_maintenance_interval"></a> [host\_maintenance\_interval](#input\_host\_maintenance\_interval) | Specifies the frequency of planned maintenance events. | `string` | `""` | no |
 | <a name="input_image_type"></a> [image\_type](#input\_image\_type) | The default image type used by NAP once a new node pool is being created. Use either COS\_CONTAINERD or UBUNTU\_CONTAINERD. | `string` | `"COS_CONTAINERD"` | no |
diff --git a/modules/compute/gke-node-pool/gpu_direct.tf b/modules/compute/gke-node-pool/gpu_direct.tf
index b22c353f69..00dd298971 100644
--- a/modules/compute/gke-node-pool/gpu_direct.tf
+++ b/modules/compute/gke-node-pool/gpu_direct.tf
@@ -33,6 +33,12 @@ locals {
       updated_workload_path   = replace(local.workload_path_tcpx, ".yaml", "-tcpx.yaml")
       rxdm_version            = "v2.0.12" # matching nccl-tcpx-installer version v3.1.9
       min_additional_networks = 4
+      major_minor_version_acceptable_map = {
+        "1.27" = "1.27.7-gke.1121000"
+        "1.28" = "1.28.8-gke.1095000"
+        "1.29" = "1.29.3-gke.1093000"
+        "1.30" = "1.30.2-gke.1023000"
+      }
     }
     "a3-megagpu-8g" = {
       # Manifest to be installed for enabling TCPXO on a3-megagpu-8g machines
@@ -43,10 +49,25 @@ locals {
       updated_workload_path   = replace(local.workload_path_tcpxo, ".yaml", "-tcpxo.yaml")
       rxdm_version            = "v1.0.10" # matching nccl-tcpxo-installer version v1.0.4
       min_additional_networks = 8
+      major_minor_version_acceptable_map = {
+        "1.28" = "1.28.9-gke.1250000"
+        "1.29" = "1.29.4-gke.1542000"
+        "1.30" = "1.30.4-gke.1129000"
+      }
     }
   }
 
   min_additional_networks = try(local.gpu_direct_settings[var.machine_type].min_additional_networks, 0)
+
+  gke_version_regex = "(\\d+\\.\\d+)\\.(\\d+)-gke\\.(\\d+)" # GKE version format: 1.X.Y-gke.Z , regex output: ["1.X" , "Y", "Z"]
+
+  gke_version_parts = regex(local.gke_version_regex, var.gke_version)
+  gke_version_major = local.gke_version_parts[0]
+
+  major_minor_version_acceptable_map = try(local.gpu_direct_setting[var.machine_type].major_minor_version_acceptable_map, null)
+  minor_version_acceptable           = try(contains(keys(local.major_minor_version_acceptable_map), local.gke_version_major), false) ? local.major_minor_version_acceptable_map[local.gke_version_major] : "1.0.0-gke.0"
+  minor_version_acceptable_parts     = regex(local.gke_version_regex, local.minor_version_acceptable)
+  gke_gpudirect_compatible           = local.gke_version_parts[1] > local.minor_version_acceptable_parts[1] || (local.gke_version_parts[1] == local.minor_version_acceptable_parts[1] && local.gke_version_parts[2] >= local.minor_version_acceptable_parts[2])
 }
 
 check "gpu_direct_check_multi_vpc" {
@@ -55,3 +76,10 @@ check "gpu_direct_check_multi_vpc" {
     error_message = "To achieve optimal performance for ${var.machine_type} machine, at least ${local.min_additional_networks} additional vpc is recommended. You could configure it in the blueprint through modules/network/multivpc with network_count set as ${local.min_additional_networks}"
   }
 }
+
+check "gke_version_requirements" {
+  assert {
+    condition     = local.gke_gpudirect_compatible
+    error_message = "GPUDirect is not supported on GKE version ${var.gke_version} for ${var.machine_type} machine. For supported version details visit https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#requirements"
+  }
+}
diff --git a/modules/compute/gke-node-pool/main.tf b/modules/compute/gke-node-pool/main.tf
index f391532976..f53737527b 100644
--- a/modules/compute/gke-node-pool/main.tf
+++ b/modules/compute/gke-node-pool/main.tf
@@ -316,9 +316,6 @@ resource "null_resource" "enable_tcpxo_in_workload" {
 module "kubectl_apply" {
   source = "../../management/kubectl-apply"
 
-  cluster_id = var.cluster_id
-  project_id = var.project_id
-
   apply_manifests = flatten([
     for manifest in local.gpu_direct_setting.gpu_direct_manifests : [
       {
diff --git a/modules/compute/gke-node-pool/variables.tf b/modules/compute/gke-node-pool/variables.tf
index ef1277744f..b24aef91df 100644
--- a/modules/compute/gke-node-pool/variables.tf
+++ b/modules/compute/gke-node-pool/variables.tf
@@ -360,3 +360,8 @@ variable "initial_node_count" {
   type        = number
   default     = null
 }
+
+variable "gke_version" {
+  description = "GKE version"
+  type        = string
+}
diff --git a/modules/management/kubectl-apply/README.md b/modules/management/kubectl-apply/README.md
index bd91e424dc..37dbaa4115 100644
--- a/modules/management/kubectl-apply/README.md
+++ b/modules/management/kubectl-apply/README.md
@@ -86,15 +86,11 @@ limitations under the License.
 | Name | Version |
 |------|---------|
 | <a name="requirement_terraform"></a> [terraform](#requirement\_terraform) | >= 1.3 |
-| <a name="requirement_google"></a> [google](#requirement\_google) | > 5.0 |
 | <a name="requirement_http"></a> [http](#requirement\_http) | ~> 3.0 |
-| <a name="requirement_kubectl"></a> [kubectl](#requirement\_kubectl) | >= 1.7.0 |
 
 ## Providers
 
-| Name | Version |
-|------|---------|
-| <a name="provider_google"></a> [google](#provider\_google) | > 5.0 |
+No providers.
 
 ## Modules
 
@@ -107,20 +103,15 @@ limitations under the License.
 
 ## Resources
 
-| Name | Type |
-|------|------|
-| [google_client_config.default](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/client_config) | data source |
-| [google_container_cluster.gke_cluster](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/container_cluster) | data source |
+No resources.
 
 ## Inputs
 
 | Name | Description | Type | Default | Required |
 |------|-------------|------|---------|:--------:|
 | <a name="input_apply_manifests"></a> [apply\_manifests](#input\_apply\_manifests) | A list of manifests to apply to GKE cluster using kubectl. For more details see [kubectl module's inputs](kubectl/README.md). | <pre>list(object({<br/>    content           = optional(string, null)<br/>    source            = optional(string, null)<br/>    template_vars     = optional(map(any), null)<br/>    server_side_apply = optional(bool, false)<br/>    wait_for_rollout  = optional(bool, true)<br/>  }))</pre> | `[]` | no |
-| <a name="input_cluster_id"></a> [cluster\_id](#input\_cluster\_id) | An identifier for the gke cluster resource with format projects/<project\_id>/locations/<region>/clusters/<name>. | `string` | n/a | yes |
 | <a name="input_jobset"></a> [jobset](#input\_jobset) | Install [Jobset](https://github.com/kubernetes-sigs/jobset) which manages a group of K8s [jobs](https://kubernetes.io/docs/concepts/workloads/controllers/job/) as a unit. | <pre>object({<br/>    install = optional(bool, false)<br/>    version = optional(string, "v0.5.2")<br/>  })</pre> | `{}` | no |
 | <a name="input_kueue"></a> [kueue](#input\_kueue) | Install and configure [Kueue](https://kueue.sigs.k8s.io/docs/overview/) workload scheduler. | <pre>object({<br/>    install     = optional(bool, false)<br/>    version     = optional(string, "v0.8.1")<br/>    config_path = optional(string, null)<br/>  })</pre> | `{}` | no |
-| <a name="input_project_id"></a> [project\_id](#input\_project\_id) | The project ID that hosts the gke cluster. | `string` | n/a | yes |
 
 ## Outputs
 
diff --git a/modules/management/kubectl-apply/main.tf b/modules/management/kubectl-apply/main.tf
index dd68be57f6..fdcdf80c7f 100644
--- a/modules/management/kubectl-apply/main.tf
+++ b/modules/management/kubectl-apply/main.tf
@@ -15,11 +15,6 @@
   */
 
 locals {
-  cluster_id_parts = split("/", var.cluster_id)
-  cluster_name     = local.cluster_id_parts[5]
-  cluster_location = local.cluster_id_parts[3]
-  project_id       = var.project_id != null ? var.project_id : local.cluster_id_parts[1]
-
   apply_manifests_map = tomap({
     for index, manifest in var.apply_manifests : index => manifest
   })
@@ -30,14 +25,6 @@ locals {
   jobset_install_source = format("${path.module}/manifests/jobset-%s.yaml", try(var.jobset.version, ""))
 }
 
-data "google_container_cluster" "gke_cluster" {
-  project  = local.project_id
-  name     = local.cluster_name
-  location = local.cluster_location
-}
-
-data "google_client_config" "default" {}
-
 module "kubectl_apply_manifests" {
   for_each = local.apply_manifests_map
   source   = "./kubectl"
@@ -49,8 +36,7 @@ module "kubectl_apply_manifests" {
   wait_for_rollout  = each.value.wait_for_rollout
 
   providers = {
-    kubectl = kubectl
-    http    = http.h
+    http = http.h
   }
 }
 
@@ -60,8 +46,7 @@ module "install_kueue" {
   server_side_apply = true
 
   providers = {
-    kubectl = kubectl
-    http    = http.h
+    http = http.h
   }
 }
 
@@ -71,8 +56,7 @@ module "install_jobset" {
   server_side_apply = true
 
   providers = {
-    kubectl = kubectl
-    http    = http.h
+    http = http.h
   }
 }
 
@@ -85,7 +69,6 @@ module "configure_kueue" {
   wait_for_rollout  = true
 
   providers = {
-    kubectl = kubectl
-    http    = http.h
+    http = http.h
   }
 }
diff --git a/modules/management/kubectl-apply/providers.tf b/modules/management/kubectl-apply/providers.tf
index 74d157b93b..d5577975f3 100644
--- a/modules/management/kubectl-apply/providers.tf
+++ b/modules/management/kubectl-apply/providers.tf
@@ -14,14 +14,6 @@
   * limitations under the License.
   */
 
-provider "kubectl" {
-  host                   = "https://${data.google_container_cluster.gke_cluster.endpoint}"
-  token                  = data.google_client_config.default.access_token
-  cluster_ca_certificate = base64decode(data.google_container_cluster.gke_cluster.master_auth[0].cluster_ca_certificate)
-  load_config_file       = false
-  apply_retry_count      = 15 # Terraform may apply resources in parallel, leading to potential dependency issues. This retry mechanism ensures that if a resource's dependencies aren't ready, Terraform will attempt to apply it again.
-}
-
 provider "http" {
   alias = "h"
 }
diff --git a/modules/management/kubectl-apply/variables.tf b/modules/management/kubectl-apply/variables.tf
index e0dd6430f5..653ea5bb8a 100644
--- a/modules/management/kubectl-apply/variables.tf
+++ b/modules/management/kubectl-apply/variables.tf
@@ -14,17 +14,6 @@
   * limitations under the License.
   */
 
-variable "project_id" {
-  description = "The project ID that hosts the gke cluster."
-  type        = string
-}
-
-variable "cluster_id" {
-  description = "An identifier for the gke cluster resource with format projects/<project_id>/locations/<region>/clusters/<name>."
-  type        = string
-  nullable    = false
-}
-
 variable "apply_manifests" {
   description = "A list of manifests to apply to GKE cluster using kubectl. For more details see [kubectl module's inputs](kubectl/README.md)."
   type = list(object({
diff --git a/modules/management/kubectl-apply/versions.tf b/modules/management/kubectl-apply/versions.tf
index 227838747c..3ec8f3337f 100644
--- a/modules/management/kubectl-apply/versions.tf
+++ b/modules/management/kubectl-apply/versions.tf
@@ -16,23 +16,11 @@
 
 terraform {
   required_providers {
-    google = {
-      source  = "hashicorp/google"
-      version = "> 5.0"
-    }
-    kubectl = {
-      source  = "gavinbunney/kubectl"
-      version = ">= 1.7.0"
-    }
     http = {
       source  = "hashicorp/http"
       version = "~> 3.0"
     }
   }
 
-  provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:kubectl-apply/v1.37.2"
-  }
-
   required_version = ">= 1.3"
 }
diff --git a/modules/scheduler/gke-cluster/README.md b/modules/scheduler/gke-cluster/README.md
index 3a72e1149b..0ce00c06f3 100644
--- a/modules/scheduler/gke-cluster/README.md
+++ b/modules/scheduler/gke-cluster/README.md
@@ -192,8 +192,12 @@ limitations under the License.
 
 | Name | Description |
 |------|-------------|
+| <a name="output_access_token"></a> [access\_token](#output\_access\_token) | Google client config access token. |
+| <a name="output_cluster_ca_certificate"></a> [cluster\_ca\_certificate](#output\_cluster\_ca\_certificate) | GKE cluster CA certificate. |
 | <a name="output_cluster_id"></a> [cluster\_id](#output\_cluster\_id) | An identifier for the resource with format projects/{{project\_id}}/locations/{{region}}/clusters/{{name}}. |
+| <a name="output_gke_cluster_endpoint"></a> [gke\_cluster\_endpoint](#output\_gke\_cluster\_endpoint) | GKE cluster endpoint. |
 | <a name="output_gke_cluster_exists"></a> [gke\_cluster\_exists](#output\_gke\_cluster\_exists) | A static flag that signals to downstream modules that a cluster has been created. Needed by community/modules/scripts/kubernetes-operations. |
+| <a name="output_gke_version"></a> [gke\_version](#output\_gke\_version) | GKE cluster's version. |
 | <a name="output_instructions"></a> [instructions](#output\_instructions) | Instructions on how to connect to the created cluster. |
 | <a name="output_k8s_service_account_name"></a> [k8s\_service\_account\_name](#output\_k8s\_service\_account\_name) | Name of k8s service account. |
 <!-- END OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
diff --git a/modules/scheduler/gke-cluster/main.tf b/modules/scheduler/gke-cluster/main.tf
index 480d5b7d58..c265fb1d52 100644
--- a/modules/scheduler/gke-cluster/main.tf
+++ b/modules/scheduler/gke-cluster/main.tf
@@ -335,9 +335,6 @@ module "workload_identity" {
 module "kubectl_apply" {
   source = "../../management/kubectl-apply"
 
-  cluster_id = google_container_cluster.gke_cluster.id
-  project_id = var.project_id
-
   apply_manifests = flatten([
     for idx, network_info in var.additional_networks : [
       {
diff --git a/modules/scheduler/gke-cluster/outputs.tf b/modules/scheduler/gke-cluster/outputs.tf
index 53ee068ca2..a633675a0a 100644
--- a/modules/scheduler/gke-cluster/outputs.tf
+++ b/modules/scheduler/gke-cluster/outputs.tf
@@ -74,3 +74,23 @@ output "k8s_service_account_name" {
   description = "Name of k8s service account."
   value       = one(module.workload_identity[*].k8s_service_account_name)
 }
+
+output "gke_version" {
+  description = "GKE cluster's version."
+  value       = google_container_cluster.gke_cluster.master_version
+}
+
+output "gke_cluster_endpoint" {
+  description = "GKE cluster endpoint."
+  value       = google_container_cluster.gke_cluster.endpoint
+}
+
+output "cluster_ca_certificate" {
+  description = "GKE cluster CA certificate."
+  value       = google_container_cluster.gke_cluster.master_auth[0].cluster_ca_certificate
+}
+
+output "access_token" {
+  description = "Google client config access token."
+  value       = data.google_client_config.default.access_token
+}
diff --git a/modules/scheduler/pre-existing-gke-cluster/README.md b/modules/scheduler/pre-existing-gke-cluster/README.md
index 519715480d..1ec6460261 100644
--- a/modules/scheduler/pre-existing-gke-cluster/README.md
+++ b/modules/scheduler/pre-existing-gke-cluster/README.md
@@ -94,6 +94,7 @@ limitations under the License.
 
 | Name | Type |
 |------|------|
+| [google_client_config.default](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/client_config) | data source |
 | [google_container_cluster.existing_gke_cluster](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/container_cluster) | data source |
 
 ## Inputs
@@ -109,6 +110,10 @@ limitations under the License.
 
 | Name | Description |
 |------|-------------|
+| <a name="output_access_token"></a> [access\_token](#output\_access\_token) | Google client config access token. |
+| <a name="output_cluster_ca_certificate"></a> [cluster\_ca\_certificate](#output\_cluster\_ca\_certificate) | GKE cluster CA certificate. |
 | <a name="output_cluster_id"></a> [cluster\_id](#output\_cluster\_id) | An identifier for the gke cluster with format projects/{{project\_id}}/locations/{{region}}/clusters/{{name}}. |
+| <a name="output_gke_cluster_endpoint"></a> [gke\_cluster\_endpoint](#output\_gke\_cluster\_endpoint) | GKE cluster endpoint. |
 | <a name="output_gke_cluster_exists"></a> [gke\_cluster\_exists](#output\_gke\_cluster\_exists) | A static flag that signals to downstream modules that a cluster exists. |
+| <a name="output_gke_version"></a> [gke\_version](#output\_gke\_version) | GKE cluster's version. |
 <!-- END OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
diff --git a/modules/scheduler/pre-existing-gke-cluster/main.tf b/modules/scheduler/pre-existing-gke-cluster/main.tf
index 4b65ebe365..67afe01cad 100644
--- a/modules/scheduler/pre-existing-gke-cluster/main.tf
+++ b/modules/scheduler/pre-existing-gke-cluster/main.tf
@@ -20,12 +20,11 @@ data "google_container_cluster" "existing_gke_cluster" {
   location = var.region
 }
 
+data "google_client_config" "default" {}
+
 module "kubectl_apply" {
   source = "../../management/kubectl-apply" # can point to github
 
-  cluster_id = data.google_container_cluster.existing_gke_cluster.id
-  project_id = var.project_id
-
   apply_manifests = flatten([
     for idx, network_info in var.additional_networks : [
       {
diff --git a/modules/scheduler/pre-existing-gke-cluster/outputs.tf b/modules/scheduler/pre-existing-gke-cluster/outputs.tf
index 9bfd571b61..543148fa16 100644
--- a/modules/scheduler/pre-existing-gke-cluster/outputs.tf
+++ b/modules/scheduler/pre-existing-gke-cluster/outputs.tf
@@ -26,3 +26,24 @@ output "gke_cluster_exists" {
     data.google_container_cluster.existing_gke_cluster
   ]
 }
+
+output "gke_version" {
+  description = "GKE cluster's version."
+  value       = data.google_container_cluster.existing_gke_cluster.master_version
+}
+
+
+output "gke_cluster_endpoint" {
+  description = "GKE cluster endpoint."
+  value       = data.google_container_cluster.existing_gke_cluster.endpoint
+}
+
+output "cluster_ca_certificate" {
+  description = "GKE cluster CA certificate."
+  value       = data.google_container_cluster.existing_gke_cluster.master_auth[0].cluster_ca_certificate
+}
+
+output "access_token" {
+  description = "Google client config access token."
+  value       = data.google_client_config.default.access_token
+}