diff --git a/cmd/root.go b/cmd/root.go index cbe76ebfc6..c39e3887fe 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -53,7 +53,7 @@ HPC deployments on the Google Cloud Platform.`, logging.Fatal("cmd.Help function failed: %s", err) } }, - Version: "v1.57.0", + Version: "v1.58.0", Annotations: annotation, } ) diff --git a/community/modules/compute/htcondor-execute-point/versions.tf b/community/modules/compute/htcondor-execute-point/versions.tf index 47c905b352..40014c4bfb 100644 --- a/community/modules/compute/htcondor-execute-point/versions.tf +++ b/community/modules/compute/htcondor-execute-point/versions.tf @@ -29,6 +29,6 @@ terraform { } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-execute-point/v1.57.0" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-execute-point/v1.58.0" } } diff --git a/community/modules/compute/mig/versions.tf b/community/modules/compute/mig/versions.tf index a1045b9a6b..ed97d77150 100644 --- a/community/modules/compute/mig/versions.tf +++ b/community/modules/compute/mig/versions.tf @@ -22,6 +22,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:mig/v1.57.0" + module_name = "blueprints/terraform/hpc-toolkit:mig/v1.58.0" } } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/versions.tf index 73100a9d52..c3e2a4ac5f 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/versions.tf @@ -24,6 +24,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset-dynamic/v1.57.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset-dynamic/v1.58.0" } } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf index 8b9d422d23..da994918ae 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf @@ -18,6 +18,6 @@ terraform { required_version = ">= 1.3" provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset-tpu/v1.57.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset-tpu/v1.58.0" } } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf index b879e1bd17..e3784c71d2 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf @@ -24,6 +24,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset/v1.57.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset/v1.58.0" } } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-partition/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v6-partition/versions.tf index 5298c89dbe..22b8b5aaab 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-partition/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-partition/versions.tf @@ -18,6 +18,6 @@ terraform { required_version = ">= 1.3" provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-partition/v1.57.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-partition/v1.58.0" } } diff --git a/community/modules/database/slurm-cloudsql-federation/versions.tf b/community/modules/database/slurm-cloudsql-federation/versions.tf index 922e383a69..d210843592 100644 --- a/community/modules/database/slurm-cloudsql-federation/versions.tf +++ b/community/modules/database/slurm-cloudsql-federation/versions.tf @@ -26,10 +26,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.57.0" + module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.58.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.57.0" + module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.58.0" } required_version = ">= 0.13.0" diff --git a/community/modules/file-system/cloud-storage-bucket/versions.tf b/community/modules/file-system/cloud-storage-bucket/versions.tf index e80c28ca47..4e4591c750 100644 --- a/community/modules/file-system/cloud-storage-bucket/versions.tf +++ b/community/modules/file-system/cloud-storage-bucket/versions.tf @@ -30,10 +30,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:cloud-storage-bucket/v1.57.0" + module_name = "blueprints/terraform/hpc-toolkit:cloud-storage-bucket/v1.58.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:cloud-storage-bucket/v1.57.0" + module_name = "blueprints/terraform/hpc-toolkit:cloud-storage-bucket/v1.58.0" } required_version = ">= 0.14.0" } diff --git a/community/modules/file-system/nfs-server/versions.tf b/community/modules/file-system/nfs-server/versions.tf index 8c8c39fda7..b747233d9c 100644 --- a/community/modules/file-system/nfs-server/versions.tf +++ b/community/modules/file-system/nfs-server/versions.tf @@ -30,7 +30,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:nfs-server/v1.57.0" + module_name = "blueprints/terraform/hpc-toolkit:nfs-server/v1.58.0" } required_version = ">= 0.14.0" diff --git a/community/modules/files/fsi-montecarlo-on-batch/versions.tf b/community/modules/files/fsi-montecarlo-on-batch/versions.tf index 07270f9cfe..5dc7905f10 100644 --- a/community/modules/files/fsi-montecarlo-on-batch/versions.tf +++ b/community/modules/files/fsi-montecarlo-on-batch/versions.tf @@ -35,9 +35,9 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:fsi-montecarlo-on-batch/v1.57.0" + module_name = "blueprints/terraform/hpc-toolkit:fsi-montecarlo-on-batch/v1.58.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:fsi-montecarlo-on-batch/v1.57.0" + module_name = "blueprints/terraform/hpc-toolkit:fsi-montecarlo-on-batch/v1.58.0" } } diff --git a/community/modules/internal/slurm-gcp/login/versions.tf b/community/modules/internal/slurm-gcp/login/versions.tf index 23b07fc64a..c01a91173b 100644 --- a/community/modules/internal/slurm-gcp/login/versions.tf +++ b/community/modules/internal/slurm-gcp/login/versions.tf @@ -24,6 +24,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-controller/v1.57.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-controller/v1.58.0" } } diff --git a/community/modules/network/private-service-access/versions.tf b/community/modules/network/private-service-access/versions.tf index c5a889017f..aaa1b4a4c1 100644 --- a/community/modules/network/private-service-access/versions.tf +++ b/community/modules/network/private-service-access/versions.tf @@ -30,11 +30,11 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:private-service-access/v1.57.0" + module_name = "blueprints/terraform/hpc-toolkit:private-service-access/v1.58.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:private-service-access/v1.57.0" + module_name = "blueprints/terraform/hpc-toolkit:private-service-access/v1.58.0" } required_version = ">= 1.2" diff --git a/community/modules/project/service-enablement/versions.tf b/community/modules/project/service-enablement/versions.tf index 67eb446cde..e508099479 100644 --- a/community/modules/project/service-enablement/versions.tf +++ b/community/modules/project/service-enablement/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:service-enablement/v1.57.0" + module_name = "blueprints/terraform/hpc-toolkit:service-enablement/v1.58.0" } required_version = ">= 0.14.0" diff --git a/community/modules/pubsub/bigquery-sub/versions.tf b/community/modules/pubsub/bigquery-sub/versions.tf index 54f9ccf8bb..48aaaab724 100644 --- a/community/modules/pubsub/bigquery-sub/versions.tf +++ b/community/modules/pubsub/bigquery-sub/versions.tf @@ -26,10 +26,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:bigquery-sub/v1.57.0" + module_name = "blueprints/terraform/hpc-toolkit:bigquery-sub/v1.58.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:bigquery-sub/v1.57.0" + module_name = "blueprints/terraform/hpc-toolkit:bigquery-sub/v1.58.0" } required_version = ">= 1.0" } diff --git a/community/modules/pubsub/topic/versions.tf b/community/modules/pubsub/topic/versions.tf index c2ef87845b..7a19611197 100644 --- a/community/modules/pubsub/topic/versions.tf +++ b/community/modules/pubsub/topic/versions.tf @@ -27,6 +27,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:topic/v1.57.0" + module_name = "blueprints/terraform/hpc-toolkit:topic/v1.58.0" } } diff --git a/community/modules/scheduler/htcondor-access-point/versions.tf b/community/modules/scheduler/htcondor-access-point/versions.tf index e64942c840..560c17beae 100644 --- a/community/modules/scheduler/htcondor-access-point/versions.tf +++ b/community/modules/scheduler/htcondor-access-point/versions.tf @@ -30,7 +30,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-access-point/v1.57.0" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-access-point/v1.58.0" } required_version = ">= 1.1" diff --git a/community/modules/scheduler/htcondor-central-manager/versions.tf b/community/modules/scheduler/htcondor-central-manager/versions.tf index 24051328ff..657de15676 100644 --- a/community/modules/scheduler/htcondor-central-manager/versions.tf +++ b/community/modules/scheduler/htcondor-central-manager/versions.tf @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-central-manager/v1.57.0" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-central-manager/v1.58.0" } required_version = ">= 1.1.0" diff --git a/community/modules/scheduler/htcondor-pool-secrets/versions.tf b/community/modules/scheduler/htcondor-pool-secrets/versions.tf index 87a806a0ea..1b12009cc4 100644 --- a/community/modules/scheduler/htcondor-pool-secrets/versions.tf +++ b/community/modules/scheduler/htcondor-pool-secrets/versions.tf @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-pool-secrets/v1.57.0" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-pool-secrets/v1.58.0" } required_version = ">= 1.3.0" diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/versions.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/versions.tf index 7feb700a3f..797a4c2bca 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/versions.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/versions.tf @@ -28,6 +28,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-controller/v1.57.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-controller/v1.58.0" } } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/versions.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/versions.tf index 51b3aa7271..9246bf10b0 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/versions.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/versions.tf @@ -24,6 +24,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-login/v1.57.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-login/v1.58.0" } } diff --git a/community/modules/scripts/wait-for-startup/versions.tf b/community/modules/scripts/wait-for-startup/versions.tf index bb5a32e08d..4388f65349 100644 --- a/community/modules/scripts/wait-for-startup/versions.tf +++ b/community/modules/scripts/wait-for-startup/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:wait-for-startup/v1.57.0" + module_name = "blueprints/terraform/hpc-toolkit:wait-for-startup/v1.58.0" } required_version = ">= 0.14.0" diff --git a/community/modules/scripts/windows-startup-script/versions.tf b/community/modules/scripts/windows-startup-script/versions.tf index 9d5f6bd68d..8ef2163712 100644 --- a/community/modules/scripts/windows-startup-script/versions.tf +++ b/community/modules/scripts/windows-startup-script/versions.tf @@ -16,7 +16,7 @@ terraform { provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:windows-startup-script/v1.57.0" + module_name = "blueprints/terraform/hpc-toolkit:windows-startup-script/v1.58.0" } required_version = ">= 0.14.0" diff --git a/examples/README.md b/examples/README.md index 9da06df179..e4a9a8dde6 100644 --- a/examples/README.md +++ b/examples/README.md @@ -59,6 +59,7 @@ md_toc github examples/README.md | sed -e "s/\s-\s/ * /" * [tutorial-fluent.yaml](#tutorial-fluentyaml--) ![community-badge] ![experimental-badge] * [gke-tpu-v6](#gke-tpu-v6--) ![community-badge] ![experimental-badge] * [xpk-n2-filestore](#xpk-n2-filestore--) ![community-badge] ![experimental-badge] + * [gke-h4d](#gke-h4d-) ![core-badge] * [Blueprint Schema](#blueprint-schema) * [Writing an HPC Blueprint](#writing-an-hpc-blueprint) * [Blueprint Boilerplate](#blueprint-boilerplate) @@ -1453,6 +1454,12 @@ python3 xpk.py info --cluster xpk-01 [xpk-n2-filestore]: ../community/examples/xpk-n2-filestore/xpk-n2-filestore.yaml +### [gke-h4d] ![core-badge] + +This blueprint uses GKE to provision a Kubernetes cluster and a H4D node pool, along with networks and service accounts. Information about H4D machines can be found [here](https://cloud.google.com/blog/products/compute/new-h4d-vms-optimized-for-hpc). The deployment instructions can be found in the [README](/examples/gke-h4d/README.md). + +[gke-h4d]: ../examples/gke-h4d + ## Blueprint Schema Similar documentation can be found on diff --git a/examples/gke-h4d/README.md b/examples/gke-h4d/README.md new file mode 100644 index 0000000000..a008b172b4 --- /dev/null +++ b/examples/gke-h4d/README.md @@ -0,0 +1,61 @@ +# GKE H4D Blueprint + +This blueprint uses GKE to provision a Kubernetes cluster and a H4D node pool, along with networks and service accounts. Information about H4D machines can be found [here](https://cloud.google.com/blog/products/compute/new-h4d-vms-optimized-for-hpc). + +> **_NOTE:_** The required GKE version for H4D support is >= 1.32.3-gke.1170000. + +## Steps to deploy the H4D blueprint + +1. Install Cluster Toolkit + 1. Install [dependencies](https://cloud.google.com/cluster-toolkit/docs/setup/install-dependencies). + 1. Set up [Cluster Toolkit](https://cloud.google.com/cluster-toolkit/docs/setup/configure-environment). +1. Switch to the Cluster Toolkit directory + + ```sh + cd cluster-toolkit + ``` + +1. Get the IP address for your host machine + + ```sh + curl ifconfig.me + ``` + +1. Update the vars block of the `gke-h4d-deployment.yaml` file. + 1. `project_id`: ID of the project where you are deploying the cluster. + 1. `deployment_name`: Name of the deployment. + 1. `region`: Compute region used for the deployment. + 1. `zone`: Compute zone used for the deployment. + 1. `static_node_count`: Number of nodes to create. + 1. `authorized_cidr`: update the IP address in `/32`. +1. Build the Cluster Toolkit binary + + ```sh + make + ``` + +1. Provision the GKE cluster + + ```sh + ./gcluster deploy -d examples/gke-h4d/gke-h4d-deployment.yaml examples/gke-h4d/gke-h4d.yaml + ``` + + These four options are displayed: + + ```sh + (D)isplay full proposed changes, + (A)pply proposed changes, + (S)top and exit, + (C)ontinue without applying + ``` + + Type `a` and hit enter to create the cluster. + +## Clean Up +To destroy all resources associated with creating the GKE cluster, run the following command: + +```sh +./gcluster destroy CLUSTER-NAME +``` + +Replace `CLUSTER-NAME` with the `deployment_name` used in the blueprint vars block. diff --git a/examples/gke-h4d/gke-h4d-deployment.yaml b/examples/gke-h4d/gke-h4d-deployment.yaml new file mode 100644 index 0000000000..eb44b53611 --- /dev/null +++ b/examples/gke-h4d/gke-h4d-deployment.yaml @@ -0,0 +1,40 @@ +# Copyright 2025 "Google LLC" +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +terraform_backend_defaults: + type: gcs + configuration: + # The GCS bucket used for storing terraform state + bucket: BUCKET_NAME + +vars: + # Your GCP Project ID + project_id: PROJECT_ID + + # This should be unique across all of your Cluster + # Toolkit Deployments. + deployment_name: DEPLOYMENT_NAME + + # The GCP Region used for this deployment. + region: COMPUTE_REGION + + # The GCP Zone used for this deployment. + zone: COMPUTE_ZONE + + # The number of nodes to be created + static_node_count: NODE_COUNT + + # Cidr block containing the IP of the machine calling terraform. + # The following line must be updated for this example to work. + authorized_cidr: IP_ADDRESS/SUFFIX diff --git a/examples/gke-h4d/gke-h4d.yaml b/examples/gke-h4d/gke-h4d.yaml new file mode 100644 index 0000000000..832b076561 --- /dev/null +++ b/examples/gke-h4d/gke-h4d.yaml @@ -0,0 +1,184 @@ +# Copyright 2025 "Google LLC" +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +blueprint_name: gke-h4d + +vars: + # The following variables should be over-written in the deployment.yaml file. + # Your GCP Project ID + project_id: + + # This should be unique across all of your Cluster + # Toolkit Deployments. + deployment_name: gke-h4d + + # The GCP Region used for this deployment. + region: + + # The GCP Zone used for this deployment. + zone: + + # The number of nodes to be created. + static_node_count: + + # Cidr block containing the IP of the machine calling terraform. + # The following line must be updated for this example to work. + authorized_cidr: + + system_node_pool_disk_size_gb: 100 + h4d_node_pool_disk_size_gb: 100 + + +deployment_groups: +- group: primary + modules: + - id: gke-h4d-net + source: modules/network/vpc + settings: + network_name: $(vars.deployment_name)-net + subnetworks: + - subnet_name: $(vars.deployment_name)-sub + subnet_region: $(vars.region) + subnet_ip: 192.168.0.0/24 + secondary_ranges_list: + - subnetwork_name: $(vars.deployment_name)-sub + ranges: + - range_name: pods + ip_cidr_range: 10.64.0.0/19 + - range_name: services + ip_cidr_range: 10.65.0.0/19 + firewall_rules: + - name: $(vars.deployment_name)-internal + ranges: [192.168.0.0/24] + allow: + - protocol: tcp + ports: ["0-65535"] + - protocol: udp + ports: ["0-65535"] + - protocol: icmp + + - id: gke-h4d-rdma-net + source: modules/network/vpc + settings: + network_name: $(vars.deployment_name)-rdma-net + network_profile: https://www.googleapis.com/compute/beta/projects/$(vars.project_id)/global/networkProfiles/$(vars.zone)-vpc-falcon + network_routing_mode: REGIONAL + enable_cloud_router: false + enable_cloud_nat: false + subnetworks: + - subnet_name: $(vars.deployment_name)-rdma-sub + subnet_region: $(vars.region) + subnet_ip: 192.168.1.0/24 + region: $(vars.region) + + - id: node_pool_service_account + source: community/modules/project/service-account + settings: + name: gke-np-sa + project_roles: + - logging.logWriter + - monitoring.metricWriter + - monitoring.viewer + - stackdriver.resourceMetadata.writer + - storage.objectViewer + - artifactregistry.reader + + - id: workload_service_account + source: community/modules/project/service-account + settings: + name: gke-wl-sa + project_roles: + - logging.logWriter + - monitoring.metricWriter + - monitoring.viewer + - stackdriver.resourceMetadata.writer + - storage.objectAdmin + - artifactregistry.reader + + - id: h4d-cluster + source: modules/scheduler/gke-cluster + use: [gke-h4d-net, workload_service_account] + settings: + system_node_pool_machine_type: "e2-standard-16" + system_node_pool_disk_size_gb: $(vars.system_node_pool_disk_size_gb) + system_node_pool_taints: [] + enable_multi_networking: true + enable_dcgm_monitoring: true + gcp_public_cidrs_access_enabled: false + enable_private_endpoint: false # Allows access from authorized public IPs + configure_workload_identity_sa: true + master_authorized_networks: + - cidr_block: $(vars.authorized_cidr) # Allows your machine to run the kubectl command. Required for multi network setup. + display_name: "kubectl-access-network" + additional_networks: + $(concat( + [{ + network=gke-h4d-rdma-net.network_name, + subnetwork=gke-h4d-rdma-net.subnetwork_name, + subnetwork_project=vars.project_id, + nic_type="IRDMA", + queue_count=null, + network_ip=null, + stack_type=null, + access_config=[{nat_ip=null, public_ptr_domain_name=null, network_tier=null}], + ipv6_access_config=[], + alias_ip_range=[] + }] + )) + # Cluster versions cannot be updated through the toolkit after creation + # Please manage cluster version from the Google Cloud Console directly + version_prefix: "1.32." + release_channel: RAPID + maintenance_exclusions: + - name: no-minor-or-node-upgrades-indefinite + start_time: "2024-12-01T00:00:00Z" + end_time: "2025-12-22T00:00:00Z" + exclusion_scope: NO_MINOR_OR_NODE_UPGRADES + outputs: [instructions] + + - id: h4d-pool + source: modules/compute/gke-node-pool + use: [h4d-cluster, node_pool_service_account] + settings: + machine_type: h4d-highmem-192-lssd + auto_upgrade: true + zones: [$(vars.zone)] + disk_size_gb: $(vars.h4d_node_pool_disk_size_gb) + static_node_count: $(vars.static_node_count) + additional_networks: + $(concat( + [{ + network=gke-h4d-rdma-net.network_name, + subnetwork=gke-h4d-rdma-net.subnetwork_name, + subnetwork_project=vars.project_id, + nic_type="IRDMA", + queue_count=null, + network_ip=null, + stack_type=null, + access_config=[{nat_ip=null, public_ptr_domain_name=null, network_tier=null}], + ipv6_access_config=[], + alias_ip_range=[] + }] + )) + outputs: [instructions] + + # Install Kueue and Jobset + - id: workload-manager-install + source: modules/management/kubectl-apply + use: [h4d-cluster] + settings: + kueue: + install: true + jobset: + install: true diff --git a/examples/gke-managed-hyperdisk.yaml b/examples/gke-managed-hyperdisk.yaml index 6e28acb5c5..c5c12d2ed3 100644 --- a/examples/gke-managed-hyperdisk.yaml +++ b/examples/gke-managed-hyperdisk.yaml @@ -154,7 +154,8 @@ deployment_groups: from transformers import AutoTokenizer import numpy as np tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") - tokenized_data = tokenizer(dataset["sentence"], return_tensors="np", padding=True) + sentences = [str(s) for s in dataset["sentence"]] + tokenized_data = tokenizer(sentences, return_tensors="np", padding=True) tokenized_data = dict(tokenized_data) labels = np.array(dataset["label"]) from transformers import TFAutoModelForSequenceClassification @@ -195,7 +196,8 @@ deployment_groups: from transformers import AutoTokenizer import numpy as np tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") - tokenized_data = tokenizer(dataset["sentence"], return_tensors="np", padding=True) + sentences = [str(s) for s in dataset["sentence"]] + tokenized_data = tokenizer(sentences, return_tensors="np", padding=True) tokenized_data = dict(tokenized_data) labels = np.array(dataset["label"]) from transformers import TFAutoModelForSequenceClassification @@ -236,7 +238,8 @@ deployment_groups: from transformers import AutoTokenizer import numpy as np tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") - tokenized_data = tokenizer(dataset["sentence"], return_tensors="np", padding=True) + sentences = [str(s) for s in dataset["sentence"]] + tokenized_data = tokenizer(sentences, return_tensors="np", padding=True) tokenized_data = dict(tokenized_data) labels = np.array(dataset["label"]) from transformers import TFAutoModelForSequenceClassification diff --git a/modules/compute/gke-node-pool/versions.tf b/modules/compute/gke-node-pool/versions.tf index 2388b350b1..2770b828df 100644 --- a/modules/compute/gke-node-pool/versions.tf +++ b/modules/compute/gke-node-pool/versions.tf @@ -30,7 +30,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:gke-node-pool/v1.57.0" + module_name = "blueprints/terraform/hpc-toolkit:gke-node-pool/v1.58.0" } provider_meta "google-beta" { module_name = "blueprints/terraform/hpc-toolkit:gke-node-pool/v1.45.0" diff --git a/modules/compute/vm-instance/versions.tf b/modules/compute/vm-instance/versions.tf index 4b2786e2af..4ab83d076f 100644 --- a/modules/compute/vm-instance/versions.tf +++ b/modules/compute/vm-instance/versions.tf @@ -31,10 +31,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.57.0" + module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.58.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.57.0" + module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.58.0" } required_version = ">= 1.3.0" diff --git a/modules/file-system/filestore/versions.tf b/modules/file-system/filestore/versions.tf index efde3ac158..9ebef7d908 100644 --- a/modules/file-system/filestore/versions.tf +++ b/modules/file-system/filestore/versions.tf @@ -26,10 +26,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.57.0" + module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.58.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.57.0" + module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.58.0" } required_version = ">= 1.3.0" diff --git a/modules/file-system/gke-persistent-volume/versions.tf b/modules/file-system/gke-persistent-volume/versions.tf index 1c6b08e279..7bed64f237 100644 --- a/modules/file-system/gke-persistent-volume/versions.tf +++ b/modules/file-system/gke-persistent-volume/versions.tf @@ -29,6 +29,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:gke-persistent-volume/v1.57.0" + module_name = "blueprints/terraform/hpc-toolkit:gke-persistent-volume/v1.58.0" } } diff --git a/modules/file-system/gke-storage/versions.tf b/modules/file-system/gke-storage/versions.tf index ab0ea408ce..9c447d78ee 100644 --- a/modules/file-system/gke-storage/versions.tf +++ b/modules/file-system/gke-storage/versions.tf @@ -16,6 +16,6 @@ terraform { required_version = ">= 1.5" provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:gke-storage/v1.57.0" + module_name = "blueprints/terraform/hpc-toolkit:gke-storage/v1.58.0" } } diff --git a/modules/file-system/managed-lustre/versions.tf b/modules/file-system/managed-lustre/versions.tf index 59808977a6..f01ee79664 100644 --- a/modules/file-system/managed-lustre/versions.tf +++ b/modules/file-system/managed-lustre/versions.tf @@ -26,10 +26,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:managed-lustre/v1.57.0" + module_name = "blueprints/terraform/hpc-toolkit:managed-lustre/v1.58.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:managed-lustre/v1.57.0" + module_name = "blueprints/terraform/hpc-toolkit:managed-lustre/v1.58.0" } required_version = ">= 1.3.0" diff --git a/modules/monitoring/dashboard/versions.tf b/modules/monitoring/dashboard/versions.tf index d8c4587051..1be5e2b924 100644 --- a/modules/monitoring/dashboard/versions.tf +++ b/modules/monitoring/dashboard/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:dashboard/v1.57.0" + module_name = "blueprints/terraform/hpc-toolkit:dashboard/v1.58.0" } required_version = ">= 0.14.0" diff --git a/modules/network/firewall-rules/versions.tf b/modules/network/firewall-rules/versions.tf index c892bc0435..b6cc8cb1f1 100644 --- a/modules/network/firewall-rules/versions.tf +++ b/modules/network/firewall-rules/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:firewall-rules/v1.57.0" + module_name = "blueprints/terraform/hpc-toolkit:firewall-rules/v1.58.0" } required_version = ">= 1.5" diff --git a/modules/network/pre-existing-subnetwork/versions.tf b/modules/network/pre-existing-subnetwork/versions.tf index e90ae5991d..a0da152f75 100644 --- a/modules/network/pre-existing-subnetwork/versions.tf +++ b/modules/network/pre-existing-subnetwork/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:pre-existing-subnetwork/v1.57.0" + module_name = "blueprints/terraform/hpc-toolkit:pre-existing-subnetwork/v1.58.0" } required_version = ">= 1.5" diff --git a/modules/network/pre-existing-vpc/versions.tf b/modules/network/pre-existing-vpc/versions.tf index d798ca3f8c..01ed0ef2f1 100644 --- a/modules/network/pre-existing-vpc/versions.tf +++ b/modules/network/pre-existing-vpc/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:pre-existing-vpc/v1.57.0" + module_name = "blueprints/terraform/hpc-toolkit:pre-existing-vpc/v1.58.0" } required_version = ">= 1.5" diff --git a/modules/scheduler/batch-login-node/versions.tf b/modules/scheduler/batch-login-node/versions.tf index 5109db2aee..9ed48fca45 100644 --- a/modules/scheduler/batch-login-node/versions.tf +++ b/modules/scheduler/batch-login-node/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:batch-login-node/v1.57.0" + module_name = "blueprints/terraform/hpc-toolkit:batch-login-node/v1.58.0" } required_version = ">= 0.14.0" diff --git a/modules/scheduler/gke-cluster/versions.tf b/modules/scheduler/gke-cluster/versions.tf index 39dc44bd48..68063bc9df 100644 --- a/modules/scheduler/gke-cluster/versions.tf +++ b/modules/scheduler/gke-cluster/versions.tf @@ -30,7 +30,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:gke-cluster/v1.57.0" + module_name = "blueprints/terraform/hpc-toolkit:gke-cluster/v1.58.0" } provider_meta "google-beta" { diff --git a/modules/scheduler/pre-existing-gke-cluster/versions.tf b/modules/scheduler/pre-existing-gke-cluster/versions.tf index 1e458a6b4c..c5be94d28d 100644 --- a/modules/scheduler/pre-existing-gke-cluster/versions.tf +++ b/modules/scheduler/pre-existing-gke-cluster/versions.tf @@ -23,7 +23,7 @@ terraform { } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:pre-existing-gke-cluster/v1.57.0" + module_name = "blueprints/terraform/hpc-toolkit:pre-existing-gke-cluster/v1.58.0" } required_version = ">= 1.3" diff --git a/modules/scripts/startup-script/versions.tf b/modules/scripts/startup-script/versions.tf index 91aff38497..7ec6cd59b6 100644 --- a/modules/scripts/startup-script/versions.tf +++ b/modules/scripts/startup-script/versions.tf @@ -30,7 +30,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:startup-script/v1.57.0" + module_name = "blueprints/terraform/hpc-toolkit:startup-script/v1.58.0" } required_version = ">= 1.5" diff --git a/tools/cloud-build/daily-tests/builds/gke-h4d.yaml b/tools/cloud-build/daily-tests/builds/gke-h4d.yaml new file mode 100644 index 0000000000..e33cc4e882 --- /dev/null +++ b/tools/cloud-build/daily-tests/builds/gke-h4d.yaml @@ -0,0 +1,63 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +tags: +- gke +- m.gke-cluster +- m.gke-node-pool +- m.service-account +- m.vpc +- m.kubectl-apply + +timeout: 14400s # 4hr +steps: +- id: gke-h4d + name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner + entrypoint: /bin/bash + env: + - "ANSIBLE_HOST_KEY_CHECKING=false" + - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" + args: + - -c + - | + set -x -e + cd /workspace && make + BUILD_ID_FULL=$BUILD_ID + BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6} + EXAMPLE_BP=examples/gke-h4d/gke-h4d.yaml + + # adding vm to act as remote node + echo ' - id: remote-node' >> $${EXAMPLE_BP} + echo ' source: modules/compute/vm-instance' >> $${EXAMPLE_BP} + echo ' use: [gke-h4d-net]' >> $${EXAMPLE_BP} + echo ' settings:' >> $${EXAMPLE_BP} + echo ' machine_type: e2-standard-2' >> $${EXAMPLE_BP} + echo ' name_prefix: remote-node' >> $${EXAMPLE_BP} + echo ' add_deployment_name_before_prefix: true' >> $${EXAMPLE_BP} + echo '' + echo ' - id: job_template_hostname' >> $${EXAMPLE_BP} + echo ' source: modules/compute/gke-job-template' >> $${EXAMPLE_BP} + echo ' use: [h4d-pool]' >> $${EXAMPLE_BP} + echo ' settings:' >> $${EXAMPLE_BP} + echo ' image: busybox' >> $${EXAMPLE_BP} + echo ' command:' >> $${EXAMPLE_BP} + echo ' - echo' >> $${EXAMPLE_BP} + echo ' - Hello World' >> $${EXAMPLE_BP} + echo ' node_count: 1' >> $${EXAMPLE_BP} + echo ' outputs: [instructions]' >> $${EXAMPLE_BP} + + ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ + --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --extra-vars="@tools/cloud-build/daily-tests/tests/gke-h4d.yml" diff --git a/tools/cloud-build/daily-tests/tests/gke-h4d.yml b/tools/cloud-build/daily-tests/tests/gke-h4d.yml new file mode 100644 index 0000000000..5ced2941ac --- /dev/null +++ b/tools/cloud-build/daily-tests/tests/gke-h4d.yml @@ -0,0 +1,36 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- + +# region, zone must be defined +# in build file with --extra-vars flag! +test_name: gke-h4d +deployment_name: gke-h4d-{{ build }} +workspace: /workspace +blueprint_yaml: "{{ workspace }}/examples/gke-h4d/gke-h4d.yaml" +network: "{{ deployment_name }}-net" +region: us-central1 +zone: us-central1-b +remote_node: "{{ deployment_name }}-remote-node-0" +static_node_count: 2 +cli_deployment_vars: + region: "{{ region }}" + zone: "{{ zone }}" + static_node_count: "{{ static_node_count }}" + authorized_cidr: "{{ build_ip.stdout }}/32" +custom_vars: + project: "{{ project }}" +post_deploy_tests: +- test-validation/test-gke-job.yml diff --git a/tools/validate_configs/validate_configs.sh b/tools/validate_configs/validate_configs.sh index 518c7ab304..30dc4c67c4 100755 --- a/tools/validate_configs/validate_configs.sh +++ b/tools/validate_configs/validate_configs.sh @@ -120,7 +120,7 @@ check_background() { fi } -CONFIGS=$(find examples/ community/examples/ tools/validate_configs/test_configs/ docs/tutorials/ docs/videos/build-your-own-blueprint/ -name "*.yaml" -type f -not -path 'examples/machine-learning/a3-megagpu-8g/*' -not -path 'examples/machine-learning/a3-ultragpu-8g/*' -not -path 'examples/machine-learning/build-service-images/*' -not -path 'examples/gke-a3-ultragpu/*' -not -path 'examples/hypercompute_clusters/*' -not -path 'examples/gke-consumption-options/*' -not -path 'examples/gke-a4/*' -not -path 'examples/gke-a3-megagpu/*' -not -path 'examples/machine-learning/a4-highgpu-8g/*' -not -path 'community/examples/gke-tpu-v6/*' -not -path 'community/examples/xpk-n2-filestore/*' -not -path 'examples/gke-a4x/*' -not -path 'examples/science/af3-slurm/*') +CONFIGS=$(find examples/ community/examples/ tools/validate_configs/test_configs/ docs/tutorials/ docs/videos/build-your-own-blueprint/ -name "*.yaml" -type f -not -path 'examples/machine-learning/a3-megagpu-8g/*' -not -path 'examples/machine-learning/a3-ultragpu-8g/*' -not -path 'examples/machine-learning/build-service-images/*' -not -path 'examples/gke-a3-ultragpu/*' -not -path 'examples/hypercompute_clusters/*' -not -path 'examples/gke-consumption-options/*' -not -path 'examples/gke-a4/*' -not -path 'examples/gke-a3-megagpu/*' -not -path 'examples/machine-learning/a4-highgpu-8g/*' -not -path 'community/examples/gke-tpu-v6/*' -not -path 'community/examples/xpk-n2-filestore/*' -not -path 'examples/gke-a4x/*' -not -path 'examples/science/af3-slurm/*' -not -path 'examples/gke-h4d/*') # Exclude blueprints that use v5 modules. declare -A EXCLUDE_EXAMPLE EXCLUDE_EXAMPLE["tools/validate_configs/test_configs/two-clusters-sql.yaml"]=