community/examples/intel/hpc-intel-select-slurm.yaml

# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

---

blueprint_name: hpc-intel-select-slurm

vars:
  deployment_name: hpc-intel-select
  region: us-central1
  zone: us-central1-c
  controller_image:
    family: slurm-intel-hpc-controller
    project: $(vars.project_id)
  compute_image:
    family: slurm-intel-hpc-compute
    project: $(vars.project_id)
  network_name: intel-select-net
  subnetwork_name: intel-select-primary-subnet

# Documentation for each of the modules used below can be found at
# https://github.com/GoogleCloudPlatform/hpc-toolkit/blob/main/modules/README.md

deployment_groups:
- group: primary
  modules:
  - id: network1
    source: modules/network/vpc

  - id: startup_controller
    source: modules/scripts/startup-script
    settings:
      runners:
      - type: shell
        destination: /var/tmp/install_intel_controller.sh
        content: |
          #!/bin/bash
          yum -y update google-hpc-compute
          google_install_mpi --prefix /apps --intel_compliance
    outputs:
    - startup_script

  - id: startup_compute
    source: modules/scripts/startup-script
    settings:
      runners:
      - type: shell
        destination: /var/tmp/install_intel_compute.sh
        content: |
          #!/bin/bash
          yum -y update google-hpc-compute
          google_install_mpi --intel_comp_meta
      - type: data
        destination: /var/tmp/dgemm_job.sh
        content: |
          #!/bin/bash
          #SBATCH --nodes=4
          #SBATCH --ntasks-per-node=30
          #SBATCH --time=01:00:00
          #SBATCH --job-name=clckjob
          #SBATCH --output=job_%j.log
          #SBATCH --partition=compute
          . /apps/clck/2019.10/env/vars.sh
          export CLCK_SHARED_TEMP_DIR=$HOME
          cd $SLURM_SUBMIT_DIR
          # select_solutions_sim_mod_user_base_2018.0 | select_solutions_sim_mod_user_plus_2018.0
          FWD=select_solutions_sim_mod_user_base_2018.0
          clck -D ${FWD}.db -F ${FWD} -l debug
    outputs:
    - startup_script

- group: build1
  modules:
  - id: controller-image
    source: modules/packer/custom-image
    kind: packer
    settings:
      disk_size: 20
      source_image_project_id: [schedmd-slurm-public]
      source_image_family: schedmd-slurm-21-08-8-hpc-centos-7
      image_family: $(vars.controller_image.family)

- group: build2
  modules:
  - id: compute-image
    source: modules/packer/custom-image
    kind: packer
    settings:
      disk_size: 20
      source_image_project_id: [schedmd-slurm-public]
      source_image_family: schedmd-slurm-21-08-8-hpc-centos-7
      image_family: $(vars.compute_image.family)

- group: cluster
  modules:
  - id: cluster-network
    source: modules/network/pre-existing-vpc

  - id: homefs
    source: modules/file-system/filestore
    use:
    - cluster-network
    settings:
      local_mount: /home

  # This debug_partition will work out of the box without requesting additional GCP quota.
  - id: debug_partition
    source: community/modules/compute/SchedMD-slurm-on-gcp-partition
    use:
    - cluster-network
    - homefs
    settings:
      partition_name: debug
      max_node_count: 4
      enable_placement: false
      exclusive: false
      machine_type: n2-standard-4
      instance_image: $(vars.compute_image)

  - id: compute_partition
    source: community/modules/compute/SchedMD-slurm-on-gcp-partition
    use:
    - cluster-network
    - homefs
    settings:
      partition_name: compute
      instance_image: $(vars.compute_image)
      max_node_count: 100
      machine_type: c2-standard-60
      bandwidth_tier: gvnic_enabled

  - id: slurm_controller
    source: community/modules/scheduler/SchedMD-slurm-on-gcp-controller
    use:
    - cluster-network
    - compute_partition
    - homefs
    settings:
      login_node_count: 1
      instance_image: $(vars.controller_image)
      controller_machine_type: c2-standard-4

  - id: slurm_login
    source: community/modules/scheduler/SchedMD-slurm-on-gcp-login-node
    use:
    - cluster-network
    - slurm_controller
    - homefs
    settings:
      instance_image: $(vars.compute_image)
      login_machine_type: c2-standard-4