forked from GoogleCloudPlatform/cluster-toolkit
-
Notifications
You must be signed in to change notification settings - Fork 0
/
hpc-intel-select-slurm.yaml
162 lines (147 loc) · 4.58 KB
/
hpc-intel-select-slurm.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
---
blueprint_name: hpc-intel-select-slurm
vars:
deployment_name: hpc-intel-select
region: us-central1
zone: us-central1-c
controller_image:
family: slurm-intel-hpc-controller
project: $(vars.project_id)
compute_image:
family: slurm-intel-hpc-compute
project: $(vars.project_id)
network_name: intel-select-net
subnetwork_name: intel-select-primary-subnet
# Documentation for each of the modules used below can be found at
# https://github.com/GoogleCloudPlatform/hpc-toolkit/blob/main/modules/README.md
deployment_groups:
- group: primary
modules:
- id: network1
source: modules/network/vpc
- id: startup_controller
source: modules/scripts/startup-script
settings:
runners:
- type: shell
destination: /var/tmp/install_intel_controller.sh
content: |
#!/bin/bash
yum -y update google-hpc-compute
google_install_mpi --prefix /apps --intel_compliance
outputs:
- startup_script
- id: startup_compute
source: modules/scripts/startup-script
settings:
runners:
- type: shell
destination: /var/tmp/install_intel_compute.sh
content: |
#!/bin/bash
yum -y update google-hpc-compute
google_install_mpi --intel_comp_meta
- type: data
destination: /var/tmp/dgemm_job.sh
content: |
#!/bin/bash
#SBATCH --nodes=4
#SBATCH --ntasks-per-node=30
#SBATCH --time=01:00:00
#SBATCH --job-name=clckjob
#SBATCH --output=job_%j.log
#SBATCH --partition=compute
. /apps/clck/2019.10/env/vars.sh
export CLCK_SHARED_TEMP_DIR=$HOME
cd $SLURM_SUBMIT_DIR
# select_solutions_sim_mod_user_base_2018.0 | select_solutions_sim_mod_user_plus_2018.0
FWD=select_solutions_sim_mod_user_base_2018.0
clck -D ${FWD}.db -F ${FWD} -l debug
outputs:
- startup_script
- group: build1
modules:
- id: controller-image
source: modules/packer/custom-image
kind: packer
settings:
disk_size: 20
source_image_project_id: [schedmd-slurm-public]
source_image_family: schedmd-slurm-21-08-8-hpc-centos-7
image_family: $(vars.controller_image.family)
- group: build2
modules:
- id: compute-image
source: modules/packer/custom-image
kind: packer
settings:
disk_size: 20
source_image_project_id: [schedmd-slurm-public]
source_image_family: schedmd-slurm-21-08-8-hpc-centos-7
image_family: $(vars.compute_image.family)
- group: cluster
modules:
- id: cluster-network
source: modules/network/pre-existing-vpc
- id: homefs
source: modules/file-system/filestore
use:
- cluster-network
settings:
local_mount: /home
# This debug_partition will work out of the box without requesting additional GCP quota.
- id: debug_partition
source: community/modules/compute/SchedMD-slurm-on-gcp-partition
use:
- cluster-network
- homefs
settings:
partition_name: debug
max_node_count: 4
enable_placement: false
exclusive: false
machine_type: n2-standard-4
instance_image: $(vars.compute_image)
- id: compute_partition
source: community/modules/compute/SchedMD-slurm-on-gcp-partition
use:
- cluster-network
- homefs
settings:
partition_name: compute
instance_image: $(vars.compute_image)
max_node_count: 100
machine_type: c2-standard-60
bandwidth_tier: gvnic_enabled
- id: slurm_controller
source: community/modules/scheduler/SchedMD-slurm-on-gcp-controller
use:
- cluster-network
- compute_partition
- homefs
settings:
login_node_count: 1
instance_image: $(vars.controller_image)
controller_machine_type: c2-standard-4
- id: slurm_login
source: community/modules/scheduler/SchedMD-slurm-on-gcp-login-node
use:
- cluster-network
- slurm_controller
- homefs
settings:
instance_image: $(vars.compute_image)
login_machine_type: c2-standard-4