From 482e804b7db0b2b01d3f032c5d4cf8fbe85b8b16 Mon Sep 17 00:00:00 2001 From: Allan Carter Date: Tue, 20 Aug 2024 03:05:15 +0000 Subject: [PATCH] Generalize RES configuration of login nodes and user/group json Add new config variable: ExternalLoginNodes Resolves #250 ===== Fix SlurmFsxLustre ingress rule. CDK creates egress rule without matching ingress rule. Resolves #252 ===== Fix FSxZ egress rules Compensate for a bug in ParallelCluster that requires egress rules. Leave the bug open so that can remove rules when ParallelCluster bug is fixed. Addresses #253 ===== Document FSx configuration ===== Add IAM policy required to mount FSx Add AttachRolePolicy, DetachRolePolicy for HeadNodePolicy. Resolves #254 ==== Fix SNS notification bug when CreateParallelCluster Lambda is missing parameter --- .../create_slurm_security_groups_stack.py | 24 +- docs/config.md | 67 +++++ docs/deployment-prerequisites.md | 55 ++++ source/cdk/cdk_slurm_stack.py | 259 +++++++++++------- source/cdk/config_schema.py | 28 +- .../ConfigureExternalLoginNodes.py | 143 ++++++++++ .../ConfigureRESSubmitters.py | 127 --------- .../ConfigureUsersGroupsJson.py} | 55 ++-- .../CreateParallelCluster.py | 7 +- .../DeconfigureExternalLoginNodes.py} | 173 ++++++------ .../cfnresponse.py | 0 .../DeconfigureUsersGroupsJson.py} | 127 ++++----- .../cfnresponse.py | 0 .../config/bin/on_head_node_configured.sh | 16 +- 14 files changed, 660 insertions(+), 421 deletions(-) create mode 100644 source/resources/lambdas/ConfigureExternalLoginNodes/ConfigureExternalLoginNodes.py delete mode 100644 source/resources/lambdas/ConfigureRESSubmitters/ConfigureRESSubmitters.py rename source/resources/lambdas/{ConfigureRESUsersGroupsJson/ConfigureRESUsersGroupsJson.py => ConfigureUsersGroupsJson/ConfigureUsersGroupsJson.py} (70%) rename source/resources/lambdas/{DeconfigureRESSubmitters/DeconfigureRESSubmitters.py => DeconfigureExternalLoginNodes/DeconfigureExternalLoginNodes.py} (53%) rename source/resources/lambdas/{DeconfigureRESSubmitters => DeconfigureExternalLoginNodes}/cfnresponse.py (100%) rename source/resources/lambdas/{DeconfigureRESUsersGroupsJson/DeconfigureRESUsersGroupsJson.py => DeconfigureUsersGroupsJson/DeconfigureUsersGroupsJson.py} (68%) rename source/resources/lambdas/{DeconfigureRESUsersGroupsJson => DeconfigureUsersGroupsJson}/cfnresponse.py (100%) diff --git a/create-slurm-security-groups/create_slurm_security_groups/create_slurm_security_groups_stack.py b/create-slurm-security-groups/create_slurm_security_groups/create_slurm_security_groups_stack.py index 55de164d..993ccbd9 100644 --- a/create-slurm-security-groups/create_slurm_security_groups/create_slurm_security_groups_stack.py +++ b/create-slurm-security-groups/create_slurm_security_groups/create_slurm_security_groups_stack.py @@ -122,6 +122,9 @@ def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None: for dst_sg_name, dst_sg in lustre_security_groups.items(): src_sg.connections.allow_to(dst_sg, ec2.Port.tcp(988), f"{src_sg_name} to {dst_sg_name} lustre") src_sg.connections.allow_to(dst_sg, ec2.Port.tcp_range(1018, 1023), f"{src_sg_name} to {dst_sg_name} lustre") + # It shouldn't be necessary to do allow_to and allow_from, but CDK left off the ingress rule form lustre to lustre if I didn't add the allow_from. + dst_sg.connections.allow_from(src_sg, ec2.Port.tcp(988), f"{src_sg_name} to {dst_sg_name} lustre") + dst_sg.connections.allow_from(src_sg, ec2.Port.tcp_range(1018, 1023), f"{src_sg_name} to {dst_sg_name} lustre") # Rules for FSx Ontap for fsx_client_sg_name, fsx_client_sg in fsx_client_security_groups.items(): @@ -138,12 +141,21 @@ def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None: fsx_client_sg.connections.allow_to(fsx_ontap_sg, ec2.Port.udp(4046), f"{fsx_client_sg_name} to {fsx_ontap_sg_name} Network status monitor for NFS") for fsx_zfs_sg_name, fsx_zfs_sg in zfs_security_groups.items(): - fsx_client_sg.connections.allow_to(slurm_fsx_zfs_sg, ec2.Port.tcp(111), f"{fsx_client_sg_name} to {fsx_zfs_sg_name} rpc for NFS") - fsx_client_sg.connections.allow_to(slurm_fsx_zfs_sg, ec2.Port.udp(111), f"{fsx_client_sg_name} to {fsx_zfs_sg_name} rpc for NFS") - fsx_client_sg.connections.allow_to(slurm_fsx_zfs_sg, ec2.Port.tcp(2049), f"{fsx_client_sg_name} to {fsx_zfs_sg_name} NFS server daemon") - fsx_client_sg.connections.allow_to(slurm_fsx_zfs_sg, ec2.Port.udp(2049), f"{fsx_client_sg_name} to {fsx_zfs_sg_name} NFS server daemon") - fsx_client_sg.connections.allow_to(slurm_fsx_zfs_sg, ec2.Port.tcp_range(20001, 20003), f"{fsx_client_sg_name} to {fsx_zfs_sg_name} NFS mount, status monitor, and lock daemon") - fsx_client_sg.connections.allow_to(slurm_fsx_zfs_sg, ec2.Port.udp_range(20001, 20003), f"{fsx_client_sg_name} to {fsx_zfs_sg_name} NFS mount, status monitor, and lock daemon") + fsx_client_sg.connections.allow_to(fsx_zfs_sg, ec2.Port.tcp(111), f"{fsx_client_sg_name} to {fsx_zfs_sg_name} rpc for NFS") + fsx_client_sg.connections.allow_to(fsx_zfs_sg, ec2.Port.udp(111), f"{fsx_client_sg_name} to {fsx_zfs_sg_name} rpc for NFS") + fsx_client_sg.connections.allow_to(fsx_zfs_sg, ec2.Port.tcp(2049), f"{fsx_client_sg_name} to {fsx_zfs_sg_name} NFS server daemon") + fsx_client_sg.connections.allow_to(fsx_zfs_sg, ec2.Port.udp(2049), f"{fsx_client_sg_name} to {fsx_zfs_sg_name} NFS server daemon") + fsx_client_sg.connections.allow_to(fsx_zfs_sg, ec2.Port.tcp_range(20001, 20003), f"{fsx_client_sg_name} to {fsx_zfs_sg_name} NFS mount, status monitor, and lock daemon") + fsx_client_sg.connections.allow_to(fsx_zfs_sg, ec2.Port.udp_range(20001, 20003), f"{fsx_client_sg_name} to {fsx_zfs_sg_name} NFS mount, status monitor, and lock daemon") + # There is a bug in PC 3.10.1 that requires outbound traffic to be enabled even though ZFS doesn't. + # Remove when bug in PC is fixed. + # Tracked by https://github.com/aws-samples/aws-eda-slurm-cluster/issues/253 + fsx_client_sg.connections.allow_from(fsx_zfs_sg, ec2.Port.tcp(111), f"{fsx_zfs_sg_name} to {fsx_client_sg_name} rpc for NFS") + fsx_client_sg.connections.allow_from(fsx_zfs_sg, ec2.Port.udp(111), f"{fsx_zfs_sg_name} to {fsx_client_sg_name} rpc for NFS") + fsx_client_sg.connections.allow_from(fsx_zfs_sg, ec2.Port.tcp(2049), f"{fsx_zfs_sg_name} to {fsx_client_sg_name} NFS server daemon") + fsx_client_sg.connections.allow_from(fsx_zfs_sg, ec2.Port.udp(2049), f"{fsx_zfs_sg_name} to {fsx_client_sg_name} NFS server daemon") + fsx_client_sg.connections.allow_from(fsx_zfs_sg, ec2.Port.tcp_range(20001, 20003), f"{fsx_zfs_sg_name} to {fsx_client_sg_name} NFS mount, status monitor, and lock daemon") + fsx_client_sg.connections.allow_from(fsx_zfs_sg, ec2.Port.udp_range(20001, 20003), f"{fsx_zfs_sg_name} to {fsx_client_sg_name} NFS mount, status monitor, and lock daemon") for sg_name, sg in security_groups.items(): CfnOutput(self, f"{sg_name}Id", diff --git a/docs/config.md b/docs/config.md index fa0d41b8..88a5afb2 100644 --- a/docs/config.md +++ b/docs/config.md @@ -14,6 +14,16 @@ This project creates a ParallelCluster configuration file that is documented in TimeZone: str AdditionalSecurityGroupsStackName: str RESStackName: str +ExternalLoginNodes: + - Tags: + - Key: str + Values: [ str ] + SecurityGroupId: str +DomainJoinedInstance: + - Tags: + - Key: str + Values: [ str ] + SecurityGroupId: str slurm: ParallelClusterConfig: Version: str @@ -212,6 +222,63 @@ This requires you to [configure security groups for external login nodes](../dep The Slurm binaries will be compiled for the OS of the desktops and and environment modulefile will be created so that the users just need to load the cluster modulefile to use the cluster. +### ExternalLoginNodes + +An array of specifications for instances that should automatically be configured as Slurm login nodes. +Each array element contains one or more tags that will be used to select login node instances. +It also includes the security group id that must be attached to the login node to give it access to the slurm cluster. +The tags for a group of instances is an array with the tag name and an array of values. + +A lambda function processes each login node specification. +It uses the tags to select running instances. +If the instances do not have the security group attached, then it will attach the security group. +It will then run a script each instance to configure it as a login node for the slurm cluster. +To use the cluster, users simply load the environment modulefile that is created by the script. + +For example, to configure RES virtual desktops as Slurm login nodes the following configuration is added. + +``` +--- +ExternalLoginNodes: +- Tags: + - Key: 'res:EnvironmentName' + Values: [ 'res-eda' ] + - Key: 'res:NodeType' + Values: ['virtual-desktop-dcv-host'] + SecurityGroupId: +``` + +### DomainJoinedInstance + +A specifications for a domain joined instance that will be used to create and update users_groups.json. +It also includes the security group id that must be attached to the login node to give it access to the slurm head node so it can mount the slurm configuration file system. +The tags for the instance is an array with the tag name and an array of values. + +A lambda function the specification. +It uses the tags to select a running instance. +If the instance does not have the security group attached, then it will attach the security group. +It will then run a script each instance to configure it to save all of the users and groups into a json file that +is used to create local users and groups on compute nodes when they boot. + +For example, to configure the RES cluster manager, the following configuration is added. + +``` +--- +DomainJoinedInstance: +- Tags: + - Key: 'Name' + Values: [ 'res-eda-cluster-manager' ] + - Key: 'res:EnvironmentName' + Values: [ 'res-eda' ] + - Key: 'res:ModuleName' + Values: [ 'cluster-manager' ] + - Key: 'res:ModuleId' + Values: [ 'cluster-manager' ] + - Key: 'app' + Values: ['virtual-desktop-dcv-host'] + SecurityGroupId: +``` + ## slurm Slurm configuration parameters. diff --git a/docs/deployment-prerequisites.md b/docs/deployment-prerequisites.md index 03e5eaa3..70c6b28e 100644 --- a/docs/deployment-prerequisites.md +++ b/docs/deployment-prerequisites.md @@ -441,3 +441,58 @@ slurm: ansys: Count: 1 ``` + +### Configure File Systems + +The Storage/ExtraMounts parameter allows you to configure additional file systems to mount on compute nodes. +Note that the security groups for the file systems must allow connections from the compute nodes. + +#### Lustre + +The following example shows how to add an FSx for Lustre file system. +The mount information can be found from the FSx console. + +``` + storage: + ExtraMounts + - dest: /lustre + src: .fsx..amazonaws.com@tcp:/ + StorageType: FsxLustre + FileSystemId: + type: lustre + options: relatime,flock +``` + +#### ONTAP + +The following example shows how to add an FSx for NetApp ONTAP file system. +The mount information can be found from the FSx console. + +``` + storage: + ExtraMounts + - dest: /ontap + src: ..fsx..amazonaws.com:/vol1 + StorageType: FsxOntap + FileSystemId: + VolumeId: + type: nfs + options: default +``` + +#### ZFS + +The following example shows how to add an FSx for OpenZFS file system. +The mount information can be found from the FSx console. + +``` + storage: + ExtraMounts + - dest: /zfs + src: .fsx..amazonaws.com:/fsx + StorageType: FsxOpenZfs + FileSystemId: + VolumeId: + type: nfs + options: noatime,nfsvers=3,sync,nconnect=16,rsize=1048576,wsize=1048576 +``` diff --git a/source/cdk/cdk_slurm_stack.py b/source/cdk/cdk_slurm_stack.py index 847bfec9..01058d09 100644 --- a/source/cdk/cdk_slurm_stack.py +++ b/source/cdk/cdk_slurm_stack.py @@ -610,8 +610,6 @@ def update_config_for_res(self): self.config['SubnetId'] = subnet_ids[0] logger.info(f" SubnetId: {self.config['SubnetId']}") - # self.config['slurm']['SubmitterInstanceTags'] = {'res:EnvironmentName': [self.res_environment_name]} - # Get RES VDI Security Group res_vdc_stack_name = f"{self.res_environment_name}-vdc" if res_vdc_stack_name not in stack_statuses: @@ -678,6 +676,52 @@ def update_config_for_res(self): logger.error(f"RES VDC controller security group not found.") exit(1) + self.config['ExternalLoginNodes'] = self.config.get('ExternalLoginNodes', []) + self.config['ExternalLoginNodes'].append( + { + 'Tags': [ + { + 'Key': 'res:EnvironmentName', + 'Values': [self.res_environment_name] + }, + { + 'Key': 'res:NodeType', + 'Values': ['virtual-desktop-dcv-host'] + } + ], + 'SecurityGroupId': self.slurm_login_node_sg_id + } + ) + + if 'DomainJoinedInstance' in self.config: + logger.error(f"Can't specify both DomainJoinedInstance and RESStackName in config file.") + exit(1) + self.config['DomainJoinedInstance'] = { + 'Tags': [ + { + 'Key': 'Name', + 'Values': [f"{self.res_environment_name}-cluster-manager",] + }, + { + 'Key': 'res:EnvironmentName', + 'Values': [self.res_environment_name] + }, + { + 'Key': 'res:ModuleName', + 'Values': ['cluster-manager'] + }, + { + 'Key': 'res:ModuleId', + 'Values': ['cluster-manager'] + }, + { + 'Key': 'res:NodeType', + 'Values': ['app'] + } + ], + 'SecurityGroupId': self.slurm_login_node_sg_id + } + # Configure the /home mount from RES if /home not already configured home_mount_found = False for extra_mount in self.config['slurm'].get('storage', {}).get('ExtraMounts', []): @@ -803,21 +847,24 @@ def create_parallel_cluster_assets(self): ) # W47:SNS Topic should specify KmsMasterKeyId property self.suppress_cfn_nag(self.create_head_node_a_record_sns_topic, 'W47', 'Use default KMS key.') - if 'RESStackName' in self.config: + + if 'DomainJoinedInstance' in self.config: # SNS topic that gets notified when cluster is created and triggers a lambda to configure the cluster manager - self.configure_res_users_groups_json_sns_topic = sns.Topic( - self, "ConfigureRESUsersGroupsJsonSnsTopic", - topic_name = f"{self.config['slurm']['ClusterName']}ConfigureRESUsersGroupsJson" + self.configure_users_groups_json_sns_topic = sns.Topic( + self, "ConfigureUsersGroupsJsonSnsTopic", + topic_name = f"{self.config['slurm']['ClusterName']}ConfigureUsersGroupsJson" ) # W47:SNS Topic should specify KmsMasterKeyId property - self.suppress_cfn_nag(self.configure_res_users_groups_json_sns_topic, 'W47', 'Use default KMS key.') - # SNS topic that gets notified when cluster is created and triggers a lambda to configure the cluster manager - self.configure_res_submitters_sns_topic = sns.Topic( - self, "ConfigureRESSubmittersSnsTopic", - topic_name = f"{self.config['slurm']['ClusterName']}ConfigureRESSubmitters" + self.suppress_cfn_nag(self.configure_users_groups_json_sns_topic, 'W47', 'Use default KMS key.') + + if 'ExternalLoginNodes' in self.config: + # SNS topic that gets notified when cluster is created and triggers a lambda to configure external login nodes + self.configure_external_login_nodes_sns_topic = sns.Topic( + self, "ConfigureExternalLoginNodesSnsTopic", + topic_name = f"{self.config['slurm']['ClusterName']}ConfigureExternalLoginNodes" ) # W47:SNS Topic should specify KmsMasterKeyId property - self.suppress_cfn_nag(self.configure_res_submitters_sns_topic, 'W47', 'Use default KMS key.') + self.suppress_cfn_nag(self.configure_external_login_nodes_sns_topic, 'W47', 'Use default KMS key.') # Create an SSM parameter to store the JWT tokens for root and slurmrestd self.jwt_token_for_root_ssm_parameter_name = f"/{self.config['slurm']['ClusterName']}/slurmrestd/jwt/root" @@ -852,22 +899,23 @@ def create_parallel_cluster_assets(self): ) os.remove(playbooks_zipfile_filename) - if 'RESStackName' in self.config: - self.configure_res_users_groups_json_sns_topic_arn_parameter_name = f"/{self.config['slurm']['ClusterName']}/ConfigureRESUsersGroupsJsonSnsTopicArn" - self.configure_res_users_groups_json_sns_topic_arn_parameter = ssm.StringParameter( - self, f"ConfigureRESUsersGroupsJsonSnsTopicArnParameter", - parameter_name = self.configure_res_users_groups_json_sns_topic_arn_parameter_name, - string_value = self.configure_res_users_groups_json_sns_topic.topic_arn + if 'DomainJoinedInstance' in self.config: + self.configure_users_groups_json_sns_topic_arn_parameter_name = f"/{self.config['slurm']['ClusterName']}/ConfigureUsersGroupsJsonSnsTopicArn" + self.configure_users_groups_json_sns_topic_arn_parameter = ssm.StringParameter( + self, f"ConfigureUsersGroupsJsonSnsTopicArnParameter", + parameter_name = self.configure_users_groups_json_sns_topic_arn_parameter_name, + string_value = self.configure_users_groups_json_sns_topic.topic_arn ) - self.configure_res_users_groups_json_sns_topic_arn_parameter.grant_read(self.parallel_cluster_asset_read_policy) - - self.configure_res_submitters_sns_topic_arn_parameter_name = f"/{self.config['slurm']['ClusterName']}/ConfigureRESSubmittersSnsTopicArn" - self.configure_res_submitters_sns_topic_arn_parameter = ssm.StringParameter( - self, f"ConfigureRESSubmittersSnsTopicArnParameter", - parameter_name = self.configure_res_submitters_sns_topic_arn_parameter_name, - string_value = self.configure_res_submitters_sns_topic.topic_arn + self.configure_users_groups_json_sns_topic_arn_parameter.grant_read(self.parallel_cluster_asset_read_policy) + + if 'ExternalLoginNodes' in self.config: + self.configure_external_login_nodes_sns_topic_arn_parameter_name = f"/{self.config['slurm']['ClusterName']}/ConfigureExternalLoginNodesSnsTopicArn" + self.configure_external_login_nodes_sns_topic_arn_parameter = ssm.StringParameter( + self, f"ConfigureExternalLoginNodesSnsTopicArnParameter", + parameter_name = self.configure_external_login_nodes_sns_topic_arn_parameter_name, + string_value = self.configure_external_login_nodes_sns_topic.topic_arn ) - self.configure_res_submitters_sns_topic_arn_parameter.grant_read(self.parallel_cluster_asset_read_policy) + self.configure_external_login_nodes_sns_topic_arn_parameter.grant_read(self.parallel_cluster_asset_read_policy) self.create_head_node_a_record_sns_topic_arn_parameter_name = f"/{self.config['slurm']['ClusterName']}/CreateHeadNodeARecordSnsTopicArn" self.create_head_node_a_record_sns_topic_arn_parameter = ssm.StringParameter( @@ -881,8 +929,8 @@ def create_parallel_cluster_assets(self): 'assets_bucket': self.assets_bucket, 'assets_base_key': self.assets_base_key, 'ClusterName': self.config['slurm']['ClusterName'], - 'ConfigureRESUsersGroupsJsonSnsTopicArnParameter': '', - 'ConfigureRESSubmittersSnsTopicArnParameter': '', + 'ConfigureUsersGroupsJsonSnsTopicArnParameter': '', + 'ConfigureExternalLoginNodesSnsTopicArnParameter': '', 'CreateHeadNodeARecordSnsTopicArnParameter': self.create_head_node_a_record_sns_topic_arn_parameter_name, 'ErrorSnsTopicArn': self.config.get('ErrorSnsTopicArn', ''), 'playbooks_s3_url': self.playbooks_s3_url, @@ -898,9 +946,10 @@ def create_parallel_cluster_assets(self): template_vars['HomeMountSrc'] = self.mount_home_src else: template_vars['HomeMountSrc'] = '' - if 'RESStackName' in self.config: - template_vars['ConfigureRESUsersGroupsJsonSnsTopicArnParameter'] = self.configure_res_users_groups_json_sns_topic_arn_parameter_name - template_vars['ConfigureRESSubmittersSnsTopicArnParameter'] = self.configure_res_submitters_sns_topic_arn_parameter_name + if 'DomainJoinedInstance' in self.config: + template_vars['ConfigureUsersGroupsJsonSnsTopicArnParameter'] = self.configure_users_groups_json_sns_topic_arn_parameter_name + if 'ExternalLoginNodes' in self.config: + template_vars['ConfigureExternalLoginNodesSnsTopicArnParameter'] = self.configure_external_login_nodes_sns_topic_arn_parameter_name # Additions or deletions to the list should be reflected in config_scripts in on_head_node_start.sh. files_to_upload = [ @@ -1539,6 +1588,18 @@ def create_parallel_cluster_lambdas(self): ] ) ) + self.create_parallel_cluster_lambda.add_to_role_policy( + statement=iam.PolicyStatement( + effect=iam.Effect.ALLOW, + actions=[ + "iam:AttachRolePolicy", + "iam:DetachRolePolicy", + ], + resources=[ + f"arn:{Aws.PARTITION}:iam::{Aws.ACCOUNT_ID}:role/*" + ] + ) + ) if self.munge_key_secret_arn: self.create_parallel_cluster_lambda.add_to_role_policy( statement=iam.PolicyStatement( @@ -1639,36 +1700,28 @@ def create_parallel_cluster_lambdas(self): ) ) - if 'RESStackName' in self.config: - configureRESUsersGroupsJsonLambdaAsset = s3_assets.Asset(self, "ConfigureRESUsersGroupsJsonAsset", path="resources/lambdas/ConfigureRESUsersGroupsJson") - self.configure_res_users_groups_json_lambda = aws_lambda.Function( - self, "ConfigRESUsersGroupsJsonLambda", - function_name=f"{self.stack_name}-ConfigRESUsersGroupsJson", - description="Configure RES users and groups json file", + if 'DomainJoinedInstance' in self.config: + configureUsersGroupsJsonLambdaAsset = s3_assets.Asset(self, "ConfigureUsersGroupsJsonAsset", path="resources/lambdas/ConfigureUsersGroupsJson") + self.configure_users_groups_json_lambda = aws_lambda.Function( + self, "ConfigUsersGroupsJsonLambda", + function_name=f"{self.stack_name}-ConfigUsersGroupsJson", + description="Configure users and groups json file", memory_size=2048, runtime=aws_lambda.Runtime.PYTHON_3_9, architecture=aws_lambda.Architecture.X86_64, timeout=Duration.minutes(15), log_retention=logs.RetentionDays.INFINITE, - handler="ConfigureRESUsersGroupsJson.lambda_handler", - code=aws_lambda.Code.from_bucket(configureRESUsersGroupsJsonLambdaAsset.bucket, configureRESUsersGroupsJsonLambdaAsset.s3_object_key), + handler="ConfigureUsersGroupsJson.lambda_handler", + code=aws_lambda.Code.from_bucket(configureUsersGroupsJsonLambdaAsset.bucket, configureUsersGroupsJsonLambdaAsset.s3_object_key), environment = { 'ClusterName': self.config['slurm']['ClusterName'], 'ErrorSnsTopicArn': self.config.get('ErrorSnsTopicArn', ''), 'Region': self.cluster_region, - 'RESStackName': self.config['RESStackName'], - 'RESEnvironmentName': self.res_environment_name, - # 'RESDomainJoinedInstanceName': f"{self.res_environment_name}-vdc-controller", - # 'RESDomainJoinedInstanceModuleName': 'virtual-desktop-controller', - # 'RESDomainJoinedInstanceModuleId': 'vdc', - 'RESDomainJoinedInstanceName': f"{self.res_environment_name}-cluster-manager", - 'RESDomainJoinedInstanceModuleName': 'cluster-manager', - 'RESDomainJoinedInstanceModuleId': 'cluster-manager', - 'RESDomainJoinedInstanceNodeType': 'app', - 'SlurmLoginNodeSGId': self.slurm_login_node_sg_id + 'DomainJoinedInstanceTagsJson': json.dumps(self.config['DomainJoinedInstance']['Tags']), + 'SlurmLoginNodeSGId': self.config['DomainJoinedInstance'].get('SecurityGroupId', 'None') } ) - self.configure_res_users_groups_json_lambda.add_to_role_policy( + self.configure_users_groups_json_lambda.add_to_role_policy( statement=iam.PolicyStatement( effect=iam.Effect.ALLOW, actions=[ @@ -1681,7 +1734,7 @@ def create_parallel_cluster_lambdas(self): ) ) if 'ErrorSnsTopicArn' in self.config: - self.configure_res_users_groups_json_lambda.add_to_role_policy( + self.configure_users_groups_json_lambda.add_to_role_policy( statement=iam.PolicyStatement( effect=iam.Effect.ALLOW, actions=[ @@ -1690,33 +1743,32 @@ def create_parallel_cluster_lambdas(self): resources=[self.config['ErrorSnsTopicArn']] ) ) - self.configure_res_users_groups_json_lambda.add_event_source( - lambda_event_sources.SnsEventSource(self.configure_res_users_groups_json_sns_topic) + self.configure_users_groups_json_lambda.add_event_source( + lambda_event_sources.SnsEventSource(self.configure_users_groups_json_sns_topic) ) - self.configure_res_users_groups_json_sns_topic.grant_publish(self.parallel_cluster_sns_publish_policy) - - configureRESSubmittersLambdaAsset = s3_assets.Asset(self, "ConfigureRESSubmittersAsset", path="resources/lambdas/ConfigureRESSubmitters") - self.configure_res_submitters_lambda = aws_lambda.Function( - self, "ConfigRESSubmittersLambda", - function_name=f"{self.stack_name}-ConfigRESSubmitters", - description="Configure RES submitters", + self.configure_users_groups_json_sns_topic.grant_publish(self.parallel_cluster_sns_publish_policy) + + if 'ExternalLoginNodes' in self.config: + configureExternalLoginNodesLambdaAsset = s3_assets.Asset(self, "ConfigureExternalLoginNodesAsset", path="resources/lambdas/ConfigureExternalLoginNodes") + self.configure_external_login_nodes_lambda = aws_lambda.Function( + self, "ConfigExternalLoginNodesLambda", + function_name=f"{self.stack_name}-ConfigExternalLoginNodes", + description="Configure external login nodes", memory_size=2048, runtime=aws_lambda.Runtime.PYTHON_3_9, architecture=aws_lambda.Architecture.X86_64, timeout=Duration.minutes(15), log_retention=logs.RetentionDays.INFINITE, - handler="ConfigureRESSubmitters.lambda_handler", - code=aws_lambda.Code.from_bucket(configureRESSubmittersLambdaAsset.bucket, configureRESSubmittersLambdaAsset.s3_object_key), + handler="ConfigureExternalLoginNodes.lambda_handler", + code=aws_lambda.Code.from_bucket(configureExternalLoginNodesLambdaAsset.bucket, configureExternalLoginNodesLambdaAsset.s3_object_key), environment = { 'Region': self.cluster_region, 'ClusterName': self.config['slurm']['ClusterName'], 'ErrorSnsTopicArn': self.config.get('ErrorSnsTopicArn', ''), - 'RESStackName': self.config['RESStackName'], - 'RESEnvironmentName': self.res_environment_name, - 'SlurmLoginNodeSGId': self.slurm_login_node_sg_id + 'ExternalLoginNodesConfigJson': json.dumps(self.config['ExternalLoginNodes']) } ) - self.configure_res_submitters_lambda.add_to_role_policy( + self.configure_external_login_nodes_lambda.add_to_role_policy( statement=iam.PolicyStatement( effect=iam.Effect.ALLOW, actions=[ @@ -1729,7 +1781,7 @@ def create_parallel_cluster_lambdas(self): ) ) if 'ErrorSnsTopicArn' in self.config: - self.configure_res_submitters_lambda.add_to_role_policy( + self.configure_external_login_nodes_lambda.add_to_role_policy( statement=iam.PolicyStatement( effect=iam.Effect.ALLOW, actions=[ @@ -1738,38 +1790,32 @@ def create_parallel_cluster_lambdas(self): resources=[self.config['ErrorSnsTopicArn']] ) ) - self.configure_res_submitters_lambda.add_event_source( - lambda_event_sources.SnsEventSource(self.configure_res_submitters_sns_topic) + self.configure_external_login_nodes_lambda.add_event_source( + lambda_event_sources.SnsEventSource(self.configure_external_login_nodes_sns_topic) ) - self.configure_res_submitters_sns_topic.grant_publish(self.parallel_cluster_sns_publish_policy) + self.configure_external_login_nodes_sns_topic.grant_publish(self.parallel_cluster_sns_publish_policy) - self.deconfigureRESUsersGroupsJsonLambdaAsset = s3_assets.Asset(self, "DeconfigureRESUsersGroupsJsonAsset", path="resources/lambdas/DeconfigureRESUsersGroupsJson") - self.deconfigure_res_users_groups_json_lambda = aws_lambda.Function( - self, "DeconfigRESUsersGroupsJsonLambda", - function_name=f"{self.stack_name}-DeconfigRESUsersGroupsJson", + if 'DomainJoinedInstance' in self.config: + self.deconfigureUsersGroupsJsonLambdaAsset = s3_assets.Asset(self, "DeconfigureUsersGroupsJsonAsset", path="resources/lambdas/DeconfigureUsersGroupsJson") + self.deconfigure_users_groups_json_lambda = aws_lambda.Function( + self, "DeconfigUsersGroupsJsonLambda", + function_name=f"{self.stack_name}-DeconfigUsersGroupsJson", description="Deconfigure RES users and groups json file", memory_size=2048, runtime=aws_lambda.Runtime.PYTHON_3_9, architecture=aws_lambda.Architecture.X86_64, timeout=Duration.minutes(15), log_retention=logs.RetentionDays.INFINITE, - handler="DeconfigureRESUsersGroupsJson.lambda_handler", - code=aws_lambda.Code.from_bucket(self.deconfigureRESUsersGroupsJsonLambdaAsset.bucket, self.deconfigureRESUsersGroupsJsonLambdaAsset.s3_object_key), + handler="DeconfigureUsersGroupsJson.lambda_handler", + code=aws_lambda.Code.from_bucket(self.deconfigureUsersGroupsJsonLambdaAsset.bucket, self.deconfigureUsersGroupsJsonLambdaAsset.s3_object_key), environment = { 'ClusterName': self.config['slurm']['ClusterName'], 'ErrorSnsTopicArn': self.config.get('ErrorSnsTopicArn', ''), 'Region': self.cluster_region, - 'RESEnvironmentName': self.res_environment_name, - # 'RESDomainJoinedInstanceName': f"{self.res_environment_name}-vdc-controller", - # 'RESDomainJoinedInstanceModuleName': 'virtual-desktop-controller', - # 'RESDomainJoinedInstanceModuleId': 'vdc', - 'RESDomainJoinedInstanceName': f"{self.res_environment_name}-cluster-manager", - 'RESDomainJoinedInstanceModuleName': 'cluster-manager', - 'RESDomainJoinedInstanceModuleId': 'cluster-manager', - 'RESDomainJoinedInstanceNodeType': 'app' + 'DomainJoinedInstanceTagsJson': json.dumps(self.config['DomainJoinedInstance']['Tags']) } ) - self.deconfigure_res_users_groups_json_lambda.add_to_role_policy( + self.deconfigure_users_groups_json_lambda.add_to_role_policy( statement=iam.PolicyStatement( effect=iam.Effect.ALLOW, actions=[ @@ -1781,7 +1827,7 @@ def create_parallel_cluster_lambdas(self): ) ) if 'ErrorSnsTopicArn' in self.config: - self.deconfigure_res_users_groups_json_lambda.add_to_role_policy( + self.deconfigure_users_groups_json_lambda.add_to_role_policy( statement=iam.PolicyStatement( effect=iam.Effect.ALLOW, actions=[ @@ -1791,26 +1837,27 @@ def create_parallel_cluster_lambdas(self): ) ) - deconfigureRESSubmittersLambdaAsset = s3_assets.Asset(self, "DeconfigureRESSubmittersAsset", path="resources/lambdas/DeconfigureRESSubmitters") - self.deconfigure_res_submitters_lambda = aws_lambda.Function( - self, "DeconfigRESSubmittersLambda", - function_name=f"{self.stack_name}-DeconfigRESSubmitters", - description="Deconfigure RES submitters", + if 'ExternalLoginNodes' in self.config: + deconfigureExternalLoginNodesLambdaAsset = s3_assets.Asset(self, "DeconfigureExternalLoginNodesAsset", path="resources/lambdas/DeconfigureExternalLoginNodes") + self.deconfigure_external_login_nodes_lambda = aws_lambda.Function( + self, "DeconfigExternalLoginNodesLambda", + function_name=f"{self.stack_name}-DeconfigExternalLoginNodes", + description="Deconfigure external login nodes", memory_size=2048, runtime=aws_lambda.Runtime.PYTHON_3_9, architecture=aws_lambda.Architecture.X86_64, timeout=Duration.minutes(15), log_retention=logs.RetentionDays.INFINITE, - handler="DeconfigureRESSubmitters.lambda_handler", - code=aws_lambda.Code.from_bucket(deconfigureRESSubmittersLambdaAsset.bucket, deconfigureRESSubmittersLambdaAsset.s3_object_key), + handler="DeconfigureExternalLoginNodes.lambda_handler", + code=aws_lambda.Code.from_bucket(deconfigureExternalLoginNodesLambdaAsset.bucket, deconfigureExternalLoginNodesLambdaAsset.s3_object_key), environment = { 'ClusterName': self.config['slurm']['ClusterName'], 'ErrorSnsTopicArn': self.config.get('ErrorSnsTopicArn', ''), 'Region': self.cluster_region, - 'RESEnvironmentName': self.res_environment_name + 'ExternalLoginNodesConfigJson': json.dumps(self.config['ExternalLoginNodes']) } ) - self.deconfigure_res_submitters_lambda.add_to_role_policy( + self.deconfigure_external_login_nodes_lambda.add_to_role_policy( statement=iam.PolicyStatement( effect=iam.Effect.ALLOW, actions=[ @@ -1822,7 +1869,7 @@ def create_parallel_cluster_lambdas(self): ) ) if 'ErrorSnsTopicArn' in self.config: - self.deconfigure_res_submitters_lambda.add_to_role_policy( + self.deconfigure_external_login_nodes_lambda.add_to_role_policy( statement=iam.PolicyStatement( effect=iam.Effect.ALLOW, actions=[ @@ -2707,10 +2754,11 @@ def create_parallel_cluster_config(self): # The lambda to create an A record for the head node must be built before the parallel cluster. self.parallel_cluster.node.add_dependency(self.create_head_node_a_record_lambda) self.parallel_cluster.node.add_dependency(self.update_head_node_lambda) - # The lambdas to configure instances must exist befor the cluster so they can be called. - if 'RESStackName' in self.config: - self.parallel_cluster.node.add_dependency(self.configure_res_users_groups_json_lambda) - self.parallel_cluster.node.add_dependency(self.configure_res_submitters_lambda) + # The lambdas to configure instances must exist before the cluster so they can be called. + if 'DomainJoinedInstance' in self.config: + self.parallel_cluster.node.add_dependency(self.configure_users_groups_json_lambda) + if 'ExternalLoginNodes' in self.config: + self.parallel_cluster.node.add_dependency(self.configure_external_login_nodes_lambda) # Build config files need to be created before cluster so that they can be downloaded as part of on_head_node_configures self.parallel_cluster.node.add_dependency(self.build_config_files) self.parallel_cluster.node.add_dependency(self.parallel_cluster_config) @@ -2728,20 +2776,21 @@ def create_parallel_cluster_config(self): ) self.update_head_node.node.add_dependency(self.parallel_cluster) - if 'RESStackName' in self.config: + if 'DomainJoinedInstance' in self.config: # Custom resource to deconfigure cluster manager before deleting cluster self.deconfigure_res_users_groups_json = CustomResource( - self, "DeconfigureRESUsersGroupsJson", - service_token = self.deconfigure_res_users_groups_json_lambda.function_arn, + self, "DeconfigureUsersGroupsJson", + service_token = self.deconfigure_users_groups_json_lambda.function_arn, properties = { } ) self.deconfigure_res_users_groups_json.node.add_dependency(self.parallel_cluster) + if 'ExternalLoginNodes' in self.config: # Custom resource to deconfigure submitters before deleting cluster self.deconfigure_res_submitters = CustomResource( - self, "DeconfigureRESSubmitters", - service_token = self.deconfigure_res_submitters_lambda.function_arn, + self, "DeconfigureExternalLoginNodes", + service_token = self.deconfigure_external_login_nodes_lambda.function_arn, properties = { } ) diff --git a/source/cdk/config_schema.py b/source/cdk/config_schema.py index 17209bf3..9b8a4360 100644 --- a/source/cdk/config_schema.py +++ b/source/cdk/config_schema.py @@ -569,6 +569,30 @@ def get_config_schema(config): Optional('TimeZone', default='US/Central'): str, Optional('AdditionalSecurityGroupsStackName'): str, Optional('RESStackName'): str, + # ExternalLoginNodes: + # Configure external login nodes + # Tags of instances that can be configured to submit to the cluster. + # When the cluster is deleted, the tag is used to unmount the slurm filesystem from the instances using SSM. + Optional('ExternalLoginNodes'): [ + { + 'Tags': [ + { + 'Key': str, + 'Values': [str] + } + ], + Optional('SecurityGroupId'): str + } + ], + Optional('DomainJoinedInstance'): { + 'Tags': [ + { + 'Key': str, + 'Values': [str] + } + ], + Optional('SecurityGroupId'): str + }, 'slurm': { Optional('ParallelClusterConfig'): { Optional('Enable', default=True): And(bool, lambda s: s == True), @@ -681,10 +705,6 @@ def get_config_schema(config): Optional('Secured', default=True): bool } }, - # SubmitterInstanceTags: - # Tags of instances that can be configured to submit to the cluster. - # When the cluster is deleted, the tag is used to unmount the slurm filesystem from the instances using SSM. - Optional('SubmitterInstanceTags'): {str: [str]}, # # InstanceConfig: # Configure the instances used by the cluster diff --git a/source/resources/lambdas/ConfigureExternalLoginNodes/ConfigureExternalLoginNodes.py b/source/resources/lambdas/ConfigureExternalLoginNodes/ConfigureExternalLoginNodes.py new file mode 100644 index 00000000..8cc7db0b --- /dev/null +++ b/source/resources/lambdas/ConfigureExternalLoginNodes/ConfigureExternalLoginNodes.py @@ -0,0 +1,143 @@ +""" +Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +SPDX-License-Identifier: MIT-0 + +Permission is hereby granted, free of charge, to any person obtaining a copy of this +software and associated documentation files (the "Software"), to deal in the Software +without restriction, including without limitation the rights to use, copy, modify, +merge, publish, distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, +INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +""" + +''' +Call /opt/slurm/{{ClusterName}}/config/bin/on_head_node_updated.sh using ssm run command. +''' +import boto3 +from textwrap import dedent +import json +import logging +from os import environ as environ + +logger=logging.getLogger(__file__) +logger_formatter = logging.Formatter('%(levelname)s: %(message)s') +logger_streamHandler = logging.StreamHandler() +logger_streamHandler.setFormatter(logger_formatter) +logger.addHandler(logger_streamHandler) +logger.setLevel(logging.INFO) +logger.propagate = False + +def lambda_handler(event, context): + try: + logger.info(f"event:\n{json.dumps(event, indent=4)}") + + cluster_name = environ['ClusterName'] + cluster_region = environ['Region'] + external_login_nodes_config = json.loads(environ['ExternalLoginNodesConfigJson']) + + logger.info(f"Configure external login nodes for {cluster_name} in {cluster_region}") + + ec2_client = boto3.client('ec2', region_name=cluster_region) + + login_node_instance_ids = [] + for external_login_node_config in external_login_nodes_config: + slurm_login_node_sg_id = external_login_node_config.get('SecurityGroupId', None) + + tags_message = '' + describe_instances_kwargs = { + 'Filters': [ + {'Name': 'instance-state-name', 'Values': ['running']} + ] + } + for tag_dict in external_login_node_config['Tags']: + tag = tag_dict['Key'] + values = tag_dict['Values'] + tags_message += f"\n{tag}: {values}" + describe_instances_kwargs['Filters'].append( + {'Name': f"tag:{tag}", 'Values': values} + ) + logger.info(f"Configure instances with the following tags as login nodes:{tags_message}") + + describe_instances_paginator = ec2_client.get_paginator('describe_instances') + describe_instances_iterator = describe_instances_paginator.paginate(**describe_instances_kwargs) + reservation_index = 0 + for response in describe_instances_iterator: + for reservation_info in response['Reservations']: + logger.info(f"reservation[{reservation_index}]:") + reservation_index += 1 + instance_index = 0 + for instance_info in reservation_info['Instances']: + logger.info(f" instance[{instance_index}]:") + instance_index += 1 + logger.info(f" instance_id: {instance_info['InstanceId']}") + if instance_info['State']['Name'] != 'running': + logger.info(f" Skipping because state = {instance_info['State']['Name']}") + continue + instance_id = instance_info['InstanceId'] + login_node_instance_ids.append(instance_id) + security_group_ids = [] + for security_group_dict in instance_info['SecurityGroups']: + security_group_ids.append(security_group_dict['GroupId']) + if slurm_login_node_sg_id: + if slurm_login_node_sg_id not in security_group_ids: + # Attach the security group + logger.info(f"Attaching {slurm_login_node_sg_id} to {instance_id}.") + security_group_ids.append(slurm_login_node_sg_id) + ec2_client.modify_instance_attribute(InstanceId=instance_id, Groups=security_group_ids) + else: + logger.info(f"{slurm_login_node_sg_id} already attached to {instance_id}") + + if login_node_instance_ids: + logger.info(f"Found {len(login_node_instance_ids)} login nodes. instance_ids:" + "\n".join(login_node_instance_ids)) + else: + logger.info("No running login nodes.") + return + + ssm_client = boto3.client('ssm', region_name=cluster_region) + + ssm_script = dedent(f""" + set -ex + + if ! [ -e /opt/slurm/{cluster_name} ]; then + sudo mkdir -p /opt/slurm/{cluster_name} + fi + if ! mountpoint /opt/slurm/{cluster_name} ; then + sudo mount head_node.{cluster_name}.pcluster:/opt/slurm /opt/slurm/{cluster_name} || true + fi + + script="/opt/slurm/{cluster_name}/config/bin/submitter_configure.sh" + if ! [ -e $script ]; then + echo "$script doesn't exist" + exit 1 + fi + + sudo $script + """) + + TIMEOUT_MINUTES = 90 + TIMEOUT_SECONDS = TIMEOUT_MINUTES * 60 + send_command_response = ssm_client.send_command( + DocumentName = 'AWS-RunShellScript', + InstanceIds = login_node_instance_ids, + Parameters = {'commands': [ssm_script]}, + Comment = f"Configure external login nodes for {cluster_name}", + TimeoutSeconds = TIMEOUT_SECONDS + ) + logger.info(f"Sent SSM command {send_command_response['Command']['CommandId']}") + + except Exception as e: + logger.exception(str(e)) + sns_client = boto3.client('sns') + sns_client.publish( + TopicArn = environ['ErrorSnsTopicArn'], + Subject = f"{cluster_name} ConfigureRESSubmitters failed", + Message = str(e) + ) + logger.info(f"Published error to {environ['ErrorSnsTopicArn']}") + raise diff --git a/source/resources/lambdas/ConfigureRESSubmitters/ConfigureRESSubmitters.py b/source/resources/lambdas/ConfigureRESSubmitters/ConfigureRESSubmitters.py deleted file mode 100644 index 5beabd4b..00000000 --- a/source/resources/lambdas/ConfigureRESSubmitters/ConfigureRESSubmitters.py +++ /dev/null @@ -1,127 +0,0 @@ -""" -Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -SPDX-License-Identifier: MIT-0 - -Permission is hereby granted, free of charge, to any person obtaining a copy of this -software and associated documentation files (the "Software"), to deal in the Software -without restriction, including without limitation the rights to use, copy, modify, -merge, publish, distribute, sublicense, and/or sell copies of the Software, and to -permit persons to whom the Software is furnished to do so. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A -PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION -OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE -SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -""" - -''' -Call /opt/slurm/{{ClusterName}}/config/bin/on_head_node_updated.sh using ssm run command. -''' -import boto3 -import json -import logging -from os import environ as environ - -logger=logging.getLogger(__file__) -logger_formatter = logging.Formatter('%(levelname)s: %(message)s') -logger_streamHandler = logging.StreamHandler() -logger_streamHandler.setFormatter(logger_formatter) -logger.addHandler(logger_streamHandler) -logger.setLevel(logging.INFO) -logger.propagate = False - -def lambda_handler(event, context): - try: - logger.info(f"event:\n{json.dumps(event, indent=4)}") - - cluster_name = environ['ClusterName'] - cluster_region = environ['Region'] - environment_name = environ['RESEnvironmentName'] - slurm_login_node_sg_id = environ['SlurmLoginNodeSGId'] - logger.info(f"Configure RES({environment_name}) submitters for {cluster_name} in {cluster_region}") - - ec2_client = boto3.client('ec2', region_name=cluster_region) - describe_instances_paginator = ec2_client.get_paginator('describe_instances') - describe_instances_iterator = describe_instances_paginator.paginate( - Filters = [ - {'Name': 'tag:res:EnvironmentName', 'Values': [environment_name]}, - {'Name': 'tag:res:NodeType', 'Values': ['virtual-desktop-dcv-host']}, - {'Name': 'instance-state-name', 'Values': ['running']} - ] - ) - submitter_instance_ids = [] - reservation_index = 0 - for response in describe_instances_iterator: - for reservation_info in response['Reservations']: - logger.info(f"reservation[{reservation_index}]:") - reservation_index += 1 - instance_index = 0 - for instance_info in reservation_info['Instances']: - logger.info(f" instance[{instance_index}]:") - instance_index += 1 - logger.info(f" instance_id: {instance_info['InstanceId']}") - if instance_info['State']['Name'] != 'running': - logger.info(f" Skipping because state = {instance_info['State']['Name']}") - continue - for tags in instance_info['Tags']: - logger.info(f" {tags}") - instance_id = instance_info['InstanceId'] - submitter_instance_ids.append(instance_id) - security_group_ids = [] - for security_group_dict in instance_info['SecurityGroups']: - security_group_ids.append(security_group_dict['GroupId']) - if slurm_login_node_sg_id not in security_group_ids: - # Attach the security group - logger.info(f"Attaching {slurm_login_node_sg_id} to {instance_id}.") - security_group_ids.append(slurm_login_node_sg_id) - ec2_client.modify_instance_attribute(InstanceId=instance_id, Groups=security_group_ids) - else: - logger.info(f"{slurm_login_node_sg_id} already attached to {instance_id}") - - logger.info(f"submitter_instance_ids: {submitter_instance_ids}") - if not submitter_instance_ids: - logger.info("No running submitters.") - return - - ssm_client = boto3.client('ssm', region_name=cluster_region) - commands = f""" -set -ex - -if ! [ -e /opt/slurm/{cluster_name} ]; then - sudo mkdir -p /opt/slurm/{cluster_name} -fi -if ! mountpoint /opt/slurm/{cluster_name} ; then - sudo mount head_node.{cluster_name}.pcluster:/opt/slurm /opt/slurm/{cluster_name} || true -fi - -script="/opt/slurm/{cluster_name}/config/bin/submitter_configure.sh" -if ! [ -e $script ]; then - echo "$script doesn't exist" - exit 1 -fi - -sudo $script - """ - TIMEOUT_MINUTES = 90 - TIMEOUT_SECONDS = TIMEOUT_MINUTES * 60 - send_command_response = ssm_client.send_command( - DocumentName = 'AWS-RunShellScript', - InstanceIds = submitter_instance_ids, - Parameters = {'commands': [commands]}, - Comment = f"Configure {environment_name} submitters for {cluster_name}", - TimeoutSeconds = TIMEOUT_SECONDS - ) - logger.info(f"Sent SSM command {send_command_response['Command']['CommandId']}") - - except Exception as e: - logger.exception(str(e)) - sns_client = boto3.client('sns') - sns_client.publish( - TopicArn = environ['ErrorSnsTopicArn'], - Subject = f"{cluster_name} ConfigureRESSubmitters failed", - Message = str(e) - ) - logger.info(f"Published error to {environ['ErrorSnsTopicArn']}") - raise diff --git a/source/resources/lambdas/ConfigureRESUsersGroupsJson/ConfigureRESUsersGroupsJson.py b/source/resources/lambdas/ConfigureUsersGroupsJson/ConfigureUsersGroupsJson.py similarity index 70% rename from source/resources/lambdas/ConfigureRESUsersGroupsJson/ConfigureRESUsersGroupsJson.py rename to source/resources/lambdas/ConfigureUsersGroupsJson/ConfigureUsersGroupsJson.py index f08de760..86e9971d 100644 --- a/source/resources/lambdas/ConfigureRESUsersGroupsJson/ConfigureRESUsersGroupsJson.py +++ b/source/resources/lambdas/ConfigureUsersGroupsJson/ConfigureUsersGroupsJson.py @@ -38,28 +38,30 @@ def lambda_handler(event, context): cluster_name = environ['ClusterName'] cluster_region = environ['Region'] - res_environment_name = environ['RESEnvironmentName'] - res_domain_joined_instance_name = environ['RESDomainJoinedInstanceName'] - res_domain_joined_instance_module_name = environ['RESDomainJoinedInstanceModuleName'] - res_domain_joined_instance_module_id = environ['RESDomainJoinedInstanceModuleId'] - res_domain_joined_instance_node_type = environ['RESDomainJoinedInstanceNodeType'] + domain_joined_instance_tags = json.loads(environ['DomainJoinedInstanceTagsJson']) slurm_login_node_sg_id = environ['SlurmLoginNodeSGId'] - logger.info(f"Configure update of /opt/slurm/{cluster_name}/config/users_groups.json from RES {res_environment_name} domain joined instance with following tags:\nName={res_domain_joined_instance_name}\nres:ModuleName={res_domain_joined_instance_module_name}\nres:ModuleId={res_domain_joined_instance_module_name}\nres:NodeType={res_domain_joined_instance_node_type}\nstate=running") + if slurm_login_node_sg_id == 'None': + slurm_login_node_sg_id = None - domain_joined_instance_id = None - domain_joined_instance_security_group_ids = [] - ec2_client = boto3.client('ec2', region_name=cluster_region) - describe_instances_paginator = ec2_client.get_paginator('describe_instances') + tags_message = '' describe_instances_kwargs = { 'Filters': [ - {'Name': 'tag:res:EnvironmentName', 'Values': [res_environment_name]}, - {'Name': 'tag:Name', 'Values': [res_domain_joined_instance_name]}, - {'Name': 'tag:res:ModuleName', 'Values': [res_domain_joined_instance_module_name]}, - {'Name': 'tag:res:ModuleId', 'Values': [res_domain_joined_instance_module_id]}, - {'Name': 'tag:res:NodeType', 'Values': [res_domain_joined_instance_node_type]}, {'Name': 'instance-state-name', 'Values': ['running']} ] } + for tag_dict in domain_joined_instance_tags: + tag = tag_dict['Key'] + values = tag_dict['Values'] + tags_message += f"\n{tag}: {values}" + describe_instances_kwargs['Filters'].append( + {'Name': f"tag:{tag}", 'Values': values} + ) + logger.info(f"Configure update of /opt/slurm/{cluster_name}/config/users_groups.json from domain joined instance with following tags:{tags_message}") + + domain_joined_instance_id = None + domain_joined_instance_security_group_ids = [] + ec2_client = boto3.client('ec2', region_name=cluster_region) + describe_instances_paginator = ec2_client.get_paginator('describe_instances') for describe_instances_response in describe_instances_paginator.paginate(**describe_instances_kwargs): for reservation_dict in describe_instances_response['Reservations']: domain_joined_instance_info = reservation_dict['Instances'][0] @@ -68,16 +70,17 @@ def lambda_handler(event, context): for security_group_dict in domain_joined_instance_info['SecurityGroups']: domain_joined_instance_security_group_ids.append(security_group_dict['GroupId']) if not domain_joined_instance_id: - raise RuntimeError(f"No running instances found with tags res:EnvironmentName={res_environment_name}, Name={res_domain_joined_instance_name}, res:ModuleName={res_domain_joined_instance_module_name}, res:ModuleId={res_domain_joined_instance_module_id}, res:NodeType={res_domain_joined_instance_node_type}") - - # Make sure that the RES login nodes have the required security group attached. - if slurm_login_node_sg_id not in domain_joined_instance_security_group_ids: - # Attach the security group - logger.info(f"Attaching {slurm_login_node_sg_id} to {domain_joined_instance_id}.") - domain_joined_instance_security_group_ids.append(slurm_login_node_sg_id) - ec2_client.modify_instance_attribute(InstanceId=domain_joined_instance_id, Groups=domain_joined_instance_security_group_ids) - else: - logger.info(f"{slurm_login_node_sg_id} already attached to {domain_joined_instance_id}") + raise RuntimeError(f"No running instances found with tags:{tags_message}") + + # Make sure that the instance has the required security group attached. + if slurm_login_node_sg_id: + if slurm_login_node_sg_id not in domain_joined_instance_security_group_ids: + # Attach the security group + logger.info(f"Attaching {slurm_login_node_sg_id} to {domain_joined_instance_id}.") + domain_joined_instance_security_group_ids.append(slurm_login_node_sg_id) + ec2_client.modify_instance_attribute(InstanceId=domain_joined_instance_id, Groups=domain_joined_instance_security_group_ids) + else: + logger.info(f"{slurm_login_node_sg_id} already attached to {domain_joined_instance_id}") ssm_client = boto3.client('ssm', region_name=cluster_region) commands = f""" @@ -102,7 +105,7 @@ def lambda_handler(event, context): DocumentName = 'AWS-RunShellScript', InstanceIds = [domain_joined_instance_id], Parameters = {'commands': [commands]}, - Comment = f"Configure {res_environment_name} users and groups for {cluster_name}", + Comment = f"Configure users and groups for {cluster_name}", TimeoutSeconds = 5 * 60 # 5 minutes ) command_id = send_command_response['Command']['CommandId'] diff --git a/source/resources/lambdas/CreateParallelCluster/CreateParallelCluster.py b/source/resources/lambdas/CreateParallelCluster/CreateParallelCluster.py index c8b78086..0e825667 100644 --- a/source/resources/lambdas/CreateParallelCluster/CreateParallelCluster.py +++ b/source/resources/lambdas/CreateParallelCluster/CreateParallelCluster.py @@ -67,6 +67,10 @@ def get_cluster_status(cluster_name, cluster_region): def lambda_handler(event, context): try: logger.info(f"event:\n{json.dumps(event, indent=4)}") + + # Create sns client so can send notifications on any errors. + sns_client = boto3.client('sns') + cluster_name = None requestType = event['RequestType'] properties = event['ResourceProperties'] @@ -91,9 +95,6 @@ def lambda_handler(event, context): cluster_region = environ['Region'] logger.info(f"{requestType} request for {cluster_name} in {cluster_region}") - # Create sns client so can send notifications on any errors. - sns_client = boto3.client('sns') - cluster_status = get_cluster_status(cluster_name, cluster_region) if cluster_status: valid_statuses = ['CREATE_COMPLETE', 'UPDATE_COMPLETE', 'UPDATE_ROLLBACK_COMPLETE'] diff --git a/source/resources/lambdas/DeconfigureRESSubmitters/DeconfigureRESSubmitters.py b/source/resources/lambdas/DeconfigureExternalLoginNodes/DeconfigureExternalLoginNodes.py similarity index 53% rename from source/resources/lambdas/DeconfigureRESSubmitters/DeconfigureRESSubmitters.py rename to source/resources/lambdas/DeconfigureExternalLoginNodes/DeconfigureExternalLoginNodes.py index 141c7452..ceea354e 100644 --- a/source/resources/lambdas/DeconfigureRESSubmitters/DeconfigureRESSubmitters.py +++ b/source/resources/lambdas/DeconfigureExternalLoginNodes/DeconfigureExternalLoginNodes.py @@ -24,6 +24,7 @@ import json import logging from os import environ as environ +from textwrap import dedent import time logger=logging.getLogger(__file__) @@ -58,7 +59,8 @@ def lambda_handler(event, context): cluster_name = environ['ClusterName'] cluster_region = environ['Region'] - environment_name = environ['RESEnvironmentName'] + external_login_nodes_config = json.loads(environ['ExternalLoginNodesConfigJson']) + logger.info(f"{requestType} request for {cluster_name} in {cluster_region}") if requestType != 'Delete': @@ -66,101 +68,114 @@ def lambda_handler(event, context): cfnresponse.send(event, context, cfnresponse.SUCCESS, {}, physicalResourceId=cluster_name) return - logger.info(f"Deconfigure RES({environment_name}) submitters for {cluster_name} in {cluster_region}") + logger.info(f"Deconfigure external login nodes for {cluster_name} in {cluster_region}") ec2_client = boto3.client('ec2', region_name=cluster_region) - describe_instances_paginator = ec2_client.get_paginator('describe_instances') - describe_instances_iterator = describe_instances_paginator.paginate( - Filters = [ - {'Name': 'tag:res:EnvironmentName', 'Values': [environment_name]}, - {'Name': 'tag:res:NodeType', 'Values': ['virtual-desktop-dcv-host']}, - {'Name': 'instance-state-name', 'Values': ['running']} - ] - ) - submitter_instance_ids = [] - reservation_index = 0 - for response in describe_instances_iterator: - for reservation_info in response['Reservations']: - logger.info(f"reservation[{reservation_index}]:") - reservation_index += 1 - instance_index = 0 - for instance_info in reservation_info['Instances']: - logger.info(f" instance[{instance_index}]:") - instance_index += 1 - logger.info(f" instance_id: {instance_info['InstanceId']}") - if instance_info['State']['Name'] != 'running': - logger.info(f" Skipping because state = {instance_info['State']['Name']}") - continue - for tags in instance_info['Tags']: - logger.info(f" {tags}") - submitter_instance_ids.append(instance_info['InstanceId']) - logger.info(f"submitter_instance_ids: {submitter_instance_ids}") - if not submitter_instance_ids: - logger.info("No running submitters.") + + login_node_instance_ids = [] + for external_login_node_config in external_login_nodes_config: + tags_message = '' + describe_instances_kwargs = { + 'Filters': [ + {'Name': 'instance-state-name', 'Values': ['running']} + ] + } + for tag_dict in external_login_node_config['Tags']: + tag = tag_dict['Key'] + values = tag_dict['Values'] + tags_message += f"\n{tag}: {values}" + describe_instances_kwargs['Filters'].append( + {'Name': f"tag:{tag}", 'Values': values} + ) + logger.info(f"Deconfigure instances with the following tags as login nodes:{tags_message}") + + describe_instances_paginator = ec2_client.get_paginator('describe_instances') + describe_instances_iterator = describe_instances_paginator.paginate(**describe_instances_kwargs) + reservation_index = 0 + for response in describe_instances_iterator: + for reservation_info in response['Reservations']: + logger.info(f"reservation[{reservation_index}]:") + reservation_index += 1 + instance_index = 0 + for instance_info in reservation_info['Instances']: + logger.info(f" instance[{instance_index}]:") + instance_index += 1 + logger.info(f" instance_id: {instance_info['InstanceId']}") + if instance_info['State']['Name'] != 'running': + logger.info(f" Skipping because state = {instance_info['State']['Name']}") + continue + login_node_instance_ids.append(instance_info['InstanceId']) + + if login_node_instance_ids: + logger.info(f"Found {len(login_node_instance_ids)} login nodes. instance_ids:"+"\n" + '\n'.join(login_node_instance_ids)) + else: + logger.info("No running login nodes.") cfnresponse.send(event, context, cfnresponse.SUCCESS, {}, physicalResourceId=cluster_name) return ssm_client = boto3.client('ssm', region_name=cluster_region) - commands = f""" -set -ex - -mount_dest=/opt/slurm/{cluster_name} - -# Make sure that the cluster is still mounted and mount is accessible. -# If the cluster has already been deleted then the mount will be hung and we have to do manual cleanup. -if mount | grep " $mount_dest "; then - echo "$mount_dest is mounted." - if ! timeout 1s ls $mount_dest; then - echo "Mount point ($mount_dest) is hung. Source may have already been deleted." - timeout 5s sudo umount -lf $mount_dest - timeout 1s rm -rf $mount_dest - fi -fi - -script="$mount_dest/config/bin/submitter_deconfigure.sh" -if ! timeout 1s ls $script; then - echo "$script doesn't exist" -else - sudo $script -fi - -# Do manual cleanup just in case something above failed. - -sudo rm -f /etc/profile.d/slurm_{cluster_name}_modulefiles.sh - -sudo grep -v ' $mount_dest ' /etc/fstab > /etc/fstab.new -if diff -q /etc/fstab /etc/fstab.new; then - sudo rm -f /etc/fstab.new -else - sudo cp /etc/fstab /etc/fstab.$(date '+%Y-%m-%d@%H:%M:%S~') - sudo mv -f /etc/fstab.new /etc/fstab -fi - -if timeout 1s mountpoint $mount_dest; then - echo "$mount_dest is a mountpoint" - sudo umount -lf $mount_dest -fi - -if timeout 1s ls $mount_dest; then - sudo rmdir $mount_dest -fi - """ + + ssm_script = dedent(f""" + set -ex + + mount_dest=/opt/slurm/{cluster_name} + + # Make sure that the cluster is still mounted and mount is accessible. + # If the cluster has already been deleted then the mount will be hung and we have to do manual cleanup. + if mount | grep " $mount_dest "; then + echo "$mount_dest is mounted." + if ! timeout 1s ls $mount_dest; then + echo "Mount point ($mount_dest) is hung. Source may have already been deleted." + timeout 5s sudo umount -lf $mount_dest + timeout 1s rm -rf $mount_dest + fi + fi + + script="$mount_dest/config/bin/submitter_deconfigure.sh" + if ! timeout 1s ls $script; then + echo "$script doesn't exist" + else + sudo $script + fi + + # Do manual cleanup just in case something above failed. + + sudo rm -f /etc/profile.d/slurm_{cluster_name}_modulefiles.sh + + sudo grep -v ' $mount_dest ' /etc/fstab > /etc/fstab.new + if diff -q /etc/fstab /etc/fstab.new; then + sudo rm -f /etc/fstab.new + else + sudo cp /etc/fstab /etc/fstab.$(date '+%Y-%m-%d@%H:%M:%S~') + sudo mv -f /etc/fstab.new /etc/fstab + fi + + if timeout 1s mountpoint $mount_dest; then + echo "$mount_dest is a mountpoint" + sudo umount -lf $mount_dest + fi + + if timeout 1s ls $mount_dest; then + sudo rmdir $mount_dest + fi + """) + response = ssm_client.send_command( DocumentName = 'AWS-RunShellScript', - InstanceIds = submitter_instance_ids, - Parameters = {'commands': [commands]}, - Comment = f"Deconfigure {environment_name} submitters for {cluster_name}" + InstanceIds = login_node_instance_ids, + Parameters = {'commands': [ssm_script]}, + Comment = f"Deconfigure external login nodes for {cluster_name}" ) command_id = response['Command']['CommandId'] logger.info(f"Sent SSM command {command_id}") # Wait for the command invocations to be made time.sleep(5) - # Wait for the command to complete before returning so that the cluster resources aren't removed before the command completes. + # Wait for the commands to complete before returning so that the cluster resources aren't removed before the command completes. num_errors = 0 MAX_WAIT_TIME = 13 * 60 wait_time = 0 - for instance_id in submitter_instance_ids: + for instance_id in login_node_instance_ids: command_complete = False while not command_complete: response = ssm_client.get_command_invocation( diff --git a/source/resources/lambdas/DeconfigureRESSubmitters/cfnresponse.py b/source/resources/lambdas/DeconfigureExternalLoginNodes/cfnresponse.py similarity index 100% rename from source/resources/lambdas/DeconfigureRESSubmitters/cfnresponse.py rename to source/resources/lambdas/DeconfigureExternalLoginNodes/cfnresponse.py diff --git a/source/resources/lambdas/DeconfigureRESUsersGroupsJson/DeconfigureRESUsersGroupsJson.py b/source/resources/lambdas/DeconfigureUsersGroupsJson/DeconfigureUsersGroupsJson.py similarity index 68% rename from source/resources/lambdas/DeconfigureRESUsersGroupsJson/DeconfigureRESUsersGroupsJson.py rename to source/resources/lambdas/DeconfigureUsersGroupsJson/DeconfigureUsersGroupsJson.py index 21adfc54..375ab9a1 100644 --- a/source/resources/lambdas/DeconfigureRESUsersGroupsJson/DeconfigureRESUsersGroupsJson.py +++ b/source/resources/lambdas/DeconfigureUsersGroupsJson/DeconfigureUsersGroupsJson.py @@ -24,6 +24,7 @@ import json import logging from os import environ as environ +from textwrap import dedent import time logger=logging.getLogger(__file__) @@ -57,12 +58,10 @@ def lambda_handler(event, context): raise KeyError(error_message) cluster_name = environ['ClusterName'] + error_sns_topic_arn = environ['ErrorSnsTopicArn'] cluster_region = environ['Region'] - environment_name = environ['RESEnvironmentName'] - res_domain_joined_instance_name = environ['RESDomainJoinedInstanceName'] - res_domain_joined_instance_module_name = environ['RESDomainJoinedInstanceModuleName'] - res_domain_joined_instance_module_id = environ['RESDomainJoinedInstanceModuleId'] - res_domain_joined_instance_node_type = environ['RESDomainJoinedInstanceNodeType'] + domain_joined_instance_tags = json.loads(environ['DomainJoinedInstanceTagsJson']) + logger.info(f"{requestType} request for {cluster_name} in {cluster_region}") if requestType != 'Delete': @@ -70,81 +69,83 @@ def lambda_handler(event, context): cfnresponse.send(event, context, cfnresponse.SUCCESS, {}, physicalResourceId=cluster_name) return - logger.info(f"Deconfigure update of /opt/slurm/{cluster_name}/config/users_groups.json from RES {environment_name} domain joined instance with following tags:\nName={res_domain_joined_instance_name}\nres:ModuleName={res_domain_joined_instance_module_name}\nres:ModuleId={res_domain_joined_instance_module_name}\nres:NodeType={res_domain_joined_instance_node_type}\nstate=running") - - domain_joined_instance_id = None - ec2_client = boto3.client('ec2', region_name=cluster_region) - describe_instances_paginator = ec2_client.get_paginator('describe_instances') + tags_message = '' describe_instances_kwargs = { 'Filters': [ - {'Name': 'tag:res:EnvironmentName', 'Values': [environment_name]}, - {'Name': 'tag:Name', 'Values': [res_domain_joined_instance_name]}, - {'Name': 'tag:res:ModuleName', 'Values': [res_domain_joined_instance_module_name]}, - {'Name': 'tag:res:ModuleId', 'Values': [res_domain_joined_instance_module_id]}, - {'Name': 'tag:res:NodeType', 'Values': [res_domain_joined_instance_node_type]}, {'Name': 'instance-state-name', 'Values': ['running']} ] } + for tag_dict in domain_joined_instance_tags: + tag = tag_dict['Key'] + values = tag_dict['Values'] + tags_message += f"\n{tag}: {values}" + describe_instances_kwargs['Filters'].append( + {'Name': f"tag:{tag}", 'Values': values} + ) + logger.info(f"Deconfigure update of /opt/slurm/{cluster_name}/config/users_groups.json from domain joined instance with following tags:{tags_message}") + + domain_joined_instance_id = None + ec2_client = boto3.client('ec2', region_name=cluster_region) + describe_instances_paginator = ec2_client.get_paginator('describe_instances') for describe_instances_response in describe_instances_paginator.paginate(**describe_instances_kwargs): for reservation_dict in describe_instances_response['Reservations']: domain_joined_instance_info = reservation_dict['Instances'][0] domain_joined_instance_id = domain_joined_instance_info['InstanceId'] logger.info(f"Domain joined instance id: {domain_joined_instance_id}") if not domain_joined_instance_id: - raise RuntimeError(f"No running instances found with tags res:EnvironmentName={environment_name} and res:ModuleId={res_domain_joined_instance_module_name}") + raise RuntimeError(f"No running instances found with tags:{tags_message}") ssm_client = boto3.client('ssm', region_name=cluster_region) - commands = f""" -set -ex - -mount_dest=/opt/slurm/{cluster_name} - -# Make sure that the cluster is still mounted and mount is accessible. -# If the cluster has already been deleted then the mount will be hung and we have to do manual cleanup. -# Another failure mechanism is if the cluster didn't deploy in which case the mount may not even exist. -if mount | grep " $mount_dest "; then - echo "$mount_dest is mounted." - if ! timeout 1s ls $mount_dest; then - echo "Mount point ($mount_dest) is hung. Source may have already been deleted." - timeout 5s sudo umount -lf $mount_dest - timeout 1s rm -rf $mount_dest - fi -fi - -script="$mount_dest/config/bin/create_users_groups_json_deconfigure.sh" -if ! timeout 1s ls $script; then - echo "$script doesn't exist or isn't accessible." -else - sudo $script -fi - -# Do manual cleanup just in case something above failed. - -sudo grep -v " $mount_dest " /etc/fstab > /etc/fstab.new -if diff -q /etc/fstab /etc/fstab.new; then - sudo rm -f /etc/fstab.new -else - sudo cp /etc/fstab /etc/fstab.$(date '+%Y-%m-%d@%H:%M:%S~') - sudo mv -f /etc/fstab.new /etc/fstab -fi - -if timeout 1s mountpoint $mount_dest; then - echo "$mount_dest is a mountpoint" - sudo umount -lf $mount_dest -fi - -if timeout 1s ls $mount_dest; then - sudo rmdir $mount_dest -fi - -true - """ + commands = dedent(f"""set -ex + + mount_dest=/opt/slurm/{cluster_name} + + # Make sure that the cluster is still mounted and mount is accessible. + # If the cluster has already been deleted then the mount will be hung and we have to do manual cleanup. + # Another failure mechanism is if the cluster didn't deploy in which case the mount may not even exist. + if mount | grep " $mount_dest "; then + echo "$mount_dest is mounted." + if ! timeout 1s ls $mount_dest; then + echo "Mount point ($mount_dest) is hung. Source may have already been deleted." + timeout 5s sudo umount -lf $mount_dest + timeout 1s rm -rf $mount_dest + fi + fi + + script="$mount_dest/config/bin/create_users_groups_json_deconfigure.sh" + if ! timeout 1s ls $script; then + echo "$script doesn't exist or isn't accessible." + else + sudo $script + fi + + # Do manual cleanup just in case something above failed. + + sudo grep -v " $mount_dest " /etc/fstab > /etc/fstab.new + if diff -q /etc/fstab /etc/fstab.new; then + sudo rm -f /etc/fstab.new + else + sudo cp /etc/fstab /etc/fstab.$(date '+%Y-%m-%d@%H:%M:%S~') + sudo mv -f /etc/fstab.new /etc/fstab + fi + + if timeout 1s mountpoint $mount_dest; then + echo "$mount_dest is a mountpoint" + sudo umount -lf $mount_dest + fi + + if timeout 1s ls $mount_dest; then + sudo rmdir $mount_dest + fi + + true + """) logger.info(f"Submitting SSM command") send_command_response = ssm_client.send_command( DocumentName = 'AWS-RunShellScript', InstanceIds = [domain_joined_instance_id], Parameters = {'commands': [commands]}, - Comment = f"Deconfigure {environment_name} users and groups for {cluster_name}" + Comment = f"Deconfigure users and groups for {cluster_name}" ) command_id = send_command_response['Command']['CommandId'] logger.info(f"Sent SSM command {command_id}") @@ -185,7 +186,7 @@ def lambda_handler(event, context): cfnresponse.send(event, context, cfnresponse.FAILED, {'error': str(e)}, physicalResourceId=cluster_name) sns_client = boto3.client('sns') sns_client.publish( - TopicArn = environ['ErrorSnsTopicArn'], + TopicArn = error_sns_topic_arn, Subject = f"{cluster_name} DeconfigureRESUsersGroupsJson failed", Message = str(e) ) diff --git a/source/resources/lambdas/DeconfigureRESUsersGroupsJson/cfnresponse.py b/source/resources/lambdas/DeconfigureUsersGroupsJson/cfnresponse.py similarity index 100% rename from source/resources/lambdas/DeconfigureRESUsersGroupsJson/cfnresponse.py rename to source/resources/lambdas/DeconfigureUsersGroupsJson/cfnresponse.py diff --git a/source/resources/parallel-cluster/config/bin/on_head_node_configured.sh b/source/resources/parallel-cluster/config/bin/on_head_node_configured.sh index 21b4184c..4a49b61d 100755 --- a/source/resources/parallel-cluster/config/bin/on_head_node_configured.sh +++ b/source/resources/parallel-cluster/config/bin/on_head_node_configured.sh @@ -84,16 +84,16 @@ ansible-playbook $PLAYBOOKS_PATH/ParallelClusterHeadNode.yml \ popd # Notify SNS topic that trigger configuration of cluster manager and submitters -ConfigureRESUsersGroupsJsonSnsTopicArnParameter={{ConfigureRESUsersGroupsJsonSnsTopicArnParameter}} -if ! [[ -z "$ConfigureRESUsersGroupsJsonSnsTopicArnParameter" ]]; then - ConfigureRESClusterManagerSnsTopicArn=$(aws ssm get-parameter --name $ConfigureRESUsersGroupsJsonSnsTopicArnParameter --query 'Parameter.Value' --output text) - aws sns publish --topic-arn $ConfigureRESClusterManagerSnsTopicArn --message 'Configure {{ClusterName}} RES ClusterManager' +ConfigureUsersGroupsJsonSnsTopicArnParameter={{ConfigureUsersGroupsJsonSnsTopicArnParameter}} +if ! [[ -z "$ConfigureUsersGroupsJsonSnsTopicArnParameter" ]]; then + ConfigureUsersGroupsJsonSnsTopicArn=$(aws ssm get-parameter --name $ConfigureUsersGroupsJsonSnsTopicArnParameter --query 'Parameter.Value' --output text) + aws sns publish --topic-arn $ConfigureUsersGroupsJsonSnsTopicArn --message 'Configure {{ClusterName}} users_groups.json' fi -ConfigureRESSubmittersSnsTopicArnParameter={{ConfigureRESSubmittersSnsTopicArnParameter}} -if ! [[ -z "$ConfigureRESSubmittersSnsTopicArnParameter" ]]; then - ConfigureRESSubmittersSnsTopicArn=$(aws ssm get-parameter --name $ConfigureRESSubmittersSnsTopicArnParameter --query 'Parameter.Value' --output text) - aws sns publish --topic-arn $ConfigureRESSubmittersSnsTopicArn --message 'Configure {{ClusterName}} RES submitters' +ConfigureExternalLoginNodesSnsTopicArnParameter={{ConfigureExternalLoginNodesSnsTopicArnParameter}} +if ! [[ -z "$ConfigureExternalLoginNodesSnsTopicArnParameter" ]]; then + ConfigureExternalLoginNodesSnsTopicArn=$(aws ssm get-parameter --name $ConfigureExternalLoginNodesSnsTopicArnParameter --query 'Parameter.Value' --output text) + aws sns publish --topic-arn $ConfigureExternalLoginNodesSnsTopicArn --message 'Configure {{ClusterName}} login nodes' fi echo "$(date): Finished ${script_name}"