From 9968c716119737a9e78cf90f05bf322a3260cf9b Mon Sep 17 00:00:00 2001 From: Allan Carter Date: Tue, 20 Aug 2024 03:05:15 +0000 Subject: [PATCH] Generalize RES configuration of login nodes and user/group json Resolves #250 ===== Fix SlurmFsxLustre ingress rule. CDK creates egress rule without matching ingress rule. Resolves #252 ===== Fix FSxZ egress rules Compensate for a bug in ParallelCluster that requires egress rules. Leave the bug open so that can remove rules when ParallelCluster bug is fixed. Addresses #253 ===== Document FSx configuration ===== Add IAM policy required to mount FSx Add AttachRolePolicy, DetachRolePolicy for HeadNodePolicy. Resolves #254 ==== Fix SNS notification bug when CreateParallelCluster Lambda is missing parameter --- .../create_slurm_security_groups_stack.py | 24 +- docs/deployment-prerequisites.md | 55 ++++ source/cdk/cdk_slurm_stack.py | 285 ++++++++++-------- source/cdk/config_schema.py | 28 +- .../ConfigureExternalLoginNodes.py | 143 +++++++++ .../ConfigureRESSubmitters.py | 127 -------- .../ConfigureUsersGroupsJson.py} | 55 ++-- .../CreateParallelCluster.py | 7 +- .../DeconfigureExternalLoginNodes.py} | 173 ++++++----- .../cfnresponse.py | 0 .../DeconfigureUsersGroupsJson.py} | 0 .../cfnresponse.py | 0 .../config/bin/on_head_node_configured.sh | 16 +- 13 files changed, 542 insertions(+), 371 deletions(-) create mode 100644 source/resources/lambdas/ConfigureExternalLoginNodes/ConfigureExternalLoginNodes.py delete mode 100644 source/resources/lambdas/ConfigureRESSubmitters/ConfigureRESSubmitters.py rename source/resources/lambdas/{ConfigureRESUsersGroupsJson/ConfigureRESUsersGroupsJson.py => ConfigureUsersGroupsJson/ConfigureUsersGroupsJson.py} (70%) rename source/resources/lambdas/{DeconfigureRESSubmitters/DeconfigureRESSubmitters.py => DeconfigureExternalLoginNodes/DeconfigureExternalLoginNodes.py} (53%) rename source/resources/lambdas/{DeconfigureRESSubmitters => DeconfigureExternalLoginNodes}/cfnresponse.py (100%) rename source/resources/lambdas/{DeconfigureRESUsersGroupsJson/DeconfigureRESUsersGroupsJson.py => DeconfigureUsersGroupsJson/DeconfigureUsersGroupsJson.py} (100%) rename source/resources/lambdas/{DeconfigureRESUsersGroupsJson => DeconfigureUsersGroupsJson}/cfnresponse.py (100%) diff --git a/create-slurm-security-groups/create_slurm_security_groups/create_slurm_security_groups_stack.py b/create-slurm-security-groups/create_slurm_security_groups/create_slurm_security_groups_stack.py index 55de164d..993ccbd9 100644 --- a/create-slurm-security-groups/create_slurm_security_groups/create_slurm_security_groups_stack.py +++ b/create-slurm-security-groups/create_slurm_security_groups/create_slurm_security_groups_stack.py @@ -122,6 +122,9 @@ def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None: for dst_sg_name, dst_sg in lustre_security_groups.items(): src_sg.connections.allow_to(dst_sg, ec2.Port.tcp(988), f"{src_sg_name} to {dst_sg_name} lustre") src_sg.connections.allow_to(dst_sg, ec2.Port.tcp_range(1018, 1023), f"{src_sg_name} to {dst_sg_name} lustre") + # It shouldn't be necessary to do allow_to and allow_from, but CDK left off the ingress rule form lustre to lustre if I didn't add the allow_from. + dst_sg.connections.allow_from(src_sg, ec2.Port.tcp(988), f"{src_sg_name} to {dst_sg_name} lustre") + dst_sg.connections.allow_from(src_sg, ec2.Port.tcp_range(1018, 1023), f"{src_sg_name} to {dst_sg_name} lustre") # Rules for FSx Ontap for fsx_client_sg_name, fsx_client_sg in fsx_client_security_groups.items(): @@ -138,12 +141,21 @@ def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None: fsx_client_sg.connections.allow_to(fsx_ontap_sg, ec2.Port.udp(4046), f"{fsx_client_sg_name} to {fsx_ontap_sg_name} Network status monitor for NFS") for fsx_zfs_sg_name, fsx_zfs_sg in zfs_security_groups.items(): - fsx_client_sg.connections.allow_to(slurm_fsx_zfs_sg, ec2.Port.tcp(111), f"{fsx_client_sg_name} to {fsx_zfs_sg_name} rpc for NFS") - fsx_client_sg.connections.allow_to(slurm_fsx_zfs_sg, ec2.Port.udp(111), f"{fsx_client_sg_name} to {fsx_zfs_sg_name} rpc for NFS") - fsx_client_sg.connections.allow_to(slurm_fsx_zfs_sg, ec2.Port.tcp(2049), f"{fsx_client_sg_name} to {fsx_zfs_sg_name} NFS server daemon") - fsx_client_sg.connections.allow_to(slurm_fsx_zfs_sg, ec2.Port.udp(2049), f"{fsx_client_sg_name} to {fsx_zfs_sg_name} NFS server daemon") - fsx_client_sg.connections.allow_to(slurm_fsx_zfs_sg, ec2.Port.tcp_range(20001, 20003), f"{fsx_client_sg_name} to {fsx_zfs_sg_name} NFS mount, status monitor, and lock daemon") - fsx_client_sg.connections.allow_to(slurm_fsx_zfs_sg, ec2.Port.udp_range(20001, 20003), f"{fsx_client_sg_name} to {fsx_zfs_sg_name} NFS mount, status monitor, and lock daemon") + fsx_client_sg.connections.allow_to(fsx_zfs_sg, ec2.Port.tcp(111), f"{fsx_client_sg_name} to {fsx_zfs_sg_name} rpc for NFS") + fsx_client_sg.connections.allow_to(fsx_zfs_sg, ec2.Port.udp(111), f"{fsx_client_sg_name} to {fsx_zfs_sg_name} rpc for NFS") + fsx_client_sg.connections.allow_to(fsx_zfs_sg, ec2.Port.tcp(2049), f"{fsx_client_sg_name} to {fsx_zfs_sg_name} NFS server daemon") + fsx_client_sg.connections.allow_to(fsx_zfs_sg, ec2.Port.udp(2049), f"{fsx_client_sg_name} to {fsx_zfs_sg_name} NFS server daemon") + fsx_client_sg.connections.allow_to(fsx_zfs_sg, ec2.Port.tcp_range(20001, 20003), f"{fsx_client_sg_name} to {fsx_zfs_sg_name} NFS mount, status monitor, and lock daemon") + fsx_client_sg.connections.allow_to(fsx_zfs_sg, ec2.Port.udp_range(20001, 20003), f"{fsx_client_sg_name} to {fsx_zfs_sg_name} NFS mount, status monitor, and lock daemon") + # There is a bug in PC 3.10.1 that requires outbound traffic to be enabled even though ZFS doesn't. + # Remove when bug in PC is fixed. + # Tracked by https://github.com/aws-samples/aws-eda-slurm-cluster/issues/253 + fsx_client_sg.connections.allow_from(fsx_zfs_sg, ec2.Port.tcp(111), f"{fsx_zfs_sg_name} to {fsx_client_sg_name} rpc for NFS") + fsx_client_sg.connections.allow_from(fsx_zfs_sg, ec2.Port.udp(111), f"{fsx_zfs_sg_name} to {fsx_client_sg_name} rpc for NFS") + fsx_client_sg.connections.allow_from(fsx_zfs_sg, ec2.Port.tcp(2049), f"{fsx_zfs_sg_name} to {fsx_client_sg_name} NFS server daemon") + fsx_client_sg.connections.allow_from(fsx_zfs_sg, ec2.Port.udp(2049), f"{fsx_zfs_sg_name} to {fsx_client_sg_name} NFS server daemon") + fsx_client_sg.connections.allow_from(fsx_zfs_sg, ec2.Port.tcp_range(20001, 20003), f"{fsx_zfs_sg_name} to {fsx_client_sg_name} NFS mount, status monitor, and lock daemon") + fsx_client_sg.connections.allow_from(fsx_zfs_sg, ec2.Port.udp_range(20001, 20003), f"{fsx_zfs_sg_name} to {fsx_client_sg_name} NFS mount, status monitor, and lock daemon") for sg_name, sg in security_groups.items(): CfnOutput(self, f"{sg_name}Id", diff --git a/docs/deployment-prerequisites.md b/docs/deployment-prerequisites.md index 03e5eaa3..70c6b28e 100644 --- a/docs/deployment-prerequisites.md +++ b/docs/deployment-prerequisites.md @@ -441,3 +441,58 @@ slurm: ansys: Count: 1 ``` + +### Configure File Systems + +The Storage/ExtraMounts parameter allows you to configure additional file systems to mount on compute nodes. +Note that the security groups for the file systems must allow connections from the compute nodes. + +#### Lustre + +The following example shows how to add an FSx for Lustre file system. +The mount information can be found from the FSx console. + +``` + storage: + ExtraMounts + - dest: /lustre + src: .fsx..amazonaws.com@tcp:/ + StorageType: FsxLustre + FileSystemId: + type: lustre + options: relatime,flock +``` + +#### ONTAP + +The following example shows how to add an FSx for NetApp ONTAP file system. +The mount information can be found from the FSx console. + +``` + storage: + ExtraMounts + - dest: /ontap + src: ..fsx..amazonaws.com:/vol1 + StorageType: FsxOntap + FileSystemId: + VolumeId: + type: nfs + options: default +``` + +#### ZFS + +The following example shows how to add an FSx for OpenZFS file system. +The mount information can be found from the FSx console. + +``` + storage: + ExtraMounts + - dest: /zfs + src: .fsx..amazonaws.com:/fsx + StorageType: FsxOpenZfs + FileSystemId: + VolumeId: + type: nfs + options: noatime,nfsvers=3,sync,nconnect=16,rsize=1048576,wsize=1048576 +``` diff --git a/source/cdk/cdk_slurm_stack.py b/source/cdk/cdk_slurm_stack.py index 847bfec9..621617f6 100644 --- a/source/cdk/cdk_slurm_stack.py +++ b/source/cdk/cdk_slurm_stack.py @@ -610,8 +610,6 @@ def update_config_for_res(self): self.config['SubnetId'] = subnet_ids[0] logger.info(f" SubnetId: {self.config['SubnetId']}") - # self.config['slurm']['SubmitterInstanceTags'] = {'res:EnvironmentName': [self.res_environment_name]} - # Get RES VDI Security Group res_vdc_stack_name = f"{self.res_environment_name}-vdc" if res_vdc_stack_name not in stack_statuses: @@ -678,6 +676,52 @@ def update_config_for_res(self): logger.error(f"RES VDC controller security group not found.") exit(1) + self.config['ExternalLoginNodes'] = self.config.get('ExternalLoginNodes', []) + self.config['ExternalLoginNodes'].append( + { + 'Tags': [ + { + 'Key': 'res:EnvironmentName', + 'Values': [self.res_environment_name] + }, + { + 'Key': 'res:NodeType', + 'Values': ['virtual-desktop-dcv-host'] + } + ], + 'SecurityGroupId': self.slurm_login_node_sg_id + } + ) + + if 'DomainJoinedInstance' in self.config: + logger.error(f"Can't specify both DomainJoinedInstance and RESStackName in config file.") + exit(1) + self.config['DomainJoinedInstance'] = { + 'Tags': [ + { + 'Key': 'Name', + 'Values': [f"{self.res_environment_name}-cluster-manager",] + }, + { + 'Key': 'res:EnvironmentName', + 'Values': [self.res_environment_name] + }, + { + 'Key': 'res:ModuleName', + 'Values': ['cluster-manager'] + }, + { + 'Key': 'res:ModuleId', + 'Values': ['cluster-manager'] + }, + { + 'Key': 'res:NodeType', + 'Values': ['app'] + } + ], + 'SecurityGroupId': self.slurm_login_node_sg_id + } + # Configure the /home mount from RES if /home not already configured home_mount_found = False for extra_mount in self.config['slurm'].get('storage', {}).get('ExtraMounts', []): @@ -803,21 +847,24 @@ def create_parallel_cluster_assets(self): ) # W47:SNS Topic should specify KmsMasterKeyId property self.suppress_cfn_nag(self.create_head_node_a_record_sns_topic, 'W47', 'Use default KMS key.') - if 'RESStackName' in self.config: + + if 'DomainJoinedInstance' in self.config: # SNS topic that gets notified when cluster is created and triggers a lambda to configure the cluster manager - self.configure_res_users_groups_json_sns_topic = sns.Topic( - self, "ConfigureRESUsersGroupsJsonSnsTopic", - topic_name = f"{self.config['slurm']['ClusterName']}ConfigureRESUsersGroupsJson" + self.configure_users_groups_json_sns_topic = sns.Topic( + self, "ConfigureUsersGroupsJsonSnsTopic", + topic_name = f"{self.config['slurm']['ClusterName']}ConfigureUsersGroupsJson" ) # W47:SNS Topic should specify KmsMasterKeyId property - self.suppress_cfn_nag(self.configure_res_users_groups_json_sns_topic, 'W47', 'Use default KMS key.') - # SNS topic that gets notified when cluster is created and triggers a lambda to configure the cluster manager - self.configure_res_submitters_sns_topic = sns.Topic( - self, "ConfigureRESSubmittersSnsTopic", - topic_name = f"{self.config['slurm']['ClusterName']}ConfigureRESSubmitters" + self.suppress_cfn_nag(self.configure_users_groups_json_sns_topic, 'W47', 'Use default KMS key.') + + if 'ExternalLoginNodes' in self.config: + # SNS topic that gets notified when cluster is created and triggers a lambda to configure external login nodes + self.configure_external_login_nodes_sns_topic = sns.Topic( + self, "ConfigureExternalLoginNodesSnsTopic", + topic_name = f"{self.config['slurm']['ClusterName']}ConfigureExternalLoginNodes" ) # W47:SNS Topic should specify KmsMasterKeyId property - self.suppress_cfn_nag(self.configure_res_submitters_sns_topic, 'W47', 'Use default KMS key.') + self.suppress_cfn_nag(self.configure_external_login_nodes_sns_topic, 'W47', 'Use default KMS key.') # Create an SSM parameter to store the JWT tokens for root and slurmrestd self.jwt_token_for_root_ssm_parameter_name = f"/{self.config['slurm']['ClusterName']}/slurmrestd/jwt/root" @@ -852,22 +899,23 @@ def create_parallel_cluster_assets(self): ) os.remove(playbooks_zipfile_filename) - if 'RESStackName' in self.config: - self.configure_res_users_groups_json_sns_topic_arn_parameter_name = f"/{self.config['slurm']['ClusterName']}/ConfigureRESUsersGroupsJsonSnsTopicArn" - self.configure_res_users_groups_json_sns_topic_arn_parameter = ssm.StringParameter( - self, f"ConfigureRESUsersGroupsJsonSnsTopicArnParameter", - parameter_name = self.configure_res_users_groups_json_sns_topic_arn_parameter_name, - string_value = self.configure_res_users_groups_json_sns_topic.topic_arn + if 'DomainJoinedInstance' in self.config: + self.configure_users_groups_json_sns_topic_arn_parameter_name = f"/{self.config['slurm']['ClusterName']}/ConfigureUsersGroupsJsonSnsTopicArn" + self.configure_users_groups_json_sns_topic_arn_parameter = ssm.StringParameter( + self, f"ConfigureUsersGroupsJsonSnsTopicArnParameter", + parameter_name = self.configure_users_groups_json_sns_topic_arn_parameter_name, + string_value = self.configure_users_groups_json_sns_topic.topic_arn ) - self.configure_res_users_groups_json_sns_topic_arn_parameter.grant_read(self.parallel_cluster_asset_read_policy) - - self.configure_res_submitters_sns_topic_arn_parameter_name = f"/{self.config['slurm']['ClusterName']}/ConfigureRESSubmittersSnsTopicArn" - self.configure_res_submitters_sns_topic_arn_parameter = ssm.StringParameter( - self, f"ConfigureRESSubmittersSnsTopicArnParameter", - parameter_name = self.configure_res_submitters_sns_topic_arn_parameter_name, - string_value = self.configure_res_submitters_sns_topic.topic_arn + self.configure_users_groups_json_sns_topic_arn_parameter.grant_read(self.parallel_cluster_asset_read_policy) + + if 'ExternalLoginNodes' in self.config: + self.configure_external_login_nodes_sns_topic_arn_parameter_name = f"/{self.config['slurm']['ClusterName']}/ConfigureExternalLoginNodesSnsTopicArn" + self.configure_external_login_nodes_sns_topic_arn_parameter = ssm.StringParameter( + self, f"ConfigureExternalLoginNodesSnsTopicArnParameter", + parameter_name = self.configure_external_login_nodes_sns_topic_arn_parameter_name, + string_value = self.configure_external_login_nodes_sns_topic.topic_arn ) - self.configure_res_submitters_sns_topic_arn_parameter.grant_read(self.parallel_cluster_asset_read_policy) + self.configure_external_login_nodes_sns_topic_arn_parameter.grant_read(self.parallel_cluster_asset_read_policy) self.create_head_node_a_record_sns_topic_arn_parameter_name = f"/{self.config['slurm']['ClusterName']}/CreateHeadNodeARecordSnsTopicArn" self.create_head_node_a_record_sns_topic_arn_parameter = ssm.StringParameter( @@ -881,8 +929,8 @@ def create_parallel_cluster_assets(self): 'assets_bucket': self.assets_bucket, 'assets_base_key': self.assets_base_key, 'ClusterName': self.config['slurm']['ClusterName'], - 'ConfigureRESUsersGroupsJsonSnsTopicArnParameter': '', - 'ConfigureRESSubmittersSnsTopicArnParameter': '', + 'ConfigureUsersGroupsJsonSnsTopicArnParameter': '', + 'ConfigureExternalLoginNodesSnsTopicArnParameter': '', 'CreateHeadNodeARecordSnsTopicArnParameter': self.create_head_node_a_record_sns_topic_arn_parameter_name, 'ErrorSnsTopicArn': self.config.get('ErrorSnsTopicArn', ''), 'playbooks_s3_url': self.playbooks_s3_url, @@ -898,9 +946,10 @@ def create_parallel_cluster_assets(self): template_vars['HomeMountSrc'] = self.mount_home_src else: template_vars['HomeMountSrc'] = '' - if 'RESStackName' in self.config: - template_vars['ConfigureRESUsersGroupsJsonSnsTopicArnParameter'] = self.configure_res_users_groups_json_sns_topic_arn_parameter_name - template_vars['ConfigureRESSubmittersSnsTopicArnParameter'] = self.configure_res_submitters_sns_topic_arn_parameter_name + if 'DomainJoinedInstance' in self.config: + template_vars['ConfigureUsersGroupsJsonSnsTopicArnParameter'] = self.configure_users_groups_json_sns_topic_arn_parameter_name + if 'ExternalLoginNodes' in self.config: + template_vars['ConfigureExternalLoginNodesSnsTopicArnParameter'] = self.configure_external_login_nodes_sns_topic_arn_parameter_name # Additions or deletions to the list should be reflected in config_scripts in on_head_node_start.sh. files_to_upload = [ @@ -1539,6 +1588,18 @@ def create_parallel_cluster_lambdas(self): ] ) ) + self.create_parallel_cluster_lambda.add_to_role_policy( + statement=iam.PolicyStatement( + effect=iam.Effect.ALLOW, + actions=[ + "iam:AttachRolePolicy", + "iam:DetachRolePolicy", + ], + resources=[ + f"arn:{Aws.PARTITION}:iam::{Aws.ACCOUNT_ID}:role/*" + ] + ) + ) if self.munge_key_secret_arn: self.create_parallel_cluster_lambda.add_to_role_policy( statement=iam.PolicyStatement( @@ -1639,36 +1700,28 @@ def create_parallel_cluster_lambdas(self): ) ) - if 'RESStackName' in self.config: - configureRESUsersGroupsJsonLambdaAsset = s3_assets.Asset(self, "ConfigureRESUsersGroupsJsonAsset", path="resources/lambdas/ConfigureRESUsersGroupsJson") - self.configure_res_users_groups_json_lambda = aws_lambda.Function( - self, "ConfigRESUsersGroupsJsonLambda", - function_name=f"{self.stack_name}-ConfigRESUsersGroupsJson", - description="Configure RES users and groups json file", + if 'DomainJoinedInstance' in self.config: + configureUsersGroupsJsonLambdaAsset = s3_assets.Asset(self, "ConfigureUsersGroupsJsonAsset", path="resources/lambdas/ConfigureUsersGroupsJson") + self.configure_users_groups_json_lambda = aws_lambda.Function( + self, "ConfigUsersGroupsJsonLambda", + function_name=f"{self.stack_name}-ConfigUsersGroupsJson", + description="Configure users and groups json file", memory_size=2048, runtime=aws_lambda.Runtime.PYTHON_3_9, architecture=aws_lambda.Architecture.X86_64, timeout=Duration.minutes(15), log_retention=logs.RetentionDays.INFINITE, - handler="ConfigureRESUsersGroupsJson.lambda_handler", - code=aws_lambda.Code.from_bucket(configureRESUsersGroupsJsonLambdaAsset.bucket, configureRESUsersGroupsJsonLambdaAsset.s3_object_key), + handler="ConfigureUsersGroupsJson.lambda_handler", + code=aws_lambda.Code.from_bucket(configureUsersGroupsJsonLambdaAsset.bucket, configureUsersGroupsJsonLambdaAsset.s3_object_key), environment = { 'ClusterName': self.config['slurm']['ClusterName'], 'ErrorSnsTopicArn': self.config.get('ErrorSnsTopicArn', ''), 'Region': self.cluster_region, - 'RESStackName': self.config['RESStackName'], - 'RESEnvironmentName': self.res_environment_name, - # 'RESDomainJoinedInstanceName': f"{self.res_environment_name}-vdc-controller", - # 'RESDomainJoinedInstanceModuleName': 'virtual-desktop-controller', - # 'RESDomainJoinedInstanceModuleId': 'vdc', - 'RESDomainJoinedInstanceName': f"{self.res_environment_name}-cluster-manager", - 'RESDomainJoinedInstanceModuleName': 'cluster-manager', - 'RESDomainJoinedInstanceModuleId': 'cluster-manager', - 'RESDomainJoinedInstanceNodeType': 'app', - 'SlurmLoginNodeSGId': self.slurm_login_node_sg_id + 'DomainJoinedInstanceTagsJson': json.dumps(self.config['DomainJoinedInstance']['Tags']), + 'SlurmLoginNodeSGId': self.config['DomainJoinedInstance'].get('SecurityGroupId', 'None') } ) - self.configure_res_users_groups_json_lambda.add_to_role_policy( + self.configure_users_groups_json_lambda.add_to_role_policy( statement=iam.PolicyStatement( effect=iam.Effect.ALLOW, actions=[ @@ -1681,7 +1734,7 @@ def create_parallel_cluster_lambdas(self): ) ) if 'ErrorSnsTopicArn' in self.config: - self.configure_res_users_groups_json_lambda.add_to_role_policy( + self.configure_users_groups_json_lambda.add_to_role_policy( statement=iam.PolicyStatement( effect=iam.Effect.ALLOW, actions=[ @@ -1690,33 +1743,32 @@ def create_parallel_cluster_lambdas(self): resources=[self.config['ErrorSnsTopicArn']] ) ) - self.configure_res_users_groups_json_lambda.add_event_source( - lambda_event_sources.SnsEventSource(self.configure_res_users_groups_json_sns_topic) + self.configure_users_groups_json_lambda.add_event_source( + lambda_event_sources.SnsEventSource(self.configure_users_groups_json_sns_topic) ) - self.configure_res_users_groups_json_sns_topic.grant_publish(self.parallel_cluster_sns_publish_policy) - - configureRESSubmittersLambdaAsset = s3_assets.Asset(self, "ConfigureRESSubmittersAsset", path="resources/lambdas/ConfigureRESSubmitters") - self.configure_res_submitters_lambda = aws_lambda.Function( - self, "ConfigRESSubmittersLambda", - function_name=f"{self.stack_name}-ConfigRESSubmitters", - description="Configure RES submitters", + self.configure_users_groups_json_sns_topic.grant_publish(self.parallel_cluster_sns_publish_policy) + + if 'ExternalLoginNodes' in self.config: + configureExternalLoginNodesLambdaAsset = s3_assets.Asset(self, "ConfigureExternalLoginNodesAsset", path="resources/lambdas/ConfigureExternalLoginNodes") + self.configure_external_login_nodes_lambda = aws_lambda.Function( + self, "ConfigExternalLoginNodesLambda", + function_name=f"{self.stack_name}-ConfigExternalLoginNodes", + description="Configure external login nodes", memory_size=2048, runtime=aws_lambda.Runtime.PYTHON_3_9, architecture=aws_lambda.Architecture.X86_64, timeout=Duration.minutes(15), log_retention=logs.RetentionDays.INFINITE, - handler="ConfigureRESSubmitters.lambda_handler", - code=aws_lambda.Code.from_bucket(configureRESSubmittersLambdaAsset.bucket, configureRESSubmittersLambdaAsset.s3_object_key), + handler="ConfigureExternalLoginNodes.lambda_handler", + code=aws_lambda.Code.from_bucket(configureExternalLoginNodesLambdaAsset.bucket, configureExternalLoginNodesLambdaAsset.s3_object_key), environment = { 'Region': self.cluster_region, 'ClusterName': self.config['slurm']['ClusterName'], 'ErrorSnsTopicArn': self.config.get('ErrorSnsTopicArn', ''), - 'RESStackName': self.config['RESStackName'], - 'RESEnvironmentName': self.res_environment_name, - 'SlurmLoginNodeSGId': self.slurm_login_node_sg_id + 'ExternalLoginNodesConfigJson': json.dumps(self.config['ExternalLoginNodes']) } ) - self.configure_res_submitters_lambda.add_to_role_policy( + self.configure_external_login_nodes_lambda.add_to_role_policy( statement=iam.PolicyStatement( effect=iam.Effect.ALLOW, actions=[ @@ -1729,7 +1781,7 @@ def create_parallel_cluster_lambdas(self): ) ) if 'ErrorSnsTopicArn' in self.config: - self.configure_res_submitters_lambda.add_to_role_policy( + self.configure_external_login_nodes_lambda.add_to_role_policy( statement=iam.PolicyStatement( effect=iam.Effect.ALLOW, actions=[ @@ -1738,38 +1790,32 @@ def create_parallel_cluster_lambdas(self): resources=[self.config['ErrorSnsTopicArn']] ) ) - self.configure_res_submitters_lambda.add_event_source( - lambda_event_sources.SnsEventSource(self.configure_res_submitters_sns_topic) + self.configure_external_login_nodes_lambda.add_event_source( + lambda_event_sources.SnsEventSource(self.configure_external_login_nodes_sns_topic) ) - self.configure_res_submitters_sns_topic.grant_publish(self.parallel_cluster_sns_publish_policy) + self.configure_external_login_nodes_sns_topic.grant_publish(self.parallel_cluster_sns_publish_policy) - self.deconfigureRESUsersGroupsJsonLambdaAsset = s3_assets.Asset(self, "DeconfigureRESUsersGroupsJsonAsset", path="resources/lambdas/DeconfigureRESUsersGroupsJson") - self.deconfigure_res_users_groups_json_lambda = aws_lambda.Function( - self, "DeconfigRESUsersGroupsJsonLambda", - function_name=f"{self.stack_name}-DeconfigRESUsersGroupsJson", + if 'DomainJoinedInstance' in self.config: + self.deconfigureUsersGroupsJsonLambdaAsset = s3_assets.Asset(self, "DeconfigureUsersGroupsJsonAsset", path="resources/lambdas/DeconfigureUsersGroupsJson") + self.deconfigure_users_groups_json_lambda = aws_lambda.Function( + self, "DeconfigUsersGroupsJsonLambda", + function_name=f"{self.stack_name}-DeconfigUsersGroupsJson", description="Deconfigure RES users and groups json file", memory_size=2048, runtime=aws_lambda.Runtime.PYTHON_3_9, architecture=aws_lambda.Architecture.X86_64, timeout=Duration.minutes(15), log_retention=logs.RetentionDays.INFINITE, - handler="DeconfigureRESUsersGroupsJson.lambda_handler", - code=aws_lambda.Code.from_bucket(self.deconfigureRESUsersGroupsJsonLambdaAsset.bucket, self.deconfigureRESUsersGroupsJsonLambdaAsset.s3_object_key), + handler="DeconfigureUsersGroupsJson.lambda_handler", + code=aws_lambda.Code.from_bucket(self.deconfigureUsersGroupsJsonLambdaAsset.bucket, self.deconfigureUsersGroupsJsonLambdaAsset.s3_object_key), environment = { 'ClusterName': self.config['slurm']['ClusterName'], 'ErrorSnsTopicArn': self.config.get('ErrorSnsTopicArn', ''), 'Region': self.cluster_region, - 'RESEnvironmentName': self.res_environment_name, - # 'RESDomainJoinedInstanceName': f"{self.res_environment_name}-vdc-controller", - # 'RESDomainJoinedInstanceModuleName': 'virtual-desktop-controller', - # 'RESDomainJoinedInstanceModuleId': 'vdc', - 'RESDomainJoinedInstanceName': f"{self.res_environment_name}-cluster-manager", - 'RESDomainJoinedInstanceModuleName': 'cluster-manager', - 'RESDomainJoinedInstanceModuleId': 'cluster-manager', - 'RESDomainJoinedInstanceNodeType': 'app' + 'DomainJoinedInstanceTagsJson': json.dumps(self.config['DomainJoinedInstance']['Tags']) } ) - self.deconfigure_res_users_groups_json_lambda.add_to_role_policy( + self.deconfigure_users_groups_json_lambda.add_to_role_policy( statement=iam.PolicyStatement( effect=iam.Effect.ALLOW, actions=[ @@ -1781,7 +1827,7 @@ def create_parallel_cluster_lambdas(self): ) ) if 'ErrorSnsTopicArn' in self.config: - self.deconfigure_res_users_groups_json_lambda.add_to_role_policy( + self.deconfigure_users_groups_json_lambda.add_to_role_policy( statement=iam.PolicyStatement( effect=iam.Effect.ALLOW, actions=[ @@ -1791,26 +1837,27 @@ def create_parallel_cluster_lambdas(self): ) ) - deconfigureRESSubmittersLambdaAsset = s3_assets.Asset(self, "DeconfigureRESSubmittersAsset", path="resources/lambdas/DeconfigureRESSubmitters") - self.deconfigure_res_submitters_lambda = aws_lambda.Function( - self, "DeconfigRESSubmittersLambda", - function_name=f"{self.stack_name}-DeconfigRESSubmitters", - description="Deconfigure RES submitters", + if 'ExternalLoginNodes' in self.config: + deconfigureExternalLoginNodesLambdaAsset = s3_assets.Asset(self, "DeconfigureExternalLoginNodesAsset", path="resources/lambdas/DeconfigureExternalLoginNodes") + self.deconfigure_external_login_nodes_lambda = aws_lambda.Function( + self, "DeconfigExternalLoginNodesLambda", + function_name=f"{self.stack_name}-DeconfigExternalLoginNodes", + description="Deconfigure external login nodes", memory_size=2048, runtime=aws_lambda.Runtime.PYTHON_3_9, architecture=aws_lambda.Architecture.X86_64, timeout=Duration.minutes(15), log_retention=logs.RetentionDays.INFINITE, - handler="DeconfigureRESSubmitters.lambda_handler", - code=aws_lambda.Code.from_bucket(deconfigureRESSubmittersLambdaAsset.bucket, deconfigureRESSubmittersLambdaAsset.s3_object_key), + handler="DeconfigureExternalLoginNodes.lambda_handler", + code=aws_lambda.Code.from_bucket(deconfigureExternalLoginNodesLambdaAsset.bucket, deconfigureExternalLoginNodesLambdaAsset.s3_object_key), environment = { 'ClusterName': self.config['slurm']['ClusterName'], 'ErrorSnsTopicArn': self.config.get('ErrorSnsTopicArn', ''), 'Region': self.cluster_region, - 'RESEnvironmentName': self.res_environment_name + 'ExternalLoginNodesConfigJson': json.dumps(self.config['ExternalLoginNodes']) } ) - self.deconfigure_res_submitters_lambda.add_to_role_policy( + self.deconfigure_external_login_nodes_lambda.add_to_role_policy( statement=iam.PolicyStatement( effect=iam.Effect.ALLOW, actions=[ @@ -1822,7 +1869,7 @@ def create_parallel_cluster_lambdas(self): ) ) if 'ErrorSnsTopicArn' in self.config: - self.deconfigure_res_submitters_lambda.add_to_role_policy( + self.deconfigure_external_login_nodes_lambda.add_to_role_policy( statement=iam.PolicyStatement( effect=iam.Effect.ALLOW, actions=[ @@ -2707,10 +2754,11 @@ def create_parallel_cluster_config(self): # The lambda to create an A record for the head node must be built before the parallel cluster. self.parallel_cluster.node.add_dependency(self.create_head_node_a_record_lambda) self.parallel_cluster.node.add_dependency(self.update_head_node_lambda) - # The lambdas to configure instances must exist befor the cluster so they can be called. - if 'RESStackName' in self.config: - self.parallel_cluster.node.add_dependency(self.configure_res_users_groups_json_lambda) - self.parallel_cluster.node.add_dependency(self.configure_res_submitters_lambda) + # The lambdas to configure instances must exist before the cluster so they can be called. + if 'DomainJoinedInstance' in self.config: + self.parallel_cluster.node.add_dependency(self.configure_users_groups_json_lambda) + if 'ExternalLoginNodes' in self.config: + self.parallel_cluster.node.add_dependency(self.configure_external_login_nodes_lambda) # Build config files need to be created before cluster so that they can be downloaded as part of on_head_node_configures self.parallel_cluster.node.add_dependency(self.build_config_files) self.parallel_cluster.node.add_dependency(self.parallel_cluster_config) @@ -2728,24 +2776,25 @@ def create_parallel_cluster_config(self): ) self.update_head_node.node.add_dependency(self.parallel_cluster) - if 'RESStackName' in self.config: - # Custom resource to deconfigure cluster manager before deleting cluster - self.deconfigure_res_users_groups_json = CustomResource( - self, "DeconfigureRESUsersGroupsJson", - service_token = self.deconfigure_res_users_groups_json_lambda.function_arn, - properties = { - } - ) - self.deconfigure_res_users_groups_json.node.add_dependency(self.parallel_cluster) - - # Custom resource to deconfigure submitters before deleting cluster - self.deconfigure_res_submitters = CustomResource( - self, "DeconfigureRESSubmitters", - service_token = self.deconfigure_res_submitters_lambda.function_arn, - properties = { - } - ) - self.deconfigure_res_submitters.node.add_dependency(self.parallel_cluster) + # if 'DomainJoinedInstance' in self.config: + # # Custom resource to deconfigure cluster manager before deleting cluster + # self.deconfigure_res_users_groups_json = CustomResource( + # self, "DeconfigureUsersGroupsJson", + # service_token = self.deconfigure_users_groups_json_lambda.function_arn, + # properties = { + # } + # ) + # self.deconfigure_res_users_groups_json.node.add_dependency(self.parallel_cluster) + + # if 'ExternalLoginNodes' in self.config: + # # Custom resource to deconfigure submitters before deleting cluster + # self.deconfigure_res_submitters = CustomResource( + # self, "DeconfigureExternalLoginNodes", + # service_token = self.deconfigure_external_login_nodes_lambda.function_arn, + # properties = { + # } + # ) + # self.deconfigure_res_submitters.node.add_dependency(self.parallel_cluster) CfnOutput(self, "ParallelClusterConfigTemplateYamlS3Url", value = self.parallel_cluster_config_template_yaml_s3_url diff --git a/source/cdk/config_schema.py b/source/cdk/config_schema.py index 17209bf3..9b8a4360 100644 --- a/source/cdk/config_schema.py +++ b/source/cdk/config_schema.py @@ -569,6 +569,30 @@ def get_config_schema(config): Optional('TimeZone', default='US/Central'): str, Optional('AdditionalSecurityGroupsStackName'): str, Optional('RESStackName'): str, + # ExternalLoginNodes: + # Configure external login nodes + # Tags of instances that can be configured to submit to the cluster. + # When the cluster is deleted, the tag is used to unmount the slurm filesystem from the instances using SSM. + Optional('ExternalLoginNodes'): [ + { + 'Tags': [ + { + 'Key': str, + 'Values': [str] + } + ], + Optional('SecurityGroupId'): str + } + ], + Optional('DomainJoinedInstance'): { + 'Tags': [ + { + 'Key': str, + 'Values': [str] + } + ], + Optional('SecurityGroupId'): str + }, 'slurm': { Optional('ParallelClusterConfig'): { Optional('Enable', default=True): And(bool, lambda s: s == True), @@ -681,10 +705,6 @@ def get_config_schema(config): Optional('Secured', default=True): bool } }, - # SubmitterInstanceTags: - # Tags of instances that can be configured to submit to the cluster. - # When the cluster is deleted, the tag is used to unmount the slurm filesystem from the instances using SSM. - Optional('SubmitterInstanceTags'): {str: [str]}, # # InstanceConfig: # Configure the instances used by the cluster diff --git a/source/resources/lambdas/ConfigureExternalLoginNodes/ConfigureExternalLoginNodes.py b/source/resources/lambdas/ConfigureExternalLoginNodes/ConfigureExternalLoginNodes.py new file mode 100644 index 00000000..8cc7db0b --- /dev/null +++ b/source/resources/lambdas/ConfigureExternalLoginNodes/ConfigureExternalLoginNodes.py @@ -0,0 +1,143 @@ +""" +Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +SPDX-License-Identifier: MIT-0 + +Permission is hereby granted, free of charge, to any person obtaining a copy of this +software and associated documentation files (the "Software"), to deal in the Software +without restriction, including without limitation the rights to use, copy, modify, +merge, publish, distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, +INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +""" + +''' +Call /opt/slurm/{{ClusterName}}/config/bin/on_head_node_updated.sh using ssm run command. +''' +import boto3 +from textwrap import dedent +import json +import logging +from os import environ as environ + +logger=logging.getLogger(__file__) +logger_formatter = logging.Formatter('%(levelname)s: %(message)s') +logger_streamHandler = logging.StreamHandler() +logger_streamHandler.setFormatter(logger_formatter) +logger.addHandler(logger_streamHandler) +logger.setLevel(logging.INFO) +logger.propagate = False + +def lambda_handler(event, context): + try: + logger.info(f"event:\n{json.dumps(event, indent=4)}") + + cluster_name = environ['ClusterName'] + cluster_region = environ['Region'] + external_login_nodes_config = json.loads(environ['ExternalLoginNodesConfigJson']) + + logger.info(f"Configure external login nodes for {cluster_name} in {cluster_region}") + + ec2_client = boto3.client('ec2', region_name=cluster_region) + + login_node_instance_ids = [] + for external_login_node_config in external_login_nodes_config: + slurm_login_node_sg_id = external_login_node_config.get('SecurityGroupId', None) + + tags_message = '' + describe_instances_kwargs = { + 'Filters': [ + {'Name': 'instance-state-name', 'Values': ['running']} + ] + } + for tag_dict in external_login_node_config['Tags']: + tag = tag_dict['Key'] + values = tag_dict['Values'] + tags_message += f"\n{tag}: {values}" + describe_instances_kwargs['Filters'].append( + {'Name': f"tag:{tag}", 'Values': values} + ) + logger.info(f"Configure instances with the following tags as login nodes:{tags_message}") + + describe_instances_paginator = ec2_client.get_paginator('describe_instances') + describe_instances_iterator = describe_instances_paginator.paginate(**describe_instances_kwargs) + reservation_index = 0 + for response in describe_instances_iterator: + for reservation_info in response['Reservations']: + logger.info(f"reservation[{reservation_index}]:") + reservation_index += 1 + instance_index = 0 + for instance_info in reservation_info['Instances']: + logger.info(f" instance[{instance_index}]:") + instance_index += 1 + logger.info(f" instance_id: {instance_info['InstanceId']}") + if instance_info['State']['Name'] != 'running': + logger.info(f" Skipping because state = {instance_info['State']['Name']}") + continue + instance_id = instance_info['InstanceId'] + login_node_instance_ids.append(instance_id) + security_group_ids = [] + for security_group_dict in instance_info['SecurityGroups']: + security_group_ids.append(security_group_dict['GroupId']) + if slurm_login_node_sg_id: + if slurm_login_node_sg_id not in security_group_ids: + # Attach the security group + logger.info(f"Attaching {slurm_login_node_sg_id} to {instance_id}.") + security_group_ids.append(slurm_login_node_sg_id) + ec2_client.modify_instance_attribute(InstanceId=instance_id, Groups=security_group_ids) + else: + logger.info(f"{slurm_login_node_sg_id} already attached to {instance_id}") + + if login_node_instance_ids: + logger.info(f"Found {len(login_node_instance_ids)} login nodes. instance_ids:" + "\n".join(login_node_instance_ids)) + else: + logger.info("No running login nodes.") + return + + ssm_client = boto3.client('ssm', region_name=cluster_region) + + ssm_script = dedent(f""" + set -ex + + if ! [ -e /opt/slurm/{cluster_name} ]; then + sudo mkdir -p /opt/slurm/{cluster_name} + fi + if ! mountpoint /opt/slurm/{cluster_name} ; then + sudo mount head_node.{cluster_name}.pcluster:/opt/slurm /opt/slurm/{cluster_name} || true + fi + + script="/opt/slurm/{cluster_name}/config/bin/submitter_configure.sh" + if ! [ -e $script ]; then + echo "$script doesn't exist" + exit 1 + fi + + sudo $script + """) + + TIMEOUT_MINUTES = 90 + TIMEOUT_SECONDS = TIMEOUT_MINUTES * 60 + send_command_response = ssm_client.send_command( + DocumentName = 'AWS-RunShellScript', + InstanceIds = login_node_instance_ids, + Parameters = {'commands': [ssm_script]}, + Comment = f"Configure external login nodes for {cluster_name}", + TimeoutSeconds = TIMEOUT_SECONDS + ) + logger.info(f"Sent SSM command {send_command_response['Command']['CommandId']}") + + except Exception as e: + logger.exception(str(e)) + sns_client = boto3.client('sns') + sns_client.publish( + TopicArn = environ['ErrorSnsTopicArn'], + Subject = f"{cluster_name} ConfigureRESSubmitters failed", + Message = str(e) + ) + logger.info(f"Published error to {environ['ErrorSnsTopicArn']}") + raise diff --git a/source/resources/lambdas/ConfigureRESSubmitters/ConfigureRESSubmitters.py b/source/resources/lambdas/ConfigureRESSubmitters/ConfigureRESSubmitters.py deleted file mode 100644 index 5beabd4b..00000000 --- a/source/resources/lambdas/ConfigureRESSubmitters/ConfigureRESSubmitters.py +++ /dev/null @@ -1,127 +0,0 @@ -""" -Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -SPDX-License-Identifier: MIT-0 - -Permission is hereby granted, free of charge, to any person obtaining a copy of this -software and associated documentation files (the "Software"), to deal in the Software -without restriction, including without limitation the rights to use, copy, modify, -merge, publish, distribute, sublicense, and/or sell copies of the Software, and to -permit persons to whom the Software is furnished to do so. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A -PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION -OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE -SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -""" - -''' -Call /opt/slurm/{{ClusterName}}/config/bin/on_head_node_updated.sh using ssm run command. -''' -import boto3 -import json -import logging -from os import environ as environ - -logger=logging.getLogger(__file__) -logger_formatter = logging.Formatter('%(levelname)s: %(message)s') -logger_streamHandler = logging.StreamHandler() -logger_streamHandler.setFormatter(logger_formatter) -logger.addHandler(logger_streamHandler) -logger.setLevel(logging.INFO) -logger.propagate = False - -def lambda_handler(event, context): - try: - logger.info(f"event:\n{json.dumps(event, indent=4)}") - - cluster_name = environ['ClusterName'] - cluster_region = environ['Region'] - environment_name = environ['RESEnvironmentName'] - slurm_login_node_sg_id = environ['SlurmLoginNodeSGId'] - logger.info(f"Configure RES({environment_name}) submitters for {cluster_name} in {cluster_region}") - - ec2_client = boto3.client('ec2', region_name=cluster_region) - describe_instances_paginator = ec2_client.get_paginator('describe_instances') - describe_instances_iterator = describe_instances_paginator.paginate( - Filters = [ - {'Name': 'tag:res:EnvironmentName', 'Values': [environment_name]}, - {'Name': 'tag:res:NodeType', 'Values': ['virtual-desktop-dcv-host']}, - {'Name': 'instance-state-name', 'Values': ['running']} - ] - ) - submitter_instance_ids = [] - reservation_index = 0 - for response in describe_instances_iterator: - for reservation_info in response['Reservations']: - logger.info(f"reservation[{reservation_index}]:") - reservation_index += 1 - instance_index = 0 - for instance_info in reservation_info['Instances']: - logger.info(f" instance[{instance_index}]:") - instance_index += 1 - logger.info(f" instance_id: {instance_info['InstanceId']}") - if instance_info['State']['Name'] != 'running': - logger.info(f" Skipping because state = {instance_info['State']['Name']}") - continue - for tags in instance_info['Tags']: - logger.info(f" {tags}") - instance_id = instance_info['InstanceId'] - submitter_instance_ids.append(instance_id) - security_group_ids = [] - for security_group_dict in instance_info['SecurityGroups']: - security_group_ids.append(security_group_dict['GroupId']) - if slurm_login_node_sg_id not in security_group_ids: - # Attach the security group - logger.info(f"Attaching {slurm_login_node_sg_id} to {instance_id}.") - security_group_ids.append(slurm_login_node_sg_id) - ec2_client.modify_instance_attribute(InstanceId=instance_id, Groups=security_group_ids) - else: - logger.info(f"{slurm_login_node_sg_id} already attached to {instance_id}") - - logger.info(f"submitter_instance_ids: {submitter_instance_ids}") - if not submitter_instance_ids: - logger.info("No running submitters.") - return - - ssm_client = boto3.client('ssm', region_name=cluster_region) - commands = f""" -set -ex - -if ! [ -e /opt/slurm/{cluster_name} ]; then - sudo mkdir -p /opt/slurm/{cluster_name} -fi -if ! mountpoint /opt/slurm/{cluster_name} ; then - sudo mount head_node.{cluster_name}.pcluster:/opt/slurm /opt/slurm/{cluster_name} || true -fi - -script="/opt/slurm/{cluster_name}/config/bin/submitter_configure.sh" -if ! [ -e $script ]; then - echo "$script doesn't exist" - exit 1 -fi - -sudo $script - """ - TIMEOUT_MINUTES = 90 - TIMEOUT_SECONDS = TIMEOUT_MINUTES * 60 - send_command_response = ssm_client.send_command( - DocumentName = 'AWS-RunShellScript', - InstanceIds = submitter_instance_ids, - Parameters = {'commands': [commands]}, - Comment = f"Configure {environment_name} submitters for {cluster_name}", - TimeoutSeconds = TIMEOUT_SECONDS - ) - logger.info(f"Sent SSM command {send_command_response['Command']['CommandId']}") - - except Exception as e: - logger.exception(str(e)) - sns_client = boto3.client('sns') - sns_client.publish( - TopicArn = environ['ErrorSnsTopicArn'], - Subject = f"{cluster_name} ConfigureRESSubmitters failed", - Message = str(e) - ) - logger.info(f"Published error to {environ['ErrorSnsTopicArn']}") - raise diff --git a/source/resources/lambdas/ConfigureRESUsersGroupsJson/ConfigureRESUsersGroupsJson.py b/source/resources/lambdas/ConfigureUsersGroupsJson/ConfigureUsersGroupsJson.py similarity index 70% rename from source/resources/lambdas/ConfigureRESUsersGroupsJson/ConfigureRESUsersGroupsJson.py rename to source/resources/lambdas/ConfigureUsersGroupsJson/ConfigureUsersGroupsJson.py index f08de760..482105f4 100644 --- a/source/resources/lambdas/ConfigureRESUsersGroupsJson/ConfigureRESUsersGroupsJson.py +++ b/source/resources/lambdas/ConfigureUsersGroupsJson/ConfigureUsersGroupsJson.py @@ -38,28 +38,30 @@ def lambda_handler(event, context): cluster_name = environ['ClusterName'] cluster_region = environ['Region'] - res_environment_name = environ['RESEnvironmentName'] - res_domain_joined_instance_name = environ['RESDomainJoinedInstanceName'] - res_domain_joined_instance_module_name = environ['RESDomainJoinedInstanceModuleName'] - res_domain_joined_instance_module_id = environ['RESDomainJoinedInstanceModuleId'] - res_domain_joined_instance_node_type = environ['RESDomainJoinedInstanceNodeType'] + instance_tags = json.loads(environ['DomainJoinedInstanceTagsJson']) slurm_login_node_sg_id = environ['SlurmLoginNodeSGId'] - logger.info(f"Configure update of /opt/slurm/{cluster_name}/config/users_groups.json from RES {res_environment_name} domain joined instance with following tags:\nName={res_domain_joined_instance_name}\nres:ModuleName={res_domain_joined_instance_module_name}\nres:ModuleId={res_domain_joined_instance_module_name}\nres:NodeType={res_domain_joined_instance_node_type}\nstate=running") + if slurm_login_node_sg_id == 'None': + slurm_login_node_sg_id = None - domain_joined_instance_id = None - domain_joined_instance_security_group_ids = [] - ec2_client = boto3.client('ec2', region_name=cluster_region) - describe_instances_paginator = ec2_client.get_paginator('describe_instances') + tags_message = '' describe_instances_kwargs = { 'Filters': [ - {'Name': 'tag:res:EnvironmentName', 'Values': [res_environment_name]}, - {'Name': 'tag:Name', 'Values': [res_domain_joined_instance_name]}, - {'Name': 'tag:res:ModuleName', 'Values': [res_domain_joined_instance_module_name]}, - {'Name': 'tag:res:ModuleId', 'Values': [res_domain_joined_instance_module_id]}, - {'Name': 'tag:res:NodeType', 'Values': [res_domain_joined_instance_node_type]}, {'Name': 'instance-state-name', 'Values': ['running']} ] } + for tag_dict in instance_tags: + tag = tag_dict['Key'] + values = tag_dict['Values'] + tags_message += f"\n{tag}: {values}" + describe_instances_kwargs['Filters'].append( + {'Name': f"tag:{tag}", 'Values': values} + ) + logger.info(f"Configure update of /opt/slurm/{cluster_name}/config/users_groups.json from domain joined instance with following tags:{tags_message}") + + domain_joined_instance_id = None + domain_joined_instance_security_group_ids = [] + ec2_client = boto3.client('ec2', region_name=cluster_region) + describe_instances_paginator = ec2_client.get_paginator('describe_instances') for describe_instances_response in describe_instances_paginator.paginate(**describe_instances_kwargs): for reservation_dict in describe_instances_response['Reservations']: domain_joined_instance_info = reservation_dict['Instances'][0] @@ -68,16 +70,17 @@ def lambda_handler(event, context): for security_group_dict in domain_joined_instance_info['SecurityGroups']: domain_joined_instance_security_group_ids.append(security_group_dict['GroupId']) if not domain_joined_instance_id: - raise RuntimeError(f"No running instances found with tags res:EnvironmentName={res_environment_name}, Name={res_domain_joined_instance_name}, res:ModuleName={res_domain_joined_instance_module_name}, res:ModuleId={res_domain_joined_instance_module_id}, res:NodeType={res_domain_joined_instance_node_type}") - - # Make sure that the RES login nodes have the required security group attached. - if slurm_login_node_sg_id not in domain_joined_instance_security_group_ids: - # Attach the security group - logger.info(f"Attaching {slurm_login_node_sg_id} to {domain_joined_instance_id}.") - domain_joined_instance_security_group_ids.append(slurm_login_node_sg_id) - ec2_client.modify_instance_attribute(InstanceId=domain_joined_instance_id, Groups=domain_joined_instance_security_group_ids) - else: - logger.info(f"{slurm_login_node_sg_id} already attached to {domain_joined_instance_id}") + raise RuntimeError(f"No running instances found with tags:{tags_message}") + + # Make sure that the instance has the required security group attached. + if slurm_login_node_sg_id: + if slurm_login_node_sg_id not in domain_joined_instance_security_group_ids: + # Attach the security group + logger.info(f"Attaching {slurm_login_node_sg_id} to {domain_joined_instance_id}.") + domain_joined_instance_security_group_ids.append(slurm_login_node_sg_id) + ec2_client.modify_instance_attribute(InstanceId=domain_joined_instance_id, Groups=domain_joined_instance_security_group_ids) + else: + logger.info(f"{slurm_login_node_sg_id} already attached to {domain_joined_instance_id}") ssm_client = boto3.client('ssm', region_name=cluster_region) commands = f""" @@ -102,7 +105,7 @@ def lambda_handler(event, context): DocumentName = 'AWS-RunShellScript', InstanceIds = [domain_joined_instance_id], Parameters = {'commands': [commands]}, - Comment = f"Configure {res_environment_name} users and groups for {cluster_name}", + Comment = f"Configure users and groups for {cluster_name}", TimeoutSeconds = 5 * 60 # 5 minutes ) command_id = send_command_response['Command']['CommandId'] diff --git a/source/resources/lambdas/CreateParallelCluster/CreateParallelCluster.py b/source/resources/lambdas/CreateParallelCluster/CreateParallelCluster.py index c8b78086..0e825667 100644 --- a/source/resources/lambdas/CreateParallelCluster/CreateParallelCluster.py +++ b/source/resources/lambdas/CreateParallelCluster/CreateParallelCluster.py @@ -67,6 +67,10 @@ def get_cluster_status(cluster_name, cluster_region): def lambda_handler(event, context): try: logger.info(f"event:\n{json.dumps(event, indent=4)}") + + # Create sns client so can send notifications on any errors. + sns_client = boto3.client('sns') + cluster_name = None requestType = event['RequestType'] properties = event['ResourceProperties'] @@ -91,9 +95,6 @@ def lambda_handler(event, context): cluster_region = environ['Region'] logger.info(f"{requestType} request for {cluster_name} in {cluster_region}") - # Create sns client so can send notifications on any errors. - sns_client = boto3.client('sns') - cluster_status = get_cluster_status(cluster_name, cluster_region) if cluster_status: valid_statuses = ['CREATE_COMPLETE', 'UPDATE_COMPLETE', 'UPDATE_ROLLBACK_COMPLETE'] diff --git a/source/resources/lambdas/DeconfigureRESSubmitters/DeconfigureRESSubmitters.py b/source/resources/lambdas/DeconfigureExternalLoginNodes/DeconfigureExternalLoginNodes.py similarity index 53% rename from source/resources/lambdas/DeconfigureRESSubmitters/DeconfigureRESSubmitters.py rename to source/resources/lambdas/DeconfigureExternalLoginNodes/DeconfigureExternalLoginNodes.py index 141c7452..ceea354e 100644 --- a/source/resources/lambdas/DeconfigureRESSubmitters/DeconfigureRESSubmitters.py +++ b/source/resources/lambdas/DeconfigureExternalLoginNodes/DeconfigureExternalLoginNodes.py @@ -24,6 +24,7 @@ import json import logging from os import environ as environ +from textwrap import dedent import time logger=logging.getLogger(__file__) @@ -58,7 +59,8 @@ def lambda_handler(event, context): cluster_name = environ['ClusterName'] cluster_region = environ['Region'] - environment_name = environ['RESEnvironmentName'] + external_login_nodes_config = json.loads(environ['ExternalLoginNodesConfigJson']) + logger.info(f"{requestType} request for {cluster_name} in {cluster_region}") if requestType != 'Delete': @@ -66,101 +68,114 @@ def lambda_handler(event, context): cfnresponse.send(event, context, cfnresponse.SUCCESS, {}, physicalResourceId=cluster_name) return - logger.info(f"Deconfigure RES({environment_name}) submitters for {cluster_name} in {cluster_region}") + logger.info(f"Deconfigure external login nodes for {cluster_name} in {cluster_region}") ec2_client = boto3.client('ec2', region_name=cluster_region) - describe_instances_paginator = ec2_client.get_paginator('describe_instances') - describe_instances_iterator = describe_instances_paginator.paginate( - Filters = [ - {'Name': 'tag:res:EnvironmentName', 'Values': [environment_name]}, - {'Name': 'tag:res:NodeType', 'Values': ['virtual-desktop-dcv-host']}, - {'Name': 'instance-state-name', 'Values': ['running']} - ] - ) - submitter_instance_ids = [] - reservation_index = 0 - for response in describe_instances_iterator: - for reservation_info in response['Reservations']: - logger.info(f"reservation[{reservation_index}]:") - reservation_index += 1 - instance_index = 0 - for instance_info in reservation_info['Instances']: - logger.info(f" instance[{instance_index}]:") - instance_index += 1 - logger.info(f" instance_id: {instance_info['InstanceId']}") - if instance_info['State']['Name'] != 'running': - logger.info(f" Skipping because state = {instance_info['State']['Name']}") - continue - for tags in instance_info['Tags']: - logger.info(f" {tags}") - submitter_instance_ids.append(instance_info['InstanceId']) - logger.info(f"submitter_instance_ids: {submitter_instance_ids}") - if not submitter_instance_ids: - logger.info("No running submitters.") + + login_node_instance_ids = [] + for external_login_node_config in external_login_nodes_config: + tags_message = '' + describe_instances_kwargs = { + 'Filters': [ + {'Name': 'instance-state-name', 'Values': ['running']} + ] + } + for tag_dict in external_login_node_config['Tags']: + tag = tag_dict['Key'] + values = tag_dict['Values'] + tags_message += f"\n{tag}: {values}" + describe_instances_kwargs['Filters'].append( + {'Name': f"tag:{tag}", 'Values': values} + ) + logger.info(f"Deconfigure instances with the following tags as login nodes:{tags_message}") + + describe_instances_paginator = ec2_client.get_paginator('describe_instances') + describe_instances_iterator = describe_instances_paginator.paginate(**describe_instances_kwargs) + reservation_index = 0 + for response in describe_instances_iterator: + for reservation_info in response['Reservations']: + logger.info(f"reservation[{reservation_index}]:") + reservation_index += 1 + instance_index = 0 + for instance_info in reservation_info['Instances']: + logger.info(f" instance[{instance_index}]:") + instance_index += 1 + logger.info(f" instance_id: {instance_info['InstanceId']}") + if instance_info['State']['Name'] != 'running': + logger.info(f" Skipping because state = {instance_info['State']['Name']}") + continue + login_node_instance_ids.append(instance_info['InstanceId']) + + if login_node_instance_ids: + logger.info(f"Found {len(login_node_instance_ids)} login nodes. instance_ids:"+"\n" + '\n'.join(login_node_instance_ids)) + else: + logger.info("No running login nodes.") cfnresponse.send(event, context, cfnresponse.SUCCESS, {}, physicalResourceId=cluster_name) return ssm_client = boto3.client('ssm', region_name=cluster_region) - commands = f""" -set -ex - -mount_dest=/opt/slurm/{cluster_name} - -# Make sure that the cluster is still mounted and mount is accessible. -# If the cluster has already been deleted then the mount will be hung and we have to do manual cleanup. -if mount | grep " $mount_dest "; then - echo "$mount_dest is mounted." - if ! timeout 1s ls $mount_dest; then - echo "Mount point ($mount_dest) is hung. Source may have already been deleted." - timeout 5s sudo umount -lf $mount_dest - timeout 1s rm -rf $mount_dest - fi -fi - -script="$mount_dest/config/bin/submitter_deconfigure.sh" -if ! timeout 1s ls $script; then - echo "$script doesn't exist" -else - sudo $script -fi - -# Do manual cleanup just in case something above failed. - -sudo rm -f /etc/profile.d/slurm_{cluster_name}_modulefiles.sh - -sudo grep -v ' $mount_dest ' /etc/fstab > /etc/fstab.new -if diff -q /etc/fstab /etc/fstab.new; then - sudo rm -f /etc/fstab.new -else - sudo cp /etc/fstab /etc/fstab.$(date '+%Y-%m-%d@%H:%M:%S~') - sudo mv -f /etc/fstab.new /etc/fstab -fi - -if timeout 1s mountpoint $mount_dest; then - echo "$mount_dest is a mountpoint" - sudo umount -lf $mount_dest -fi - -if timeout 1s ls $mount_dest; then - sudo rmdir $mount_dest -fi - """ + + ssm_script = dedent(f""" + set -ex + + mount_dest=/opt/slurm/{cluster_name} + + # Make sure that the cluster is still mounted and mount is accessible. + # If the cluster has already been deleted then the mount will be hung and we have to do manual cleanup. + if mount | grep " $mount_dest "; then + echo "$mount_dest is mounted." + if ! timeout 1s ls $mount_dest; then + echo "Mount point ($mount_dest) is hung. Source may have already been deleted." + timeout 5s sudo umount -lf $mount_dest + timeout 1s rm -rf $mount_dest + fi + fi + + script="$mount_dest/config/bin/submitter_deconfigure.sh" + if ! timeout 1s ls $script; then + echo "$script doesn't exist" + else + sudo $script + fi + + # Do manual cleanup just in case something above failed. + + sudo rm -f /etc/profile.d/slurm_{cluster_name}_modulefiles.sh + + sudo grep -v ' $mount_dest ' /etc/fstab > /etc/fstab.new + if diff -q /etc/fstab /etc/fstab.new; then + sudo rm -f /etc/fstab.new + else + sudo cp /etc/fstab /etc/fstab.$(date '+%Y-%m-%d@%H:%M:%S~') + sudo mv -f /etc/fstab.new /etc/fstab + fi + + if timeout 1s mountpoint $mount_dest; then + echo "$mount_dest is a mountpoint" + sudo umount -lf $mount_dest + fi + + if timeout 1s ls $mount_dest; then + sudo rmdir $mount_dest + fi + """) + response = ssm_client.send_command( DocumentName = 'AWS-RunShellScript', - InstanceIds = submitter_instance_ids, - Parameters = {'commands': [commands]}, - Comment = f"Deconfigure {environment_name} submitters for {cluster_name}" + InstanceIds = login_node_instance_ids, + Parameters = {'commands': [ssm_script]}, + Comment = f"Deconfigure external login nodes for {cluster_name}" ) command_id = response['Command']['CommandId'] logger.info(f"Sent SSM command {command_id}") # Wait for the command invocations to be made time.sleep(5) - # Wait for the command to complete before returning so that the cluster resources aren't removed before the command completes. + # Wait for the commands to complete before returning so that the cluster resources aren't removed before the command completes. num_errors = 0 MAX_WAIT_TIME = 13 * 60 wait_time = 0 - for instance_id in submitter_instance_ids: + for instance_id in login_node_instance_ids: command_complete = False while not command_complete: response = ssm_client.get_command_invocation( diff --git a/source/resources/lambdas/DeconfigureRESSubmitters/cfnresponse.py b/source/resources/lambdas/DeconfigureExternalLoginNodes/cfnresponse.py similarity index 100% rename from source/resources/lambdas/DeconfigureRESSubmitters/cfnresponse.py rename to source/resources/lambdas/DeconfigureExternalLoginNodes/cfnresponse.py diff --git a/source/resources/lambdas/DeconfigureRESUsersGroupsJson/DeconfigureRESUsersGroupsJson.py b/source/resources/lambdas/DeconfigureUsersGroupsJson/DeconfigureUsersGroupsJson.py similarity index 100% rename from source/resources/lambdas/DeconfigureRESUsersGroupsJson/DeconfigureRESUsersGroupsJson.py rename to source/resources/lambdas/DeconfigureUsersGroupsJson/DeconfigureUsersGroupsJson.py diff --git a/source/resources/lambdas/DeconfigureRESUsersGroupsJson/cfnresponse.py b/source/resources/lambdas/DeconfigureUsersGroupsJson/cfnresponse.py similarity index 100% rename from source/resources/lambdas/DeconfigureRESUsersGroupsJson/cfnresponse.py rename to source/resources/lambdas/DeconfigureUsersGroupsJson/cfnresponse.py diff --git a/source/resources/parallel-cluster/config/bin/on_head_node_configured.sh b/source/resources/parallel-cluster/config/bin/on_head_node_configured.sh index 21b4184c..4a49b61d 100755 --- a/source/resources/parallel-cluster/config/bin/on_head_node_configured.sh +++ b/source/resources/parallel-cluster/config/bin/on_head_node_configured.sh @@ -84,16 +84,16 @@ ansible-playbook $PLAYBOOKS_PATH/ParallelClusterHeadNode.yml \ popd # Notify SNS topic that trigger configuration of cluster manager and submitters -ConfigureRESUsersGroupsJsonSnsTopicArnParameter={{ConfigureRESUsersGroupsJsonSnsTopicArnParameter}} -if ! [[ -z "$ConfigureRESUsersGroupsJsonSnsTopicArnParameter" ]]; then - ConfigureRESClusterManagerSnsTopicArn=$(aws ssm get-parameter --name $ConfigureRESUsersGroupsJsonSnsTopicArnParameter --query 'Parameter.Value' --output text) - aws sns publish --topic-arn $ConfigureRESClusterManagerSnsTopicArn --message 'Configure {{ClusterName}} RES ClusterManager' +ConfigureUsersGroupsJsonSnsTopicArnParameter={{ConfigureUsersGroupsJsonSnsTopicArnParameter}} +if ! [[ -z "$ConfigureUsersGroupsJsonSnsTopicArnParameter" ]]; then + ConfigureUsersGroupsJsonSnsTopicArn=$(aws ssm get-parameter --name $ConfigureUsersGroupsJsonSnsTopicArnParameter --query 'Parameter.Value' --output text) + aws sns publish --topic-arn $ConfigureUsersGroupsJsonSnsTopicArn --message 'Configure {{ClusterName}} users_groups.json' fi -ConfigureRESSubmittersSnsTopicArnParameter={{ConfigureRESSubmittersSnsTopicArnParameter}} -if ! [[ -z "$ConfigureRESSubmittersSnsTopicArnParameter" ]]; then - ConfigureRESSubmittersSnsTopicArn=$(aws ssm get-parameter --name $ConfigureRESSubmittersSnsTopicArnParameter --query 'Parameter.Value' --output text) - aws sns publish --topic-arn $ConfigureRESSubmittersSnsTopicArn --message 'Configure {{ClusterName}} RES submitters' +ConfigureExternalLoginNodesSnsTopicArnParameter={{ConfigureExternalLoginNodesSnsTopicArnParameter}} +if ! [[ -z "$ConfigureExternalLoginNodesSnsTopicArnParameter" ]]; then + ConfigureExternalLoginNodesSnsTopicArn=$(aws ssm get-parameter --name $ConfigureExternalLoginNodesSnsTopicArnParameter --query 'Parameter.Value' --output text) + aws sns publish --topic-arn $ConfigureExternalLoginNodesSnsTopicArn --message 'Configure {{ClusterName}} login nodes' fi echo "$(date): Finished ${script_name}"