From 1113c0d6cf751abfbc041255a016e9feaf37dfbf Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Fri, 20 Sep 2024 19:44:43 +0000 Subject: [PATCH 1/5] Pin build-and-run-batch-job to branch that fixes Terraform configs --- .github/workflows/build-and-run-model.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-and-run-model.yaml b/.github/workflows/build-and-run-model.yaml index 50a625cf..d11d0675 100644 --- a/.github/workflows/build-and-run-model.yaml +++ b/.github/workflows/build-and-run-model.yaml @@ -69,7 +69,7 @@ jobs: # required in order to allow the reusable called workflow to push to # GitHub Container Registry packages: write - uses: ccao-data/actions/.github/workflows/build-and-run-batch-job.yaml@main + uses: ccao-data/actions/.github/workflows/build-and-run-batch-job.yaml@jeancochrane/persist-batch-compute-environment-update-policy with: backend: "ec2" vcpu: "40" From 2db2f787ad2bbf92f192d1b7b6dff12f43eea6e9 Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Fri, 20 Sep 2024 19:46:55 +0000 Subject: [PATCH 2/5] Delete terraform directory that has been unused since #59 --- terraform/.gitignore | 1 - terraform/main.tf | 183 ------------------------------------------- terraform/outputs.tf | 23 ------ 3 files changed, 207 deletions(-) delete mode 100644 terraform/.gitignore delete mode 100644 terraform/main.tf delete mode 100644 terraform/outputs.tf diff --git a/terraform/.gitignore b/terraform/.gitignore deleted file mode 100644 index e79eb231..00000000 --- a/terraform/.gitignore +++ /dev/null @@ -1 +0,0 @@ -.terraform* diff --git a/terraform/main.tf b/terraform/main.tf deleted file mode 100644 index 75d72aa3..00000000 --- a/terraform/main.tf +++ /dev/null @@ -1,183 +0,0 @@ -# Terraform configuration for AWS Batch job resources that can run the model. -# -# Note that some stable resources that are shared between all Batch environments -# are either builtin to all AWS accounts or are defined in our core AWS -# infrastructure repository, which is separate from this module. These resources -# are thus referenced using Terraform `data` entities rather than `resource` -# entities, which means Terraform will not attempt to create or update them. -# These resources include, but are not limited to: -# -# * The VPC, subnets, and security group used for container networking -# * IAM roles for task execution and Batch provisioning - -terraform { - required_providers { - aws = { - source = "hashicorp/aws" - version = "~> 5.23" - } - } - - required_version = ">= 1.5.7" - - # Backend configs change based on the calling repo, so we leave it empty here - # and then leave it up to the caller of `terraform init` to pass the required - # S3 backend config attributes in via `-backend-config` flags. - backend "s3" {} -} - -provider "aws" { - region = "us-east-1" -} - -# This variable defines the name of all of the Batch resources managed by -# this configuration. The name should have appropriate prefixes depending on -# whether it's intended for a staging or prod environment, so we leave the -# naming logic up to the Terraform caller, which we expect to be a CI -# environment with knowledge of the current git branch name -variable "batch_job_name" { - type = string -} - -# Set the name of the container image that the Batch job should pull to use as -# its execution environment, e.g. "ghcr.io/ccao-data/model-res-avm:master". -# This is defined as a variable so that CI environments can point Batch -# job definitions to freshly built images -variable "batch_container_image_name" { - type = string -} - -# How many vCPUs should be provisioned for Batch jobs -variable "batch_job_definition_vcpu" { - type = string -} - -# How much memory should be provisioned for Batch jobs -variable "batch_job_definition_memory" { - type = string -} - -# Retrieve the default VPC for this region, which is builtin to AWS. -# Containers in the Batch compute environment will be deployed into this VPC -data "aws_vpc" "default" { - default = true -} - -# Retrieve the default subnets in the default VPC, which are builtin to AWS. -# Containers in the Batch compute environment are connected to the Internet -# using these subnets. Note that these subnets are public by default, but the -# use of an additional security group ensures that we block all ingress to the -# compute environment -data "aws_subnets" "default" { - filter { - name = "vpc-id" - values = [data.aws_vpc.default.id] - } - filter { - name = "default-for-az" - values = [true] - } - filter { - name = "state" - values = ["available"] - } -} - -# Retrieve the security group that blocks all ingress and allows egress over -# HTTPS only -data "aws_security_group" "outbound_https" { - name = "outbound-https" - vpc_id = data.aws_vpc.default.id -} - -# Retrieve the IAM role that the Batch compute environment uses to manage -# EC2/ECS resources -data "aws_iam_role" "batch_service_role" { - name = "AWSServiceRoleForBatch" -} - -# Retrieve the IAM role that the Batch job definition uses to execute ECS -# operations like pulling Docker images and pushing logs to CloudWatch -data "aws_iam_role" "ecs_task_execution_role" { - name = "ecsTaskExecutionRole" -} - -# Retrieve the IAM role that the Batch job passes on to the containers, allowing -# those containers to access AWS resources like data stored in S3 -data "aws_iam_role" "ecs_job_role" { - name = "ccao-ecs-model-runner" -} - -# Create a Batch compute environment to run containers. Compute environments -# define the underlying ECS or EC2 resources that will be provisioned to -# use for running jobs in containers. Docs here: -# https://docs.aws.amazon.com/batch/latest/userguide/compute_environments.html -resource "aws_batch_compute_environment" "main" { - compute_environment_name = var.batch_job_name - service_role = data.aws_iam_role.batch_service_role.arn - state = "ENABLED" - type = "MANAGED" - - compute_resources { - type = "FARGATE" - min_vcpus = 0 - max_vcpus = 64 # Max across all jobs, not within one job - security_group_ids = [data.aws_security_group.outbound_https.id] - subnets = data.aws_subnets.default.ids - } -} - -# Create a Batch job queue to run jobs. Job queues keep track of which jobs -# are waiting to run and in what order they should be prioritized in cases -# where its associated compute environment has reached max capacity. Docs here: -# https://docs.aws.amazon.com/batch/latest/userguide/job_queues.html -resource "aws_batch_job_queue" "main" { - name = var.batch_job_name - compute_environments = [aws_batch_compute_environment.main.arn] - priority = 0 - state = "ENABLED" -} - -# Create a Batch job definition to define jobs. Job definitions provide the -# information about how containers should be configured in a compute -# environment and what those containers should do. When combined with a -# job queue, they provide enough information to submit a job. Docs here: -# https://docs.aws.amazon.com/batch/latest/userguide/job_definitions.html -# -# Note that jobs using this job definition cannot be provisioned by Terraform, -# and are instead submitted via calls to `aws batch submit-job` in the same CI -# workflow that provisions these resources -resource "aws_batch_job_definition" "main" { - name = var.batch_job_name - platform_capabilities = ["FARGATE"] - type = "container" - - container_properties = jsonencode({ - executionRoleArn = data.aws_iam_role.ecs_task_execution_role.arn - fargatePlatformConfiguration = { - platformVersion = "LATEST" - } - image = var.batch_container_image_name - jobRoleArn = data.aws_iam_role.ecs_job_role.arn - logConfiguration = { - logDriver = "awslogs" - } - networkConfiguration = { - assignPublicIp = "ENABLED" - } - resourceRequirements = [ - { - type = "VCPU" - value = var.batch_job_definition_vcpu - }, - { - type = "MEMORY" - value = var.batch_job_definition_memory - } - ] - runtimePlatform = { - cpuArchitecture = "X86_64" - operatingSystemFamily = "LINUX" - } - }) -} diff --git a/terraform/outputs.tf b/terraform/outputs.tf deleted file mode 100644 index 51c438cf..00000000 --- a/terraform/outputs.tf +++ /dev/null @@ -1,23 +0,0 @@ -# Define output variables that can be used by Terraform callers to access -# attributes of resources created by the config or input values that were -# passed into the config at build time - -output "batch_job_definition_arn" { - description = "ARN of the Batch job definition" - value = aws_batch_job_definition.main.arn -} - -output "batch_job_queue_arn" { - description = "ARN of the Batch job queue" - value = aws_batch_job_queue.main.arn -} - -output "batch_job_name" { - description = "Name of the Batch job" - value = var.batch_job_name -} - -output "batch_container_image_name" { - description = "Name of the container image to use for the Batch job" - value = var.batch_container_image_name -} From 6f9a02a748f4e871d889b3a1a2eec10233450b15 Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Fri, 20 Sep 2024 20:20:47 +0000 Subject: [PATCH 3/5] Lower resource requirements in build-and-run-model to test changing job definition --- .github/workflows/build-and-run-model.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build-and-run-model.yaml b/.github/workflows/build-and-run-model.yaml index d11d0675..e4ebaba2 100644 --- a/.github/workflows/build-and-run-model.yaml +++ b/.github/workflows/build-and-run-model.yaml @@ -72,8 +72,8 @@ jobs: uses: ccao-data/actions/.github/workflows/build-and-run-batch-job.yaml@jeancochrane/persist-batch-compute-environment-update-policy with: backend: "ec2" - vcpu: "40" - memory: "158000" + vcpu: "8" + memory: "32000" # Maximum pipeline runtime. This is slightly below 6 hours, which # is the maximum length of any single GitHub Actions job role-duration-seconds: 21000 From 757d149505fe5bfd103614706af3dbfd75dafc77 Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Fri, 20 Sep 2024 20:37:45 +0000 Subject: [PATCH 4/5] Add missing ref, which is necessary when testing a different branch of ccao-data/actions --- .github/workflows/build-and-run-model.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/build-and-run-model.yaml b/.github/workflows/build-and-run-model.yaml index e4ebaba2..4166c3c5 100644 --- a/.github/workflows/build-and-run-model.yaml +++ b/.github/workflows/build-and-run-model.yaml @@ -71,6 +71,7 @@ jobs: packages: write uses: ccao-data/actions/.github/workflows/build-and-run-batch-job.yaml@jeancochrane/persist-batch-compute-environment-update-policy with: + ref: jeancochrane/persist-batch-compute-environment-update-policy backend: "ec2" vcpu: "8" memory: "32000" From 465104abfa183fc8bc92b3374a66ce72359e3116 Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Fri, 20 Sep 2024 20:47:14 +0000 Subject: [PATCH 5/5] Switch build-and-run-model params back to their usual values after testing --- .github/workflows/build-and-run-model.yaml | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build-and-run-model.yaml b/.github/workflows/build-and-run-model.yaml index 4166c3c5..50a625cf 100644 --- a/.github/workflows/build-and-run-model.yaml +++ b/.github/workflows/build-and-run-model.yaml @@ -69,12 +69,11 @@ jobs: # required in order to allow the reusable called workflow to push to # GitHub Container Registry packages: write - uses: ccao-data/actions/.github/workflows/build-and-run-batch-job.yaml@jeancochrane/persist-batch-compute-environment-update-policy + uses: ccao-data/actions/.github/workflows/build-and-run-batch-job.yaml@main with: - ref: jeancochrane/persist-batch-compute-environment-update-policy backend: "ec2" - vcpu: "8" - memory: "32000" + vcpu: "40" + memory: "158000" # Maximum pipeline runtime. This is slightly below 6 hours, which # is the maximum length of any single GitHub Actions job role-duration-seconds: 21000