From 0602f97d1ea8972a153dd83a677852beec5e8eb4 Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Fri, 27 Oct 2023 16:36:18 -0500 Subject: [PATCH] Add deploy.yaml workflow job and Terraform config for running the model on Batch --- .github/workflows/deploy.yaml | 187 +++++++++++++++++++++++++++++++++- Dockerfile | 3 + terraform/.gitignore | 1 + terraform/main.tf | 175 +++++++++++++++++++++++++++++++ terraform/outputs.tf | 9 ++ 5 files changed, 370 insertions(+), 5 deletions(-) create mode 100644 terraform/.gitignore create mode 100644 terraform/main.tf create mode 100644 terraform/outputs.tf diff --git a/.github/workflows/deploy.yaml b/.github/workflows/deploy.yaml index 2693c157..65b778d8 100644 --- a/.github/workflows/deploy.yaml +++ b/.github/workflows/deploy.yaml @@ -1,13 +1,32 @@ +# Workflow that builds a Docker image containing the model code, +# pushes it to the GitHub Container Registry, and then optionally uses +# that container image to run the model using an AWS Batch job. +# +# Images are built on every commit to a PR or main branch in order to ensure +# that the build continues to work properly, but Batch jobs are gated behind +# a `deploy` environment that requires manual approval from a codeowner. + name: deploy on: pull_request: push: + # "*-assessment-year" are long-lived branches containing the most up-to-date + # models for a given assessment cycle, and hence we consider them to be + # main branches branches: [master, '*-assessment-year'] env: - REGISTRY: ghcr.io - IMAGE_NAME: ${{ github.repository }} + DOCKER_REGISTRY: ghcr.io + DOCKER_IMAGE_NAME: ${{ github.repository }} + # How long to wait between queries when polling for the status of an + # AWS Batch job when it's running or starting up + BATCH_JOB_POLL_INTERVAL_SECONDS: 10 + # How many times to poll AWS Batch job status while it's starting up before + # deciding to raise an error. Multiply by BATCH_JOB_POLL_INTERVAL_SECONDS to + # derive a timeout in second units + BATCH_JOB_POLL_STARTUP_MAX_RETRIES: 60 + BATCH_JOB_LOG_URL_PREFIX: "https://us-east-1.console.aws.amazon.com/cloudwatch/home?region=us-east-1#logsV2:log-groups/log-group/%2Faws%2Fbatch%2Fjob/log-events/getting-started-wizard-job-definition%2Fdefault%2F" jobs: publish-docker-image: @@ -24,7 +43,7 @@ jobs: - name: Login to GitHub Container Registry uses: docker/login-action@v3 with: - registry: ${{ env.REGISTRY }} + registry: ${{ env.DOCKER_REGISTRY }} username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} @@ -32,11 +51,11 @@ jobs: id: meta uses: docker/metadata-action@v5 with: - images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} + images: ${{ env.DOCKER_REGISTRY }}/${{ env.DOCKER_IMAGE_NAME }} # Tag the following types of images: # * On a branch, tag with the branch name (e.g. `master`) # * On a PR, tag with the PR number (e.g. `pr-12`) - # * On all events, tag with the short git SHA (e.g. `e956384`) + # * On a tagged commit, tag with the git tag (e.g. `2023`) tags: | type=ref,event=branch type=ref,event=pr @@ -52,4 +71,162 @@ jobs: labels: ${{ steps.meta.outputs.labels }} cache-from: type=gha cache-to: type=gha,mode=max + # Fix incorrect container type sidebar display in GitHub Container + # Registry provenance: false + + run-model: + needs: [publish-docker-image] + runs-on: ubuntu-latest + # Require manual approval to run this job + environment: deploy + # These permissions are needed to interact with GitHub's OIDC Token endpoint + # so that we can authenticate with AWS + permissions: + id-token: write + contents: read + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: ${{ secrets.AWS_IAM_ROLE_TO_ASSUME_ARN }} + aws-region: us-east-1 + role-duration-seconds: 14400 # Worst-case time for a full model run + + - name: Set up Terraform + uses: hashicorp/setup-terraform@v2 + + - name: Initialize Terraform + run: terraform init + shell: bash + working_directory: terraform + + - name: Set Terraform variables + id: set-vars + run: | + # # GITHUB_HEAD_REF is only set on pull_request events, so if it's + # present, we must be in a PR context + if [ -n "$GITHUB_HEAD_REF" ]; then + echo "On pull request branch, setting terraform workspace to CI" + # Replace slashes and underscores with hyphens in the workspace name + # and force it to lowercase, since we use it to name resources and + # we want to follow a consistent naming scheme + WORKSPACE="$(echo $GITHUB_HEAD_REF | \ + sed -e 's/\//-/g' -e 's/_/-/g' | \ + tr '[:upper:]' '[:lower:]')" + BATCH_JOB_NAME="ci_${WORKSPACE}_${GITHUB_REPOSITORY//\//-}" + + elif [[ $GITHUB_REF_NAME == 'master' ]]; then + echo "On master branch, setting terraform workspace to prod" + WORKSPACE="prod" + BATCH_JOB_NAME="${GITHUB_REPOSITORY//\//-}" + + else + echo "CI context did not match any of the expected environments" + exit 1 + fi + + echo { + "batch_job_name = \"$BATCH_JOB_NAME\""; + "batch_container_image_id = \"$BATCH_CONTAINER_IMAGE_ID\""; + } > "$TFVARS_FILE" + echo "batch-job-name=$BATCH_JOB_NAME" >> "$GITHUB_OUTPUT" + shell: bash + env: + BATCH_CONTAINER_IMAGE_ID: ${{ jobs.publish-docker-image.outputs.imageid }} + + - name: Select Terraform workspace + run: terraform workspace select -or-create "$WORKSPACE" + shell: bash + working_directory: terraform + env: + WORKSPACE: ${{ steps.set-vars.outputs.workspace }} + + - name: Validate Terraform config + run: terraform validate + working_directory: terraform + shell: bash + + - name: Apply Terraform changes + run: terraform apply -auto-approve + working_directory: terraform + shell: bash + + - name: Submit new Batch job + id: submit-job + run: | + BATCH_JOB_ID=$(\ + aws batch submit-job \ + --job-name "$BATCH_JOB_NAME" \ + --job-definition "$(terraform output batch_job_definition_arn)" \ + --job-queue "$(terraform output batch_job_queue_arn)" \ + | jq -r ".jobId" + echo "batch-job-id=$BATCH_JOB_ID" >> "$GITHUB_OUTPUT" + shell: bash + env: + BATCH_JOB_NAME: ${{ steps.set-vars.outputs.batch-job-name }} + + - name: Wait for Batch job to complete + run: | + LOOP_COUNTER=0 + while true; do + JOB_DESCRIPTIONS=$(aws batch describe-jobs --job-id "$BATCH_JOB_ID") + + JOB_LIST=$(echo "$JOB_DESCRIPTIONS" | jq -r '.jobs') + if [[ "$JOB_LIST" == "[]" ]]; then + echo "Unexpected empty response from aws batch describe-jobs" + exit 1 + fi + + JOB_STATUS=$(echo "$JOB_DESCRIPTIONS" | jq -r '.jobs[0].status') + echo "Job status is $JOB_STATUS" + + JOB_LOG_STREAM_NAME=$(\ + echo "$JOB_DESCRIPTIONS" | \ + jq -r '.jobs[0].container.logStreamName' \ + ) + + case "$JOB_STATUS" in + "RUNNING") + if [[ "$LOOP_COUNTER" == "0" ]]; then + # Only print the logs on the first loop, to keep output clean + echo "See logs: ${BATCH_JOB_LOG_URL_PREFIX}${JOB_LOG_STREAM_NAME}" + fi + echo "Sleeping ${BATCH_JOB_POLL_INTERVAL_SECONDS}s until next status check" + sleep 10 + ;; + + "SUCCEEDED") + echo "Job succeeded!" + exit 0 + ;; + + "FAILED") + echo "Job failed :( See logs: ${BATCH_JOB_LOG_URL_PREFIX}${JOB_LOG_STREAM_NAME}" + exit 1 + ;; + + *) + if [[ "$LOOP_COUNTER" == "$BATCH_JOB_POLL_STARTUP_MAX_RETRIES" ]]; then + echo "Failing workflow due to job startup timeout. This means " + echo "that the job did not enter a RUNNING state within a " + echo "reasonable amount of time. This usually indicates a " + echo "problem in the underlying ECS or EC2 backend that can " + echo "be debugged by checking cluster/instance logs in the " + echo "AWS console." + exit 1 + fi + echo "Sleeping ${BATCH_JOB_POLL_INTERVAL_SECONDS}s until next status check" + sleep 10 + ;; + esac + + echo "Starting status check #$((LOOP_COUNTER++))" + + done + shell: bash + env: + BATCH_JOB_ID: ${{ steps.submit-job.outputs.batch-job-id }} diff --git a/Dockerfile b/Dockerfile index 9395a2a2..a7ef7c9f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -41,3 +41,6 @@ RUN mv renv model-res-avm/ # Set the working directory to the app dir WORKDIR model-res-avm/ + +# TODO: Set this to the full pipeline once testing is complete +CMD "dvc pull & dvc repro train" diff --git a/terraform/.gitignore b/terraform/.gitignore new file mode 100644 index 00000000..e79eb231 --- /dev/null +++ b/terraform/.gitignore @@ -0,0 +1 @@ +.terraform* diff --git a/terraform/main.tf b/terraform/main.tf new file mode 100644 index 00000000..2f1c279c --- /dev/null +++ b/terraform/main.tf @@ -0,0 +1,175 @@ +# Terraform configuration for AWS Batch job resources that can run the model. +# +# Note that some stable resources that are shared between all Batch environments +# are either builtin to all AWS accounts or are defined in our core AWS +# infrastructure repository, which is separate from this module. These resources +# are thus referenced using Terraform `data` entities rather than `resource` +# entities, which means Terraform will not attempt to create or update them. +# These resources include, but are not limited to: +# +# * The VPC, subnets, and security group used for container networking +# * IAM roles for task execution and Batch provisioning + +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 5.23" + } + } + + required_version = ">= 1.5.7" + + backend "s3" { + bucket = "ccao-terraform-state-us-east-1" + key = "terraform.tfstate" + region = "us-east-1" + workspace_key_prefix = "model-res-avm/workspaces" + } +} + +provider "aws" { + region = "us-east-1" +} + +# This variable defines the name of all of the Batch resources managed by +# this configuration. The name should have appropriate prefixes depending on +# whether it's intended for a staging or prod environment, so we leave the +# naming logic up to the Terraform caller, which we expect to be a CI +# environment with knowledge of the current git branch name +variable "batch_job_name" { + type = string +} + +# Set the ID of the container image that the Batch job should pull to use as +# its execution environment, e.g. "ghcr.io/ccao-data/model-res-avm:master". +# This is defined as a variable so that CI environments can point Batch +# job definitions to freshly built images +variable "batch_container_image_id" { + type = string +} + +# Retrieve the default VPC for this region, which is builtin to AWS. +# Containers in the Batch compute environment will be deployed into this VPC +data "aws_vpc" "default" { + default = true +} + +# Retrieve the default subnets in the default VPC, which are builtin to AWS. +# Containers in the Batch compute environment are connected to the Internet +# using these subnets. Note that these subnets are public by default, but the +# use of an additional security group ensures that we block all ingress to the +# compute environment +data "aws_subnets" "default" { + filter { + name = "vpc-id" + values = [data.aws_vpc.default.id] + } + filter { + name = "default-for-az" + values = [true] + } + filter { + name = "state" + values = ["available"] + } +} + +# Retrieve the security group that blocks all ingress and allows egress over +# HTTPS only +data "aws_security_group" "outbound_https" { + name = "outbound-https" + vpc_id = data.aws_vpc.default.id +} + +# Retrieve the IAM role that the Batch compute environment uses to manage +# EC2/ECS resources +data "aws_iam_role" "batch_service_role" { + name = "AWSServiceRoleForBatch" +} + +# Retrieve the IAM role that the Batch job definition uses to execute ECS +# operations like pulling Docker images and pushing logs to CloudWatch +data "aws_iam_role" "ecs_task_execution_role" { + name = "ecsTaskExecutionRole" +} + +# Retrieve the IAM role that the Batch job passes on to the containers, allowing +# those containers to access AWS resources like data stored in S3 +data "aws_iam_role" "ecs_job_role" { + name = "ccao-ecs-dvc-full-access" +} + +# Create a Batch compute environment to run containers. Compute environments +# define the underlying ECS or EC2 resources that will be provisioned to +# use for running jobs in containers. Docs here: +# https://docs.aws.amazon.com/batch/latest/userguide/compute_environments.html +resource "aws_batch_compute_environment" "main" { + compute_environment_name = var.batch_job_name + service_role = data.aws_iam_role.batch_service_role.arn + state = "ENABLED" + type = "MANAGED" + + compute_resources { + type = "FARGATE" + min_vcpus = 0 + max_vcpus = 64 # Max across all jobs, not within one job + security_group_ids = [data.aws_security_group.outbound_https.id] + subnets = data.aws_subnets.default.ids + } +} + +# Create a Batch job queue to run jobs. Job queues keep track of which jobs +# are waiting to run and in what order they should be prioritized in cases +# where its associated compute environment has reached max capacity. Docs here: +# https://docs.aws.amazon.com/batch/latest/userguide/job_queues.html +resource "aws_batch_job_queue" "main" { + name = var.batch_job_name + compute_environments = [aws_batch_compute_environment.main.arn] + priority = 0 + state = "ENABLED" +} + +# Create a Batch job definition to define jobs. Job definitions provide the +# information about how containers should be configured in a compute +# environment and what those containers should do. When combined with a +# job queue, they provide enough information to submit a job. Docs here: +# https://docs.aws.amazon.com/batch/latest/userguide/job_definitions.html +# +# Note that jobs using this job definition cannot be provisioned by Terraform, +# and are instead submitted via calls to `aws batch submit-job` in the same CI +# workflow that provisions these resources +resource "aws_batch_job_definition" "main" { + name = var.batch_job_name + platform_capabilities = ["FARGATE"] + type = "container" + + container_properties = jsonencode({ + executionRoleArn = data.aws_iam_role.ecs_task_execution_role.arn + fargatePlatformConfiguration = { + platformVersion = "LATEST" + } + image = var.batch_container_image_id + jobRoleArn = data.aws_iam_role.ecs_job_role.arn + logConfiguration = { + logDriver = "awslogs" + } + networkConfiguration = { + assignPublicIp = "ENABLED" + } + resourceRequirements = [ + { + type = "VCPU" + value = "16.0" + }, + { + type = "MEMORY" + value = "65536" + } + ] + runtimePlatform = { + cpuArchitecture = "X86_64" + operatingSystemFamily = "LINUX" + } + }) +} diff --git a/terraform/outputs.tf b/terraform/outputs.tf new file mode 100644 index 00000000..27e838c7 --- /dev/null +++ b/terraform/outputs.tf @@ -0,0 +1,9 @@ +output "batch_job_definition_arn" { + description = "ARN of the Batch job definition" + value = aws_batch_job_definition.main.arn +} + +output "batch_job_queue_arn" { + description = "ARN of the Batch job queue" + value = aws_batch_job_queue.main.arn +}