From 0602f97d1ea8972a153dd83a677852beec5e8eb4 Mon Sep 17 00:00:00 2001
From: Jean Cochrane <jean@jeancochrane.com>
Date: Fri, 27 Oct 2023 16:36:18 -0500
Subject: [PATCH] Add deploy.yaml workflow job and Terraform config for running
 the model on Batch

---
 .github/workflows/deploy.yaml | 187 +++++++++++++++++++++++++++++++++-
 Dockerfile                    |   3 +
 terraform/.gitignore          |   1 +
 terraform/main.tf             | 175 +++++++++++++++++++++++++++++++
 terraform/outputs.tf          |   9 ++
 5 files changed, 370 insertions(+), 5 deletions(-)
 create mode 100644 terraform/.gitignore
 create mode 100644 terraform/main.tf
 create mode 100644 terraform/outputs.tf

diff --git a/.github/workflows/deploy.yaml b/.github/workflows/deploy.yaml
index 2693c157..65b778d8 100644
--- a/.github/workflows/deploy.yaml
+++ b/.github/workflows/deploy.yaml
@@ -1,13 +1,32 @@
+# Workflow that builds a Docker image containing the model code,
+# pushes it to the GitHub Container Registry, and then optionally uses
+# that container image to run the model using an AWS Batch job.
+#
+# Images are built on every commit to a PR or main branch in order to ensure
+# that the build continues to work properly, but Batch jobs are gated behind
+# a `deploy` environment that requires manual approval from a codeowner.
+
 name: deploy
 
 on:
   pull_request:
   push:
+    # "*-assessment-year" are long-lived branches containing the most up-to-date
+    # models for a given assessment cycle, and hence we consider them to be
+    # main branches
     branches: [master, '*-assessment-year']
 
 env:
-  REGISTRY: ghcr.io
-  IMAGE_NAME: ${{ github.repository }}
+  DOCKER_REGISTRY: ghcr.io
+  DOCKER_IMAGE_NAME: ${{ github.repository }}
+  # How long to wait between queries when polling for the status of an
+  # AWS Batch job when it's running or starting up
+  BATCH_JOB_POLL_INTERVAL_SECONDS: 10
+  # How many times to poll AWS Batch job status while it's starting up before
+  # deciding to raise an error. Multiply by BATCH_JOB_POLL_INTERVAL_SECONDS to
+  # derive a timeout in second units
+  BATCH_JOB_POLL_STARTUP_MAX_RETRIES: 60
+  BATCH_JOB_LOG_URL_PREFIX: "https://us-east-1.console.aws.amazon.com/cloudwatch/home?region=us-east-1#logsV2:log-groups/log-group/%2Faws%2Fbatch%2Fjob/log-events/getting-started-wizard-job-definition%2Fdefault%2F"
 
 jobs:
   publish-docker-image:
@@ -24,7 +43,7 @@ jobs:
       - name: Login to GitHub Container Registry
         uses: docker/login-action@v3
         with:
-          registry: ${{ env.REGISTRY }}
+          registry: ${{ env.DOCKER_REGISTRY }}
           username: ${{ github.actor }}
           password: ${{ secrets.GITHUB_TOKEN }}
 
@@ -32,11 +51,11 @@ jobs:
         id: meta
         uses: docker/metadata-action@v5
         with:
-          images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
+          images: ${{ env.DOCKER_REGISTRY }}/${{ env.DOCKER_IMAGE_NAME }}
           # Tag the following types of images:
           #   * On a branch, tag with the branch name (e.g. `master`)
           #   * On a PR, tag with the PR number (e.g. `pr-12`)
-          #   * On all events, tag with the short git SHA (e.g. `e956384`)
+          #   * On a tagged commit, tag with the git tag (e.g. `2023`)
           tags: |
             type=ref,event=branch
             type=ref,event=pr
@@ -52,4 +71,162 @@ jobs:
           labels: ${{ steps.meta.outputs.labels }}
           cache-from: type=gha
           cache-to: type=gha,mode=max
+          # Fix incorrect container type sidebar display in GitHub Container
+          # Registry
           provenance: false
+
+  run-model:
+    needs: [publish-docker-image]
+    runs-on: ubuntu-latest
+    # Require manual approval to run this job
+    environment: deploy
+    # These permissions are needed to interact with GitHub's OIDC Token endpoint
+    # so that we can authenticate with AWS
+    permissions:
+      id-token: write
+      contents: read
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: ${{ secrets.AWS_IAM_ROLE_TO_ASSUME_ARN }}
+          aws-region: us-east-1
+          role-duration-seconds: 14400  # Worst-case time for a full model run
+
+      - name: Set up Terraform
+        uses: hashicorp/setup-terraform@v2
+
+      - name: Initialize Terraform
+        run: terraform init
+        shell: bash
+        working_directory: terraform
+
+      - name: Set Terraform variables
+        id: set-vars
+        run: |
+          # # GITHUB_HEAD_REF is only set on pull_request events, so if it's
+          # present, we must be in a PR context
+          if [ -n "$GITHUB_HEAD_REF" ]; then
+            echo "On pull request branch, setting terraform workspace to CI"
+            # Replace slashes and underscores with hyphens in the workspace name
+            # and force it to lowercase, since we use it to name resources and
+            # we want to follow a consistent naming scheme
+            WORKSPACE="$(echo $GITHUB_HEAD_REF | \
+                        sed -e 's/\//-/g' -e 's/_/-/g' | \
+                        tr '[:upper:]' '[:lower:]')"
+            BATCH_JOB_NAME="ci_${WORKSPACE}_${GITHUB_REPOSITORY//\//-}"
+
+          elif [[ $GITHUB_REF_NAME == 'master' ]]; then
+            echo "On master branch, setting terraform workspace to prod"
+            WORKSPACE="prod"
+            BATCH_JOB_NAME="${GITHUB_REPOSITORY//\//-}"
+
+          else
+            echo "CI context did not match any of the expected environments"
+            exit 1
+          fi
+
+          echo {
+            "batch_job_name = \"$BATCH_JOB_NAME\"";
+            "batch_container_image_id = \"$BATCH_CONTAINER_IMAGE_ID\"";
+          } > "$TFVARS_FILE"
+          echo "batch-job-name=$BATCH_JOB_NAME" >> "$GITHUB_OUTPUT"
+        shell: bash
+        env:
+          BATCH_CONTAINER_IMAGE_ID: ${{ jobs.publish-docker-image.outputs.imageid }}
+
+      - name: Select Terraform workspace
+        run: terraform workspace select -or-create "$WORKSPACE"
+        shell: bash
+        working_directory: terraform
+        env:
+          WORKSPACE: ${{ steps.set-vars.outputs.workspace }}
+
+      - name: Validate Terraform config
+        run: terraform validate
+        working_directory: terraform
+        shell: bash
+
+      - name: Apply Terraform changes
+        run: terraform apply -auto-approve
+        working_directory: terraform
+        shell: bash
+
+      - name: Submit new Batch job
+        id: submit-job
+        run: |
+          BATCH_JOB_ID=$(\
+            aws batch submit-job \
+              --job-name "$BATCH_JOB_NAME" \
+              --job-definition "$(terraform output batch_job_definition_arn)" \
+              --job-queue "$(terraform output batch_job_queue_arn)" \
+            | jq -r ".jobId"
+          echo "batch-job-id=$BATCH_JOB_ID" >> "$GITHUB_OUTPUT"
+        shell: bash
+        env:
+          BATCH_JOB_NAME: ${{ steps.set-vars.outputs.batch-job-name }}
+
+      - name: Wait for Batch job to complete
+        run: |
+          LOOP_COUNTER=0
+          while true; do
+            JOB_DESCRIPTIONS=$(aws batch describe-jobs --job-id "$BATCH_JOB_ID")
+
+            JOB_LIST=$(echo "$JOB_DESCRIPTIONS" | jq -r '.jobs')
+            if [[ "$JOB_LIST" == "[]" ]]; then
+              echo "Unexpected empty response from aws batch describe-jobs"
+              exit 1
+            fi
+
+            JOB_STATUS=$(echo "$JOB_DESCRIPTIONS" | jq -r '.jobs[0].status')
+            echo "Job status is $JOB_STATUS"
+
+            JOB_LOG_STREAM_NAME=$(\
+              echo "$JOB_DESCRIPTIONS" | \
+              jq -r '.jobs[0].container.logStreamName' \
+            )
+
+            case "$JOB_STATUS" in
+              "RUNNING")
+                if [[ "$LOOP_COUNTER" == "0" ]]; then
+                  # Only print the logs on the first loop, to keep output clean
+                  echo "See logs: ${BATCH_JOB_LOG_URL_PREFIX}${JOB_LOG_STREAM_NAME}"
+                fi
+                echo "Sleeping ${BATCH_JOB_POLL_INTERVAL_SECONDS}s until next status check"
+                sleep 10
+                ;;
+
+              "SUCCEEDED")
+                echo "Job succeeded!"
+                exit 0
+                ;;
+
+              "FAILED")
+                echo "Job failed :( See logs: ${BATCH_JOB_LOG_URL_PREFIX}${JOB_LOG_STREAM_NAME}"
+                exit 1
+                ;;
+
+              *)
+                if [[ "$LOOP_COUNTER" == "$BATCH_JOB_POLL_STARTUP_MAX_RETRIES" ]]; then
+                  echo "Failing workflow due to job startup timeout. This means "
+                  echo "that the job did not enter a RUNNING state within a "
+                  echo "reasonable amount of time. This usually indicates a "
+                  echo "problem in the underlying ECS or EC2 backend that can "
+                  echo "be debugged by checking cluster/instance logs in the "
+                  echo "AWS console."
+                  exit 1
+                fi
+                echo "Sleeping ${BATCH_JOB_POLL_INTERVAL_SECONDS}s until next status check"
+                sleep 10
+                ;;
+            esac
+
+            echo "Starting status check #$((LOOP_COUNTER++))"
+
+          done
+        shell: bash
+        env:
+          BATCH_JOB_ID: ${{ steps.submit-job.outputs.batch-job-id }}
diff --git a/Dockerfile b/Dockerfile
index 9395a2a2..a7ef7c9f 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -41,3 +41,6 @@ RUN mv renv model-res-avm/
 
 # Set the working directory to the app dir
 WORKDIR model-res-avm/
+
+# TODO: Set this to the full pipeline once testing is complete
+CMD "dvc pull & dvc repro train"
diff --git a/terraform/.gitignore b/terraform/.gitignore
new file mode 100644
index 00000000..e79eb231
--- /dev/null
+++ b/terraform/.gitignore
@@ -0,0 +1 @@
+.terraform*
diff --git a/terraform/main.tf b/terraform/main.tf
new file mode 100644
index 00000000..2f1c279c
--- /dev/null
+++ b/terraform/main.tf
@@ -0,0 +1,175 @@
+# Terraform configuration for AWS Batch job resources that can run the model.
+#
+# Note that some stable resources that are shared between all Batch environments
+# are either builtin to all AWS accounts or are defined in our core AWS
+# infrastructure repository, which is separate from this module. These resources
+# are thus referenced using Terraform `data` entities rather than `resource`
+# entities, which means Terraform will not attempt to create or update them.
+# These resources include, but are not limited to:
+#
+#  * The VPC, subnets, and security group used for container networking
+#  * IAM roles for task execution and Batch provisioning
+
+terraform {
+  required_providers {
+    aws = {
+      source  = "hashicorp/aws"
+      version = "~> 5.23"
+    }
+  }
+
+  required_version = ">= 1.5.7"
+
+  backend "s3" {
+    bucket = "ccao-terraform-state-us-east-1"
+    key    = "terraform.tfstate"
+    region = "us-east-1"
+    workspace_key_prefix = "model-res-avm/workspaces"
+  }
+}
+
+provider "aws" {
+  region = "us-east-1"
+}
+
+# This variable defines the name of all of the Batch resources managed by
+# this configuration. The name should have appropriate prefixes depending on
+# whether it's intended for a staging or prod environment, so we leave the
+# naming logic up to the Terraform caller, which we expect to be a CI
+# environment with knowledge of the current git branch name
+variable "batch_job_name" {
+  type = string
+}
+
+# Set the ID of the container image that the Batch job should pull to use as
+# its execution environment, e.g. "ghcr.io/ccao-data/model-res-avm:master".
+# This is defined as a variable so that CI environments can point Batch
+# job definitions to freshly built images
+variable "batch_container_image_id" {
+  type = string
+}
+
+# Retrieve the default VPC for this region, which is builtin to AWS.
+# Containers in the Batch compute environment will be deployed into this VPC
+data "aws_vpc" "default" {
+  default = true
+}
+
+# Retrieve the default subnets in the default VPC, which are builtin to AWS.
+# Containers in the Batch compute environment are connected to the Internet
+# using these subnets. Note that these subnets are public by default, but the
+# use of an additional security group ensures that we block all ingress to the
+# compute environment
+data "aws_subnets" "default" {
+  filter {
+    name   = "vpc-id"
+    values = [data.aws_vpc.default.id]
+  }
+  filter {
+    name   = "default-for-az"
+    values = [true]
+  }
+  filter {
+    name   = "state"
+    values = ["available"]
+  }
+}
+
+# Retrieve the security group that blocks all ingress and allows egress over
+# HTTPS only
+data "aws_security_group" "outbound_https" {
+  name = "outbound-https"
+  vpc_id = data.aws_vpc.default.id
+}
+
+# Retrieve the IAM role that the Batch compute environment uses to manage
+# EC2/ECS resources
+data "aws_iam_role" "batch_service_role" {
+  name = "AWSServiceRoleForBatch"
+}
+
+# Retrieve the IAM role that the Batch job definition uses to execute ECS
+# operations like pulling Docker images and pushing logs to CloudWatch
+data "aws_iam_role" "ecs_task_execution_role" {
+  name = "ecsTaskExecutionRole"
+}
+
+# Retrieve the IAM role that the Batch job passes on to the containers, allowing
+# those containers to access AWS resources like data stored in S3
+data "aws_iam_role" "ecs_job_role" {
+  name = "ccao-ecs-dvc-full-access"
+}
+
+# Create a Batch compute environment to run containers. Compute environments
+# define the underlying ECS or EC2 resources that will be provisioned to
+# use for running jobs in containers. Docs here:
+# https://docs.aws.amazon.com/batch/latest/userguide/compute_environments.html
+resource "aws_batch_compute_environment" "main" {
+  compute_environment_name = var.batch_job_name
+  service_role             = data.aws_iam_role.batch_service_role.arn
+  state                    = "ENABLED"
+  type                     = "MANAGED"
+
+  compute_resources {
+    type               = "FARGATE"
+    min_vcpus          = 0
+    max_vcpus          = 64  # Max across all jobs, not within one job
+    security_group_ids = [data.aws_security_group.outbound_https.id]
+    subnets            = data.aws_subnets.default.ids
+  }
+}
+
+# Create a Batch job queue to run jobs. Job queues keep track of which jobs
+# are waiting to run and in what order they should be prioritized in cases
+# where its associated compute environment has reached max capacity. Docs here:
+# https://docs.aws.amazon.com/batch/latest/userguide/job_queues.html
+resource "aws_batch_job_queue" "main" {
+  name                 = var.batch_job_name
+  compute_environments = [aws_batch_compute_environment.main.arn]
+  priority             = 0
+  state                = "ENABLED"
+}
+
+# Create a Batch job definition to define jobs. Job definitions provide the
+# information about how containers should be configured in a compute
+# environment and what those containers should do. When combined with a
+# job queue, they provide enough information to submit a job. Docs here:
+# https://docs.aws.amazon.com/batch/latest/userguide/job_definitions.html
+#
+# Note that jobs using this job definition cannot be provisioned by Terraform,
+# and are instead submitted via calls to `aws batch submit-job` in the same CI
+# workflow that provisions these resources
+resource "aws_batch_job_definition" "main" {
+  name                  = var.batch_job_name
+  platform_capabilities = ["FARGATE"]
+  type                  = "container"
+
+  container_properties = jsonencode({
+    executionRoleArn = data.aws_iam_role.ecs_task_execution_role.arn
+    fargatePlatformConfiguration = {
+      platformVersion = "LATEST"
+    }
+    image = var.batch_container_image_id
+    jobRoleArn = data.aws_iam_role.ecs_job_role.arn
+    logConfiguration = {
+      logDriver = "awslogs"
+    }
+    networkConfiguration = {
+      assignPublicIp = "ENABLED"
+    }
+    resourceRequirements = [
+      {
+        type = "VCPU"
+        value = "16.0"
+      },
+      {
+        type = "MEMORY"
+        value = "65536"
+      }
+    ]
+    runtimePlatform = {
+      cpuArchitecture = "X86_64"
+      operatingSystemFamily = "LINUX"
+    }
+  })
+}
diff --git a/terraform/outputs.tf b/terraform/outputs.tf
new file mode 100644
index 00000000..27e838c7
--- /dev/null
+++ b/terraform/outputs.tf
@@ -0,0 +1,9 @@
+output "batch_job_definition_arn" {
+  description = "ARN of the Batch job definition"
+  value       = aws_batch_job_definition.main.arn
+}
+
+output "batch_job_queue_arn" {
+  description = "ARN of the Batch job queue"
+  value       = aws_batch_job_queue.main.arn
+}