diff --git a/.github/actions/setup-terraform/action.yaml b/.github/actions/setup-terraform/action.yaml deleted file mode 100644 index 6a10c623..00000000 --- a/.github/actions/setup-terraform/action.yaml +++ /dev/null @@ -1,111 +0,0 @@ -name: Setup Terraform -description: Install and configure Terraform and AWS for the correct environment -inputs: - role-to-assume: - description: AWS IAM role to assume when running Terraform operations. - required: true - aws-account-id: - description: AWS account ID to use to create resources. - required: true - batch-container-image-name: - description: The name of the container image to use for the Batch job. - required: true - batch-job-definition-vcpu: - description: > - Count of cVPUs to provision for the container. Per AWS requirements, - must be formatted as a float, e.g. 1.0 for 1 vCPU. The minimum - is 1 vCPU and values must be specified in increments of 0.25. - required: true - batch-job-definition-memory: - description: Count of megabytes of RAM to provision for the container. - required: true - role-duration-seconds: - description: How long the role specified by role-to-assume should be valid. - required: false - default: 3600 - tfvars-file: - description: File to store Terraform variables. - required: false - default: terraform.tfvars - working-directory: - description: Directory where the Terraform configuration is stored. - required: false - default: . -runs: - using: composite - steps: - - name: Mask sensitive AWS IDs from Terraform logs - run: | - echo "::add-mask::${{ inputs.role-to-assume }}" - echo "::add-mask::${{ inputs.aws-account-id }}" - shell: bash - - - name: Configure AWS credentials - uses: aws-actions/configure-aws-credentials@v4 - with: - role-to-assume: ${{ inputs.role-to-assume }} - aws-region: us-east-1 - role-duration-seconds: ${{ inputs.role-duration-seconds }} - - - name: Set up Terraform - uses: hashicorp/setup-terraform@v2 - - - name: Initialize Terraform - run: | - terraform init \ - -backend-config "bucket=ccao-terraform-state-us-east-1" \ - -backend-config "key=terraform.tfstate" \ - -backend-config "region=us-east-1" \ - -backend-config "workspace_key_prefix=$REPO/workspaces" - shell: bash - working-directory: ${{ inputs.working-directory }} - env: - REPO: ${{ github.event.repository.name }} - - - name: Set Terraform variables - id: set-vars - run: | - # GITHUB_HEAD_REF is only set on pull_request events, so if it's - # present, we must be in a PR context - if [ -n "$GITHUB_HEAD_REF" ]; then - echo "On pull request branch, setting terraform workspace to CI" - # Replace slashes and underscores with hyphens in the workspace name - # and force it to lowercase, since we use it to name resources and - # we want to follow a consistent naming scheme - WORKSPACE="$(echo $GITHUB_HEAD_REF | \ - sed -e 's/\//-/g' -e 's/_/-/g' | \ - tr '[:upper:]' '[:lower:]')" - BATCH_JOB_NAME="ci_${WORKSPACE}_${GITHUB_REPOSITORY//\//-}" - - elif [[ $GITHUB_REF_NAME == 'master' ]]; then - echo "On master branch, setting terraform workspace to prod" - WORKSPACE="prod" - BATCH_JOB_NAME="${GITHUB_REPOSITORY//\//-}" - - else - echo "CI context did not match any of the expected environments" - exit 1 - fi - - { - echo "batch_job_name = \"$BATCH_JOB_NAME\""; - echo "batch_container_image_name = \"$BATCH_CONTAINER_IMAGE_NAME\""; - echo "batch_job_definition_vcpu = \"$BATCH_JOB_DEFINITION_VCPU\""; - echo "batch_job_definition_memory = \"$BATCH_JOB_DEFINITION_MEMORY\""; - } > "$TFVARS_FILE" - - echo "workspace=$WORKSPACE" >> "$GITHUB_OUTPUT" - shell: bash - working-directory: ${{ inputs.working-directory }} - env: - TFVARS_FILE: ${{ inputs.tfvars-file }} - BATCH_CONTAINER_IMAGE_NAME: ${{ inputs.batch-container-image-name }} - BATCH_JOB_DEFINITION_VCPU: ${{ inputs.batch-job-definition-vcpu }} - BATCH_JOB_DEFINITION_MEMORY: ${{ inputs.batch-job-definition-memory }} - - - name: Select Terraform workspace - run: terraform workspace select -or-create "$WORKSPACE" - shell: bash - working-directory: ${{ inputs.working-directory }} - env: - WORKSPACE: ${{ steps.set-vars.outputs.workspace }} diff --git a/.github/scripts/poll_batch_job_status.sh b/.github/scripts/poll_batch_job_status.sh deleted file mode 100755 index a719c87a..00000000 --- a/.github/scripts/poll_batch_job_status.sh +++ /dev/null @@ -1,119 +0,0 @@ -#!/usr/bin/env bash -# Poll an AWS Batch job to check its status. -# -# Takes two positional arguments: -# -# 1. (required) The ID of the Batch job to poll -# 2. (optional) An enum indicating which type of poll to run: `startup` or -# `completion`. Defaults to `completion`. If the value of the argument is -# `startup`, the script will treat the `RUNNING` status as a terminal -# success status. Otherwise, only `SUCCESS` will count as a terminal -# success status, and a status of `RUNNING` will cause the script to -# continue polling. -# -# Example usage: -# -# ./.github/scripts/poll_batch_job_status.sh 12345 startup -set -euo pipefail - -BATCH_JOB_LOG_URL_PREFIX="https://us-east-1.console.aws.amazon.com/cloudwatch/home?region=us-east-1#logsV2:log-groups/log-group/%2Faws%2Fbatch%2Fjob/log-events" -# How many times to poll AWS Batch job status while it's starting up before -# deciding to raise an error. Multiply by BATCH_JOB_POLL_INTERVAL_SECONDS to -# derive a timeout in second units. There is no equivalent timeout for running -# jobs, because those timeouts can be set on the Batch level, whereas startup -# timeouts are not controllable by Batch -BATCH_JOB_POLL_STARTUP_MAX_RETRIES=60 - -if [ -z ${1+x} ]; then - echo "Missing Batch job ID" - exit 1 -fi - -BATCH_JOB_ID="$1" - -POLL_TYPE="completion" - # How long to wait between queries when polling -BATCH_JOB_POLL_INTERVAL_SECONDS=300 # 5 minutes - -if [[ -n ${2+x} ]]; then - case "$2" in - "startup") - POLL_TYPE="startup" - BATCH_JOB_POLL_INTERVAL_SECONDS=60 - ;; - - "completion") - ;; - - *) - echo "Positional argument must be one of 'startup' or 'completion', " - echo "got: '$2'" - exit 1 - ;; - esac -fi - -echo "Polling for status of Batch job $BATCH_JOB_ID, waiting for $POLL_TYPE" - - -LOOP_COUNTER=0 -while true; do - echo "Getting status of job $BATCH_JOB_ID" - JOB_DESCRIPTIONS=$(aws batch describe-jobs --jobs "$BATCH_JOB_ID") - - JOB_LIST=$(echo "$JOB_DESCRIPTIONS" | jq -r '.jobs') - if [[ "$JOB_LIST" == "[]" ]]; then - echo "Unexpected empty response from aws batch describe-jobs" - exit 1 - fi - - JOB_STATUS=$(echo "$JOB_DESCRIPTIONS" | jq -r '.jobs[0].status') - echo "Job status is $JOB_STATUS" - - JOB_LOG_STREAM_NAME=$(\ - echo "$JOB_DESCRIPTIONS" | \ - jq -r '.jobs[0].container.logStreamName' \ - ) - # Any slashes in the log stream name need to be urlencoded - JOB_LOG_URL="${BATCH_JOB_LOG_URL_PREFIX}/${JOB_LOG_STREAM_NAME//\//%2F}" - - case "$JOB_STATUS" in - "RUNNING") - if [[ "$POLL_TYPE" == "startup" ]]; then - echo "Job has started! See logs: $JOB_LOG_URL" - exit 0 - fi - ;; - - "SUCCEEDED") - echo "Job succeeded!" - exit 0 - ;; - - "FAILED") - echo "Job failed :( See logs: $JOB_LOG_URL" - echo "More logs and container metrics can also be found on the " - echo "job detail page in the AWS Batch console" - exit 1 - ;; - - *) - if [[ "$LOOP_COUNTER" == "$BATCH_JOB_POLL_STARTUP_MAX_RETRIES" ]]; then - echo "Failing workflow due to job startup timeout. This means " - echo "that the job did not enter a RUNNING state within a " - echo "reasonable amount of time. This usually indicates a " - echo "problem in the underlying ECS or EC2 backend that can " - echo "be debugged by checking cluster/instance logs in the " - echo "AWS console." - exit 1 - fi - ;; - esac - - echo "Sleeping ${BATCH_JOB_POLL_INTERVAL_SECONDS}s until next status check" - sleep "$BATCH_JOB_POLL_INTERVAL_SECONDS" - - LOOP_COUNTER=$((LOOP_COUNTER + 1)) - echo "Starting status check #$LOOP_COUNTER" - -done diff --git a/.github/workflows/build-and-run-batch-job.yaml b/.github/workflows/build-and-run-batch-job.yaml deleted file mode 100644 index 0d1cf994..00000000 --- a/.github/workflows/build-and-run-batch-job.yaml +++ /dev/null @@ -1,179 +0,0 @@ -# Reusable workflow that builds a Docker image, pushes it to the GitHub -# Container Registry, and then optionally uses that container image to run -# an AWS Batch job. -# -# Requirements: -# -# * A Dockerfile must be defined in the root of the repo whose workflow is -# calling this one. -# * A `deploy` environment must be configured in the calling repo. This -# environment is used to protect the `run` job, which must be approved by -# a core team member before it will run. -# * The calling workflow must grant the following permissions to the job -# that calls this workflow: -# * contents: read -# * id-token: write -# * packages: write -# * Various required inputs and secrets, documented below, must be passed in -# by the calling workflow. - -name: build-and-run-batch-job - -on: - workflow_call: - inputs: - role-duration-seconds: - description: How long IAM role used to auth with AWS can be valid. - required: false - type: string - default: 3600 - vcpu: - description: > - Count of cVPUs to provision for the container. Per AWS requirements, - must be formatted as a float, e.g. 1.0 for 1 vCPU. The minimum - is 1 vCPU and values must be specified in increments of 0.25. - required: false - type: string - default: "1.0" - memory: - description: Count of megabytes of RAM to provision for the container. - required: false - type: string - default: "4096" - terraform-working-directory: - description: Directory where the Terraform configuration is stored. - required: false - type: string - default: . - - secrets: - AWS_IAM_ROLE_TO_ASSUME_ARN: - required: true - AWS_ACCOUNT_ID: - required: true - -env: - DOCKER_REGISTRY: ghcr.io - DOCKER_IMAGE_NAME: ${{ github.repository }} - -jobs: - build: - runs-on: ubuntu-latest - outputs: - image-name: ${{ steps.save-image-name.outputs.image-name }} - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Login to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ${{ env.DOCKER_REGISTRY }} - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Extract metadata (tags, labels) for Docker - id: meta - uses: docker/metadata-action@v5 - with: - images: ${{ env.DOCKER_REGISTRY }}/${{ env.DOCKER_IMAGE_NAME }} - # Tag the following types of images: - # * On a branch, tag with the branch name (e.g. `master`) - # * On a PR, tag with the PR number (e.g. `pr-12`) - # * On a tagged commit, tag with the git tag (e.g. `2023`) - tags: | - type=ref,event=branch - type=ref,event=pr - type=ref,event=tag - - - name: Build and push Docker image - id: build-and-push - uses: docker/build-push-action@v5 - with: - context: . - push: true - tags: ${{ steps.meta.outputs.tags }} - labels: ${{ steps.meta.outputs.labels }} - cache-from: type=gha - cache-to: type=gha,mode=max - # Fix incorrect container type sidebar display in GitHub Container - # Registry - provenance: false - - - name: Save image name to output - id: save-image-name - run: | - IMAGE_NAME=$(echo "$METADATA" | jq -r '."image.name"') - echo "image-name=$IMAGE_NAME" >> "$GITHUB_OUTPUT" - shell: bash - env: - METADATA: ${{ steps.build-and-push.outputs.metadata }} - - run: - # Don't automatically run the model on push, since we prefer to use workflow - # dispatch for prod runs instead - if: github.event_name != 'push' - needs: [build] - runs-on: ubuntu-latest - # Require manual approval to run this job - environment: deploy - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: Setup Terraform - uses: ./.github/actions/setup-terraform - with: - role-to-assume: ${{ secrets.AWS_IAM_ROLE_TO_ASSUME_ARN }} - aws-account-id: ${{ secrets.AWS_ACCOUNT_ID }} - batch-container-image-name: ${{ needs.build.outputs.image-name }} - batch-job-definition-vcpu: ${{ inputs.vcpu }} - batch-job-definition-memory: ${{ inputs.memory }} - role-duration-seconds: ${{ inputs.role-duration-seconds}} - working-directory: ${{ inputs.terraform-working-directory }} - - - name: Validate Terraform config - run: terraform validate - working-directory: ${{ inputs.terraform-working-directory }} - shell: bash - - - name: Apply Terraform changes - run: terraform apply -auto-approve - working-directory: ${{ inputs.terraform-working-directory }} - shell: bash - - - name: Submit new Batch job - id: submit-job - run: | - # Use terraform-bin to disable the wrapper script installed by - # the setup-terraform action, which adds extra context to - # `terraform output` calls - BATCH_JOB_NAME="$(terraform-bin output -raw batch_job_name)" - BATCH_JOB_DEFINITION="$(terraform-bin output -raw batch_job_definition_arn)" - BATCH_JOB_QUEUE="$(terraform-bin output -raw batch_job_queue_arn)" - - BATCH_JOB_DETAILS=$(\ - aws batch submit-job \ - --job-name "$BATCH_JOB_NAME" \ - --job-definition "$BATCH_JOB_DEFINITION" \ - --job-queue "$BATCH_JOB_QUEUE" \ - ) - BATCH_JOB_ID=$(echo $BATCH_JOB_DETAILS | jq -r ".jobId") - echo "batch-job-id=$BATCH_JOB_ID" >> "$GITHUB_OUTPUT" - shell: bash - working-directory: ${{ inputs.terraform-working-directory }} - - - name: Wait for Batch job to start and print link to AWS logs - run: ./.github/scripts/poll_batch_job_status.sh "$BATCH_JOB_ID" startup - shell: bash - env: - BATCH_JOB_ID: ${{ steps.submit-job.outputs.batch-job-id }} - - - name: Wait for Batch job to complete - run: ./.github/scripts/poll_batch_job_status.sh "$BATCH_JOB_ID" - shell: bash - env: - BATCH_JOB_ID: ${{ steps.submit-job.outputs.batch-job-id }} diff --git a/.github/workflows/build-and-run-model.yaml b/.github/workflows/build-and-run-model.yaml index fd0a6d84..42fc7ad1 100644 --- a/.github/workflows/build-and-run-model.yaml +++ b/.github/workflows/build-and-run-model.yaml @@ -26,12 +26,11 @@ jobs: # required in order to allow the reusable called workflow to push to # GitHub Container Registry packages: write - uses: ./.github/workflows/build-and-run-batch-job.yaml + uses: ccao-data/actions/.github/workflows/batch-job-deploy.yaml@jeancochrane/add-batch-and-terraform-workflows-and-actions with: vcpu: "16.0" memory: "65536" role-duration-seconds: 14400 # Worst-case time for a full model run - terraform-working-directory: terraform secrets: AWS_IAM_ROLE_TO_ASSUME_ARN: ${{ secrets.AWS_IAM_ROLE_TO_ASSUME_ARN }} AWS_ACCOUNT_ID: ${{ secrets.AWS_ACCOUNT_ID }} diff --git a/.github/workflows/cleanup-model.yaml b/.github/workflows/cleanup-model.yaml index 5ba3eaa3..f68f0096 100644 --- a/.github/workflows/cleanup-model.yaml +++ b/.github/workflows/cleanup-model.yaml @@ -14,9 +14,7 @@ jobs: # so that we can authenticate with AWS id-token: write contents: read - uses: ./.github/workflows/cleanup-terraform.yaml - with: - terraform-working-directory: terraform + uses: ccao-data/actions/.github/workflows/batch-job-cleanup.yaml@jeancochrane/add-batch-and-terraform-workflows-and-actions secrets: AWS_IAM_ROLE_TO_ASSUME_ARN: ${{ secrets.AWS_IAM_ROLE_TO_ASSUME_ARN }} AWS_ACCOUNT_ID: ${{ secrets.AWS_ACCOUNT_ID }} diff --git a/.github/workflows/cleanup-terraform.yaml b/.github/workflows/cleanup-terraform.yaml deleted file mode 100644 index baacd6dd..00000000 --- a/.github/workflows/cleanup-terraform.yaml +++ /dev/null @@ -1,44 +0,0 @@ -# Reusable workflow that deletes any AWS resources created by Terraform -# for the pull request. -# -# Assumes at least one Terraform (*.tf) configuration file - -name: cleanup-terraform - -on: - workflow_call: - inputs: - terraform-working-directory: - description: Directory where the Terraform configuration is stored. - required: false - type: string - default: . - secrets: - AWS_IAM_ROLE_TO_ASSUME_ARN: - required: true - AWS_ACCOUNT_ID: - required: true - -jobs: - cleanup-terraform: - runs-on: ubuntu-latest - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: Setup Terraform - uses: ./.github/actions/setup-terraform - with: - role-to-assume: ${{ secrets.AWS_IAM_ROLE_TO_ASSUME_ARN }} - aws-account-id: ${{ secrets.AWS_ACCOUNT_ID }} - working-directory: ${{ inputs.terraform-working-directory }} - # These values can be anything, since Terraform doesn't need correct - # values for variables in order to destroy resources - batch-container-image-name: foo - batch-job-definition-vcpu: bar - batch-job-definition-memory: baz - - - name: Delete resources using Terraform - run: terraform destroy -auto-approve - working-directory: ${{ inputs.terraform-working-directory }} - shell: bash