Skip to content

Commit

Permalink
Factor out deploy and cleanup-terraform into reusable workflows (#58
Browse files Browse the repository at this point in the history
)

* Factor deploy.yaml out into a reusable workflow

* Try specifying packages:write permission in build-and-run-model.yaml

* Simplify names of build/run steps in deploy.yaml

* Clean up docs in workflows

* Temporarily simplify Dockerfile CMD to speed up CI test

* Rename deploy.yaml to build-and-run-batch-job.yaml

* Factor out vCPU and memory allocations to workflow input variables

* Revert "Temporarily simplify Dockerfile CMD to speed up CI test"

This reverts commit a86e0f1.

* Add terraform-working-directory arg to workflows

* Factor out cleanup-terraform into a reusable workflow

* Temporarily run cleanup-model on PRs for testing purposes

* Revert "Temporarily run cleanup-model on PRs for testing purposes"

This reverts commit 51813b2.

* Fix indentation error in cleanup-model.yaml

* Document empty backend block in main.tf

* Use github.event.repository.name for setup-terraform repo arg
  • Loading branch information
jeancochrane authored Nov 8, 2023
1 parent f51c35b commit 90b6e4c
Show file tree
Hide file tree
Showing 6 changed files with 187 additions and 50 deletions.
32 changes: 28 additions & 4 deletions .github/actions/setup-terraform/action.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,15 @@ inputs:
batch-container-image-name:
description: The name of the container image to use for the Batch job.
required: true
batch-job-definition-vcpu:
description: >
Count of cVPUs to provision for the container. Per AWS requirements,
must be formatted as a float, e.g. 1.0 for 1 vCPU. The minimum
is 1 vCPU and values must be specified in increments of 0.25.
required: true
batch-job-definition-memory:
description: Count of megabytes of RAM to provision for the container.
required: true
role-duration-seconds:
description: How long the role specified by role-to-assume should be valid.
required: false
Expand All @@ -18,6 +27,10 @@ inputs:
description: File to store Terraform variables.
required: false
default: terraform.tfvars
working-directory:
description: Directory where the Terraform configuration is stored.
required: false
default: .
runs:
using: composite
steps:
Expand All @@ -38,9 +51,16 @@ runs:
uses: hashicorp/setup-terraform@v2

- name: Initialize Terraform
run: terraform init
run: |
terraform init \
-backend-config "bucket=ccao-terraform-state-us-east-1" \
-backend-config "key=terraform.tfstate" \
-backend-config "region=us-east-1" \
-backend-config "workspace_key_prefix=$REPO/workspaces"
shell: bash
working-directory: terraform
working-directory: ${{ inputs.working-directory }}
env:
REPO: ${{ github.event.repository.name }}

- name: Set Terraform variables
id: set-vars
Expand Down Expand Up @@ -70,18 +90,22 @@ runs:
{
echo "batch_job_name = \"$BATCH_JOB_NAME\"";
echo "batch_container_image_name = \"$BATCH_CONTAINER_IMAGE_NAME\"";
echo "batch_job_definition_vcpu = \"$BATCH_JOB_DEFINITION_VCPU\"";
echo "batch_job_definition_memory = \"$BATCH_JOB_DEFINITION_MEMORY\"";
} > "$TFVARS_FILE"
echo "workspace=$WORKSPACE" >> "$GITHUB_OUTPUT"
shell: bash
working-directory: terraform
working-directory: ${{ inputs.working-directory }}
env:
TFVARS_FILE: ${{ inputs.tfvars-file }}
BATCH_CONTAINER_IMAGE_NAME: ${{ inputs.batch-container-image-name }}
BATCH_JOB_DEFINITION_VCPU: ${{ inputs.batch-job-definition-vcpu }}
BATCH_JOB_DEFINITION_MEMORY: ${{ inputs.batch-job-definition-memory }}

- name: Select Terraform workspace
run: terraform workspace select -or-create "$WORKSPACE"
shell: bash
working-directory: terraform
working-directory: ${{ inputs.working-directory }}
env:
WORKSPACE: ${{ steps.set-vars.outputs.workspace }}
Original file line number Diff line number Diff line change
@@ -1,26 +1,63 @@
# Workflow that builds a Docker image containing the model code,
# pushes it to the GitHub Container Registry, and then optionally uses
# that container image to run the model using an AWS Batch job.
# Reusable workflow that builds a Docker image, pushes it to the GitHub
# Container Registry, and then optionally uses that container image to run
# an AWS Batch job.
#
# Images are built on every commit to a PR or main branch in order to ensure
# that the build continues to work properly, but Batch jobs are gated behind
# a `deploy` environment that requires manual approval from a
# @ccao-data/core-team member.

name: deploy
# Requirements:
#
# * A Dockerfile must be defined in the root of the repo whose workflow is
# calling this one.
# * A `deploy` environment must be configured in the calling repo. This
# environment is used to protect the `run` job, which must be approved by
# a core team member before it will run.
# * The calling workflow must grant the following permissions to the job
# that calls this workflow:
# * contents: read
# * id-token: write
# * packages: write
# * Various required inputs and secrets, documented below, must be passed in
# by the calling workflow.

name: build-and-run-batch-job

on:
pull_request:
workflow_dispatch:
push:
branches: [master]
workflow_call:
inputs:
role-duration-seconds:
description: How long IAM role used to auth with AWS can be valid.
required: false
type: string
default: 3600
vcpu:
description: >
Count of cVPUs to provision for the container. Per AWS requirements,
must be formatted as a float, e.g. 1.0 for 1 vCPU. The minimum
is 1 vCPU and values must be specified in increments of 0.25.
required: false
type: string
default: "1.0"
memory:
description: Count of megabytes of RAM to provision for the container.
required: false
type: string
default: "4096"
terraform-working-directory:
description: Directory where the Terraform configuration is stored.
required: false
type: string
default: .

secrets:
AWS_IAM_ROLE_TO_ASSUME_ARN:
required: true
AWS_ACCOUNT_ID:
required: true

env:
DOCKER_REGISTRY: ghcr.io
DOCKER_IMAGE_NAME: ${{ github.repository }}

jobs:
publish-docker-image:
build:
runs-on: ubuntu-latest
outputs:
image-name: ${{ steps.save-image-name.outputs.image-name }}
Expand Down Expand Up @@ -75,19 +112,14 @@ jobs:
env:
METADATA: ${{ steps.build-and-push.outputs.metadata }}

run-model:
run:
# Don't automatically run the model on push, since we prefer to use workflow
# dispatch for prod runs instead
if: github.event_name != 'push'
needs: [publish-docker-image]
needs: [build]
runs-on: ubuntu-latest
# Require manual approval to run this job
environment: deploy
# These permissions are needed to interact with GitHub's OIDC Token endpoint
# so that we can authenticate with AWS
permissions:
id-token: write
contents: read
steps:
- name: Checkout
uses: actions/checkout@v4
Expand All @@ -97,17 +129,20 @@ jobs:
with:
role-to-assume: ${{ secrets.AWS_IAM_ROLE_TO_ASSUME_ARN }}
aws-account-id: ${{ secrets.AWS_ACCOUNT_ID }}
batch-container-image-name: ${{ needs.publish-docker-image.outputs.image-name }}
role-duration-seconds: 14400 # Worst-case time for a full model run
batch-container-image-name: ${{ needs.build.outputs.image-name }}
batch-job-definition-vcpu: ${{ inputs.vcpu }}
batch-job-definition-memory: ${{ inputs.memory }}
role-duration-seconds: ${{ inputs.role-duration-seconds}}
working-directory: ${{ inputs.terraform-working-directory }}

- name: Validate Terraform config
run: terraform validate
working-directory: terraform
working-directory: ${{ inputs.terraform-working-directory }}
shell: bash

- name: Apply Terraform changes
run: terraform apply -auto-approve
working-directory: terraform
working-directory: ${{ inputs.terraform-working-directory }}
shell: bash

- name: Submit new Batch job
Expand All @@ -129,7 +164,7 @@ jobs:
BATCH_JOB_ID=$(echo $BATCH_JOB_DETAILS | jq -r ".jobId")
echo "batch-job-id=$BATCH_JOB_ID" >> "$GITHUB_OUTPUT"
shell: bash
working-directory: terraform
working-directory: ${{ inputs.terraform-working-directory }}

- name: Wait for Batch job to start and print link to AWS logs
run: ./.github/scripts/poll_batch_job_status.sh "$BATCH_JOB_ID" startup
Expand Down
37 changes: 37 additions & 0 deletions .github/workflows/build-and-run-model.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Workflow that builds a Docker image containing the model code,
# pushes it to the GitHub Container Registry, and then optionally uses
# that container image to run the model using an AWS Batch job.
#
# Images are built on every commit to a PR or main branch in order to ensure
# that the build continues to work properly, but Batch jobs are gated behind
# a `deploy` environment that requires manual approval from a
# @ccao-data/core-team member.
#
name: build-and-run-model

on:
pull_request:
workflow_dispatch:
push:
branches: [master]

jobs:
build-and-run-model:
permissions:
# contents:read and id-token:write permissions are needed to interact
# with GitHub's OIDC Token endpoint so that we can authenticate with AWS
contents: read
id-token: write
# While packages:write is usually not required for workflows, it is
# required in order to allow the reusable called workflow to push to
# GitHub Container Registry
packages: write
uses: ./.github/workflows/build-and-run-batch-job.yaml
with:
vcpu: "16.0"
memory: "65536"
role-duration-seconds: 14400 # Worst-case time for a full model run
terraform-working-directory: terraform
secrets:
AWS_IAM_ROLE_TO_ASSUME_ARN: ${{ secrets.AWS_IAM_ROLE_TO_ASSUME_ARN }}
AWS_ACCOUNT_ID: ${{ secrets.AWS_ACCOUNT_ID }}
22 changes: 22 additions & 0 deletions .github/workflows/cleanup-model.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# Workflow that runs whenever a PR is closed or merged and deletes any
# AWS resources created for the pull request.

name: cleanup-model

on:
pull_request:
types: [closed]

jobs:
cleanup-model:
permissions:
# These permissions are needed to interact with GitHub's OIDC Token endpoint
# so that we can authenticate with AWS
id-token: write
contents: read
uses: ./.github/workflows/cleanup-terraform.yaml
with:
terraform-working-directory: terraform
secrets:
AWS_IAM_ROLE_TO_ASSUME_ARN: ${{ secrets.AWS_IAM_ROLE_TO_ASSUME_ARN }}
AWS_ACCOUNT_ID: ${{ secrets.AWS_ACCOUNT_ID }}
35 changes: 23 additions & 12 deletions .github/workflows/cleanup-terraform.yaml
Original file line number Diff line number Diff line change
@@ -1,19 +1,27 @@
# Workflow that runs whenever a PR is closed or merged and deletes any
# AWS resources created by the `deploy` workflow.
# Reusable workflow that deletes any AWS resources created by Terraform
# for the pull request.
#
# Assumes at least one Terraform (*.tf) configuration file

name: cleanup-terraform

on:
pull_request:
types: [closed]
workflow_call:
inputs:
terraform-working-directory:
description: Directory where the Terraform configuration is stored.
required: false
type: string
default: .
secrets:
AWS_IAM_ROLE_TO_ASSUME_ARN:
required: true
AWS_ACCOUNT_ID:
required: true

jobs:
cleanup-terraform:
runs-on: ubuntu-latest
# These permissions are needed to interact with GitHub's OIDC Token endpoint
# so that we can authenticate with AWS
permissions:
id-token: write
contents: read
steps:
- name: Checkout
uses: actions/checkout@v4
Expand All @@ -23,11 +31,14 @@ jobs:
with:
role-to-assume: ${{ secrets.AWS_IAM_ROLE_TO_ASSUME_ARN }}
aws-account-id: ${{ secrets.AWS_ACCOUNT_ID }}
# This value can be anything, since Terraform doesn't need correct
working-directory: ${{ inputs.terraform-working-directory }}
# These values can be anything, since Terraform doesn't need correct
# values for variables in order to destroy resources
batch-container-image-name: foobar
batch-container-image-name: foo
batch-job-definition-vcpu: bar
batch-job-definition-memory: baz

- name: Delete resources using Terraform
run: terraform destroy -auto-approve
working-directory: terraform
working-directory: ${{ inputs.terraform-working-directory }}
shell: bash
24 changes: 16 additions & 8 deletions terraform/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,10 @@ terraform {

required_version = ">= 1.5.7"

backend "s3" {
bucket = "ccao-terraform-state-us-east-1"
key = "terraform.tfstate"
region = "us-east-1"
workspace_key_prefix = "model-res-avm/workspaces"
}
# Backend configs change based on the calling repo, so we leave it empty here
# and then leave it up to the caller of `terraform init` to pass the required
# S3 backend config attributes in via `-backend-config` flags.
backend "s3" {}
}

provider "aws" {
Expand All @@ -49,6 +47,16 @@ variable "batch_container_image_name" {
type = string
}

# How many vCPUs should be provisioned for Batch jobs
variable "batch_job_definition_vcpu" {
type = string
}

# How much memory should be provisioned for Batch jobs
variable "batch_job_definition_memory" {
type = string
}

# Retrieve the default VPC for this region, which is builtin to AWS.
# Containers in the Batch compute environment will be deployed into this VPC
data "aws_vpc" "default" {
Expand Down Expand Up @@ -160,11 +168,11 @@ resource "aws_batch_job_definition" "main" {
resourceRequirements = [
{
type = "VCPU"
value = "16.0"
value = var.batch_job_definition_vcpu
},
{
type = "MEMORY"
value = "65536"
value = var.batch_job_definition_memory
}
]
runtimePlatform = {
Expand Down

0 comments on commit 90b6e4c

Please sign in to comment.