.github/workflows/cml.yml

name: CML

on:
  workflow_dispatch:
    inputs:
      deploy_debug_enabled:
        description: "Run the deployment with tmate.io debugging enabled"
        required: true
        type: boolean
        default: false
      model_debug_enabled:
        description: "Run the model with tmate.io debugging enabled"
        required: true
        type: boolean
        default: false
      end_debug_enabled:
        description: "Run tmate.io debugging at the end of the workflow"
        required: true
        type: boolean
        default: false
      force_retraining:
        description: "Force retraining in model execution"
        required: true
        type: boolean
        default: false
      force_postprocessing:
        description: "Force postprocessing in model execution"
        required: true
        type: boolean
        default: false
      force_summarize:
        description: "Force summarization in model execution"
        required: true
        type: boolean
        default: false
      force_all:
        description: "Force all steps in model execution"
        required: true
        type: boolean
        default: false
  push:
    branches:
      - "exp*"
    paths:
      - "pyrovelocity/**"
      - "reproducibility/figures/*.py"
      - "reproducibility/figures/dvc.*"
      - "reproducibility/figures/config.yaml"

# Review/set variables via gh CLI:
#
#   gh secret list --repo=$(GH_REPO)
#   gh secret set PERSONAL_ACCESS_TOKEN --repo="$(GH_REPO)" --body="$(GH_PAT)"
#
# see also:
#   https://cml.dev/doc/self-hosted-runners?tab=GitHub#personal-access-token
#   test machine: n1-standard-32+nvidia-tesla-t4*1
#   large machine: n1-standard-64+nvidia-tesla-t4*4
#   large machine: a2-highgpu-2g+nvidia-tesla-a100*2
env:
  REPO_TOKEN: ${{ secrets.PERSONAL_ACCESS_TOKEN }}
  GOOGLE_APPLICATION_CREDENTIALS_DATA: ${{ secrets.GOOGLE_APPLICATION_CREDENTIALS_DATA }}
  GCP_SERVICE_ACCOUNT: ${{ secrets.GCP_SERVICE_ACCOUNT }}
  MLFLOW_TRACKING_PASSWORD: ${{ secrets.MLFLOW_TRACKING_PASSWORD }}
  MLFLOW_TRACKING_URI: ${{ secrets.MLFLOW_TRACKING_URI }}
  MLFLOW_TRACKING_USERNAME: ${{ secrets.MLFLOW_TRACKING_USERNAME }}
  TF_LOG_PROVIDER: DEBUG

jobs:
  deploy:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v3
      - uses: iterative/setup-cml@v1
      - name: Setup tmate debug session
        uses: mxschmitt/action-tmate@v3
        if: ${{ inputs.deploy_debug_enabled }}
      - name: Deploy cloud runner
        run: |
          GCP_ZONES=( "us-central1-b" "us-central1-f"
                      "us-central1-a" "us-central1-c"
                      "us-east1-b"
                      "us-west1-b" "us-west4-b" )
          for trial in {1..3}; do
            for zone in "${GCP_ZONES[@]}"; do
              echo "Trial $trial in zone $zone"
              cml runner launch \
              --labels=cml-gpu \
              --reuse-idle \
              --cloud=gcp \
              --cloud-type=n1-highmem-96+nvidia-tesla-t4*4 \
              --cloud-gpu=tesla \
              --cloud-hdd-size=500 \
              --cloud-region="$zone" \
              --cloud-permission-set="${GCP_SERVICE_ACCOUNT},scopes=storage-rw" \
              && break 2
            done
          done
  pipeline:
    needs: deploy
    runs-on: [self-hosted, cml-gpu]
    timeout-minutes: 1440
    container:
      image: ghcr.io/pinellolab/pyrovelocity:master
      options: --gpus all
    steps:
      - uses: actions/checkout@v3
        with:
          fetch-depth: 0
      - name: Set up python
        id: setup-python
        uses: actions/setup-python@v4
        with:
          python-version: "3.10"
      - name: Install Poetry
        uses: snok/install-poetry@v1
        with:
          virtualenvs-create: true
          virtualenvs-in-project: true
          installer-parallel: true
      - name: Load cached venv
        id: cached-poetry-dependencies
        uses: actions/cache@v3
        with:
          path: .venv
          key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}
      - name: Setup tmate debug session
        uses: mxschmitt/action-tmate@v3
        if: ${{ inputs.model_debug_enabled }}
      - name: Install dependencies
        if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
        run: poetry install -E dev -E plotting --no-interaction --no-root
      - name: Install project
        run: poetry install -E dev -E plotting --no-interaction
      - name: CML setup
        run: cml ci
      - name: Setup rclone
        run: |
          sudo -v ; curl https://rclone.org/install.sh | sudo bash
          which rclone
          sudo apt-get update && sudo apt-get install tree
          which tree
          mkdir -p $HOME/.config/rclone/
          echo '${{ secrets.RCLONE_CONF }}' > $HOME/.config/rclone/rclone.conf
      - name: Run experiment and submit report
        run: |
          source .venv/bin/activate
          FLAGS=""

          if [ "${{ github.event.inputs.force_retraining }}" = "true" ]; then
            FLAGS="${FLAGS} -ft"
          fi

          if [ "${{ github.event.inputs.force_postprocessing }}" = "true" ]; then
            FLAGS="${FLAGS} -fp"
          fi

          if [ "${{ github.event.inputs.force_summarize }}" = "true" ]; then
            FLAGS="${FLAGS} -fs"
          fi

          if [ "${{ github.event.inputs.force_all }}" = "true" ]; then
            FLAGS="${FLAGS} -f"
          fi

          ./.github/pipeline.sh ${FLAGS}
        shell: bash
      - name: Setup tmate debug session
        uses: mxschmitt/action-tmate@v3
        if: ${{ inputs.end_debug_enabled }}