.github/workflows/calculate-times.yaml

---

name: calculate-times
run-name: calculate-times-${{ inputs.mode }}-${{ inputs.year }}-${{ inputs.geography }}-${{ inputs.state }}-${{ inputs.centroid_type }}

on:
  workflow_dispatch:
    inputs:
      # Input values match those in params.yaml
      mode:
        required: true
        description: Mode of travel
        default: 'auto'
        type: choice
        options:
          - auto
          - bicycle
          - pedestrian

      year:
        required: true
        description: Census/OSM data year
        default: '2020'
        type: choice
        options:
          - '2020'
          - '2021'
          - '2022'
          - '2023'
          - '2024'

      geography:
        required: true
        description: Census data geography
        default: county
        type: choice
        options:
          - state
          - county
          - county_subdivision
          - tract
          - block_group
          - zcta

      state:
        required: true
        description: Target Census state
        default: '01'
        type: choice
        options:
          - '01'
          - '02'
          - '04'
          - '05'
          - '06'
          - '08'
          - '09'
          - '10'
          - '11'
          - '12'
          - '13'
          - '15'
          - '16'
          - '17'
          - '18'
          - '19'
          - '20'
          - '21'
          - '22'
          - '23'
          - '24'
          - '25'
          - '26'
          - '27'
          - '28'
          - '29'
          - '30'
          - '31'
          - '32'
          - '33'
          - '34'
          - '35'
          - '36'
          - '37'
          - '38'
          - '39'
          - '40'
          - '41'
          - '42'
          - '44'
          - '45'
          - '46'
          - '47'
          - '48'
          - '49'
          - '50'
          - '51'
          - '53'
          - '54'
          - '55'
          - '56'

      centroid_type:
        required: true
        description: Whether or not to use population-weighted locations
        default: weighted
        type: choice
        options:
          - weighted
          - unweighted

      override_chunks:
        required: false
        description: |
          Comma-separated chunks to run e.g. 0-5,6-11.
          Will run all chunks if null
        type: string

env:
  AWS_DEFAULT_REGION: us-east-1
  # See: https://github.com/aws/aws-cli/issues/5262#issuecomment-705832151
  AWS_EC2_METADATA_DISABLED: true
  PYTHONUNBUFFERED: "1"
  UV_SYSTEM_PYTHON: 1

jobs:
  # Using the location data, split the origins into N jobs (max 256)
  setup-jobs:
    runs-on: ubuntu-24.04
    outputs:
      chunks: ${{ steps.create-job-chunks.outputs.chunks }}
    steps:
      - name: Checkout
        uses: actions/checkout@v4

      - name: Setup Cloudflare credentials
        uses: ./.github/actions/setup-cloudflare-s3
        with:
          CLOUDFLARE_S3_API_ACCESS_KEY_ID: ${{ secrets.CLOUDFLARE_S3_API_ACCESS_KEY_ID }}
          CLOUDFLARE_S3_API_SECRET_ACCESS_KEY: ${{ secrets.CLOUDFLARE_S3_API_SECRET_ACCESS_KEY }}

      - name: Fetch GitHub user and group ID
        shell: bash
        id: fetch-ids
        run: |
          echo "USER_ID=$(id -u)" >> $GITHUB_ENV
          echo "GROUP_ID=$(id -g)" >> $GITHUB_ENV

      - name: Build Dockerized dependencies
        uses: ./.github/actions/build-docker
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

      - name: Load Docker image
        run: |
          docker load --input /tmp/opentimes.tar
          docker image ls -a

      - name: Fetch locations data
        uses: ./.github/actions/fetch-locations

      - name: Create job chunks
        id: create-job-chunks
        shell: bash
        run: |
          export USER_ID=${{ env.USER_ID }}
          export GROUP_ID=${{ env.GROUP_ID }}
          chunks=$(docker compose run --rm --quiet-pull \
            --entrypoint=python valhalla-run /data/src/split_origin.py \
            --year ${{ inputs.year }} --geography ${{ inputs.geography }} \
            --state ${{ inputs.state }})
          echo "chunks=$chunks" >> $GITHUB_OUTPUT

          # If override chunks are set, use those instead
          chunks_parsed=($(echo "$chunks" | jq -r '.[]'))
          if [ -n "${{ inputs.override_chunks }}" ]; then
            override_chunks_parsed=($(echo "${{ inputs.override_chunks }}" | tr -d ' ' | tr ',' ' '))
            for chunk in "${override_chunks_parsed[@]}"; do
              if [[ ! " ${chunks_parsed[@]} " =~ " ${chunk} " ]]; then
                echo "Error: Override chunk ${chunk} is not in the chunks for this origin"
                echo "Chunks include: ${chunks_parsed[@]}"
                exit 1
              fi
            done
            chunks_json=$(printf '%s\n' "${override_chunks_parsed[@]}" | jq -c -R . | jq -c -s .)
            echo "Creating jobs for chunks: ${override_chunks_parsed[@]}"
            echo "chunks=$chunks_json" > $GITHUB_OUTPUT
          else
            echo "Creating jobs for chunks: ${chunks_parsed[@]}"
          fi

  run-job:
    runs-on: ubuntu-24.04
    needs: setup-jobs
    strategy:
      # Don't fail all chunks if one fails
      fail-fast: false
      matrix:
        chunk: ${{ fromJSON(needs.setup-jobs.outputs.chunks) }}

    steps:
      - name: Checkout
        uses: actions/checkout@v4

      - name: Setup Cloudflare credentials
        uses: ./.github/actions/setup-cloudflare-s3
        with:
          CLOUDFLARE_S3_API_ACCESS_KEY_ID: ${{ secrets.CLOUDFLARE_S3_API_ACCESS_KEY_ID }}
          CLOUDFLARE_S3_API_SECRET_ACCESS_KEY: ${{ secrets.CLOUDFLARE_S3_API_SECRET_ACCESS_KEY }}

      - name: Fetch locations data
        uses: ./.github/actions/fetch-locations

      # Don't fetch tile data in setup-jobs because they're very large and
      # will churn the Actions cache. We want to wait to fetch it until jobs
      # have actually been picked up
      - name: Fetch Valhalla tile data
        uses: ./.github/actions/fetch-valhalla-tiles
        with:
          year: ${{ inputs.year }}
          state: ${{ inputs.state }}

      - name: Fetch GitHub user and group ID
        shell: bash
        id: fetch-ids
        run: |
          echo "USER_ID=$(id -u)" >> $GITHUB_ENV
          echo "GROUP_ID=$(id -g)" >> $GITHUB_ENV

      - name: Fetch Docker image
        uses: actions/download-artifact@v4
        with:
          name: opentimes-docker-${{ hashFiles('./data/Dockerfile', './pyproject.toml') }}
          path: /tmp

      - name: Load Docker image
        run: |
          docker load --input /tmp/opentimes.tar
          docker image ls -a

      - name: Extract tiles
        shell: bash
        working-directory: 'data'
        run: |
          tile_path="year=${{ inputs.year }}/geography=state/state=${{ inputs.state }}"
          ln ./intermediate/valhalla_tiles/"$tile_path"/valhalla_tiles.tar.zst ./build/
          tar -xf ./build/valhalla_tiles.tar.zst -C ./build
          rm -f ./build/valhalla_tiles.tar.zst

      - name: Run job chunk
        shell: bash
        working-directory: 'data'
        run: |
          export USER_ID=${{ env.USER_ID }}
          export GROUP_ID=${{ env.GROUP_ID }}
          docker compose run --rm --quiet-pull --entrypoint=python \
            valhalla-run /data/src/calculate_times.py \
            --mode ${{ inputs.mode }} --year ${{ inputs.year }} \
            --geography ${{ inputs.geography }} --state ${{ inputs.state }} \
            --centroid-type ${{ inputs.centroid_type }} \
            --chunk ${{ matrix.chunk }} --write-to-s3

      # Clear the cache if we're one of the last running jobs. Using this instead
      # of a separate workflow step because the steps often come last in the job
      # queue and then won't run when many workflows are queued at the same time
      - name: Clear workflow cache
        if: always()
        continue-on-error: true
        shell: bash
        run: |
          endpoint='repos/${{ github.repository }}/actions/runs/${{ github.run_id }}/jobs --paginate -q'
          total_jobs=$(gh api $endpoint '.total_count')
          complete_jobs=$(gh api $endpoint '.jobs[] | select(.status == "completed")' | wc -l)
          in_progress_jobs=$(gh api $endpoint '.jobs[] | select(.status == "in_progress")' | wc -l)

          n_remaining=$((total_jobs - complete_jobs))
          all_statuses=$((complete_jobs + in_progress_jobs))
          echo "Total number of jobs: $total_jobs"
          echo "Number of jobs complete: $total_jobs"
          echo "Number of jobs remaining: $n_remaining"
          echo "Number of jobs run/running: $all_statuses"
          if [ "$n_remaining" -lt 2 ] && [ "$total_jobs" -eq "$all_statuses" ]; then
            echo "Less than 5 jobs still running. Clearing workflow cache!"
            gh cache delete \
              valhalla-tiles-${{ inputs.year }}-${{ inputs.state }}-${{ hashFiles('./data/dvc.lock') }} || true
          fi
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}