Skip to content

Commit

Permalink
Add new "choose your own pipeline" workflow (#13230)
Browse files Browse the repository at this point in the history
* #0: split perf-models and nightly workflows into internal impl workflows

* #0: add choose your own pipeline workflow

* #0: fix typo in nightly workflow impl

* #0: use profiler build for device perf tests

* #0: create separate T3K frequent tests impl and add it to the "Choose your pipeline" workflow

* #0: add t3k nightly tests to "Choose your pipeline" workflow

* #0: fix name of new internal workflow impls

* #0: add T3K unit tests to pipeline select

* #0: fix name

* #0: add T3K model perf tests to pipeline select workflow

* #0: add tgg unit tests

* #0: add tg unit tests
  • Loading branch information
TT-billteng authored Sep 28, 2024
1 parent fc3cec8 commit 849b3e6
Show file tree
Hide file tree
Showing 20 changed files with 684 additions and 541 deletions.
169 changes: 169 additions & 0 deletions .github/workflows/fast-dispatch-full-regressions-and-models-impl.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
name: "[internal] Nightly fast dispatch tests impl"

on:
workflow_call:

jobs:
fd-nightly:
strategy:
# Do not fail-fast because we need to ensure all tests go to completion
# so we try not to get hanging machines
fail-fast: false
matrix:
test-group:
[
{
name: "Common models GS",
arch: grayskull,
runs-on: ["cloud-virtual-machine", "E150", "in-service"],
cmd: tests/scripts/single_card/nightly/run_common_models.sh,
timeout: 40
},
{
name: "GS ttnn nightly",
arch: grayskull,
runs-on: ["cloud-virtual-machine", "E150", "in-service"],
cmd: tests/scripts/single_card/nightly/run_ttnn.sh,
timeout: 40
},
{
name: "WH N150 ttnn nightly",
arch: wormhole_b0,
runs-on: ["cloud-virtual-machine", "N150", "in-service"],
cmd: tests/scripts/single_card/nightly/run_ttnn.sh,
timeout: 70
},
{
name: "WH N300 ttnn nightly",
arch: wormhole_b0,
runs-on: ["cloud-virtual-machine", "N300", "in-service"],
cmd: tests/scripts/single_card/nightly/run_ttnn.sh,
timeout: 70
},
{
name: "GS-only models",
arch: grayskull,
runs-on: ["cloud-virtual-machine", "E150", "in-service"],
cmd: tests/scripts/single_card/nightly/run_gs_only.sh,
timeout: 40
},
{
name: "API tests GS",
arch: grayskull,
runs-on: ["cloud-virtual-machine", "E150", "in-service"],
cmd: ./tests/scripts/run_tests.sh --tt-arch grayskull --pipeline-type frequent_api --dispatch-mode fast,
timeout: 10
},
{
name: "API tests N300 WH B0",
arch: wormhole_b0,
runs-on: ["cloud-virtual-machine", "N300", "in-service"],
cmd: ./tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type frequent_api --dispatch-mode fast,
timeout: 10
},
{
name: "API tests N150 WH B0",
arch: wormhole_b0,
runs-on: ["cloud-virtual-machine", "N150", "in-service"],
cmd: ./tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type frequent_api --dispatch-mode fast,
timeout: 10
},
{
name: "[Unstable] N150 models",
arch: wormhole_b0,
runs-on: ["cloud-virtual-machine", "N150", "in-service"],
cmd: tests/scripts/single_card/nightly/run_wh_b0_unstable.sh,
timeout: 55
},
{
name: "[Unstable] N300 models",
arch: wormhole_b0,
runs-on: ["cloud-virtual-machine", "N300", "in-service"],
cmd: tests/scripts/single_card/nightly/run_wh_b0_unstable.sh,
timeout: 55
},
]
name: FD ${{ matrix.test-group.name }} ${{ matrix.test-group.arch }}
env:
ARCH_NAME: ${{ matrix.test-group.arch }}
LOGURU_LEVEL: INFO
LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib
runs-on: ${{ matrix.test-group.runs-on }}
steps:
- uses: tenstorrent-metal/metal-workflows/.github/actions/checkout-with-submodule-lfs@v2.0.0
- uses: ./.github/actions/retry-command
with:
timeout-seconds: 100
max-retries: 10
backoff-seconds: 60
command: ./.github/scripts/cloud_utils/mount_weka.sh
- name: Set up dyanmic env vars for build
run: |
echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
- uses: actions/download-artifact@v4
with:
name: TTMetal_build_${{ matrix.test-group.arch }}
- name: Extract files
run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
- uses: ./.github/actions/install-python-deps
- name: Run frequent reg tests scripts
timeout-minutes: ${{ matrix.test-group.timeout }}
run: |
source ${{ github.workspace }}/python_env/bin/activate
cd $TT_METAL_HOME
export PYTHONPATH=$TT_METAL_HOME
${{ matrix.test-group.cmd }}
- uses: ./.github/actions/upload-artifact-with-job-uuid
if: ${{ !cancelled() }}
with:
path: |
generated/test_reports/
prefix: "test_reports_"
nightly-wh-models:
strategy:
# Do not fail-fast because we need to ensure all tests go to completion
# so we try not to get hanging machines
fail-fast: false
matrix:
card: [N150, N300]
model: [common_models, functional_unet, llama31_8b, mamba, mistral7b, mistral7b_eth, resnet50]
name: Nightly ${{ matrix.card }} ${{ matrix.model }}
env:
ARCH_NAME: wormhole_b0
LOGURU_LEVEL: INFO
LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib
runs-on: ["cloud-virtual-machine", "in-service", "${{ matrix.card }}"]
steps:
- uses: tenstorrent-metal/metal-workflows/.github/actions/checkout-with-submodule-lfs@v2.0.0
- uses: ./.github/actions/retry-command
with:
timeout-seconds: 100
max-retries: 10
backoff-seconds: 60
command: ./.github/scripts/cloud_utils/mount_weka.sh
- name: Set up dyanmic env vars for build
run: |
echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
- name: Set up WH_ARCH_YAML for eth-enabled models
if: ${{ matrix.model != 'mistral7b' }}
run: |
echo "WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml" >> $GITHUB_ENV
- uses: actions/download-artifact@v4
with:
name: TTMetal_build_wormhole_b0
- name: Extract files
run: tar -xvf ttm_wormhole_b0.tar
- uses: ./.github/actions/install-python-deps
- name: Run frequent reg tests scripts
timeout-minutes: 50
run: |
source ${{ github.workspace }}/python_env/bin/activate
cd $TT_METAL_HOME
export PYTHONPATH=$TT_METAL_HOME
pytest -n auto tests/nightly/single_card/${{ matrix.model }}
- uses: ./.github/actions/upload-artifact-with-job-uuid
if: ${{ !cancelled() }}
with:
path: |
generated/test_reports/
prefix: "test_reports_"
165 changes: 2 additions & 163 deletions .github/workflows/fast-dispatch-full-regressions-and-models.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,166 +12,5 @@ jobs:
secrets: inherit
fd-nightly:
needs: build-artifact
strategy:
# Do not fail-fast because we need to ensure all tests go to completion
# so we try not to get hanging machines
fail-fast: false
matrix:
test-group:
[
{
name: "Common models GS",
arch: grayskull,
runs-on: ["cloud-virtual-machine", "E150", "in-service"],
cmd: tests/scripts/single_card/nightly/run_common_models.sh,
timeout: 40
},
{
name: "GS ttnn nightly",
arch: grayskull,
runs-on: ["cloud-virtual-machine", "E150", "in-service"],
cmd: tests/scripts/single_card/nightly/run_ttnn.sh,
timeout: 40
},
{
name: "WH N150 ttnn nightly",
arch: wormhole_b0,
runs-on: ["cloud-virtual-machine", "N150", "in-service"],
cmd: tests/scripts/single_card/nightly/run_ttnn.sh,
timeout: 70
},
{
name: "WH N300 ttnn nightly",
arch: wormhole_b0,
runs-on: ["cloud-virtual-machine", "N300", "in-service"],
cmd: tests/scripts/single_card/nightly/run_ttnn.sh,
timeout: 70
},
{
name: "GS-only models",
arch: grayskull,
runs-on: ["cloud-virtual-machine", "E150", "in-service"],
cmd: tests/scripts/single_card/nightly/run_gs_only.sh,
timeout: 40
},
{
name: "API tests GS",
arch: grayskull,
runs-on: ["cloud-virtual-machine", "E150", "in-service"],
cmd: ./tests/scripts/run_tests.sh --tt-arch grayskull --pipeline-type frequent_api --dispatch-mode fast,
timeout: 10
},
{
name: "API tests N300 WH B0",
arch: wormhole_b0,
runs-on: ["cloud-virtual-machine", "N300", "in-service"],
cmd: ./tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type frequent_api --dispatch-mode fast,
timeout: 10
},
{
name: "API tests N150 WH B0",
arch: wormhole_b0,
runs-on: ["cloud-virtual-machine", "N150", "in-service"],
cmd: ./tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type frequent_api --dispatch-mode fast,
timeout: 10
},
{
name: "[Unstable] N150 models",
arch: wormhole_b0,
runs-on: ["cloud-virtual-machine", "N150", "in-service"],
cmd: tests/scripts/single_card/nightly/run_wh_b0_unstable.sh,
timeout: 55
},
{
name: "[Unstable] N300 models",
arch: wormhole_b0,
runs-on: ["cloud-virtual-machine", "N300", "in-service"],
cmd: tests/scripts/single_card/nightly/run_wh_b0_unstable.sh,
timeout: 55
},
]
name: FD ${{ matrix.test-group.name }} ${{ matrix.test-group.arch }}
env:
ARCH_NAME: ${{ matrix.test-group.arch }}
LOGURU_LEVEL: INFO
LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib
runs-on: ${{ matrix.test-group.runs-on }}
steps:
- uses: tenstorrent-metal/metal-workflows/.github/actions/checkout-with-submodule-lfs@v2.0.0
- uses: ./.github/actions/retry-command
with:
timeout-seconds: 100
max-retries: 10
backoff-seconds: 60
command: ./.github/scripts/cloud_utils/mount_weka.sh
- name: Set up dyanmic env vars for build
run: |
echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
- uses: actions/download-artifact@v4
with:
name: TTMetal_build_${{ matrix.test-group.arch }}
- name: Extract files
run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
- uses: ./.github/actions/install-python-deps
- name: Run frequent reg tests scripts
timeout-minutes: ${{ matrix.test-group.timeout }}
run: |
source ${{ github.workspace }}/python_env/bin/activate
cd $TT_METAL_HOME
export PYTHONPATH=$TT_METAL_HOME
${{ matrix.test-group.cmd }}
- uses: ./.github/actions/upload-artifact-with-job-uuid
if: ${{ !cancelled() }}
with:
path: |
generated/test_reports/
prefix: "test_reports_"
nightly-wh-models:
needs: build-artifact
strategy:
# Do not fail-fast because we need to ensure all tests go to completion
# so we try not to get hanging machines
fail-fast: false
matrix:
card: [N150, N300]
model: [common_models, functional_unet, llama31_8b, mamba, mistral7b, mistral7b_eth, resnet50]
name: Nightly ${{ matrix.card }} ${{ matrix.model }}
env:
ARCH_NAME: wormhole_b0
LOGURU_LEVEL: INFO
LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib
runs-on: ["cloud-virtual-machine", "in-service", "${{ matrix.card }}"]
steps:
- uses: tenstorrent-metal/metal-workflows/.github/actions/checkout-with-submodule-lfs@v2.0.0
- uses: ./.github/actions/retry-command
with:
timeout-seconds: 100
max-retries: 10
backoff-seconds: 60
command: ./.github/scripts/cloud_utils/mount_weka.sh
- name: Set up dyanmic env vars for build
run: |
echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
- name: Set up WH_ARCH_YAML for eth-enabled models
if: ${{ matrix.model != 'mistral7b' }}
run: |
echo "WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml" >> $GITHUB_ENV
- uses: actions/download-artifact@v4
with:
name: TTMetal_build_wormhole_b0
- name: Extract files
run: tar -xvf ttm_wormhole_b0.tar
- uses: ./.github/actions/install-python-deps
- name: Run frequent reg tests scripts
timeout-minutes: 50
run: |
source ${{ github.workspace }}/python_env/bin/activate
cd $TT_METAL_HOME
export PYTHONPATH=$TT_METAL_HOME
pytest -n auto tests/nightly/single_card/${{ matrix.model }}
- uses: ./.github/actions/upload-artifact-with-job-uuid
if: ${{ !cancelled() }}
with:
path: |
generated/test_reports/
prefix: "test_reports_"
uses: ./.github/workflows/fast-dispatch-full-regressions-and-models-impl.yaml
secrets: inherit
2 changes: 1 addition & 1 deletion .github/workflows/perf-device-models-impl.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: "(internal) Single-card Device perf regressions impl"
name: "[internal] Single-card Device perf regressions impl"

on:
workflow_call:
Expand Down
Loading

0 comments on commit 849b3e6

Please sign in to comment.