Add new "choose your own pipeline" workflow (#13230)

* #0: split perf-models and nightly workflows into internal impl workflows * #0: add choose your own pipeline workflow * #0: fix typo in nightly workflow impl * #0: use profiler build for device perf tests * #0: create separate T3K frequent tests impl and add it to the "Choose your pipeline" workflow * #0: add t3k nightly tests to "Choose your pipeline" workflow * #0: fix name of new internal workflow impls * #0: add T3K unit tests to pipeline select * #0: fix name * #0: add T3K model perf tests to pipeline select workflow * #0: add tgg unit tests * #0: add tg unit tests
tenstorrent · Sep 28, 2024 · 849b3e6 · 849b3e6
1 parent fc3cec8
commit 849b3e6
Show file tree

Hide file tree

Showing 20 changed files with 684 additions and 541 deletions.
diff --git a/.github/workflows/fast-dispatch-full-regressions-and-models-impl.yaml b/.github/workflows/fast-dispatch-full-regressions-and-models-impl.yaml
@@ -0,0 +1,169 @@
+name: "[internal] Nightly fast dispatch tests impl"
+
+on:
+  workflow_call:
+
+jobs:
+  fd-nightly:
+    strategy:
+      # Do not fail-fast because we need to ensure all tests go to completion
+      # so we try not to get hanging machines
+      fail-fast: false
+      matrix:
+        test-group:
+          [
+            {
+              name: "Common models GS",
+              arch: grayskull,
+              runs-on: ["cloud-virtual-machine", "E150", "in-service"],
+              cmd: tests/scripts/single_card/nightly/run_common_models.sh,
+              timeout: 40
+            },
+            {
+              name: "GS ttnn nightly",
+              arch: grayskull,
+              runs-on: ["cloud-virtual-machine", "E150", "in-service"],
+              cmd: tests/scripts/single_card/nightly/run_ttnn.sh,
+              timeout: 40
+            },
+            {
+              name: "WH N150 ttnn nightly",
+              arch: wormhole_b0,
+              runs-on: ["cloud-virtual-machine", "N150", "in-service"],
+              cmd: tests/scripts/single_card/nightly/run_ttnn.sh,
+              timeout: 70
+            },
+            {
+              name: "WH N300 ttnn nightly",
+              arch: wormhole_b0,
+              runs-on: ["cloud-virtual-machine", "N300", "in-service"],
+              cmd: tests/scripts/single_card/nightly/run_ttnn.sh,
+              timeout: 70
+            },
+            {
+              name: "GS-only models",
+              arch: grayskull,
+              runs-on: ["cloud-virtual-machine", "E150", "in-service"],
+              cmd: tests/scripts/single_card/nightly/run_gs_only.sh,
+              timeout: 40
+            },
+            {
+              name: "API tests GS",
+              arch: grayskull,
+              runs-on: ["cloud-virtual-machine", "E150", "in-service"],
+              cmd: ./tests/scripts/run_tests.sh --tt-arch grayskull --pipeline-type frequent_api --dispatch-mode fast,
+              timeout: 10
+            },
+            {
+              name: "API tests N300 WH B0",
+              arch: wormhole_b0,
+              runs-on: ["cloud-virtual-machine", "N300", "in-service"],
+              cmd: ./tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type frequent_api --dispatch-mode fast,
+              timeout: 10
+            },
+            {
+              name: "API tests N150 WH B0",
+              arch: wormhole_b0,
+              runs-on: ["cloud-virtual-machine", "N150", "in-service"],
+              cmd: ./tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type frequent_api --dispatch-mode fast,
+              timeout: 10
+            },
+            {
+              name: "[Unstable] N150 models",
+              arch: wormhole_b0,
+              runs-on: ["cloud-virtual-machine", "N150", "in-service"],
+              cmd: tests/scripts/single_card/nightly/run_wh_b0_unstable.sh,
+              timeout: 55
+            },
+            {
+              name: "[Unstable] N300 models",
+              arch: wormhole_b0,
+              runs-on: ["cloud-virtual-machine", "N300", "in-service"],
+              cmd: tests/scripts/single_card/nightly/run_wh_b0_unstable.sh,
+              timeout: 55
+            },
+          ]
+    name: FD ${{ matrix.test-group.name }} ${{ matrix.test-group.arch }}
+    env:
+      ARCH_NAME: ${{ matrix.test-group.arch }}
+      LOGURU_LEVEL: INFO
+      LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib
+    runs-on: ${{ matrix.test-group.runs-on }}
+    steps:
+      - uses: tenstorrent-metal/metal-workflows/.github/actions/checkout-with-submodule-lfs@v2.0.0
+      - uses: ./.github/actions/retry-command
+        with:
+          timeout-seconds: 100
+          max-retries: 10
+          backoff-seconds: 60
+          command: ./.github/scripts/cloud_utils/mount_weka.sh
+      - name: Set up dyanmic env vars for build
+        run: |
+          echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
+      - uses: actions/download-artifact@v4
+        with:
+          name: TTMetal_build_${{ matrix.test-group.arch }}
+      - name: Extract files
+        run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
+      - uses: ./.github/actions/install-python-deps
+      - name: Run frequent reg tests scripts
+        timeout-minutes: ${{ matrix.test-group.timeout }}
+        run: |
+          source ${{ github.workspace }}/python_env/bin/activate
+          cd $TT_METAL_HOME
+          export PYTHONPATH=$TT_METAL_HOME
+          ${{ matrix.test-group.cmd }}
+      - uses: ./.github/actions/upload-artifact-with-job-uuid
+        if: ${{ !cancelled() }}
+        with:
+          path: |
+            generated/test_reports/
+          prefix: "test_reports_"
+  nightly-wh-models:
+    strategy:
+      # Do not fail-fast because we need to ensure all tests go to completion
+      # so we try not to get hanging machines
+      fail-fast: false
+      matrix:
+        card: [N150, N300]
+        model: [common_models, functional_unet, llama31_8b, mamba, mistral7b, mistral7b_eth, resnet50]
+    name: Nightly ${{ matrix.card }} ${{ matrix.model }}
+    env:
+      ARCH_NAME: wormhole_b0
+      LOGURU_LEVEL: INFO
+      LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib
+    runs-on: ["cloud-virtual-machine", "in-service", "${{ matrix.card }}"]
+    steps:
+      - uses: tenstorrent-metal/metal-workflows/.github/actions/checkout-with-submodule-lfs@v2.0.0
+      - uses: ./.github/actions/retry-command
+        with:
+          timeout-seconds: 100
+          max-retries: 10
+          backoff-seconds: 60
+          command: ./.github/scripts/cloud_utils/mount_weka.sh
+      - name: Set up dyanmic env vars for build
+        run: |
+          echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
+      - name: Set up WH_ARCH_YAML for eth-enabled models
+        if: ${{ matrix.model != 'mistral7b' }}
+        run: |
+          echo "WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml" >> $GITHUB_ENV
+      - uses: actions/download-artifact@v4
+        with:
+          name: TTMetal_build_wormhole_b0
+      - name: Extract files
+        run: tar -xvf ttm_wormhole_b0.tar
+      - uses: ./.github/actions/install-python-deps
+      - name: Run frequent reg tests scripts
+        timeout-minutes: 50
+        run: |
+          source ${{ github.workspace }}/python_env/bin/activate
+          cd $TT_METAL_HOME
+          export PYTHONPATH=$TT_METAL_HOME
+          pytest -n auto tests/nightly/single_card/${{ matrix.model }}
+      - uses: ./.github/actions/upload-artifact-with-job-uuid
+        if: ${{ !cancelled() }}
+        with:
+          path: |
+            generated/test_reports/
+          prefix: "test_reports_"
diff --git a/.github/workflows/fast-dispatch-full-regressions-and-models.yaml b/.github/workflows/fast-dispatch-full-regressions-and-models.yaml
@@ -12,166 +12,5 @@ jobs:
     secrets: inherit
   fd-nightly:
     needs: build-artifact
-    strategy:
-      # Do not fail-fast because we need to ensure all tests go to completion
-      # so we try not to get hanging machines
-      fail-fast: false
-      matrix:
-        test-group:
-          [
-            {
-              name: "Common models GS",
-              arch: grayskull,
-              runs-on: ["cloud-virtual-machine", "E150", "in-service"],
-              cmd: tests/scripts/single_card/nightly/run_common_models.sh,
-              timeout: 40
-            },
-            {
-              name: "GS ttnn nightly",
-              arch: grayskull,
-              runs-on: ["cloud-virtual-machine", "E150", "in-service"],
-              cmd: tests/scripts/single_card/nightly/run_ttnn.sh,
-              timeout: 40
-            },
-            {
-              name: "WH N150 ttnn nightly",
-              arch: wormhole_b0,
-              runs-on: ["cloud-virtual-machine", "N150", "in-service"],
-              cmd: tests/scripts/single_card/nightly/run_ttnn.sh,
-              timeout: 70
-            },
-            {
-              name: "WH N300 ttnn nightly",
-              arch: wormhole_b0,
-              runs-on: ["cloud-virtual-machine", "N300", "in-service"],
-              cmd: tests/scripts/single_card/nightly/run_ttnn.sh,
-              timeout: 70
-            },
-            {
-              name: "GS-only models",
-              arch: grayskull,
-              runs-on: ["cloud-virtual-machine", "E150", "in-service"],
-              cmd: tests/scripts/single_card/nightly/run_gs_only.sh,
-              timeout: 40
-            },
-            {
-              name: "API tests GS",
-              arch: grayskull,
-              runs-on: ["cloud-virtual-machine", "E150", "in-service"],
-              cmd: ./tests/scripts/run_tests.sh --tt-arch grayskull --pipeline-type frequent_api --dispatch-mode fast,
-              timeout: 10
-            },
-            {
-              name: "API tests N300 WH B0",
-              arch: wormhole_b0,
-              runs-on: ["cloud-virtual-machine", "N300", "in-service"],
-              cmd: ./tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type frequent_api --dispatch-mode fast,
-              timeout: 10
-            },
-            {
-              name: "API tests N150 WH B0",
-              arch: wormhole_b0,
-              runs-on: ["cloud-virtual-machine", "N150", "in-service"],
-              cmd: ./tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type frequent_api --dispatch-mode fast,
-              timeout: 10
-            },
-            {
-              name: "[Unstable] N150 models",
-              arch: wormhole_b0,
-              runs-on: ["cloud-virtual-machine", "N150", "in-service"],
-              cmd: tests/scripts/single_card/nightly/run_wh_b0_unstable.sh,
-              timeout: 55
-            },
-            {
-              name: "[Unstable] N300 models",
-              arch: wormhole_b0,
-              runs-on: ["cloud-virtual-machine", "N300", "in-service"],
-              cmd: tests/scripts/single_card/nightly/run_wh_b0_unstable.sh,
-              timeout: 55
-            },
-          ]
-    name: FD ${{ matrix.test-group.name }} ${{ matrix.test-group.arch }}
-    env:
-      ARCH_NAME: ${{ matrix.test-group.arch }}
-      LOGURU_LEVEL: INFO
-      LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib
-    runs-on: ${{ matrix.test-group.runs-on }}
-    steps:
-      - uses: tenstorrent-metal/metal-workflows/.github/actions/checkout-with-submodule-lfs@v2.0.0
-      - uses: ./.github/actions/retry-command
-        with:
-          timeout-seconds: 100
-          max-retries: 10
-          backoff-seconds: 60
-          command: ./.github/scripts/cloud_utils/mount_weka.sh
-      - name: Set up dyanmic env vars for build
-        run: |
-          echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
-      - uses: actions/download-artifact@v4
-        with:
-          name: TTMetal_build_${{ matrix.test-group.arch }}
-      - name: Extract files
-        run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
-      - uses: ./.github/actions/install-python-deps
-      - name: Run frequent reg tests scripts
-        timeout-minutes: ${{ matrix.test-group.timeout }}
-        run: |
-          source ${{ github.workspace }}/python_env/bin/activate
-          cd $TT_METAL_HOME
-          export PYTHONPATH=$TT_METAL_HOME
-          ${{ matrix.test-group.cmd }}
-      - uses: ./.github/actions/upload-artifact-with-job-uuid
-        if: ${{ !cancelled() }}
-        with:
-          path: |
-            generated/test_reports/
-          prefix: "test_reports_"
-  nightly-wh-models:
-    needs: build-artifact
-    strategy:
-      # Do not fail-fast because we need to ensure all tests go to completion
-      # so we try not to get hanging machines
-      fail-fast: false
-      matrix:
-        card: [N150, N300]
-        model: [common_models, functional_unet, llama31_8b, mamba, mistral7b, mistral7b_eth, resnet50]
-    name: Nightly ${{ matrix.card }} ${{ matrix.model }}
-    env:
-      ARCH_NAME: wormhole_b0
-      LOGURU_LEVEL: INFO
-      LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib
-    runs-on: ["cloud-virtual-machine", "in-service", "${{ matrix.card }}"]
-    steps:
-      - uses: tenstorrent-metal/metal-workflows/.github/actions/checkout-with-submodule-lfs@v2.0.0
-      - uses: ./.github/actions/retry-command
-        with:
-          timeout-seconds: 100
-          max-retries: 10
-          backoff-seconds: 60
-          command: ./.github/scripts/cloud_utils/mount_weka.sh
-      - name: Set up dyanmic env vars for build
-        run: |
-          echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
-      - name: Set up WH_ARCH_YAML for eth-enabled models
-        if: ${{ matrix.model != 'mistral7b' }}
-        run: |
-          echo "WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml" >> $GITHUB_ENV
-      - uses: actions/download-artifact@v4
-        with:
-          name: TTMetal_build_wormhole_b0
-      - name: Extract files
-        run: tar -xvf ttm_wormhole_b0.tar
-      - uses: ./.github/actions/install-python-deps
-      - name: Run frequent reg tests scripts
-        timeout-minutes: 50
-        run: |
-          source ${{ github.workspace }}/python_env/bin/activate
-          cd $TT_METAL_HOME
-          export PYTHONPATH=$TT_METAL_HOME
-          pytest -n auto tests/nightly/single_card/${{ matrix.model }}
-      - uses: ./.github/actions/upload-artifact-with-job-uuid
-        if: ${{ !cancelled() }}
-        with:
-          path: |
-            generated/test_reports/
-          prefix: "test_reports_"
+    uses: ./.github/workflows/fast-dispatch-full-regressions-and-models-impl.yaml
+    secrets: inherit
diff --git a/.github/workflows/perf-device-models-impl.yaml b/.github/workflows/perf-device-models-impl.yaml
@@ -1,4 +1,4 @@
-name: "(internal) Single-card Device perf regressions impl"
+name: "[internal] Single-card Device perf regressions impl"
 
 on:
   workflow_call: