From 12137a3f86074c43e88ea05f391893f07d8c1a50 Mon Sep 17 00:00:00 2001 From: Bill Teng Date: Thu, 3 Oct 2024 19:57:01 +0000 Subject: [PATCH] #0: add T3k profiler and TG/TGG model perf workflows to new pipeline select workflows --- .github/workflows/pipeline-select-t3k.yaml | 10 +++ .github/workflows/pipeline-select.yaml | 18 +++++ .../workflows/t3000-profiler-tests-impl.yaml | 46 +++++++++++ .github/workflows/t3000-profiler-tests.yaml | 41 +--------- .../workflows/tg-model-perf-tests-impl.yaml | 79 +++++++++++++++++++ .github/workflows/tg-model-perf-tests.yaml | 74 +---------------- .../workflows/tgg-model-perf-tests-impl.yaml | 79 +++++++++++++++++++ .github/workflows/tgg-model-perf-tests.yaml | 74 +---------------- 8 files changed, 238 insertions(+), 183 deletions(-) create mode 100644 .github/workflows/t3000-profiler-tests-impl.yaml create mode 100644 .github/workflows/tg-model-perf-tests-impl.yaml create mode 100644 .github/workflows/tgg-model-perf-tests-impl.yaml diff --git a/.github/workflows/pipeline-select-t3k.yaml b/.github/workflows/pipeline-select-t3k.yaml index ddd2947319cd..3df726bdd6e9 100644 --- a/.github/workflows/pipeline-select-t3k.yaml +++ b/.github/workflows/pipeline-select-t3k.yaml @@ -36,6 +36,11 @@ on: required: false type: boolean default: false + t3000-profiler: + description: "T3000 profiler tests (requires tracy build)" + required: false + type: boolean + default: false run-name: ${{ inputs.description }} jobs: @@ -70,3 +75,8 @@ jobs: secrets: inherit uses: ./.github/workflows/t3000-model-perf-tests-impl.yaml if: ${{ inputs.t3000-model-perf }} + t3000-profiler-tests: + needs: build-artifact + secrets: inherit + uses: ./.github/workflows/t3000-profiler-tests-impl.yaml + if: ${{ inputs.t3000-profiler }} diff --git a/.github/workflows/pipeline-select.yaml b/.github/workflows/pipeline-select.yaml index 8f991bb0c5c0..62d3ce08cd0e 100644 --- a/.github/workflows/pipeline-select.yaml +++ b/.github/workflows/pipeline-select.yaml @@ -41,6 +41,10 @@ on: required: false type: boolean default: false + tgg-model-perf: + required: false + type: boolean + default: false tg-unit: required: false type: boolean @@ -49,6 +53,10 @@ on: required: false type: boolean default: false + tg-model-perf: + required: false + type: boolean + default: false run-name: ${{ inputs.description }} jobs: @@ -88,6 +96,11 @@ jobs: secrets: inherit uses: ./.github/workflows/tgg-frequent-tests-impl.yaml if: ${{ inputs.tgg-frequent }} + tgg-model-perf-tests: + needs: build-artifact + secrets: inherit + uses: ./.github/workflows/tgg-model-perf-tests-impl.yaml + if: ${{ inputs.tgg-model-perf }} tg-unit-tests: needs: build-artifact secrets: inherit @@ -98,3 +111,8 @@ jobs: secrets: inherit uses: ./.github/workflows/tg-frequent-tests-impl.yaml if: ${{ inputs.tg-frequent }} + tg-model-perf-tests: + needs: build-artifact + secrets: inherit + uses: ./.github/workflows/tg-model-perf-tests-impl.yaml + if: ${{ inputs.tg-model-perf }} diff --git a/.github/workflows/t3000-profiler-tests-impl.yaml b/.github/workflows/t3000-profiler-tests-impl.yaml new file mode 100644 index 000000000000..571ac1628e37 --- /dev/null +++ b/.github/workflows/t3000-profiler-tests-impl.yaml @@ -0,0 +1,46 @@ +name: "[internal] T3000 profiler tests impl" + +on: + workflow_call: + +jobs: + t3000-profiler-tests: + strategy: + fail-fast: false + matrix: + test-group: [ + { + name: "T3000 profiler tests", + arch: wormhole_b0, + runs-on: ["arch-wormhole_b0", "config-t3000", "in-service", "pipeline-perf"], + cmd: './tests/scripts/run_profiler_regressions.sh' + }, + ] + name: ${{ matrix.test-group.name }} + env: + TT_METAL_ENV: ${{ vars.TT_METAL_ENV }} + ARCH_NAME: ${{ matrix.test-group.arch }} + LOGURU_LEVEL: INFO + LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib + environment: dev + runs-on: ${{ matrix.test-group.runs-on }} + steps: + - uses: tenstorrent-metal/metal-workflows/.github/actions/checkout-with-submodule-lfs@v2.0.0 + - name: Set up dynamic env vars for build + run: | + echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV + - uses: actions/download-artifact@v4 + with: + name: TTMetal_build_${{ matrix.test-group.arch }}_profiler + - name: Extract files + run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar + - uses: ./.github/actions/install-python-deps + - name: Run profiler regression tests + timeout-minutes: 30 + run: | + ./tests/scripts/run_profiler_regressions.sh + - uses: ./.github/actions/slack-report + if: ${{ failure() }} + with: + slack_webhook_url: ${{ secrets.SLACK_WEBHOOK_URL }} + owner: U03BJ1L3LUQ # Mo Memarian diff --git a/.github/workflows/t3000-profiler-tests.yaml b/.github/workflows/t3000-profiler-tests.yaml index b63ecedf2131..ccc9dda28762 100644 --- a/.github/workflows/t3000-profiler-tests.yaml +++ b/.github/workflows/t3000-profiler-tests.yaml @@ -15,42 +15,5 @@ jobs: secrets: inherit t3000-profiler-tests: needs: build-artifact-profiler - strategy: - fail-fast: false - matrix: - test-group: [ - { - name: "T3000 profiler tests", - arch: wormhole_b0, - runs-on: ["arch-wormhole_b0", "config-t3000", "in-service", "pipeline-perf"], - cmd: './tests/scripts/run_profiler_regressions.sh' - }, - ] - name: ${{ matrix.test-group.name }} - env: - TT_METAL_ENV: ${{ vars.TT_METAL_ENV }} - ARCH_NAME: ${{ matrix.test-group.arch }} - LOGURU_LEVEL: INFO - LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib - environment: dev - runs-on: ${{ matrix.test-group.runs-on }} - steps: - - uses: tenstorrent-metal/metal-workflows/.github/actions/checkout-with-submodule-lfs@v2.0.0 - - name: Set up dynamic env vars for build - run: | - echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV - - uses: actions/download-artifact@v4 - with: - name: TTMetal_build_${{ matrix.test-group.arch }}_profiler - - name: Extract files - run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar - - uses: ./.github/actions/install-python-deps - - name: Run profiler regression tests - timeout-minutes: 30 - run: | - ./tests/scripts/run_profiler_regressions.sh - - uses: ./.github/actions/slack-report - if: ${{ failure() }} - with: - slack_webhook_url: ${{ secrets.SLACK_WEBHOOK_URL }} - owner: U03BJ1L3LUQ # Mo Memarian + secrets: inherit + uses: ./.github/workflows/t3000-profiler-tests-impl.yaml diff --git a/.github/workflows/tg-model-perf-tests-impl.yaml b/.github/workflows/tg-model-perf-tests-impl.yaml new file mode 100644 index 000000000000..dd10b6109a9c --- /dev/null +++ b/.github/workflows/tg-model-perf-tests-impl.yaml @@ -0,0 +1,79 @@ +name: "[internal] TG model perf tests impl" + +on: + workflow_call: + +jobs: + tg-model-perf-tests: + strategy: + fail-fast: false + matrix: + test-group: [ + { + name: "TG LLM model perf tests", + model-type: "LLM", + arch: wormhole_b0, + runs-on: ["arch-wormhole_b0", "config-tg", "in-service", "bare-metal", "pipeline-perf"], + cmd: './tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type llm_model_perf_tg_device --dispatch-mode ""' + }, + { + name: "TG CNN model perf tests", + model-type: "CNN", + arch: wormhole_b0, + runs-on: ["arch-wormhole_b0", "config-tg", "in-service", "bare-metal", "pipeline-perf"], + cmd: './tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type cnn_model_perf_tg_device --dispatch-mode ""' + }, + ] + name: ${{ matrix.test-group.name }} + env: + TT_METAL_ENV: ${{ vars.TT_METAL_ENV }} + ARCH_NAME: ${{ matrix.test-group.arch }} + LOGURU_LEVEL: INFO + LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib + environment: dev + runs-on: ${{ matrix.test-group.runs-on }} + steps: + - uses: tenstorrent-metal/metal-workflows/.github/actions/checkout-with-submodule-lfs@v2.0.0 + - name: Enable performance mode + run: | + sudo cpupower frequency-set -g performance + - name: Ensure weka mount is active + run: | + sudo systemctl restart mnt-MLPerf.mount + sudo /etc/rc.local + ls -al /mnt/MLPerf/bit_error_tests + - name: Set up dynamic env vars for build + run: | + echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV + echo "PYTHONPATH=$(pwd)" >> $GITHUB_ENV + - uses: actions/download-artifact@v4 + with: + name: TTMetal_build_${{ matrix.test-group.arch }} + - name: Extract files + run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar + - uses: ./.github/actions/install-python-deps + - name: Run model perf regression tests + timeout-minutes: 60 + run: | + source ${{ github.workspace }}/python_env/bin/activate + cd $TT_METAL_HOME + export PYTHONPATH=$TT_METAL_HOME + ${{ matrix.test-group.cmd }} + - name: Check perf report exists + id: check-perf-report + if: ${{ !cancelled() }} + run: | + ls -hal + export PERF_REPORT_FILENAME=Models_Perf_$(date +%Y_%m_%d).csv + ls -hal $PERF_REPORT_FILENAME + echo "perf_report_filename=$PERF_REPORT_FILENAME" >> "$GITHUB_OUTPUT" + - name: Upload perf report + if: ${{ !cancelled() && steps.check-perf-report.conclusion == 'success' }} + uses: actions/upload-artifact@v4 + with: + name: perf-report-csv-${{ matrix.test-group.model-type }}-${{ matrix.test-group.arch }}-${{ matrix.test-group.machine-type }} + path: "${{ steps.check-perf-report.outputs.perf_report_filename }}" + - name: Disable performance mode + if: always() + run: | + sudo cpupower frequency-set -g ondemand diff --git a/.github/workflows/tg-model-perf-tests.yaml b/.github/workflows/tg-model-perf-tests.yaml index 0dd1580e3715..a813b7636021 100644 --- a/.github/workflows/tg-model-perf-tests.yaml +++ b/.github/workflows/tg-model-perf-tests.yaml @@ -13,75 +13,5 @@ jobs: secrets: inherit tg-model-perf-tests: needs: build-artifact - strategy: - fail-fast: false - matrix: - test-group: [ - { - name: "TG LLM model perf tests", - model-type: "LLM", - arch: wormhole_b0, - runs-on: ["arch-wormhole_b0", "config-tg", "in-service", "bare-metal", "pipeline-perf"], - cmd: './tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type llm_model_perf_tg_device --dispatch-mode ""' - }, - { - name: "TG CNN model perf tests", - model-type: "CNN", - arch: wormhole_b0, - runs-on: ["arch-wormhole_b0", "config-tg", "in-service", "bare-metal", "pipeline-perf"], - cmd: './tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type cnn_model_perf_tg_device --dispatch-mode ""' - }, - ] - name: ${{ matrix.test-group.name }} - env: - TT_METAL_ENV: ${{ vars.TT_METAL_ENV }} - ARCH_NAME: ${{ matrix.test-group.arch }} - LOGURU_LEVEL: INFO - LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib - environment: dev - runs-on: ${{ matrix.test-group.runs-on }} - steps: - - uses: tenstorrent-metal/metal-workflows/.github/actions/checkout-with-submodule-lfs@v2.0.0 - - name: Enable performance mode - run: | - sudo cpupower frequency-set -g performance - - name: Ensure weka mount is active - run: | - sudo systemctl restart mnt-MLPerf.mount - sudo /etc/rc.local - ls -al /mnt/MLPerf/bit_error_tests - - name: Set up dynamic env vars for build - run: | - echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV - echo "PYTHONPATH=$(pwd)" >> $GITHUB_ENV - - uses: actions/download-artifact@v4 - with: - name: TTMetal_build_${{ matrix.test-group.arch }} - - name: Extract files - run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar - - uses: ./.github/actions/install-python-deps - - name: Run model perf regression tests - timeout-minutes: 60 - run: | - source ${{ github.workspace }}/python_env/bin/activate - cd $TT_METAL_HOME - export PYTHONPATH=$TT_METAL_HOME - ${{ matrix.test-group.cmd }} - - name: Check perf report exists - id: check-perf-report - if: ${{ !cancelled() }} - run: | - ls -hal - export PERF_REPORT_FILENAME=Models_Perf_$(date +%Y_%m_%d).csv - ls -hal $PERF_REPORT_FILENAME - echo "perf_report_filename=$PERF_REPORT_FILENAME" >> "$GITHUB_OUTPUT" - - name: Upload perf report - if: ${{ !cancelled() && steps.check-perf-report.conclusion == 'success' }} - uses: actions/upload-artifact@v4 - with: - name: perf-report-csv-${{ matrix.test-group.model-type }}-${{ matrix.test-group.arch }}-${{ matrix.test-group.machine-type }} - path: "${{ steps.check-perf-report.outputs.perf_report_filename }}" - - name: Disable performance mode - if: always() - run: | - sudo cpupower frequency-set -g ondemand + secrets: inherit + uses: ./.github/workflows/tg-model-perf-tests-impl.yaml diff --git a/.github/workflows/tgg-model-perf-tests-impl.yaml b/.github/workflows/tgg-model-perf-tests-impl.yaml new file mode 100644 index 000000000000..f3d44f2e2ba5 --- /dev/null +++ b/.github/workflows/tgg-model-perf-tests-impl.yaml @@ -0,0 +1,79 @@ +name: "[internal] TGG model perf tests impl" + +on: + workflow_call: + +jobs: + tgg-model-perf-tests: + strategy: + fail-fast: false + matrix: + test-group: [ + { + name: "TGG LLM model perf tests", + model-type: "LLM", + arch: wormhole_b0, + runs-on: ["arch-wormhole_b0", "config-tgg", "in-service", "bare-metal", "pipeline-perf"], + cmd: './tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type llm_model_perf_tgg_device --dispatch-mode ""' + }, + { + name: "TGG CNN model perf tests", + model-type: "CNN", + arch: wormhole_b0, + runs-on: ["arch-wormhole_b0", "config-tgg", "in-service", "bare-metal", "pipeline-perf"], + cmd: './tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type cnn_model_perf_tgg_device --dispatch-mode ""' + }, + ] + name: ${{ matrix.test-group.name }} + env: + TT_METAL_ENV: ${{ vars.TT_METAL_ENV }} + ARCH_NAME: ${{ matrix.test-group.arch }} + LOGURU_LEVEL: INFO + LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib + environment: dev + runs-on: ${{ matrix.test-group.runs-on }} + steps: + - uses: tenstorrent-metal/metal-workflows/.github/actions/checkout-with-submodule-lfs@v2.0.0 + - name: Enable performance mode + run: | + sudo cpupower frequency-set -g performance + - name: Ensure weka mount is active + run: | + sudo systemctl restart mnt-MLPerf.mount + sudo /etc/rc.local + ls -al /mnt/MLPerf/bit_error_tests + - name: Set up dynamic env vars for build + run: | + echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV + echo "PYTHONPATH=$(pwd)" >> $GITHUB_ENV + - uses: actions/download-artifact@v4 + with: + name: TTMetal_build_${{ matrix.test-group.arch }} + - name: Extract files + run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar + - uses: ./.github/actions/install-python-deps + - name: Run model perf regression tests + timeout-minutes: 60 + run: | + source ${{ github.workspace }}/python_env/bin/activate + cd $TT_METAL_HOME + export PYTHONPATH=$TT_METAL_HOME + ${{ matrix.test-group.cmd }} + - name: Check perf report exists + id: check-perf-report + if: ${{ !cancelled() }} + run: | + ls -hal + export PERF_REPORT_FILENAME=Models_Perf_$(date +%Y_%m_%d).csv + ls -hal $PERF_REPORT_FILENAME + echo "perf_report_filename=$PERF_REPORT_FILENAME" >> "$GITHUB_OUTPUT" + - name: Upload perf report + if: ${{ !cancelled() && steps.check-perf-report.conclusion == 'success' }} + uses: actions/upload-artifact@v4 + with: + name: perf-report-csv-${{ matrix.test-group.model-type }}-${{ matrix.test-group.arch }}-${{ matrix.test-group.machine-type }} + path: "${{ steps.check-perf-report.outputs.perf_report_filename }}" + - name: Disable performance mode + if: always() + run: | + sudo cpupower frequency-set -g ondemand diff --git a/.github/workflows/tgg-model-perf-tests.yaml b/.github/workflows/tgg-model-perf-tests.yaml index 259fb3fa7b7b..c65fc7408d6b 100644 --- a/.github/workflows/tgg-model-perf-tests.yaml +++ b/.github/workflows/tgg-model-perf-tests.yaml @@ -13,75 +13,5 @@ jobs: secrets: inherit tgg-model-perf-tests: needs: build-artifact - strategy: - fail-fast: false - matrix: - test-group: [ - { - name: "TGG LLM model perf tests", - model-type: "LLM", - arch: wormhole_b0, - runs-on: ["arch-wormhole_b0", "config-tgg", "in-service", "bare-metal", "pipeline-perf"], - cmd: './tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type llm_model_perf_tgg_device --dispatch-mode ""' - }, - { - name: "TGG CNN model perf tests", - model-type: "CNN", - arch: wormhole_b0, - runs-on: ["arch-wormhole_b0", "config-tgg", "in-service", "bare-metal", "pipeline-perf"], - cmd: './tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type cnn_model_perf_tgg_device --dispatch-mode ""' - }, - ] - name: ${{ matrix.test-group.name }} - env: - TT_METAL_ENV: ${{ vars.TT_METAL_ENV }} - ARCH_NAME: ${{ matrix.test-group.arch }} - LOGURU_LEVEL: INFO - LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib - environment: dev - runs-on: ${{ matrix.test-group.runs-on }} - steps: - - uses: tenstorrent-metal/metal-workflows/.github/actions/checkout-with-submodule-lfs@v2.0.0 - - name: Enable performance mode - run: | - sudo cpupower frequency-set -g performance - - name: Ensure weka mount is active - run: | - sudo systemctl restart mnt-MLPerf.mount - sudo /etc/rc.local - ls -al /mnt/MLPerf/bit_error_tests - - name: Set up dynamic env vars for build - run: | - echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV - echo "PYTHONPATH=$(pwd)" >> $GITHUB_ENV - - uses: actions/download-artifact@v4 - with: - name: TTMetal_build_${{ matrix.test-group.arch }} - - name: Extract files - run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar - - uses: ./.github/actions/install-python-deps - - name: Run model perf regression tests - timeout-minutes: 60 - run: | - source ${{ github.workspace }}/python_env/bin/activate - cd $TT_METAL_HOME - export PYTHONPATH=$TT_METAL_HOME - ${{ matrix.test-group.cmd }} - - name: Check perf report exists - id: check-perf-report - if: ${{ !cancelled() }} - run: | - ls -hal - export PERF_REPORT_FILENAME=Models_Perf_$(date +%Y_%m_%d).csv - ls -hal $PERF_REPORT_FILENAME - echo "perf_report_filename=$PERF_REPORT_FILENAME" >> "$GITHUB_OUTPUT" - - name: Upload perf report - if: ${{ !cancelled() && steps.check-perf-report.conclusion == 'success' }} - uses: actions/upload-artifact@v4 - with: - name: perf-report-csv-${{ matrix.test-group.model-type }}-${{ matrix.test-group.arch }}-${{ matrix.test-group.machine-type }} - path: "${{ steps.check-perf-report.outputs.perf_report_filename }}" - - name: Disable performance mode - if: always() - run: | - sudo cpupower frequency-set -g ondemand + secrets: inherit + uses: ./.github/workflows/tgg-model-perf-tests-impl.yaml