Skip to content

Commit

Permalink
#0: add T3k profiler and TG/TGG model perf workflows to new pipeline …
Browse files Browse the repository at this point in the history
…select workflows
  • Loading branch information
TT-billteng committed Oct 3, 2024
1 parent b079f8c commit 12137a3
Show file tree
Hide file tree
Showing 8 changed files with 238 additions and 183 deletions.
10 changes: 10 additions & 0 deletions .github/workflows/pipeline-select-t3k.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,11 @@ on:
required: false
type: boolean
default: false
t3000-profiler:
description: "T3000 profiler tests (requires tracy build)"
required: false
type: boolean
default: false

run-name: ${{ inputs.description }}
jobs:
Expand Down Expand Up @@ -70,3 +75,8 @@ jobs:
secrets: inherit
uses: ./.github/workflows/t3000-model-perf-tests-impl.yaml
if: ${{ inputs.t3000-model-perf }}
t3000-profiler-tests:
needs: build-artifact
secrets: inherit
uses: ./.github/workflows/t3000-profiler-tests-impl.yaml
if: ${{ inputs.t3000-profiler }}
18 changes: 18 additions & 0 deletions .github/workflows/pipeline-select.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,10 @@ on:
required: false
type: boolean
default: false
tgg-model-perf:
required: false
type: boolean
default: false
tg-unit:
required: false
type: boolean
Expand All @@ -49,6 +53,10 @@ on:
required: false
type: boolean
default: false
tg-model-perf:
required: false
type: boolean
default: false

run-name: ${{ inputs.description }}
jobs:
Expand Down Expand Up @@ -88,6 +96,11 @@ jobs:
secrets: inherit
uses: ./.github/workflows/tgg-frequent-tests-impl.yaml
if: ${{ inputs.tgg-frequent }}
tgg-model-perf-tests:
needs: build-artifact
secrets: inherit
uses: ./.github/workflows/tgg-model-perf-tests-impl.yaml
if: ${{ inputs.tgg-model-perf }}
tg-unit-tests:
needs: build-artifact
secrets: inherit
Expand All @@ -98,3 +111,8 @@ jobs:
secrets: inherit
uses: ./.github/workflows/tg-frequent-tests-impl.yaml
if: ${{ inputs.tg-frequent }}
tg-model-perf-tests:
needs: build-artifact
secrets: inherit
uses: ./.github/workflows/tg-model-perf-tests-impl.yaml
if: ${{ inputs.tg-model-perf }}
46 changes: 46 additions & 0 deletions .github/workflows/t3000-profiler-tests-impl.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
name: "[internal] T3000 profiler tests impl"

on:
workflow_call:

jobs:
t3000-profiler-tests:
strategy:
fail-fast: false
matrix:
test-group: [
{
name: "T3000 profiler tests",
arch: wormhole_b0,
runs-on: ["arch-wormhole_b0", "config-t3000", "in-service", "pipeline-perf"],
cmd: './tests/scripts/run_profiler_regressions.sh'
},
]
name: ${{ matrix.test-group.name }}
env:
TT_METAL_ENV: ${{ vars.TT_METAL_ENV }}
ARCH_NAME: ${{ matrix.test-group.arch }}
LOGURU_LEVEL: INFO
LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib
environment: dev
runs-on: ${{ matrix.test-group.runs-on }}
steps:
- uses: tenstorrent-metal/metal-workflows/.github/actions/checkout-with-submodule-lfs@v2.0.0
- name: Set up dynamic env vars for build
run: |
echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
- uses: actions/download-artifact@v4
with:
name: TTMetal_build_${{ matrix.test-group.arch }}_profiler
- name: Extract files
run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
- uses: ./.github/actions/install-python-deps
- name: Run profiler regression tests
timeout-minutes: 30
run: |
./tests/scripts/run_profiler_regressions.sh
- uses: ./.github/actions/slack-report
if: ${{ failure() }}
with:
slack_webhook_url: ${{ secrets.SLACK_WEBHOOK_URL }}
owner: U03BJ1L3LUQ # Mo Memarian
41 changes: 2 additions & 39 deletions .github/workflows/t3000-profiler-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,42 +15,5 @@ jobs:
secrets: inherit
t3000-profiler-tests:
needs: build-artifact-profiler
strategy:
fail-fast: false
matrix:
test-group: [
{
name: "T3000 profiler tests",
arch: wormhole_b0,
runs-on: ["arch-wormhole_b0", "config-t3000", "in-service", "pipeline-perf"],
cmd: './tests/scripts/run_profiler_regressions.sh'
},
]
name: ${{ matrix.test-group.name }}
env:
TT_METAL_ENV: ${{ vars.TT_METAL_ENV }}
ARCH_NAME: ${{ matrix.test-group.arch }}
LOGURU_LEVEL: INFO
LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib
environment: dev
runs-on: ${{ matrix.test-group.runs-on }}
steps:
- uses: tenstorrent-metal/metal-workflows/.github/actions/checkout-with-submodule-lfs@v2.0.0
- name: Set up dynamic env vars for build
run: |
echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
- uses: actions/download-artifact@v4
with:
name: TTMetal_build_${{ matrix.test-group.arch }}_profiler
- name: Extract files
run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
- uses: ./.github/actions/install-python-deps
- name: Run profiler regression tests
timeout-minutes: 30
run: |
./tests/scripts/run_profiler_regressions.sh
- uses: ./.github/actions/slack-report
if: ${{ failure() }}
with:
slack_webhook_url: ${{ secrets.SLACK_WEBHOOK_URL }}
owner: U03BJ1L3LUQ # Mo Memarian
secrets: inherit
uses: ./.github/workflows/t3000-profiler-tests-impl.yaml
79 changes: 79 additions & 0 deletions .github/workflows/tg-model-perf-tests-impl.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
name: "[internal] TG model perf tests impl"

on:
workflow_call:

jobs:
tg-model-perf-tests:
strategy:
fail-fast: false
matrix:
test-group: [
{
name: "TG LLM model perf tests",
model-type: "LLM",
arch: wormhole_b0,
runs-on: ["arch-wormhole_b0", "config-tg", "in-service", "bare-metal", "pipeline-perf"],
cmd: './tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type llm_model_perf_tg_device --dispatch-mode ""'
},
{
name: "TG CNN model perf tests",
model-type: "CNN",
arch: wormhole_b0,
runs-on: ["arch-wormhole_b0", "config-tg", "in-service", "bare-metal", "pipeline-perf"],
cmd: './tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type cnn_model_perf_tg_device --dispatch-mode ""'
},
]
name: ${{ matrix.test-group.name }}
env:
TT_METAL_ENV: ${{ vars.TT_METAL_ENV }}
ARCH_NAME: ${{ matrix.test-group.arch }}
LOGURU_LEVEL: INFO
LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib
environment: dev
runs-on: ${{ matrix.test-group.runs-on }}
steps:
- uses: tenstorrent-metal/metal-workflows/.github/actions/checkout-with-submodule-lfs@v2.0.0
- name: Enable performance mode
run: |
sudo cpupower frequency-set -g performance
- name: Ensure weka mount is active
run: |
sudo systemctl restart mnt-MLPerf.mount
sudo /etc/rc.local
ls -al /mnt/MLPerf/bit_error_tests
- name: Set up dynamic env vars for build
run: |
echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
echo "PYTHONPATH=$(pwd)" >> $GITHUB_ENV
- uses: actions/download-artifact@v4
with:
name: TTMetal_build_${{ matrix.test-group.arch }}
- name: Extract files
run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
- uses: ./.github/actions/install-python-deps
- name: Run model perf regression tests
timeout-minutes: 60
run: |
source ${{ github.workspace }}/python_env/bin/activate
cd $TT_METAL_HOME
export PYTHONPATH=$TT_METAL_HOME
${{ matrix.test-group.cmd }}
- name: Check perf report exists
id: check-perf-report
if: ${{ !cancelled() }}
run: |
ls -hal
export PERF_REPORT_FILENAME=Models_Perf_$(date +%Y_%m_%d).csv
ls -hal $PERF_REPORT_FILENAME
echo "perf_report_filename=$PERF_REPORT_FILENAME" >> "$GITHUB_OUTPUT"
- name: Upload perf report
if: ${{ !cancelled() && steps.check-perf-report.conclusion == 'success' }}
uses: actions/upload-artifact@v4
with:
name: perf-report-csv-${{ matrix.test-group.model-type }}-${{ matrix.test-group.arch }}-${{ matrix.test-group.machine-type }}
path: "${{ steps.check-perf-report.outputs.perf_report_filename }}"
- name: Disable performance mode
if: always()
run: |
sudo cpupower frequency-set -g ondemand
74 changes: 2 additions & 72 deletions .github/workflows/tg-model-perf-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,75 +13,5 @@ jobs:
secrets: inherit
tg-model-perf-tests:
needs: build-artifact
strategy:
fail-fast: false
matrix:
test-group: [
{
name: "TG LLM model perf tests",
model-type: "LLM",
arch: wormhole_b0,
runs-on: ["arch-wormhole_b0", "config-tg", "in-service", "bare-metal", "pipeline-perf"],
cmd: './tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type llm_model_perf_tg_device --dispatch-mode ""'
},
{
name: "TG CNN model perf tests",
model-type: "CNN",
arch: wormhole_b0,
runs-on: ["arch-wormhole_b0", "config-tg", "in-service", "bare-metal", "pipeline-perf"],
cmd: './tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type cnn_model_perf_tg_device --dispatch-mode ""'
},
]
name: ${{ matrix.test-group.name }}
env:
TT_METAL_ENV: ${{ vars.TT_METAL_ENV }}
ARCH_NAME: ${{ matrix.test-group.arch }}
LOGURU_LEVEL: INFO
LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib
environment: dev
runs-on: ${{ matrix.test-group.runs-on }}
steps:
- uses: tenstorrent-metal/metal-workflows/.github/actions/checkout-with-submodule-lfs@v2.0.0
- name: Enable performance mode
run: |
sudo cpupower frequency-set -g performance
- name: Ensure weka mount is active
run: |
sudo systemctl restart mnt-MLPerf.mount
sudo /etc/rc.local
ls -al /mnt/MLPerf/bit_error_tests
- name: Set up dynamic env vars for build
run: |
echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
echo "PYTHONPATH=$(pwd)" >> $GITHUB_ENV
- uses: actions/download-artifact@v4
with:
name: TTMetal_build_${{ matrix.test-group.arch }}
- name: Extract files
run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
- uses: ./.github/actions/install-python-deps
- name: Run model perf regression tests
timeout-minutes: 60
run: |
source ${{ github.workspace }}/python_env/bin/activate
cd $TT_METAL_HOME
export PYTHONPATH=$TT_METAL_HOME
${{ matrix.test-group.cmd }}
- name: Check perf report exists
id: check-perf-report
if: ${{ !cancelled() }}
run: |
ls -hal
export PERF_REPORT_FILENAME=Models_Perf_$(date +%Y_%m_%d).csv
ls -hal $PERF_REPORT_FILENAME
echo "perf_report_filename=$PERF_REPORT_FILENAME" >> "$GITHUB_OUTPUT"
- name: Upload perf report
if: ${{ !cancelled() && steps.check-perf-report.conclusion == 'success' }}
uses: actions/upload-artifact@v4
with:
name: perf-report-csv-${{ matrix.test-group.model-type }}-${{ matrix.test-group.arch }}-${{ matrix.test-group.machine-type }}
path: "${{ steps.check-perf-report.outputs.perf_report_filename }}"
- name: Disable performance mode
if: always()
run: |
sudo cpupower frequency-set -g ondemand
secrets: inherit
uses: ./.github/workflows/tg-model-perf-tests-impl.yaml
79 changes: 79 additions & 0 deletions .github/workflows/tgg-model-perf-tests-impl.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
name: "[internal] TGG model perf tests impl"

on:
workflow_call:

jobs:
tgg-model-perf-tests:
strategy:
fail-fast: false
matrix:
test-group: [
{
name: "TGG LLM model perf tests",
model-type: "LLM",
arch: wormhole_b0,
runs-on: ["arch-wormhole_b0", "config-tgg", "in-service", "bare-metal", "pipeline-perf"],
cmd: './tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type llm_model_perf_tgg_device --dispatch-mode ""'
},
{
name: "TGG CNN model perf tests",
model-type: "CNN",
arch: wormhole_b0,
runs-on: ["arch-wormhole_b0", "config-tgg", "in-service", "bare-metal", "pipeline-perf"],
cmd: './tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type cnn_model_perf_tgg_device --dispatch-mode ""'
},
]
name: ${{ matrix.test-group.name }}
env:
TT_METAL_ENV: ${{ vars.TT_METAL_ENV }}
ARCH_NAME: ${{ matrix.test-group.arch }}
LOGURU_LEVEL: INFO
LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib
environment: dev
runs-on: ${{ matrix.test-group.runs-on }}
steps:
- uses: tenstorrent-metal/metal-workflows/.github/actions/checkout-with-submodule-lfs@v2.0.0
- name: Enable performance mode
run: |
sudo cpupower frequency-set -g performance
- name: Ensure weka mount is active
run: |
sudo systemctl restart mnt-MLPerf.mount
sudo /etc/rc.local
ls -al /mnt/MLPerf/bit_error_tests
- name: Set up dynamic env vars for build
run: |
echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
echo "PYTHONPATH=$(pwd)" >> $GITHUB_ENV
- uses: actions/download-artifact@v4
with:
name: TTMetal_build_${{ matrix.test-group.arch }}
- name: Extract files
run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
- uses: ./.github/actions/install-python-deps
- name: Run model perf regression tests
timeout-minutes: 60
run: |
source ${{ github.workspace }}/python_env/bin/activate
cd $TT_METAL_HOME
export PYTHONPATH=$TT_METAL_HOME
${{ matrix.test-group.cmd }}
- name: Check perf report exists
id: check-perf-report
if: ${{ !cancelled() }}
run: |
ls -hal
export PERF_REPORT_FILENAME=Models_Perf_$(date +%Y_%m_%d).csv
ls -hal $PERF_REPORT_FILENAME
echo "perf_report_filename=$PERF_REPORT_FILENAME" >> "$GITHUB_OUTPUT"
- name: Upload perf report
if: ${{ !cancelled() && steps.check-perf-report.conclusion == 'success' }}
uses: actions/upload-artifact@v4
with:
name: perf-report-csv-${{ matrix.test-group.model-type }}-${{ matrix.test-group.arch }}-${{ matrix.test-group.machine-type }}
path: "${{ steps.check-perf-report.outputs.perf_report_filename }}"
- name: Disable performance mode
if: always()
run: |
sudo cpupower frequency-set -g ondemand
Loading

0 comments on commit 12137a3

Please sign in to comment.