Merge branch 'main' into npetrovic/leaky-relu

tenstorrent · Oct 23, 2024 · ad15612 · ad15612
2 parents d45fe50 + e9272c1
commit ad15612
Show file tree

Hide file tree

Showing 201 changed files with 5,391 additions and 1,505 deletions.
diff --git a/.github/workflows/_produce-data.yaml b/.github/workflows/_produce-data.yaml
@@ -12,6 +12,10 @@ on:
         description: "Run attempt of the workflow run"
         default: 1
         type: number
+      upload_data:
+        description: "Upload data to datastore cluster for our dashboard"
+        default: false
+        type: boolean
   workflow_run:
     workflows:
       - "All post-commit tests"
@@ -33,6 +37,8 @@ on:
       - "(TGG) TGG unit tests"
       - "(TGG) TGG demo tests"
       - "(TGG) TGG frequent tests"
+      - "ttnn - Run sweeps"
+      - "Blackhole post-commit tests"
     types:
       - completed
 
@@ -111,7 +117,7 @@ jobs:
         run: ls -hal
       - name: Upload cicd data
         uses: ./.github/actions/upload-data-via-sftp
-        if: ${{ github.event_name == 'workflow_run' }}
+        if: ${{ github.event_name == 'workflow_run' || inputs.upload_data }}
         with:
           ssh-private-key: ${{ secrets.SFTP_CICD_WRITER_KEY }}
           sftp-batchfile: .github/actions/upload-data-via-sftp/cicd_data_batchfile.txt

diff --git a/.github/workflows/single-card-demo-tests.yaml b/.github/workflows/single-card-demo-tests.yaml
@@ -4,7 +4,7 @@ on:
   workflow_dispatch:
   workflow_call:
   schedule:
-    - cron: "0 0 * * 1,2,3,4,5"
+    - cron: "0 */6 * * 1,2,3,4,5"
     - cron: "0 */4 * * 0,6"
 
 jobs:

diff --git a/.github/workflows/stress-fast-dispatch-build-and-unit-tests.yaml b/.github/workflows/stress-fast-dispatch-build-and-unit-tests.yaml
@@ -19,8 +19,6 @@ jobs:
       fail-fast: false
       matrix:
         runner-info: [
-          {arch: grayskull, runs-on: ["pipeline-stress", "E150", "bare-metal", "in-service"], machine-type: "bare_metal", name: "E150"},
-          {arch: wormhole_b0, runs-on: ["pipeline-stress", "N300", "bare-metal", "in-service"], machine-type: "bare_metal", name: "N300"},
           # E150
           {arch: grayskull, runs-on: ["cloud-virtual-machine", "E150", "in-service"], machine-type: "virtual_machine", name: "E150"},
           # N150

diff --git a/.github/workflows/ttnn-run-sweeps.yaml b/.github/workflows/ttnn-run-sweeps.yaml
@@ -166,6 +166,10 @@ on:
           - eltwise.unary.hardtanh.hardtanh_pytorch2
           - eltwise.unary.leaky_relu.leaky_relu
           - eltwise.unary.reglu.reglu
+          - eltwise.unary_complex.polar.polar
+          - eltwise.unary_complex.angle.angle
+          - eltwise.unary_complex.polar_bw.polar_bw
+          - eltwise.unary_complex.angle_bw.angle_bw
           - eltwise.binary.subtract.subtract
           - eltwise.binary.subtract.subtract_tensor_pytorch2
           - eltwise.binary.multiply.multiply

diff --git a/CODEOWNERS b/CODEOWNERS
@@ -22,7 +22,7 @@ third_party/ @tt-rkim @TT-billteng
 MANIFEST.in @tt-rkim
 setup.py @tt-rkim
 pyproject.toml @tt-rkim @TT-billteng
-requirements*.txt @tt-rkim @TT-billteng
+requirements*.txt @tt-rkim @TT-billteng @ttmchiou
 setup_hugepages.py @tt-rkim @TT-billteng
 
 scripts/docker @TT-billteng
@@ -55,6 +55,7 @@ tt_metal/ @abhullar-tt @pgkeller @aliuTT @tt-aho @tt-dma @tt-asaigal @ubcheema
 tt_metal/host_api.hpp @abhullar-tt @pgkeller @aliuTT @tt-aho @tt-dma @tt-asaigal @davorchap
 tt_metal/impl/device/ @abhullar-tt @pgkeller @aliuTT @tt-aho @tt-dma @tt-asaigal @ubcheema @davorchap @cfjchu
 tt_metal/distributed/ @cfjchu @aliuTT @tt-asaigal
+tt_metal/**/requirements*.txt @tt-rkim @TT-billteng @ttmchiou
 
 # metal - dispatch
 tt_metal/impl/dispatch/kernels/packet_* @ubcheema @aliuTT

diff --git a/README.md b/README.md
@@ -21,21 +21,20 @@
 ---
 
 ## LLMs
-| Model                                                         | Batch | Hardware                                                 | ttft (s) | t/s/u | Target<br>t/s/u | t/s    | Release                                                                   |
+| Model                                                         | Batch | Hardware                                                 | ttft (ms) | t/s/u | Target<br>t/s/u | t/s    | Release                                                                   |
 |---------------------------------------------------------------|-------|----------------------------------------------------------|----------|-------|-----------------|--------|---------------------------------------------------------------------------|
 | [Falcon7B-decode](./models/demos/ttnn_falcon7b)               | 32    | [e150](https://tenstorrent.com/hardware/grayskull)       |          | 4.2   | 4.4             | 134.4  |                                                                           |
-| [Falcon7B](./models/demos/wormhole/falcon7b)                  | 32    | [n150](https://tenstorrent.com/hardware/wormhole)        | 0.07     | 16.7  | 26              | 534.4  | [v0.52.0-rc31](https://github.com/tenstorrent/tt-metal/tree/v0.52.0-rc31) |
+| [Falcon7B](./models/demos/wormhole/falcon7b)                  | 32    | [n150](https://tenstorrent.com/hardware/wormhole)        | 75       | 17.0  | 26              | 544.0  | [v0.53.0-rc16](https://github.com/tenstorrent/tt-metal/tree/v0.53.0-rc16) |
 | [Mistral-7B](./models/demos/wormhole/mistral7b)               | 32    | [n150](https://tenstorrent.com/hardware/wormhole)        |          | 9.9   | 25              | 316.8  | [v0.51.0-rc28](https://github.com/tenstorrent/tt-metal/tree/v0.51.0-rc28) |
-| [Mamba-2.8B](./models/demos/wormhole/mamba)                   | 32    | [n150](https://tenstorrent.com/hardware/wormhole)        | 0.04     | 12.3  | 41              | 393.6  | [v0.51.0-rc26](https://github.com/tenstorrent/tt-metal/tree/v0.51.0-rc26) |
-| [LLaMA-3.1-8B](./models/demos/wormhole/llama31_8b)            | 1     | [n150](https://tenstorrent.com/hardware/wormhole)        | 0.20     | 21.4  | 23              | 21.4   | [v0.52.0-rc31](https://github.com/tenstorrent/tt-metal/tree/v0.52.0-rc31) |
-| [Falcon7B (DP=8)](./models/demos/t3000/falcon7b)              | 256   | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | 0.10     | 14.4  | 26              | 3686.4 | [v0.52.0-rc31](https://github.com/tenstorrent/tt-metal/tree/v0.52.0-rc31) |
-| [LLaMA-2-70B - (TP=8)](./models/demos/t3000/llama2_70b)       | 32    | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | 0.19     | 15.1  | 20              | 483.2  | [v0.52.0-rc31](https://github.com/tenstorrent/tt-metal/tree/v0.52.0-rc31) |
-| [LLaMA-3.1-70B (TP=8)](./models/demos/t3000/llama3_70b)       | 32    | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | 0.19     | 15.1  | 20              | 483.2  | [v0.52.0-rc31](https://github.com/tenstorrent/tt-metal/tree/v0.52.0-rc31) |
-| [Falcon40B (TP=8)](./models/demos/t3000/falcon40b)            | 32    | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) |          | 5.3   | 36              | 169.6  | [v0.52.0-rc31](https://github.com/tenstorrent/tt-metal/tree/v0.52.0-rc31) |
-| [Mixtral7Bx8 (TP=8)](./models/demos/t3000/mixtral8x7b)        | 32    | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | 0.23     | 14.2  | 33              | 454.4  | [v0.52.0-rc31](https://github.com/tenstorrent/tt-metal/tree/v0.52.0-rc31) |
-| [Falcon7B (DP=32)](./models/demos/tg/falcon7b)                | 1024  | [Galaxy](https://tenstorrent.com/hardware/galaxy)        | 0.24     | 4.4   | 26              | 4505.6 | [v0.52.0-rc31](https://github.com/tenstorrent/tt-metal/tree/v0.52.0-rc31) |
-| [LLaMA-3.1-70B (DP=4, TP=8)](./models/demos/t3000/llama3_70b) | 128   | [Galaxy](https://tenstorrent.com/hardware/galaxy)        | 0.19     | 14.3  | 20              | 1835.5 | [v0.52.0-rc31](https://github.com/tenstorrent/tt-metal/tree/v0.52.0-rc31) |
-> **Last Update:** October 7, 2024
+| [Mamba-2.8B](./models/demos/wormhole/mamba)                   | 32    | [n150](https://tenstorrent.com/hardware/wormhole)        | 48       | 12.3  | 41              | 393.6  | [v0.51.0-rc26](https://github.com/tenstorrent/tt-metal/tree/v0.51.0-rc26) |
+| [LLaMA-3.1-8B](./models/demos/wormhole/llama31_8b)            | 1     | [n150](https://tenstorrent.com/hardware/wormhole)        | 291      | 22.9  | 23              | 22.9   | [v0.53.0-rc16](https://github.com/tenstorrent/tt-metal/tree/v0.53.0-rc16) |
+| [Falcon7B (DP=8)](./models/demos/t3000/falcon7b)              | 256   | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | 101      | 14.4  | 26              | 3686.4 | [v0.53.0-rc16](https://github.com/tenstorrent/tt-metal/tree/v0.53.0-rc16) |
+| [LLaMA-3.1-70B (TP=8)](./models/demos/t3000/llama3_70b)       | 32    | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | 190      | 15.1  | 20              | 483.2  | [v0.53.0-rc16](https://github.com/tenstorrent/tt-metal/tree/v0.53.0-rc16) |
+| [Falcon40B (TP=8)](./models/demos/t3000/falcon40b)            | 32    | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) |          | 5.3   | 36              | 169.6  | [v0.53.0-rc16](https://github.com/tenstorrent/tt-metal/tree/v0.53.0-rc16) |
+| [Mixtral7Bx8 (TP=8)](./models/demos/t3000/mixtral8x7b)        | 32    | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | 235      | 14.2  | 33              | 454.4  | [v0.53.0-rc16](https://github.com/tenstorrent/tt-metal/tree/v0.53.0-rc16) |
+| [Falcon7B (DP=32)](./models/demos/tg/falcon7b)                | 1024  | [Galaxy](https://tenstorrent.com/hardware/galaxy)        | 242      | 4.4   | 26              | 4505.6 | [v0.53.0-rc16](https://github.com/tenstorrent/tt-metal/tree/v0.53.0-rc16) |
+| [LLaMA-3.1-70B (DP=4, TP=8)](./models/demos/t3000/llama3_70b) | 128   | [Galaxy](https://tenstorrent.com/hardware/galaxy)        | 190      | 14.3  | 20              | 1835.5 | [v0.52.0-rc31](https://github.com/tenstorrent/tt-metal/tree/v0.52.0-rc31) |
+> **Last Update:** October 21, 2024
 
 > **Notes:**
 > - TP = Tensor Parallel, DP = Data Parallel; Defines parallelization factors across multiple devices.
@@ -54,6 +53,8 @@
 | [ViT](./models/demos/grayskull/vit)                                         | 9     | [e150](https://tenstorrent.com/hardware/grayskull)       | 1,360   | 2,000      |             |
 | [ViT](./models/demos/wormhole/vit)                                          | 8     | [n150](https://tenstorrent.com/hardware/wormhole)        | 912     | 1,600      |             |
 | [Stable Diffusion 1.4 (512x512)](./models/demos/wormhole/stable_diffusion)  | 1     | [n150](https://tenstorrent.com/hardware/wormhole)        | 0.167   | 0.3        |             |
+| [U-Net](./models/experimental/functional_unet)                              | 2     | [n150](https://tenstorrent.com/hardware/wormhole)        | 530     | 1000       | [v0.53.0-rc22](https://github.com/tenstorrent/tt-metal/tree/v0.53.0-rc22) |
+
 
 ## NLPs
 | Model                                               | Batch | Hardware                                           | sen/sec | Target sen/sec | Release |
@@ -70,6 +71,7 @@ For the latest model updates and features, please see [MODEL_UPDATES.md](models/
 - [Advanced Performance Optimizations for Models](./tech_reports/AdvancedPerformanceOperationsForModels/AdvancedPerformanceOptimizationsForModels.md) (updated Oct 17th)
 - [Programming Mesh of Devices](./tech_reports/Programming%20Mesh%20of%20Devices/Programming%20Mesh%20of%20Devices%20with%20TT-NN.md) (updated Sept 9th)
 - [ViT Implementation in TT-NN on GS](./tech_reports/ViT-TTNN/vit.md)  (updated Sept 22nd)
+- [LLMs Bring up in TT-NN](./tech_reports/LLMs/llms.md)  (updated Oct 29th)
 ---
 
 <div align="center">

diff --git a/models/MODEL_UPDATES.md b/models/MODEL_UPDATES.md
@@ -4,6 +4,11 @@
 >
 > Please refer to the front-page [README](../README.md) for the latest verified release for each model.
 
+## October 21, 2024
+
+### [Llama 3/3.1 - 70B](demos/t3000/llama3_70b)
+- Enabled prefill workloads to pad to multiples of 1024 instead of powers of 2, improving overall performance for longer sequences
+
 ## October 7, 2024
 
 ### [Llama 3.1 - 8B](demos/wormhole/llama31_8b)

diff --git a/tech_reports/LLMs/llms.md b/tech_reports/LLMs/llms.md
@@ -0,0 +1,112 @@
+# LLMs in TT-NN
+Authors: 
+## Contents
+- [LLMs in TT-NN](#llms-in-tt-nn)
+  - [Contents](#contents)
+  - [1. Overview](#1-overview)
+  - [2. Modules](#2-modules)
+    - [2.1 Embedding](#21-embedding)
+    - [2.2 RoPE](#22-rope)
+    - [2.3 Norm](#23-norm) 
+    - [2.4 Attention](#24-attention)
+    - [2.5 MLP](#25-mlp)
+    - [2.6 Decoder](#26-decoder)
+    - [2.7 LM Head](#27-lm-head)
+  - [3. Features](#3-features)
+    - [3.1 Generative Decoding](#31-generative-decoding)
+    - [3.2 Prefill and Decode](#32-prefill-and-decode)
+    - [3.3 Multi-Device](#33-multi-device)
+    - [3.4 Continuous Batching](#34-continuous-batching)
+    - [3.5 vLLM Integration](#34-vllm-integration)
+  - [4. Best Practices and Optimizations](#4-best-practices-and-optimizations)
+    - [4.1 Tracing](#41-tracing)
+    - [4.2 Async Mode](#42-async-mode)
+    - [4.3 Multiple CQs](#43-multiple-cqs)
+    - [4.4 Op Configs](#44-op-configs)
+    - [4.5 Accuracy](#45-accuracy)
+    - [4.6 Performance Analysis](#46-performance-analysis)
+    - [4.7 Misc. Performance Optimizations](#47-misc-performance-optimizations)
+    - [4.8 Module Tests](#48-module-tests)
+    - [4.9 Performance Testing](#49-performance-testing)
+    - [4.10 Common Pitfalls](#410-common-pitfalls)
+      - [4.10.1 Error Messages](#4101-error-messages)
+      - [4.10.2 Shard Spec Mismatches](#4102-shard-spec-mismatches)
+      - [4.10.3 Ethernet Dispatch Cores](#4103-ethernet-dispatch-cores)
+      - [4.10.4 Hangs](#4104-hangs)
+        - [4.10.4.1 Tracing](#41041-tracing)
+        - [4.10.4.2 Large Matmuls](#41042-large-matmuls)
+
+## 1. Overview
+## 2. Modules
+### 2.1 Embedding
+### 2.2 RoPE
+  - Iterative update system
+  - When to use our fused op
+### 2.3 Norm
+  - Replicated layernorm vs distributed layernorm
+    - Layernorm/rmsnorm weights in row major / wrapped around tile size trick
+### 2.4 Attention
+  - Flash Attention and Flash Decode
+    - general description
+    - limitations
+    - which dims are parallelized
+### 2.5 MLP
+### 2.6 Decoder
+### 2.7 LM Head
+## 3. Features
+### 3.1 Generative Decoding
+### 3.2 Prefill and Decode
+  - submodules, tests
+  - how to combine prefill and decode, 
+  - slicing prefill to fit in L1
+### 3.3 Multi-Device
+  - device mesh
+  - column parallel followed by row parallel
+  - sharding, CCL ops, reducing CCL overheads, etc.
+### 3.4 Continuous Batching
+  - quick intro and how it is implemented in demos.
+### 3.5 vLLM Integration
+  - Our vLLM repo and what's needed to integrate with it.
+## 4. Best Practices and Optimizations
+### 4.1 Tracing
+  - link to existing doc, why it helps decode more
+### 4.2 Async Mode
+### 4.3 Multiple CQs
+  - how to feed back output to input and read output asyncronously
+### 4.4 Op Configs
+  - Writing correct program configs and shard specs 
+  - Deciding how many cores to run an op on
+    - Why did we use 16 cores for MLP
+  - Which matmul to use when @Colman Glagovich 
+    - 1d, 2d, dram-sharded, ...
+  - Implicitly padding weights in program config for matmuls
+### 4.5 Accuracy
+  - How we measure it (PCC, perplexity, top-1/top-5, end-user tests, benchmarking)
+  - How much PCC is enough? Rules of thumb.
+  - Accuracy tests
+  - Debugging PCC issues
+### 4.6 Performance Analysis
+  - Performance tooling, tracy
+### 4.7 Misc. Performance Optimizations
+  - Which dim to shard matmuls on
+  - DRAM-sharding
+  - Avoiding sharded to interleaved calls
+### 4.8 Module Tests
+### 4.9 Performance Testing
+### 4.10 Common Pitfalls
+#### 4.10.1 Error Messages
+  - Running out of L1
+  - Shard spec and program config mismatches
+  - For some TTNN ops (e.g. ttnn.all_gather) it's not supported to pass -1 in the dim argument. 
+    - You'll see an error related to op invocation where the arguments don't match
+#### 4.10.2 Shard Spec Mismatches
+#### 4.10.3 Ethernet Dispatch Cores
+  - link to any other description, and mention it is needed for N300 and T3K
+#### 4.10.4 Hangs
+##### 4.10.4.1 Tracing
+  - Host communications cause tracing to hang
+  - Running without async mode enabled causes tracing to hang
+  - Careful with print in tracing
+##### 4.10.4.2 Large Matmuls
+  - Large matmuls hanging? Link to appropriate ticket with workaround
+  - Issue is being investigated with a workaround of setting the output subblock to 1,1 and grid size to 8x7
diff --git a/tech_reports/MetalProfiler/media/profiler-diagram.png b/tech_reports/MetalProfiler/media/profiler-diagram.png