diff --git a/.github/workflows/_produce-data.yaml b/.github/workflows/_produce-data.yaml
index 6f35a04f007..fd547b44aa9 100644
--- a/.github/workflows/_produce-data.yaml
+++ b/.github/workflows/_produce-data.yaml
@@ -12,6 +12,10 @@ on:
description: "Run attempt of the workflow run"
default: 1
type: number
+ upload_data:
+ description: "Upload data to datastore cluster for our dashboard"
+ default: false
+ type: boolean
workflow_run:
workflows:
- "All post-commit tests"
@@ -33,6 +37,8 @@ on:
- "(TGG) TGG unit tests"
- "(TGG) TGG demo tests"
- "(TGG) TGG frequent tests"
+ - "ttnn - Run sweeps"
+ - "Blackhole post-commit tests"
types:
- completed
@@ -111,7 +117,7 @@ jobs:
run: ls -hal
- name: Upload cicd data
uses: ./.github/actions/upload-data-via-sftp
- if: ${{ github.event_name == 'workflow_run' }}
+ if: ${{ github.event_name == 'workflow_run' || inputs.upload_data }}
with:
ssh-private-key: ${{ secrets.SFTP_CICD_WRITER_KEY }}
sftp-batchfile: .github/actions/upload-data-via-sftp/cicd_data_batchfile.txt
diff --git a/.github/workflows/single-card-demo-tests.yaml b/.github/workflows/single-card-demo-tests.yaml
index 4918c0faa56..ef7c101d8fb 100644
--- a/.github/workflows/single-card-demo-tests.yaml
+++ b/.github/workflows/single-card-demo-tests.yaml
@@ -4,7 +4,7 @@ on:
workflow_dispatch:
workflow_call:
schedule:
- - cron: "0 0 * * 1,2,3,4,5"
+ - cron: "0 */6 * * 1,2,3,4,5"
- cron: "0 */4 * * 0,6"
jobs:
diff --git a/.github/workflows/stress-fast-dispatch-build-and-unit-tests.yaml b/.github/workflows/stress-fast-dispatch-build-and-unit-tests.yaml
index 81d24a83835..d2e1d8b63a3 100644
--- a/.github/workflows/stress-fast-dispatch-build-and-unit-tests.yaml
+++ b/.github/workflows/stress-fast-dispatch-build-and-unit-tests.yaml
@@ -19,8 +19,6 @@ jobs:
fail-fast: false
matrix:
runner-info: [
- {arch: grayskull, runs-on: ["pipeline-stress", "E150", "bare-metal", "in-service"], machine-type: "bare_metal", name: "E150"},
- {arch: wormhole_b0, runs-on: ["pipeline-stress", "N300", "bare-metal", "in-service"], machine-type: "bare_metal", name: "N300"},
# E150
{arch: grayskull, runs-on: ["cloud-virtual-machine", "E150", "in-service"], machine-type: "virtual_machine", name: "E150"},
# N150
diff --git a/.github/workflows/ttnn-run-sweeps.yaml b/.github/workflows/ttnn-run-sweeps.yaml
index 99cd17e0f01..1e8ed77978f 100644
--- a/.github/workflows/ttnn-run-sweeps.yaml
+++ b/.github/workflows/ttnn-run-sweeps.yaml
@@ -166,6 +166,10 @@ on:
- eltwise.unary.hardtanh.hardtanh_pytorch2
- eltwise.unary.leaky_relu.leaky_relu
- eltwise.unary.reglu.reglu
+ - eltwise.unary_complex.polar.polar
+ - eltwise.unary_complex.angle.angle
+ - eltwise.unary_complex.polar_bw.polar_bw
+ - eltwise.unary_complex.angle_bw.angle_bw
- eltwise.binary.subtract.subtract
- eltwise.binary.subtract.subtract_tensor_pytorch2
- eltwise.binary.multiply.multiply
diff --git a/CODEOWNERS b/CODEOWNERS
index e87255b0cb4..1a4b8906716 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -22,7 +22,7 @@ third_party/ @tt-rkim @TT-billteng
MANIFEST.in @tt-rkim
setup.py @tt-rkim
pyproject.toml @tt-rkim @TT-billteng
-requirements*.txt @tt-rkim @TT-billteng
+requirements*.txt @tt-rkim @TT-billteng @ttmchiou
setup_hugepages.py @tt-rkim @TT-billteng
scripts/docker @TT-billteng
@@ -55,6 +55,7 @@ tt_metal/ @abhullar-tt @pgkeller @aliuTT @tt-aho @tt-dma @tt-asaigal @ubcheema
tt_metal/host_api.hpp @abhullar-tt @pgkeller @aliuTT @tt-aho @tt-dma @tt-asaigal @davorchap
tt_metal/impl/device/ @abhullar-tt @pgkeller @aliuTT @tt-aho @tt-dma @tt-asaigal @ubcheema @davorchap @cfjchu
tt_metal/distributed/ @cfjchu @aliuTT @tt-asaigal
+tt_metal/**/requirements*.txt @tt-rkim @TT-billteng @ttmchiou
# metal - dispatch
tt_metal/impl/dispatch/kernels/packet_* @ubcheema @aliuTT
diff --git a/README.md b/README.md
index 5ffc2a53f70..00cf1dad925 100644
--- a/README.md
+++ b/README.md
@@ -21,21 +21,20 @@
---
## LLMs
-| Model | Batch | Hardware | ttft (s) | t/s/u | Target
t/s/u | t/s | Release |
+| Model | Batch | Hardware | ttft (ms) | t/s/u | Target
t/s/u | t/s | Release |
|---------------------------------------------------------------|-------|----------------------------------------------------------|----------|-------|-----------------|--------|---------------------------------------------------------------------------|
| [Falcon7B-decode](./models/demos/ttnn_falcon7b) | 32 | [e150](https://tenstorrent.com/hardware/grayskull) | | 4.2 | 4.4 | 134.4 | |
-| [Falcon7B](./models/demos/wormhole/falcon7b) | 32 | [n150](https://tenstorrent.com/hardware/wormhole) | 0.07 | 16.7 | 26 | 534.4 | [v0.52.0-rc31](https://github.com/tenstorrent/tt-metal/tree/v0.52.0-rc31) |
+| [Falcon7B](./models/demos/wormhole/falcon7b) | 32 | [n150](https://tenstorrent.com/hardware/wormhole) | 75 | 17.0 | 26 | 544.0 | [v0.53.0-rc16](https://github.com/tenstorrent/tt-metal/tree/v0.53.0-rc16) |
| [Mistral-7B](./models/demos/wormhole/mistral7b) | 32 | [n150](https://tenstorrent.com/hardware/wormhole) | | 9.9 | 25 | 316.8 | [v0.51.0-rc28](https://github.com/tenstorrent/tt-metal/tree/v0.51.0-rc28) |
-| [Mamba-2.8B](./models/demos/wormhole/mamba) | 32 | [n150](https://tenstorrent.com/hardware/wormhole) | 0.04 | 12.3 | 41 | 393.6 | [v0.51.0-rc26](https://github.com/tenstorrent/tt-metal/tree/v0.51.0-rc26) |
-| [LLaMA-3.1-8B](./models/demos/wormhole/llama31_8b) | 1 | [n150](https://tenstorrent.com/hardware/wormhole) | 0.20 | 21.4 | 23 | 21.4 | [v0.52.0-rc31](https://github.com/tenstorrent/tt-metal/tree/v0.52.0-rc31) |
-| [Falcon7B (DP=8)](./models/demos/t3000/falcon7b) | 256 | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | 0.10 | 14.4 | 26 | 3686.4 | [v0.52.0-rc31](https://github.com/tenstorrent/tt-metal/tree/v0.52.0-rc31) |
-| [LLaMA-2-70B - (TP=8)](./models/demos/t3000/llama2_70b) | 32 | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | 0.19 | 15.1 | 20 | 483.2 | [v0.52.0-rc31](https://github.com/tenstorrent/tt-metal/tree/v0.52.0-rc31) |
-| [LLaMA-3.1-70B (TP=8)](./models/demos/t3000/llama3_70b) | 32 | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | 0.19 | 15.1 | 20 | 483.2 | [v0.52.0-rc31](https://github.com/tenstorrent/tt-metal/tree/v0.52.0-rc31) |
-| [Falcon40B (TP=8)](./models/demos/t3000/falcon40b) | 32 | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | | 5.3 | 36 | 169.6 | [v0.52.0-rc31](https://github.com/tenstorrent/tt-metal/tree/v0.52.0-rc31) |
-| [Mixtral7Bx8 (TP=8)](./models/demos/t3000/mixtral8x7b) | 32 | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | 0.23 | 14.2 | 33 | 454.4 | [v0.52.0-rc31](https://github.com/tenstorrent/tt-metal/tree/v0.52.0-rc31) |
-| [Falcon7B (DP=32)](./models/demos/tg/falcon7b) | 1024 | [Galaxy](https://tenstorrent.com/hardware/galaxy) | 0.24 | 4.4 | 26 | 4505.6 | [v0.52.0-rc31](https://github.com/tenstorrent/tt-metal/tree/v0.52.0-rc31) |
-| [LLaMA-3.1-70B (DP=4, TP=8)](./models/demos/t3000/llama3_70b) | 128 | [Galaxy](https://tenstorrent.com/hardware/galaxy) | 0.19 | 14.3 | 20 | 1835.5 | [v0.52.0-rc31](https://github.com/tenstorrent/tt-metal/tree/v0.52.0-rc31) |
-> **Last Update:** October 7, 2024
+| [Mamba-2.8B](./models/demos/wormhole/mamba) | 32 | [n150](https://tenstorrent.com/hardware/wormhole) | 48 | 12.3 | 41 | 393.6 | [v0.51.0-rc26](https://github.com/tenstorrent/tt-metal/tree/v0.51.0-rc26) |
+| [LLaMA-3.1-8B](./models/demos/wormhole/llama31_8b) | 1 | [n150](https://tenstorrent.com/hardware/wormhole) | 291 | 22.9 | 23 | 22.9 | [v0.53.0-rc16](https://github.com/tenstorrent/tt-metal/tree/v0.53.0-rc16) |
+| [Falcon7B (DP=8)](./models/demos/t3000/falcon7b) | 256 | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | 101 | 14.4 | 26 | 3686.4 | [v0.53.0-rc16](https://github.com/tenstorrent/tt-metal/tree/v0.53.0-rc16) |
+| [LLaMA-3.1-70B (TP=8)](./models/demos/t3000/llama3_70b) | 32 | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | 190 | 15.1 | 20 | 483.2 | [v0.53.0-rc16](https://github.com/tenstorrent/tt-metal/tree/v0.53.0-rc16) |
+| [Falcon40B (TP=8)](./models/demos/t3000/falcon40b) | 32 | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | | 5.3 | 36 | 169.6 | [v0.53.0-rc16](https://github.com/tenstorrent/tt-metal/tree/v0.53.0-rc16) |
+| [Mixtral7Bx8 (TP=8)](./models/demos/t3000/mixtral8x7b) | 32 | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | 235 | 14.2 | 33 | 454.4 | [v0.53.0-rc16](https://github.com/tenstorrent/tt-metal/tree/v0.53.0-rc16) |
+| [Falcon7B (DP=32)](./models/demos/tg/falcon7b) | 1024 | [Galaxy](https://tenstorrent.com/hardware/galaxy) | 242 | 4.4 | 26 | 4505.6 | [v0.53.0-rc16](https://github.com/tenstorrent/tt-metal/tree/v0.53.0-rc16) |
+| [LLaMA-3.1-70B (DP=4, TP=8)](./models/demos/t3000/llama3_70b) | 128 | [Galaxy](https://tenstorrent.com/hardware/galaxy) | 190 | 14.3 | 20 | 1835.5 | [v0.52.0-rc31](https://github.com/tenstorrent/tt-metal/tree/v0.52.0-rc31) |
+> **Last Update:** October 21, 2024
> **Notes:**
> - TP = Tensor Parallel, DP = Data Parallel; Defines parallelization factors across multiple devices.
@@ -54,6 +53,8 @@
| [ViT](./models/demos/grayskull/vit) | 9 | [e150](https://tenstorrent.com/hardware/grayskull) | 1,360 | 2,000 | |
| [ViT](./models/demos/wormhole/vit) | 8 | [n150](https://tenstorrent.com/hardware/wormhole) | 912 | 1,600 | |
| [Stable Diffusion 1.4 (512x512)](./models/demos/wormhole/stable_diffusion) | 1 | [n150](https://tenstorrent.com/hardware/wormhole) | 0.167 | 0.3 | |
+| [U-Net](./models/experimental/functional_unet) | 2 | [n150](https://tenstorrent.com/hardware/wormhole) | 530 | 1000 | [v0.53.0-rc22](https://github.com/tenstorrent/tt-metal/tree/v0.53.0-rc22) |
+
## NLPs
| Model | Batch | Hardware | sen/sec | Target sen/sec | Release |
@@ -70,6 +71,7 @@ For the latest model updates and features, please see [MODEL_UPDATES.md](models/
- [Advanced Performance Optimizations for Models](./tech_reports/AdvancedPerformanceOperationsForModels/AdvancedPerformanceOptimizationsForModels.md) (updated Oct 17th)
- [Programming Mesh of Devices](./tech_reports/Programming%20Mesh%20of%20Devices/Programming%20Mesh%20of%20Devices%20with%20TT-NN.md) (updated Sept 9th)
- [ViT Implementation in TT-NN on GS](./tech_reports/ViT-TTNN/vit.md) (updated Sept 22nd)
+- [LLMs Bring up in TT-NN](./tech_reports/LLMs/llms.md) (updated Oct 29th)
---
diff --git a/models/MODEL_UPDATES.md b/models/MODEL_UPDATES.md
index feaa61cc031..aa8fc1d9232 100644
--- a/models/MODEL_UPDATES.md
+++ b/models/MODEL_UPDATES.md
@@ -4,6 +4,11 @@
>
> Please refer to the front-page [README](../README.md) for the latest verified release for each model.
+## October 21, 2024
+
+### [Llama 3/3.1 - 70B](demos/t3000/llama3_70b)
+- Enabled prefill workloads to pad to multiples of 1024 instead of powers of 2, improving overall performance for longer sequences
+
## October 7, 2024
### [Llama 3.1 - 8B](demos/wormhole/llama31_8b)
diff --git a/tech_reports/LLMs/llms.md b/tech_reports/LLMs/llms.md
new file mode 100644
index 00000000000..4b4a34f6a7c
--- /dev/null
+++ b/tech_reports/LLMs/llms.md
@@ -0,0 +1,112 @@
+# LLMs in TT-NN
+Authors:
+## Contents
+- [LLMs in TT-NN](#llms-in-tt-nn)
+ - [Contents](#contents)
+ - [1. Overview](#1-overview)
+ - [2. Modules](#2-modules)
+ - [2.1 Embedding](#21-embedding)
+ - [2.2 RoPE](#22-rope)
+ - [2.3 Norm](#23-norm)
+ - [2.4 Attention](#24-attention)
+ - [2.5 MLP](#25-mlp)
+ - [2.6 Decoder](#26-decoder)
+ - [2.7 LM Head](#27-lm-head)
+ - [3. Features](#3-features)
+ - [3.1 Generative Decoding](#31-generative-decoding)
+ - [3.2 Prefill and Decode](#32-prefill-and-decode)
+ - [3.3 Multi-Device](#33-multi-device)
+ - [3.4 Continuous Batching](#34-continuous-batching)
+ - [3.5 vLLM Integration](#34-vllm-integration)
+ - [4. Best Practices and Optimizations](#4-best-practices-and-optimizations)
+ - [4.1 Tracing](#41-tracing)
+ - [4.2 Async Mode](#42-async-mode)
+ - [4.3 Multiple CQs](#43-multiple-cqs)
+ - [4.4 Op Configs](#44-op-configs)
+ - [4.5 Accuracy](#45-accuracy)
+ - [4.6 Performance Analysis](#46-performance-analysis)
+ - [4.7 Misc. Performance Optimizations](#47-misc-performance-optimizations)
+ - [4.8 Module Tests](#48-module-tests)
+ - [4.9 Performance Testing](#49-performance-testing)
+ - [4.10 Common Pitfalls](#410-common-pitfalls)
+ - [4.10.1 Error Messages](#4101-error-messages)
+ - [4.10.2 Shard Spec Mismatches](#4102-shard-spec-mismatches)
+ - [4.10.3 Ethernet Dispatch Cores](#4103-ethernet-dispatch-cores)
+ - [4.10.4 Hangs](#4104-hangs)
+ - [4.10.4.1 Tracing](#41041-tracing)
+ - [4.10.4.2 Large Matmuls](#41042-large-matmuls)
+
+## 1. Overview
+## 2. Modules
+### 2.1 Embedding
+### 2.2 RoPE
+ - Iterative update system
+ - When to use our fused op
+### 2.3 Norm
+ - Replicated layernorm vs distributed layernorm
+ - Layernorm/rmsnorm weights in row major / wrapped around tile size trick
+### 2.4 Attention
+ - Flash Attention and Flash Decode
+ - general description
+ - limitations
+ - which dims are parallelized
+### 2.5 MLP
+### 2.6 Decoder
+### 2.7 LM Head
+## 3. Features
+### 3.1 Generative Decoding
+### 3.2 Prefill and Decode
+ - submodules, tests
+ - how to combine prefill and decode,
+ - slicing prefill to fit in L1
+### 3.3 Multi-Device
+ - device mesh
+ - column parallel followed by row parallel
+ - sharding, CCL ops, reducing CCL overheads, etc.
+### 3.4 Continuous Batching
+ - quick intro and how it is implemented in demos.
+### 3.5 vLLM Integration
+ - Our vLLM repo and what's needed to integrate with it.
+## 4. Best Practices and Optimizations
+### 4.1 Tracing
+ - link to existing doc, why it helps decode more
+### 4.2 Async Mode
+### 4.3 Multiple CQs
+ - how to feed back output to input and read output asyncronously
+### 4.4 Op Configs
+ - Writing correct program configs and shard specs
+ - Deciding how many cores to run an op on
+ - Why did we use 16 cores for MLP
+ - Which matmul to use when @Colman Glagovich
+ - 1d, 2d, dram-sharded, ...
+ - Implicitly padding weights in program config for matmuls
+### 4.5 Accuracy
+ - How we measure it (PCC, perplexity, top-1/top-5, end-user tests, benchmarking)
+ - How much PCC is enough? Rules of thumb.
+ - Accuracy tests
+ - Debugging PCC issues
+### 4.6 Performance Analysis
+ - Performance tooling, tracy
+### 4.7 Misc. Performance Optimizations
+ - Which dim to shard matmuls on
+ - DRAM-sharding
+ - Avoiding sharded to interleaved calls
+### 4.8 Module Tests
+### 4.9 Performance Testing
+### 4.10 Common Pitfalls
+#### 4.10.1 Error Messages
+ - Running out of L1
+ - Shard spec and program config mismatches
+ - For some TTNN ops (e.g. ttnn.all_gather) it's not supported to pass -1 in the dim argument.
+ - You'll see an error related to op invocation where the arguments don't match
+#### 4.10.2 Shard Spec Mismatches
+#### 4.10.3 Ethernet Dispatch Cores
+ - link to any other description, and mention it is needed for N300 and T3K
+#### 4.10.4 Hangs
+##### 4.10.4.1 Tracing
+ - Host communications cause tracing to hang
+ - Running without async mode enabled causes tracing to hang
+ - Careful with print in tracing
+##### 4.10.4.2 Large Matmuls
+ - Large matmuls hanging? Link to appropriate ticket with workaround
+ - Issue is being investigated with a workaround of setting the output subblock to 1,1 and grid size to 8x7
diff --git a/tech_reports/MetalProfiler/media/profiler-diagram.png b/tech_reports/MetalProfiler/media/profiler-diagram.png
new file mode 100644
index 00000000000..855f051d86a
Binary files /dev/null and b/tech_reports/MetalProfiler/media/profiler-diagram.png differ
diff --git a/tech_reports/MetalProfiler/metal-profiler.md b/tech_reports/MetalProfiler/metal-profiler.md
new file mode 100644
index 00000000000..44b844612c4
--- /dev/null
+++ b/tech_reports/MetalProfiler/metal-profiler.md
@@ -0,0 +1,331 @@
+# Metal Profiler
+
+## Quick Links
+- Tracy Profiler Repo: https://github.com/wolfpld/tracy
+- Tracy Documentation: https://github.com/wolfpld/tracy/releases/latest/download/tracy.pdf
+- Metal Fork of Tracy: https://github.com/tenstorrent-metal/tracy/tree/71d4c8d378b52af7da7012b9b595a61e9304f0bb
+
+## Introduction
+Tracy is an open-source C++ profiling tool with sampling and code instrumentation profiling capabilities. The profiled application is a client, and the profiler itself is a server (by default runs on port 8086). It was named this way because the client is a thin layer that just collects events and sends them for processing and long-term storage on the server. The fact that the server needs to connect to the client to begin the profiling session may be a bit confusing at first.
+
+## Things built from Tracy that are needed in tt-metal
+tt-metal is still on v0.10 of tracy. tt-metal has forked the tracy repo and added specific functionality to support running tracy on tenstorrent devices. Repo located here: https://github.com/tenstorrent-metal/tracy/tree/71d4c8d378b52af7da7012b9b595a61e9304f0bb. They key differences between v0.10 and v0.11.1 are how the tools within Tracy are built, with the former being from Makefile and the later upgrading to CMake. Metal plans to uplift to this version in the near future.
+
+For instructional purposes, the following section describes metal's fork of v0.10.
+
+### tracy-client
+tracy-client is a library that you link to your application. It will act as a thin wrapper between your application and the server. All macro calls that you insert into your application via Tracy APIs will interface with the tracy-client.
+```
+cmake -B build -S . -DCMAKE_BUILD_TYPE=Release -DTRACY_ENABLE=ON
+cmake --build build --config Release --parallel
+libTracyClient.a
+```
+
+### tracy-capture
+tracy-capture is a command line executable that acts as the tracy server to capture events from tracy-client. It will dump a .tracy file which you can feed into tracy-profiler GUI.
+```
+cd capture/build/unix
+make all
+./tracy-capture -o test.tracy
+```
+
+### tracy-profiler
+tracy-profiler is a gui application that also acts as the tracy server. It can consume events from tracy-client live or can ingest a .tracy file computed offline. Typically you would run this on your local macbook while running tracy client + application on some remote machine. This is usually built on a machine in the same network as your host machine.
+```
+cd profiler/build/unix
+make all
+./tracy-profiler
+```
+
+### tracy-csvexport
+tracy-csvexport is a command line executable that consumes .tracy file and outputs a csv file with all the data within the .tracy file. It is meant for an easier way to view the data.
+```
+cd csvexport/build/unix
+make all
+./tracy-csvexport test.tracy
+```
+
+## Basic Tracy Application
+The following section descibres how to integrate Tracy into your project. It is meant for devs to understand the flow of Tracy before using it in tt-metal or extending tt-metal within their own application.
+
+### 1. Add Tracy
+Add the Tracy repository to your project directory (as a third_party submodule)
+```
+mkdir third_party
+cd third_party
+git clone https://github.com/tenstorrent-metal/tracy/tree/71d4c8d378b52af7da7012b9b595a61e9304f0bb
+cd tracy
+```
+
+### 2. Build tracy-client
+Build tracy-client as a static lib and link to your executable target (or use Tracy::TracyClient)
+```
+add_executable(runner main.cpp)
+add_subdirectory(third_party/tracy)
+target_link_libraries(runner PUBLIC hellolib Tracy::TracyClient)
+target_include_directories(runner third_party/tracy/public)
+```
+
+### 3. Add Tracy includes
+Add tracy/Tracy.hpp as an include file to any file that will use tracy apis. Tracy source files are located in tracy/public directory
+```
+#include "tracy/Tracy.hpp"
+```
+
+### 4. Define compile options
+Define TRACY_ENABLE=ON for the WHOLE project (otherwise, won't be able to collect any data)
+```
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTRACY_ENABLE=ON")
+```
+
+### 5. Insert macros
+Insert tracy related macro calls into your code
+eg. Zones in Tracy are marked sections of code that users are interested in profiling. Tracy provides macros such as ZoneScoped; to accomplish this. Please refer to section 3 of Tracy’s documentation for further information on zones and available macros.
+```
+TracyMessageL("hello");
+ZoneScoped;
+FrameMarkNamed("main");
+```
+
+### 6. Build tracy-capture
+
+### 7. Build tracy-csvexport
+
+### 8. Build tracy-profiler
+
+## Developer Flow for using Tracy
+
+### 1. Start tracy-capture
+This will start listening on default port + address for which the tracy-client will interact with. It will dump all it's results into a .tracy file (which can be uploaded into tracy-profiler after)
+```
+./tracy-capture -o hello.tracy -f
+```
+
+### 2. (Optional) Start tracy-profiler
+Instead of starting tracy-capture via command line, you can start tracy-profiler from your macbook. The tracy-profiler is a gui that will communicate in real-time with the tracy-client and display the results on the gui.
+```
+./tracy-profiler
+```
+
+### 3. Start application
+Start your application in a different terminal. This is the application that has been compiled with all the stuff mentioned in Basic Tracy Integration. As your application runs, you will see tracy-capture capturing events/tracy-profiler capturing events.
+```
+./runner
+```
+
+### 4. (Only if did 1.) Feed .tracy into tracy-profiler
+If you used tracy-capture, it will dump a .tracy file once this is complete. You can then feed this .tracy file into the tracy-profiler to view the results.
+
+### 5. (Only if did 1.) View .tracy contents
+You can also view the contents of the .tracy file as a csv file using tracy-csvexport. This will dump the results in csv format which you can pip into a file and view the results. Optionally, you can also save the .tracy file via the GUI itself and then feed it into the tracy-csvexport tool.
+```
+./tracy-csvexport hello.tracy
+```
+
+## Tracy Example
+The following section will provide an example of how to use Tracy in a sample app, step by step.
+
+### 1. Setup project directory structure
+```
+- project/
+ - third_party/
+ - tracy/
+ - include/
+ - hellolib.hpp
+ - CMakeLists.txt
+ - main.cpp
+ - hellolib.cpp
+```
+
+### 2. Fill in contents of each file found below
+// third_party/tracy
+```
+mkdir third_party
+cd third_party
+git clone https://github.com/tenstorrent-metal/tracy/tree/71d4c8d378b52af7da7012b9b595a61e9304f0bb
+cd tracy
+```
+
+// hellolib.hpp
+```
+#include "tracy/Tracy.hpp"
+#include
+
+int add(int a, int b);
+int subtract(int a, int b);
+int multiply(int a, int b);
+int divide(int a, int b);
+```
+
+// CMakeLists.txt
+```
+cmake_minimum_required(VERSION 3.10)
+
+project(TracyTest LANGUAGES CXX)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED True)
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTRACY_ENABLE=ON")
+
+add_library(hellolib SHARED hellolib.cpp)
+
+target_include_directories(hellolib
+ PUBLIC
+ ${CMAKE_SOURCE_DIR}/include
+ ${CMAKE_SOURCE_DIR}/third_party/tracy/public
+)
+
+add_executable(runner main.cpp)
+
+add_subdirectory(third_party/tracy)
+
+# Link the executable against the shared library
+target_link_libraries(runner PUBLIC hellolib Tracy::TracyClient)
+```
+
+// main.cpp
+```
+#include "tracy/Tracy.hpp"
+#include
+#include "hellolib.hpp"
+#include // for sleep()
+
+int main() {
+ sleep(5); // need to add short sleep call so that tracy client can establish connection
+ FrameMarkNamed("main");
+ TracyMessageL("Hello");
+ ZoneScoped;
+ int c = 0;
+ std::cout << "Hello, World = " << c << std::endl;
+ c = add(5, 5);
+ c = subtract(5, 5);
+ c = multiply(5, 5);
+ c = divide(5, 5);
+ return 0;
+}
+```
+
+// hellolib.cpp
+```
+#include "tracy/Tracy.hpp"
+#include "hellolib.hpp"
+#include
+#include
+
+int add(int a, int b) {
+ TracyMessageL("add");
+ ZoneScoped;
+ int c = a + b;
+ return c;
+}
+
+int subtract(int a, int b) {
+ TracyMessageL("subtract");
+ ZoneScoped;
+ return a - b;
+}
+
+int multiply(int a, int b) {
+ TracyMessageL("multiply");
+ ZoneScoped;
+ return a * b;
+}
+
+int divide(int a, int b) {
+ TracyMessageL("divide");
+ ZoneScoped;
+ return a / b;
+}
+```
+
+### 3. Build Project
+This will build the executable and binarines in `build/` folder.
+```
+mkdir -p build
+cd build/
+cmake -G Ninja -DTRACY_ENABLE=ON ..
+ninja
+```
+
+### 4. Build tracy-capture
+
+### 5. Optional: Build tracy-profiler (on macbook)
+
+### 6. Build tracy-csvexport
+
+### 7. Start tracy-capture OR tracy-profiler
+Start this in a separate terminal. This will dump all events into hello.tracy.
+```
+./tracy-capture -o hello.tracy
+```
+
+Start this on your macbook. This will collect all events live. You need to make sure you port forward from your remote machine to your macbook.
+```
+./tracy-profiler
+```
+
+### 8. Run project
+This will run the executable and the results will be collected by tracy-capture and stored into `hello.tracy` file or displayed lived on tracy-profiler.
+```
+cd build
+./runner
+```
+
+### (Optional) Run tracy-csvexport
+If you used tracy-capture and want to view the results, you can pass them through tracy-csvexport. This will dump out all the results in csv format which you can then pipe to a file. You can also save a .tracy file via the tracy-profiler GUI and view them using this tool.
+```
+./csvexport hello.tracy
+```
+
+### (Optional) Upload .tracy file into tracy-profiler
+If you used tracy-capture to get the .tracy file, you can upload it into tracy-profiler GUI offline on your macbook. Follow instructions on GUI widget.
+
+## Tracy + Metal
+The following sections relates to tt-metal's usage of Tracy. tt-metal uses v0.10 version of Tracy. They have also built on-top of tracy with custom files to support device side profiling. Repo found here: https://github.com/tenstorrent-metal/tracy/tree/71d4c8d378b52af7da7012b9b595a61e9304f0bb. There are several components regarding how tt-metal integrates Tracy and provides profiler support.
+
+### Building in profiler mode
+You can build metal in profiler mode using the following
+```
+./build_metal -p
+```
+All of the tools that are needed by metal are generated under `build/tools/profiler/bin/`.
+
+### tt_metal.so
+tt_metal shared library is generated, which has all the low level implementation details. This library can be used standalone if calling tt_metal APIs and it is linked against ttnn.so if using ttnn APIs. This library is also what Tracy links against.
+```
+location: tt_metal/CMakeLists.txt
+eg: target_link_libraries(tt_metal PUBLIC compiler_flags $<$:TracyClient>)
+```
+
+### profiler.o
+A profiler object gets generated with various low level API calls within tt-metal. This object is linked against tt_metal.so.
+```
+location: tt-metal/tt_metal/tools/profiler
+eg: profiler.cpp
+```
+
+### Tracy module tool for dev convenience
+```
+location: tt-metal/ttnn/tracy
+eg: __main__.py
+```
+
+Developers can use the tracy module tool that will handle everything interally for them (such as tracy-capture, tracy-csvexport etc). This is provided for convenience. Profiling python code with tracy requires running your python code with the python tracy module. For profiling your entire python program, run your program as follows.
+```
+python -m tracy {test_script}.py
+```
+
+### Tracy post-processing scripts
+Metal will dump out various information about kernel profiling data. All this information gets cleaned and presented in a visible format through various post-processing scripts. If you are using the tracy module script infrastructure provided by metal, it will handle all of this for you.
+```
+location: tt-metal/tt_metal/tools/profiler
+eg: process_ops_logs.py
+```
+
+### Tracy + Metal Architecture
+The following image depicts the architectural diagram and program flow for how metal integrates Tracy and how the internal flow works. Everything is handled internally by the Tracy module tool that metal provides for dev convenience.
+
+
+
+## Extending Tracy to External Applications
+For an example of how metal Tracy is used in another application, please refer to https://github.com/tenstorrent/tt-mlir, specifically https://github.com/tenstorrent/tt-mlir/tree/main/runtime/tools/python (ttrt) - an independent runtime wrapper around metal APIs.
diff --git a/tests/scripts/run_moreh_microbenchmark.sh b/tests/scripts/run_moreh_microbenchmark.sh
index cdccd2f8302..2b7107bb7df 100755
--- a/tests/scripts/run_moreh_microbenchmark.sh
+++ b/tests/scripts/run_moreh_microbenchmark.sh
@@ -35,6 +35,7 @@ run_profiling_test() {
if [[ "$ARCH_NAME" == "wormhole_b0" ]]; then
pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_matmul_single_core_sharded -k $ARCH_NAME
pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_dram_read_12_core -k $ARCH_NAME
+ pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_dram_read_remote_cb_sync -k $ARCH_NAME
fi
# bypass wh_b0 for now until we can move FD cores to last col
if [[ "$ARCH_NAME" != "wormhole_b0" ]]; then
diff --git a/tests/scripts/test_moreh_microbenchmark.py b/tests/scripts/test_moreh_microbenchmark.py
index dc1e3b9b4c9..c93b82c45d6 100755
--- a/tests/scripts/test_moreh_microbenchmark.py
+++ b/tests/scripts/test_moreh_microbenchmark.py
@@ -287,6 +287,33 @@ def run_dram_read_l1_write_cmd(k, n, num_blocks, df, num_banks, bank_start_id):
run_moreh_single_test("DRAM BW test multi-core", command)
+def run_dram_read_remote_cb_sync_cmd(
+ k, n, num_blocks, cb_num_blocks, cb_padding, df, num_receivers, num_mixed_df_layers
+):
+ command = (
+ "TT_METAL_DEVICE_PROFILER=1 ./build/test/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/test_dram_read_remote_cb "
+ + " --k "
+ + str(k)
+ + " --n "
+ + str(n)
+ + " --num-blocks "
+ + str(num_blocks)
+ + " --cb-num-blocks "
+ + str(cb_num_blocks)
+ + " --cb-padding "
+ + str(cb_padding)
+ + " --num-tests "
+ + str(1)
+ + " --data-type "
+ + str(df)
+ + " --num-receivers "
+ + str(num_receivers)
+ + " --num-mixed-df-layers "
+ + str(num_mixed_df_layers)
+ )
+ run_moreh_single_test("DRAM read remote CB sync single-core ", command)
+
+
# noc
def test_noc_local(r=9, c=12, nt=256, cb=1):
command = (
@@ -739,6 +766,64 @@ def test_dram_read_l1_write_core(arch, freq, test_vector, num_tests, nblock, dat
assert bw_bound <= throughput
+@pytest.mark.parametrize(
+ "arch, freq, test_vector, num_tests, nblock, cb_nblock, cb_padding, data_format, num_receivers, num_mixed_df_layers",
+ [
+ # single layer single receiver test
+ ("wormhole_b0", 1000, np.array([32768, 128]), 1, 64, 5, 256, 1, 1, 1),
+ # single layer multi receiver test
+ ("wormhole_b0", 1000, np.array([32768, 128]), 1, 64, 3, 256, 1, 2, 1),
+ # multi layer multi receiver test
+ ("wormhole_b0", 1000, np.array([32768, 256]), 1, 64, 5, 256, 1, 4, 15),
+ ],
+)
+def test_dram_read_remote_cb_sync(
+ arch, freq, test_vector, num_tests, nblock, cb_nblock, cb_padding, data_format, num_receivers, num_mixed_df_layers
+):
+ data = []
+ cycle_list = []
+ time_list = []
+ throughput_list = []
+ for _ in range(num_tests):
+ k = int(test_vector[0])
+ n = int(test_vector[1])
+ input_size = 0
+ if data_format == 0:
+ input_size += k * n * 1088 // 1024
+ elif data_format == 1:
+ input_size += k * n * 2048 // 1024
+ for i in range(num_mixed_df_layers - 1):
+ if i % 2 == 0:
+ input_size += k * n * 1088 // 1024
+ else:
+ input_size += k * n * 2048 // 1024
+ run_dram_read_remote_cb_sync_cmd(
+ k, n, nblock, cb_nblock, cb_padding, data_format, num_receivers, num_mixed_df_layers
+ )
+ cycle = profile_results_kernel_duration()
+ time = cycle / freq / 1000.0 / 1000.0
+ throughput = input_size / cycle * freq / 1000.0
+ cycle_list.append(cycle)
+ time_list.append(time)
+ throughput_list.append(throughput)
+ cycle = sum(cycle_list) / len(cycle_list)
+ time = sum(time_list) / len(time_list)
+ throughput = sum(throughput_list) / len(throughput_list)
+ logger.info("DRAM read cycle: " + str(cycle))
+ logger.info("DRAM read time: " + str(time))
+ logger.info("DRAM read throughput: " + str(throughput))
+ data.append([throughput])
+ # check within range
+ dev_freq = get_device_freq()
+ if arch == "grayskull":
+ bw_bound = 100.0
+ elif arch == "wormhole_b0":
+ bw_bound = 22.0
+ elif arch == "blackhole":
+ bw_bound = 340.0
+ assert bw_bound <= throughput
+
+
@pytest.mark.parametrize(
"arch, freq, r, c, test_vector_global, test_vector_local",
[
diff --git a/tests/scripts/tg/run_tg_nightly_tests.sh b/tests/scripts/tg/run_tg_nightly_tests.sh
index e56038e0ca6..6810e5b58d4 100755
--- a/tests/scripts/tg/run_tg_nightly_tests.sh
+++ b/tests/scripts/tg/run_tg_nightly_tests.sh
@@ -7,6 +7,8 @@ run_tg_llama3_70b_tests() {
echo "LOG_METAL: Running run_tg_llama3_70b_tests"
+ pytest tests/ttnn/unit_tests/operations/test_all_gather_TG_nightly.py ; fail+=$?
+
# Falcon40B prefill 60 layer end to end with 10 loops; we need 8x8 grid size
pytest tests/nightly/tg/models/demos/tg/llama3_70b ; fail+=$?
diff --git a/tests/sweep_framework/sweeps/eltwise/unary_complex/angle/angle.py b/tests/sweep_framework/sweeps/eltwise/unary_complex/angle/angle.py
new file mode 100644
index 00000000000..e1e872d0585
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary_complex/angle/angle.py
@@ -0,0 +1,103 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import random
+import ttnn
+from tests.sweep_framework.sweep_utils.utils import gen_shapes, sanitize_shape_rm
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 30
+
+random.seed(0)
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+ "xfail": {
+ "input_shape": gen_shapes([1, 1, 1, 1], [6, 12, 256, 256], [1, 1, 1, 1], 16)
+ + gen_shapes([1, 1, 1], [12, 256, 256], [1, 1, 1], 16)
+ + gen_shapes([1, 1], [256, 256], [1, 1], 16),
+ "input_a_dtype": [ttnn.bfloat16],
+ "input_layout": [ttnn.TILE_LAYOUT, ttnn.ROW_MAJOR_LAYOUT],
+ "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ },
+}
+
+
+# Invalidate vector is called during the generation phase where each vector will be passed in.
+# If invalidated, the vector will still be stored but will be skipped.
+# Returns False, None if the vector is valid, and True, str with a reason for invalidation if it is invalid.
+def invalidate_vector(test_vector) -> Tuple[bool, Optional[str]]:
+ if test_vector["input_layout"] == ttnn.ROW_MAJOR_LAYOUT:
+ return True, "Inputs to eltwise binary must be tilized"
+ if test_vector["input_layout"] == ttnn.ROW_MAJOR_LAYOUT and test_vector["input_a_dtype"] == ttnn.bfloat8_b:
+ return True, "bfloat8_b is only supported on tiled layout"
+ return False, None
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+ input_shape,
+ input_a_dtype,
+ input_layout,
+ input_a_memory_config,
+ output_memory_config,
+ *,
+ device,
+) -> list:
+ data_seed = random.randint(0, 20000000)
+ torch.manual_seed(data_seed)
+
+ if input_layout == ttnn.ROW_MAJOR_LAYOUT:
+ input_shape = sanitize_shape_rm(input_shape)
+
+ torch_real = gen_func_with_cast_tt(partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype)(
+ input_shape
+ ).to(torch.float32)
+ torch_imag = gen_func_with_cast_tt(partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype)(
+ input_shape
+ ).to(torch.float32)
+ torch_input_tensor_a = torch.complex(torch_real, torch_imag)
+
+ golden_function = ttnn.get_golden_function(ttnn.angle)
+ torch_output_tensor = golden_function(torch_input_tensor_a)
+
+ input_tensor_a_real = ttnn.from_torch(
+ torch_real,
+ dtype=input_a_dtype,
+ layout=input_layout,
+ device=device,
+ memory_config=input_a_memory_config,
+ )
+ input_tensor_a_imag = ttnn.from_torch(
+ torch_imag,
+ dtype=input_a_dtype,
+ layout=input_layout,
+ device=device,
+ memory_config=input_a_memory_config,
+ )
+ input_tensor_a = ttnn.complex_tensor(input_tensor_a_real, input_tensor_a_imag)
+
+ start_time = start_measuring_time()
+ output_tensor = ttnn.angle(input_tensor_a, memory_config=output_memory_config)
+ e2e_perf = stop_measuring_time(start_time)
+
+ output_tensor = ttnn.to_torch(output_tensor)
+
+ return [check_with_pcc(torch_output_tensor, output_tensor, 0.999), e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary_complex/angle_bw/angle_bw.py b/tests/sweep_framework/sweeps/eltwise/unary_complex/angle_bw/angle_bw.py
new file mode 100644
index 00000000000..ce3dd28f636
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary_complex/angle_bw/angle_bw.py
@@ -0,0 +1,125 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import random
+import ttnn
+from tests.sweep_framework.sweep_utils.utils import gen_shapes, sanitize_shape_rm
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 30
+
+random.seed(0)
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+ "nightly": {
+ "input_shape": gen_shapes([1, 1, 1, 1], [6, 12, 256, 256], [1, 1, 1, 1], 8)
+ + gen_shapes([1, 1, 1], [12, 256, 256], [1, 1, 1], 8)
+ + gen_shapes([1, 1], [256, 256], [1, 1], 8),
+ "grad_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
+ "input_a_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
+ "input_layout": [ttnn.TILE_LAYOUT, ttnn.ROW_MAJOR_LAYOUT],
+ "grad_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ },
+}
+
+
+# Invalidate vector is called during the generation phase where each vector will be passed in.
+# If invalidated, the vector will still be stored but will be skipped.
+# Returns False, None if the vector is valid, and True, str with a reason for invalidation if it is invalid.
+def invalidate_vector(test_vector) -> Tuple[bool, Optional[str]]:
+ if test_vector["input_layout"] == ttnn.ROW_MAJOR_LAYOUT:
+ return True, "Inputs to eltwise binary must be tilized"
+ if test_vector["input_a_dtype"] == ttnn.bfloat8_b:
+ return True, "bfloat8_b is not supported on input_tensor_a"
+ if test_vector["input_layout"] == ttnn.ROW_MAJOR_LAYOUT and test_vector["input_a_dtype"] == ttnn.bfloat8_b:
+ return True, "bfloat8_b is only supported on tiled layout"
+ return False, None
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+ input_shape,
+ grad_dtype,
+ input_a_dtype,
+ input_layout,
+ grad_memory_config,
+ input_a_memory_config,
+ output_memory_config,
+ *,
+ device,
+) -> list:
+ data_seed = random.randint(0, 20000000)
+ torch.manual_seed(data_seed)
+
+ if input_layout == ttnn.ROW_MAJOR_LAYOUT:
+ input_shape = sanitize_shape_rm(input_shape)
+
+ torch_grad_tensor = gen_func_with_cast_tt(
+ partial(torch_random, low=-100, high=100, dtype=torch.float32), grad_dtype
+ )(input_shape).to(torch.float32)
+
+ torch_real = gen_func_with_cast_tt(partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype)(
+ input_shape
+ ).to(torch.float32)
+ torch_imag = gen_func_with_cast_tt(partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype)(
+ input_shape
+ ).to(torch.float32)
+
+ torch_input_tensor_a = torch.complex(torch_real, torch_imag)
+
+ torch_input_tensor_a.requires_grad = True
+
+ golden_function = ttnn.get_golden_function(ttnn.angle_bw)
+ torch_output_tensor = golden_function(torch_grad_tensor, torch_input_tensor_a)[0]
+
+ grad_tensor = ttnn.from_torch(
+ torch_grad_tensor,
+ dtype=grad_dtype,
+ layout=input_layout,
+ device=device,
+ memory_config=grad_memory_config,
+ )
+
+ input_tensor_a_real = ttnn.from_torch(
+ torch_real,
+ dtype=input_a_dtype,
+ layout=input_layout,
+ device=device,
+ memory_config=input_a_memory_config,
+ )
+ input_tensor_a_imag = ttnn.from_torch(
+ torch_imag,
+ dtype=input_a_dtype,
+ layout=input_layout,
+ device=device,
+ memory_config=input_a_memory_config,
+ )
+
+ input_tensor_a = ttnn.complex_tensor(input_tensor_a_real, input_tensor_a_imag)
+
+ start_time = start_measuring_time()
+ output_tensor = ttnn.angle_bw(grad_tensor, input_tensor_a, memory_config=output_memory_config)[0]
+ e2e_perf = stop_measuring_time(start_time)
+
+ output_tensor = torch.cat((ttnn.to_torch(output_tensor.real), ttnn.to_torch(output_tensor.imag)), dim=-1)
+
+ return [check_with_pcc(torch_output_tensor, output_tensor, 0.999), e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary_complex/polar/polar.py b/tests/sweep_framework/sweeps/eltwise/unary_complex/polar/polar.py
new file mode 100644
index 00000000000..857f4d533fd
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary_complex/polar/polar.py
@@ -0,0 +1,109 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import random
+import ttnn
+from tests.sweep_framework.sweep_utils.utils import gen_shapes, sanitize_shape_rm
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 30
+
+random.seed(0)
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+ "nightly": {
+ "input_shape": gen_shapes([1, 1, 1, 1], [6, 12, 256, 256], [1, 1, 1, 1], 16)
+ + gen_shapes([1, 1, 1], [12, 256, 256], [1, 1, 1], 16)
+ + gen_shapes([1, 1], [256, 256], [1, 1], 16),
+ "input_a_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
+ "input_layout": [ttnn.TILE_LAYOUT, ttnn.ROW_MAJOR_LAYOUT],
+ "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ },
+}
+
+
+# Invalidate vector is called during the generation phase where each vector will be passed in.
+# If invalidated, the vector will still be stored but will be skipped.
+# Returns False, None if the vector is valid, and True, str with a reason for invalidation if it is invalid.
+def invalidate_vector(test_vector) -> Tuple[bool, Optional[str]]:
+ if test_vector["input_layout"] == ttnn.ROW_MAJOR_LAYOUT:
+ return True, "Inputs to eltwise binary must be tilized"
+ if test_vector["input_layout"] == ttnn.ROW_MAJOR_LAYOUT and test_vector["input_a_dtype"] == ttnn.bfloat8_b:
+ return True, "bfloat8_b is only supported on tiled layout"
+ return False, None
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+ input_shape,
+ input_a_dtype,
+ input_layout,
+ input_a_memory_config,
+ output_memory_config,
+ *,
+ device,
+) -> list:
+ data_seed = random.randint(0, 20000000)
+ torch.manual_seed(data_seed)
+
+ if input_layout == ttnn.ROW_MAJOR_LAYOUT:
+ input_shape = sanitize_shape_rm(input_shape)
+
+ torch_real = gen_func_with_cast_tt(partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype)(
+ input_shape
+ ).to(torch.float32)
+ torch_imag = gen_func_with_cast_tt(partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype)(
+ input_shape
+ ).to(torch.float32)
+
+ golden_function = torch.polar
+ torch_output_tensor = golden_function(torch_real, torch_imag)
+
+ input_tensor_a_real = ttnn.from_torch(
+ torch_real,
+ dtype=input_a_dtype,
+ layout=input_layout,
+ device=device,
+ memory_config=input_a_memory_config,
+ )
+ input_tensor_a_imag = ttnn.from_torch(
+ torch_imag,
+ dtype=input_a_dtype,
+ layout=input_layout,
+ device=device,
+ memory_config=input_a_memory_config,
+ )
+ input_tensor_a = ttnn.complex_tensor(input_tensor_a_real, input_tensor_a_imag)
+
+ start_time = start_measuring_time()
+ output_tensor = ttnn.polar(input_tensor_a, memory_config=output_memory_config)
+ e2e_perf = stop_measuring_time(start_time)
+
+ output_tensor = torch.complex(
+ ttnn.to_torch(output_tensor.real).to(torch.float32), ttnn.to_torch(output_tensor.imag).to(torch.float32)
+ )
+
+ return [
+ check_with_pcc(
+ torch.view_as_real(torch_output_tensor.clone()), torch.view_as_real(output_tensor.clone()), 0.999
+ ),
+ e2e_perf,
+ ]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary_complex/polar_bw/polar_bw.py b/tests/sweep_framework/sweeps/eltwise/unary_complex/polar_bw/polar_bw.py
new file mode 100644
index 00000000000..2ac0d2dec36
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary_complex/polar_bw/polar_bw.py
@@ -0,0 +1,137 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import random
+import ttnn
+from tests.sweep_framework.sweep_utils.utils import gen_shapes, sanitize_shape_rm
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 30
+
+random.seed(0)
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+ "nightly": {
+ "input_shape": gen_shapes([1, 1, 1, 1], [6, 12, 256, 256], [1, 1, 1, 1], 8)
+ + gen_shapes([1, 1, 1], [12, 256, 256], [1, 1, 1], 8)
+ + gen_shapes([1, 1], [256, 256], [1, 1], 8),
+ "grad_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
+ "input_a_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
+ "input_layout": [ttnn.TILE_LAYOUT, ttnn.ROW_MAJOR_LAYOUT],
+ "grad_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ },
+}
+
+
+# Invalidate vector is called during the generation phase where each vector will be passed in.
+# If invalidated, the vector will still be stored but will be skipped.
+# Returns False, None if the vector is valid, and True, str with a reason for invalidation if it is invalid.
+def invalidate_vector(test_vector) -> Tuple[bool, Optional[str]]:
+ if test_vector["input_layout"] == ttnn.ROW_MAJOR_LAYOUT:
+ return True, "Inputs to eltwise binary must be tilized"
+ if test_vector["input_a_dtype"] == ttnn.bfloat8_b:
+ return True, "bfloat8_b is not supported on input_tensor_a"
+ if test_vector["input_layout"] == ttnn.ROW_MAJOR_LAYOUT and test_vector["input_a_dtype"] == ttnn.bfloat8_b:
+ return True, "bfloat8_b is only supported on tiled layout"
+ return False, None
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+ input_shape,
+ grad_dtype,
+ input_a_dtype,
+ input_layout,
+ grad_memory_config,
+ input_a_memory_config,
+ output_memory_config,
+ *,
+ device,
+) -> list:
+ data_seed = random.randint(0, 20000000)
+ torch.manual_seed(data_seed)
+
+ if input_layout == ttnn.ROW_MAJOR_LAYOUT:
+ input_shape = sanitize_shape_rm(input_shape)
+
+ torch_grad_real = gen_func_with_cast_tt(partial(torch_random, low=-100, high=100, dtype=torch.float32), grad_dtype)(
+ input_shape
+ ).to(torch.float32)
+ torch_grad_imag = gen_func_with_cast_tt(partial(torch_random, low=-100, high=100, dtype=torch.float32), grad_dtype)(
+ input_shape
+ ).to(torch.float32)
+
+ torch_real = gen_func_with_cast_tt(partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype)(
+ input_shape
+ ).to(torch.float32)
+ torch_imag = gen_func_with_cast_tt(partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype)(
+ input_shape
+ ).to(torch.float32)
+
+ torch_grad_tensor = torch.complex(torch_grad_real, torch_grad_imag)
+ torch_input_tensor_a = torch.complex(torch_real, torch_imag)
+
+ torch_input_tensor_a.requires_grad = True
+
+ golden_function = ttnn.get_golden_function(ttnn.polar_bw)
+ torch_output_tensor = golden_function(torch_grad_tensor, torch_input_tensor_a)[0]
+
+ grad_real = ttnn.from_torch(
+ torch_grad_real,
+ dtype=grad_dtype,
+ layout=input_layout,
+ device=device,
+ memory_config=grad_memory_config,
+ )
+ grad_imag = ttnn.from_torch(
+ torch_grad_imag,
+ dtype=grad_dtype,
+ layout=input_layout,
+ device=device,
+ memory_config=grad_memory_config,
+ )
+
+ input_tensor_a_real = ttnn.from_torch(
+ torch_real,
+ dtype=input_a_dtype,
+ layout=input_layout,
+ device=device,
+ memory_config=input_a_memory_config,
+ )
+ input_tensor_a_imag = ttnn.from_torch(
+ torch_imag,
+ dtype=input_a_dtype,
+ layout=input_layout,
+ device=device,
+ memory_config=input_a_memory_config,
+ )
+
+ grad_tensor = ttnn.complex_tensor(grad_real, grad_imag)
+ input_tensor_a = ttnn.complex_tensor(input_tensor_a_real, input_tensor_a_imag)
+
+ start_time = start_measuring_time()
+ output_tensor = ttnn.polar_bw(grad_tensor, input_tensor_a, memory_config=output_memory_config)[0]
+ e2e_perf = stop_measuring_time(start_time)
+
+ output_tensor = torch.cat((ttnn.to_torch(output_tensor.real), ttnn.to_torch(output_tensor.imag)), dim=-1)
+
+ return [check_with_pcc(torch_output_tensor, output_tensor, 0.999), e2e_perf]
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/kernels/reader_dram.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/kernels/reader_dram.cpp
new file mode 100644
index 00000000000..9b5988f4e63
--- /dev/null
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/kernels/reader_dram.cpp
@@ -0,0 +1,145 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include
+
+#include "dataflow_api.h"
+#include "ttnn/cpp/ttnn/operations/ccl/kernel_common/worker_sync_utils.hpp"
+
+#include "debug/dprint.h"
+
+template
+FORCE_INLINE
+void noc_async_read_tile_dram_sharded(uint32_t src_addr, uint32_t dest_addr, uint32_t bank_id = 0, const uint32_t vc = 0) {
+ uint32_t src_addr_;
+ uint32_t src_noc_xy;
+
+ src_addr_ = src_addr + bank_base_address;
+ src_addr_ += bank_to_dram_offset[bank_id];
+ src_noc_xy = dram_bank_to_noc_xy[noc_index][bank_id];
+
+ WAYPOINT("NRTW");
+ DEBUG_SANITIZE_NOC_READ_TRANSACTION(noc_index, get_noc_addr_helper(src_noc_xy, src_addr_), dest_addr, page_size);
+ while (!noc_cmd_buf_ready(noc_index, NCRISC_RD_CMD_BUF));
+ WAYPOINT("NRTD");
+
+ if constexpr(use_vc) {
+ uint32_t noc_rd_cmd_field = NOC_CMD_CPY | NOC_CMD_RD | NOC_CMD_RESP_MARKED | NOC_CMD_VC_STATIC | NOC_CMD_STATIC_VC(vc);
+ NOC_CMD_BUF_WRITE_REG(noc_index, NCRISC_RD_CMD_BUF, NOC_CTRL, noc_rd_cmd_field);
+ }
+
+ NOC_CMD_BUF_WRITE_REG(noc_index, NCRISC_RD_CMD_BUF, NOC_RET_ADDR_LO, dest_addr);
+ NOC_CMD_BUF_WRITE_REG(noc_index, NCRISC_RD_CMD_BUF, NOC_TARG_ADDR_LO, src_addr_); // (uint32_t)src_addr
+ NOC_CMD_BUF_WRITE_REG(noc_index, NCRISC_RD_CMD_BUF, NOC_TARG_ADDR_COORDINATE, src_noc_xy); // src_addr >> 32
+ NOC_CMD_BUF_WRITE_REG(noc_index, NCRISC_RD_CMD_BUF, NOC_AT_LEN_BE, page_size); // len_bytes
+ NOC_CMD_BUF_WRITE_REG(noc_index, NCRISC_RD_CMD_BUF, NOC_CMD_CTRL, NOC_CTRL_SEND_REQ);
+ noc_reads_num_issued[noc_index] += 1;
+}
+
+void kernel_main() {
+ constexpr uint32_t input_addr = get_compile_time_arg_val(0);
+ constexpr uint32_t input_start_tile_id = get_compile_time_arg_val(1);
+ constexpr uint32_t noc = get_compile_time_arg_val(2);
+ constexpr uint32_t num_layers = get_compile_time_arg_val(3);
+
+ uint32_t rt_args_idx = 0;
+ const uint32_t bank_id = get_arg_val(rt_args_idx++);
+ const uint32_t vc = get_arg_val(rt_args_idx++);
+ tt_l1_ptr uint32_t* page_size = (tt_l1_ptr uint32_t*)(get_arg_addr(increment_arg_idx(rt_args_idx, num_layers)));
+ tt_l1_ptr uint32_t* num_pages = (tt_l1_ptr uint32_t*)(get_arg_addr(increment_arg_idx(rt_args_idx, num_layers)));
+ tt_l1_ptr uint32_t* num_blocks = (tt_l1_ptr uint32_t*)(get_arg_addr(increment_arg_idx(rt_args_idx, num_layers)));
+ tt_l1_ptr uint32_t* block_num_tiles = (tt_l1_ptr uint32_t*)(get_arg_addr(increment_arg_idx(rt_args_idx, num_layers)));
+
+ constexpr uint32_t cb_id = 0;
+ constexpr uint32_t total_num_blocks_in_buffer = 3;
+
+ uint32_t block_size_bytes = num_pages[0] * page_size[0];
+ uint32_t l1_buffer_start_addr = get_write_ptr(cb_id);
+ uint32_t l1_buffer_end_addr = get_write_ptr(cb_id) + block_size_bytes * total_num_blocks_in_buffer;
+
+ uint32_t src_read_addr = 0;
+ uint32_t src_read_addr_offset_bytes = 0;
+
+ for (uint32_t l = 0; l < num_layers; ++l) {
+ uint32_t curr_page_size = page_size[l];
+ uint32_t curr_num_pages = num_pages[l];
+ uint32_t curr_num_blocks = num_blocks[l];
+ uint32_t curr_block_num_tiles = block_num_tiles[l];
+
+ uint32_t curr_block_size_bytes = curr_num_pages * curr_page_size;
+ uint32_t curr_layer_size_bytes = curr_num_blocks * curr_block_size_bytes;
+
+ uint32_t src_base_addr = noc_async_read_tile_dram_sharded_set_state(input_addr, curr_page_size, bank_id, vc);
+ src_read_addr = src_read_addr_offset_bytes;
+
+ // For debug purpose, use trivial DRAM read method
+ // for (uint32_t block = 0; block < curr_num_blocks; ++block) {
+ // // Operand 1
+ // cb_reserve_back(cb_id, curr_block_num_tiles);
+ // auto l1_write_addr = get_write_ptr(cb_id);
+
+ // for (uint32_t h = 0; h < curr_num_pages; ++h) {
+ // noc_async_read_tile_dram_sharded_with_state(src_base_addr, src_read_addr, l1_write_addr);
+ // src_read_addr += curr_page_size;
+ // l1_write_addr += curr_page_size;
+ // }
+
+ // noc_async_read_barrier();
+
+ // cb_push_back(cb_id, curr_block_num_tiles);
+ // }
+
+ uint32_t num_free_blocks_in_buffer = total_num_blocks_in_buffer;
+ uint32_t curr_block_trid = 1;
+ uint32_t block_trid_to_wait = 1;
+
+ cb_reserve_back(cb_id, curr_block_num_tiles);
+ uint32_t l1_write_addr_offset = 0;
+ uint32_t l1_write_addr_start = get_write_ptr(cb_id);
+ if (l1_write_addr_start >= l1_buffer_end_addr) {
+ l1_write_addr_start = l1_buffer_start_addr;
+ }
+ uint32_t l1_write_addr = l1_write_addr_start;
+ for (uint32_t block = 0; block < curr_num_blocks; ++block) {
+ noc_async_read_tile_dram_sharded_set_trid(curr_block_trid);
+
+ uint32_t temp_l1_write_addr = l1_write_addr;
+ for (uint32_t h = 0; h < curr_num_pages; ++h) {
+ noc_async_read_tile_dram_sharded_with_state_with_trid(
+ src_base_addr, src_read_addr, temp_l1_write_addr, curr_block_trid);
+ src_read_addr += curr_page_size;
+ temp_l1_write_addr += curr_page_size;
+ }
+
+ if (num_free_blocks_in_buffer == 2) {
+ noc_async_read_barrier_with_trid(block_trid_to_wait);
+ cb_push_back(cb_id, curr_block_num_tiles);
+ // wait for next block trid
+ block_trid_to_wait = block_trid_to_wait == 3 ? 1 : (block_trid_to_wait + 1);
+ // reserve for next block
+ cb_reserve_back(cb_id, curr_block_num_tiles * 2);
+ } else {
+ num_free_blocks_in_buffer -= 1;
+ }
+
+ if (curr_block_trid == total_num_blocks_in_buffer) {
+ curr_block_trid = 1;
+ } else {
+ curr_block_trid += 1;
+ }
+
+ l1_write_addr += block_size_bytes;
+ if (l1_write_addr >= l1_buffer_end_addr) {
+ l1_write_addr = l1_buffer_start_addr;
+ }
+ }
+ // last block to wait
+ noc_async_read_barrier_with_trid(block_trid_to_wait);
+ cb_push_back(cb_id, curr_block_num_tiles);
+
+ src_read_addr_offset_bytes += curr_layer_size_bytes;
+
+ }
+
+}
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/kernels/receiver_l1.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/kernels/receiver_l1.cpp
new file mode 100644
index 00000000000..7e702916608
--- /dev/null
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/kernels/receiver_l1.cpp
@@ -0,0 +1,164 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include
+
+#include "dataflow_api.h"
+#include "ttnn/cpp/ttnn/operations/ccl/kernel_common/worker_sync_utils.hpp"
+
+#include "debug/dprint.h"
+
+constexpr uint32_t ALIGNED_PAGE_SIZE = 16;
+
+constexpr uint32_t cb_start_addr = get_compile_time_arg_val(0);
+constexpr uint32_t cb_rd_ptr = get_compile_time_arg_val(0);
+constexpr uint32_t cb_size = get_compile_time_arg_val(1);
+constexpr uint32_t num_layers = get_compile_time_arg_val(2);
+
+uint32_t rt_args_idx = 0;
+uint32_t vc;
+uint32_t noc_x;
+uint32_t noc_y;
+uint32_t pages_acked_semaphore_addr;
+uint32_t pages_sent_semaphore_addr;
+tt_l1_ptr uint32_t* page_size;
+tt_l1_ptr uint32_t* num_blocks;
+tt_l1_ptr uint32_t* block_num_tiles;
+
+uint32_t start_page_size;
+
+struct RemoteReceiverCBInterface {
+ volatile tt_l1_ptr uint32_t* pages_acked;
+ volatile tt_l1_ptr uint32_t* pages_sent;
+
+ uint32_t fifo_size;
+ uint32_t fifo_limit;
+ uint32_t fifo_limit_page_aligned;
+
+ uint32_t fifo_page_size;
+ uint32_t fifo_aligned_num_pages;
+
+ uint32_t fifo_rd_ptr;
+
+ uint32_t fifo_start_addr;
+
+ uint32_t aligned_page_size;
+};
+
+RemoteReceiverCBInterface remote_cb_interface;
+
+template
+FORCE_INLINE void setup_remote_receiver_cb_interface() {
+ uint32_t num_pages = cb_size / start_page_size;
+ uint32_t cb_size_page_aligned = num_pages * start_page_size;
+
+ remote_cb_interface.fifo_size = cb_size;
+ remote_cb_interface.fifo_limit = cb_size + cb_start_addr;
+ remote_cb_interface.fifo_limit_page_aligned = cb_size_page_aligned + cb_start_addr;
+
+
+ remote_cb_interface.fifo_page_size = start_page_size;
+ remote_cb_interface.fifo_aligned_num_pages = num_pages * start_page_size / aligned_page_size;
+
+ remote_cb_interface.fifo_rd_ptr = cb_rd_ptr;
+
+ remote_cb_interface.fifo_start_addr = cb_start_addr;
+
+ remote_cb_interface.pages_acked = reinterpret_cast(get_semaphore(pages_acked_semaphore_addr));
+ remote_cb_interface.pages_sent = reinterpret_cast(get_semaphore(pages_sent_semaphore_addr));
+
+ remote_cb_interface.aligned_page_size = aligned_page_size;
+}
+
+FORCE_INLINE void setup_remote_cb_page_size(uint32_t page_size, uint32_t remote_noc_x, uint32_t remote_noc_y, uint8_t noc = noc_index) {
+ uint32_t num_pages = remote_cb_interface.fifo_size / page_size;
+ uint32_t cb_size_page_aligned = num_pages * page_size;
+
+ remote_cb_interface.fifo_limit_page_aligned = cb_size_page_aligned + remote_cb_interface.fifo_start_addr;
+ remote_cb_interface.fifo_page_size = page_size;
+ remote_cb_interface.fifo_aligned_num_pages = num_pages * page_size / remote_cb_interface.aligned_page_size;
+
+ uint32_t curr_fifo_rd_ptr = remote_cb_interface.fifo_rd_ptr;
+ bool fifo_rd_ptr_exceed_fifo_limit = curr_fifo_rd_ptr > remote_cb_interface.fifo_limit_page_aligned;
+ uint32_t num_pages_till_fifo_limit = (remote_cb_interface.fifo_limit_page_aligned - curr_fifo_rd_ptr) / page_size;
+
+ if (fifo_rd_ptr_exceed_fifo_limit) {
+ remote_cb_interface.fifo_rd_ptr = remote_cb_interface.fifo_start_addr;
+ } else {
+ uint32_t next_fifo_rd_ptr = remote_cb_interface.fifo_limit_page_aligned - num_pages_till_fifo_limit * page_size;
+ uint32_t pages_acked = (next_fifo_rd_ptr - remote_cb_interface.fifo_rd_ptr) / remote_cb_interface.aligned_page_size;
+ remote_cb_interface.fifo_rd_ptr = next_fifo_rd_ptr;
+
+ // increment the aligned pages acked because we skipped to next aligned page location
+ *remote_cb_interface.pages_acked += pages_acked;
+ uint64_t remote_ack_ptr_addr = get_noc_addr(remote_noc_x, remote_noc_y, (uint32_t)remote_cb_interface.pages_acked, noc);
+ noc_semaphore_inc(remote_ack_ptr_addr, pages_acked, noc);
+ }
+}
+
+FORCE_INLINE void remote_cb_wait_front(uint32_t num_pages) {
+ uint32_t len_bytes = num_pages * remote_cb_interface.fifo_page_size;
+ uint32_t num_pages_wait = len_bytes / remote_cb_interface.aligned_page_size;
+ volatile uint32_t num_pages_recv = 0;
+ uint32_t pages_acked = 0;
+ uint32_t pages_sent = 0;
+
+ do {
+
+ pages_acked = (uint32_t)reg_read((uint32_t)remote_cb_interface.pages_acked);
+ pages_sent = (uint32_t)reg_read((uint32_t)remote_cb_interface.pages_sent);
+ num_pages_recv = pages_sent - pages_acked;
+ } while (num_pages_recv < num_pages_wait);
+}
+
+FORCE_INLINE void remote_cb_pop_front(uint32_t num_pages, uint32_t remote_noc_x, uint32_t remote_noc_y, uint8_t noc = noc_index) {
+ uint32_t len_bytes = num_pages * remote_cb_interface.fifo_page_size;
+ uint32_t num_aligned_pages = len_bytes / remote_cb_interface.aligned_page_size;
+
+ *remote_cb_interface.pages_acked += num_aligned_pages;
+ remote_cb_interface.fifo_rd_ptr += len_bytes;
+
+ if (remote_cb_interface.fifo_rd_ptr >= remote_cb_interface.fifo_limit_page_aligned) {
+ remote_cb_interface.fifo_rd_ptr = remote_cb_interface.fifo_start_addr;
+ }
+
+ uint64_t remote_ack_ptr_addr = get_noc_addr(remote_noc_x, remote_noc_y, (uint32_t)remote_cb_interface.pages_acked, noc);
+ noc_semaphore_inc(remote_ack_ptr_addr, num_aligned_pages, noc);
+}
+
+
+void kernel_main() {
+
+ uint32_t rt_args_idx = 0;
+ vc = get_arg_val(rt_args_idx++);
+ noc_x = get_arg_val(rt_args_idx++);
+ noc_y = get_arg_val(rt_args_idx++);
+ pages_acked_semaphore_addr = get_arg_val(rt_args_idx++);
+ pages_sent_semaphore_addr = get_arg_val(rt_args_idx++);
+
+ page_size = (tt_l1_ptr uint32_t*)(get_arg_addr(increment_arg_idx(rt_args_idx, num_layers)));
+ num_blocks = (tt_l1_ptr uint32_t*)(get_arg_addr(increment_arg_idx(rt_args_idx, num_layers)));
+ block_num_tiles = (tt_l1_ptr uint32_t*)(get_arg_addr(increment_arg_idx(rt_args_idx, num_layers)));
+
+ start_page_size = page_size[0];
+
+ constexpr uint32_t cb_id = 0;
+
+ setup_remote_receiver_cb_interface();
+
+ for (uint32_t l = 0; l < num_layers; ++l) {
+ uint32_t curr_page_size = page_size[l];
+ uint32_t curr_num_blocks = num_blocks[l];
+ uint32_t curr_block_num_tiles = block_num_tiles[l];
+
+ setup_remote_cb_page_size(curr_page_size, noc_x, noc_y);
+
+ for (uint32_t block = 0; block < curr_num_blocks; ++block) {
+ remote_cb_wait_front(curr_block_num_tiles);
+
+ remote_cb_pop_front(curr_block_num_tiles, noc_x, noc_y);
+ }
+ }
+
+}
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/kernels/writer_l1.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/kernels/writer_l1.cpp
new file mode 100644
index 00000000000..0fefcfbf9b1
--- /dev/null
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/kernels/writer_l1.cpp
@@ -0,0 +1,331 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include
+
+#include "dataflow_api.h"
+#include "ttnn/cpp/ttnn/operations/ccl/kernel_common/worker_sync_utils.hpp"
+
+#include "debug/dprint.h"
+
+constexpr uint32_t ALIGNED_PAGE_SIZE = 16;
+
+constexpr uint32_t noc = get_compile_time_arg_val(0);
+constexpr uint32_t cb_start_addr = get_compile_time_arg_val(1);
+constexpr uint32_t cb_wr_ptr = get_compile_time_arg_val(1);
+constexpr uint32_t cb_size = get_compile_time_arg_val(2);
+constexpr uint32_t num_receivers = get_compile_time_arg_val(3);
+constexpr uint32_t num_layers = get_compile_time_arg_val(4);
+
+tt_l1_ptr uint32_t* noc_x;
+tt_l1_ptr uint32_t* noc_y;
+tt_l1_ptr uint32_t* pages_acked_semaphore_addr;
+tt_l1_ptr uint32_t* pages_sent_semaphore_addr;
+tt_l1_ptr uint32_t* coalesced_page_size;
+tt_l1_ptr uint32_t* coalesced_num_pages;
+tt_l1_ptr uint32_t* num_blocks;
+tt_l1_ptr uint32_t* block_num_tiles;
+tt_l1_ptr uint32_t* page_size;
+tt_l1_ptr uint32_t* num_tile_rows;
+
+uint32_t start_page_size;
+uint32_t layer = 0;
+
+template
+struct RemoteSenderCBInterface {
+ uint32_t num_receivers;
+
+ volatile tt_l1_ptr uint32_t* pages_acked[num_recv_cbs];
+ volatile tt_l1_ptr uint32_t* pages_sent[num_recv_cbs];
+
+ uint32_t fifo_size;
+ uint32_t fifo_limit;
+ uint32_t fifo_limit_page_aligned;
+
+ uint32_t fifo_page_size;
+ uint32_t fifo_aligned_num_pages;
+
+ uint32_t fifo_wr_ptr;
+
+ uint32_t fifo_start_addr;
+
+ uint32_t aligned_page_size;
+};
+
+RemoteSenderCBInterface remote_cb_interface;
+
+template
+FORCE_INLINE void setup_remote_sender_cb_interface() {
+ uint32_t num_pages = cb_size / start_page_size;
+ uint32_t cb_size_page_aligned = num_pages * start_page_size;
+
+ remote_cb_interface.fifo_size = cb_size;
+ remote_cb_interface.fifo_limit = cb_size + cb_start_addr;
+ remote_cb_interface.fifo_limit_page_aligned = cb_size_page_aligned + cb_start_addr;
+
+ remote_cb_interface.fifo_page_size = start_page_size;
+ remote_cb_interface.fifo_aligned_num_pages = num_pages * start_page_size / aligned_page_size;
+
+ remote_cb_interface.fifo_wr_ptr = cb_wr_ptr;
+
+ remote_cb_interface.fifo_start_addr = cb_start_addr;
+
+ remote_cb_interface.num_receivers = num_receivers;
+
+ for (uint32_t i=0; i < num_receivers; ++i) {
+ remote_cb_interface.pages_acked[i] = reinterpret_cast(get_semaphore(pages_acked_semaphore_addr[i]));
+ remote_cb_interface.pages_sent[i] = reinterpret_cast(get_semaphore(pages_sent_semaphore_addr[i]));
+ }
+
+ remote_cb_interface.aligned_page_size = aligned_page_size;
+
+}
+
+FORCE_INLINE void setup_remote_cb_page_size(uint32_t page_size, uint32_t* remote_noc_x, uint32_t* remote_noc_y, uint8_t noc = noc_index) {
+ uint32_t num_pages = remote_cb_interface.fifo_size / page_size;
+ uint32_t cb_size_page_aligned = num_pages * page_size;
+
+ remote_cb_interface.fifo_limit_page_aligned = cb_size_page_aligned + remote_cb_interface.fifo_start_addr;
+ remote_cb_interface.fifo_page_size = page_size;
+ remote_cb_interface.fifo_aligned_num_pages = num_pages * page_size / remote_cb_interface.aligned_page_size;
+
+ uint32_t curr_fifo_wr_ptr = remote_cb_interface.fifo_wr_ptr;
+ bool fifo_wr_ptr_exceed_fifo_limit = curr_fifo_wr_ptr > remote_cb_interface.fifo_limit_page_aligned;
+ uint32_t num_pages_till_fifo_limit = (remote_cb_interface.fifo_limit_page_aligned - curr_fifo_wr_ptr) / page_size;
+
+ if (fifo_wr_ptr_exceed_fifo_limit) {
+ remote_cb_interface.fifo_wr_ptr = remote_cb_interface.fifo_start_addr;
+ } else {
+ uint32_t next_fifo_wr_ptr = remote_cb_interface.fifo_limit_page_aligned - num_pages_till_fifo_limit * page_size;
+ uint32_t pages_sent = (next_fifo_wr_ptr - remote_cb_interface.fifo_wr_ptr) / remote_cb_interface.aligned_page_size;
+ remote_cb_interface.fifo_wr_ptr = next_fifo_wr_ptr;
+
+ // increment the aligned pages sent because we skipped to next aligned page location
+ for (uint32_t i=0; i < remote_cb_interface.num_receivers; ++i) {
+ uint32_t remote_noc_xy = uint32_t(NOC_XY_ENCODING(DYNAMIC_NOC_X(noc, remote_noc_x[i]), DYNAMIC_NOC_Y(noc, remote_noc_y[i])));
+ *remote_cb_interface.pages_sent[i] += pages_sent;
+ uint64_t remote_ack_ptr_addr = get_noc_addr_helper(remote_noc_xy, (uint32_t)remote_cb_interface.pages_sent[i]);
+ noc_semaphore_inc(remote_ack_ptr_addr, pages_sent, noc);
+ }
+ }
+}
+
+FORCE_INLINE void remote_cb_reserve_back(uint32_t num_pages) {
+ uint32_t len_bytes = num_pages * remote_cb_interface.fifo_page_size;
+ uint32_t num_pages_wait = len_bytes / remote_cb_interface.aligned_page_size;
+ uint32_t free_pages;
+
+ for (uint32_t i=0; i < remote_cb_interface.num_receivers; ++i) {
+ do {
+ uint32_t pages_acked = (uint32_t)reg_read((uint32_t)remote_cb_interface.pages_acked[0]);
+ uint32_t pages_sent = (uint32_t)reg_read((uint32_t)remote_cb_interface.pages_sent[0]);
+ free_pages = remote_cb_interface.fifo_aligned_num_pages - (pages_sent - pages_acked);
+ } while (free_pages < num_pages_wait);
+ }
+}
+
+// unused for now, but we might need to use this one if we want to transfer the maximum noc packet
+FORCE_INLINE void remote_cb_push_back_and_write_pages_(uint32_t local_cb_addr, uint32_t num_pages, uint32_t remote_noc_x, uint32_t remote_noc_y, uint8_t noc = noc_index) {
+ uint32_t len_bytes = num_pages * remote_cb_interface.fifo_page_size;
+ uint32_t pages_sent = len_bytes / remote_cb_interface.aligned_page_size;
+
+ uint32_t local_fifo_rd_ptr = local_cb_addr;
+ uint32_t remote_fifo_wr_ptr = remote_cb_interface.fifo_wr_ptr;
+
+ uint32_t src_addr = local_cb_addr;
+ uint32_t dest_addr = remote_cb_interface.fifo_wr_ptr;
+ uint32_t remote_noc_xy = uint32_t(NOC_XY_ENCODING(DYNAMIC_NOC_X(noc, remote_noc_x), DYNAMIC_NOC_Y(noc, remote_noc_y)));
+ uint64_t dest_noc_addr = get_noc_addr_helper(remote_noc_xy, dest_addr);
+
+
+ while (len_bytes > NOC_MAX_BURST_SIZE) {
+
+ src_addr = local_fifo_rd_ptr;
+ dest_addr = remote_fifo_wr_ptr;
+ dest_noc_addr = get_noc_addr_helper(remote_noc_xy, dest_addr);
+
+ // split one write to two chunks
+ if ((dest_addr + NOC_MAX_BURST_SIZE) >= remote_cb_interface.fifo_limit_page_aligned) {
+ uint32_t first_len_bytes = remote_cb_interface.fifo_limit_page_aligned - dest_addr;
+ uint32_t second_len_bytes = NOC_MAX_BURST_SIZE - first_len_bytes;
+
+ // issue first write transfer
+ while (!noc_cmd_buf_ready(noc, write_cmd_buf));
+ ncrisc_noc_fast_write(noc, write_cmd_buf, src_addr, dest_noc_addr, first_len_bytes, NOC_UNICAST_WRITE_VC, false, false, 1, true);
+ src_addr += first_len_bytes;
+ dest_addr = remote_cb_interface.fifo_start_addr;
+ dest_noc_addr = get_noc_addr_helper(remote_noc_xy, dest_addr);
+
+ if (second_len_bytes != 0) {
+ // issue second write transfer
+ while (!noc_cmd_buf_ready(noc, write_cmd_buf));
+ ncrisc_noc_fast_write(noc, write_cmd_buf, src_addr, dest_noc_addr, second_len_bytes, NOC_UNICAST_WRITE_VC, false, false, 1, true);
+ src_addr += second_len_bytes;
+ dest_addr += second_len_bytes;
+ }
+
+ } else { // issue write in one request
+ while (!noc_cmd_buf_ready(noc, write_cmd_buf));
+ ncrisc_noc_fast_write(noc, write_cmd_buf, src_addr, dest_noc_addr, NOC_MAX_BURST_SIZE, NOC_UNICAST_WRITE_VC, false, false, 1, true);
+ src_addr += NOC_MAX_BURST_SIZE;
+ dest_addr += NOC_MAX_BURST_SIZE;
+ }
+
+ // update local and remote pointers
+ local_fifo_rd_ptr = src_addr;
+ remote_fifo_wr_ptr = dest_addr;
+
+ len_bytes -= NOC_MAX_BURST_SIZE;
+ }
+
+ dest_noc_addr = get_noc_addr_helper(remote_noc_xy, dest_addr);
+ // split one write to two chunks for last write
+ if ((dest_addr + len_bytes) >= remote_cb_interface.fifo_limit_page_aligned) {
+
+ uint32_t first_len_bytes = remote_cb_interface.fifo_limit_page_aligned - dest_addr;
+ uint32_t second_len_bytes = len_bytes - first_len_bytes;
+
+ // issue first write transfer
+ while (!noc_cmd_buf_ready(noc, write_cmd_buf));
+ ncrisc_noc_fast_write(noc, write_cmd_buf, src_addr, dest_noc_addr, first_len_bytes, NOC_UNICAST_WRITE_VC, false, false, 1, true);
+ src_addr += first_len_bytes;
+ dest_addr = remote_cb_interface.fifo_start_addr;
+ dest_noc_addr = get_noc_addr_helper(remote_noc_xy, dest_addr);
+
+ if (second_len_bytes != 0) {
+ // issue second write transfer
+ while (!noc_cmd_buf_ready(noc, write_cmd_buf));
+ ncrisc_noc_fast_write(noc, write_cmd_buf, src_addr, dest_noc_addr, second_len_bytes, NOC_UNICAST_WRITE_VC, false, false, 1, true);
+ src_addr += second_len_bytes;
+ dest_addr += second_len_bytes;
+ }
+
+ } else { // issue write in one request
+ while (!noc_cmd_buf_ready(noc, write_cmd_buf));
+ ncrisc_noc_fast_write(noc, write_cmd_buf, src_addr, dest_noc_addr, len_bytes, NOC_UNICAST_WRITE_VC, false, false, 1, true);
+ src_addr += len_bytes;
+ dest_addr += len_bytes;
+ }
+
+ *remote_cb_interface.pages_sent += pages_sent;
+ remote_cb_interface.fifo_wr_ptr = dest_addr;
+
+ uint64_t remote_ack_ptr_addr = get_noc_addr_helper(remote_noc_xy, (uint32_t)remote_cb_interface.pages_sent);
+ noc_semaphore_inc(remote_ack_ptr_addr, pages_sent, noc);
+}
+
+FORCE_INLINE void remote_cb_push_back_and_write_pages(uint32_t local_cb_addr, uint32_t num_pages, uint32_t num_rows, uint32_t coalesced_num_pages_per_row, uint32_t coalesced_page_size, uint32_t* remote_noc_x, uint32_t* remote_noc_y, uint8_t noc = noc_index) {
+ uint32_t len_bytes = num_pages * remote_cb_interface.fifo_page_size;
+ uint32_t pages_sent = len_bytes / remote_cb_interface.aligned_page_size;
+
+ uint32_t next_receiver_start_addr_stride = coalesced_num_pages_per_row * coalesced_page_size;
+ uint32_t next_block_row_stride = next_receiver_start_addr_stride * remote_cb_interface.num_receivers;
+
+ uint32_t dest_addr;
+
+ uint32_t next_receiver_start_addr_offset = 0;
+ for (uint32_t i=0; i < remote_cb_interface.num_receivers; ++i) {
+
+ uint32_t src_addr = local_cb_addr + next_receiver_start_addr_offset;
+ dest_addr = remote_cb_interface.fifo_wr_ptr;
+
+ uint32_t remote_noc_xy = uint32_t(NOC_XY_ENCODING(DYNAMIC_NOC_X(noc, remote_noc_x[i]), DYNAMIC_NOC_Y(noc, remote_noc_y[i])));
+ uint64_t dest_noc_addr = get_noc_addr_helper(remote_noc_xy, dest_addr);
+
+ noc_async_write_one_packet_set_state(dest_noc_addr, coalesced_page_size, noc);
+
+ for (uint32_t h = 0; h < num_rows; ++h) {
+ uint32_t prev_src_addr = src_addr;
+ for (uint32_t w = 0; w < coalesced_num_pages_per_row; ++w) {
+ dest_noc_addr = get_noc_addr_helper(remote_noc_xy, dest_addr);
+
+ if ((dest_addr + coalesced_page_size) > remote_cb_interface.fifo_limit_page_aligned) {
+
+ uint32_t first_len_bytes = remote_cb_interface.fifo_limit_page_aligned - dest_addr;
+ uint32_t second_len_bytes = coalesced_page_size - first_len_bytes;
+
+ if (first_len_bytes != 0) {
+ noc_async_write_one_packet(src_addr, dest_noc_addr, first_len_bytes, noc);
+ src_addr += first_len_bytes;
+ }
+
+ dest_addr = remote_cb_interface.fifo_start_addr;
+ dest_noc_addr = get_noc_addr_helper(remote_noc_xy, dest_addr);
+
+ noc_async_write_one_packet(src_addr, dest_noc_addr, second_len_bytes, noc);
+
+ src_addr += second_len_bytes;
+ dest_addr += second_len_bytes;
+ dest_noc_addr = get_noc_addr_helper(remote_noc_xy, dest_addr);
+
+ noc_async_write_one_packet_set_state(dest_noc_addr, coalesced_page_size, noc);
+
+ } else {
+ noc_async_write_one_packet_with_state(src_addr, dest_noc_addr, noc);
+
+ src_addr += coalesced_page_size;
+ dest_addr += coalesced_page_size;
+ }
+ }
+ src_addr = prev_src_addr + next_block_row_stride;
+ }
+ next_receiver_start_addr_offset += next_receiver_start_addr_stride;
+
+ *remote_cb_interface.pages_sent[i] += pages_sent;
+
+ uint64_t remote_ack_ptr_addr = get_noc_addr_helper(remote_noc_xy, (uint32_t)remote_cb_interface.pages_sent[i]);
+ noc_semaphore_inc(remote_ack_ptr_addr, pages_sent, noc);
+ }
+
+ remote_cb_interface.fifo_wr_ptr = dest_addr;
+
+}
+
+void kernel_main() {
+
+ uint32_t rt_args_idx = 0;
+ noc_x = (tt_l1_ptr uint32_t*)(get_arg_addr(increment_arg_idx(rt_args_idx, num_receivers)));
+ noc_y = (tt_l1_ptr uint32_t*)(get_arg_addr(increment_arg_idx(rt_args_idx, num_receivers)));
+ pages_acked_semaphore_addr = (tt_l1_ptr uint32_t*)(get_arg_addr(increment_arg_idx(rt_args_idx, num_receivers)));
+ pages_sent_semaphore_addr = (tt_l1_ptr uint32_t*)(get_arg_addr(increment_arg_idx(rt_args_idx, num_receivers)));
+
+ coalesced_page_size = (tt_l1_ptr uint32_t*)(get_arg_addr(increment_arg_idx(rt_args_idx, num_layers)));
+ coalesced_num_pages = (tt_l1_ptr uint32_t*)(get_arg_addr(increment_arg_idx(rt_args_idx, num_layers)));
+ num_blocks = (tt_l1_ptr uint32_t*)(get_arg_addr(increment_arg_idx(rt_args_idx, num_layers)));
+ block_num_tiles = (tt_l1_ptr uint32_t*)(get_arg_addr(increment_arg_idx(rt_args_idx, num_layers)));
+ page_size = (tt_l1_ptr uint32_t*)(get_arg_addr(increment_arg_idx(rt_args_idx, num_layers)));
+ num_tile_rows = (tt_l1_ptr uint32_t*)(get_arg_addr(increment_arg_idx(rt_args_idx, num_layers)));
+
+ start_page_size = page_size[0];
+
+ constexpr uint32_t cb_id = 0;
+
+ setup_remote_sender_cb_interface();
+
+ for (uint32_t l = 0; l < num_layers; ++l) {
+ uint32_t curr_coalesced_page_size = coalesced_page_size[l];
+ uint32_t curr_coalesced_num_pages = coalesced_num_pages[l];
+ uint32_t curr_num_blocks = num_blocks[l];
+ uint32_t curr_block_num_tiles = block_num_tiles[l];
+ uint32_t curr_page_size = page_size[l];
+ uint32_t curr_num_tile_rows = num_tile_rows[l];
+ uint32_t curr_receiver_block_num_tiles = curr_block_num_tiles / num_receivers;
+
+ setup_remote_cb_page_size(curr_page_size, noc_x, noc_y, noc);
+
+ for (uint32_t block = 0; block < curr_num_blocks; ++block) {
+
+ cb_wait_front(cb_id, curr_block_num_tiles);
+
+ uint32_t local_cb_addr = get_read_ptr(cb_id);
+ remote_cb_reserve_back(curr_receiver_block_num_tiles);
+ remote_cb_push_back_and_write_pages(local_cb_addr, curr_receiver_block_num_tiles, curr_num_tile_rows, curr_coalesced_num_pages, curr_coalesced_page_size, noc_x, noc_y, noc);
+
+ cb_pop_front(cb_id, curr_block_num_tiles);
+
+ }
+ layer++;
+ }
+
+}
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/test_dram_read_remote_cb.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/test_dram_read_remote_cb.cpp
new file mode 100644
index 00000000000..5bbf0ca25b0
--- /dev/null
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/test_dram_read_remote_cb.cpp
@@ -0,0 +1,832 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include "common/bfloat8.hpp"
+#include "common/bfloat16.hpp"
+#include "common/tt_backend_api_types.hpp"
+#include "tt_metal/detail/tt_metal.hpp"
+#include "tt_metal/detail/util.hpp"
+#include "tt_metal/host_api.hpp"
+#include "tt_metal/tt_metal/perf_microbenchmark/common/util.hpp"
+#include "tt_metal/common/work_split.hpp"
+#include "tests/tt_metal/test_utils/tilization.hpp"
+#include "tt_metal/test_utils/deprecated/tensor.hpp"
+#include "tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/matmul_utils.hpp"
+#include
+
+using namespace tt;
+using std::chrono::duration_cast;
+using std::chrono::microseconds;
+
+////////////////////////////////////////////////////////////////////////////////
+// A tensix core that's next to a DRAM bank reads from the bank, and writes to
+// the neighbour receiver tensix core. It creates a bfloat16/bfloat8_b format
+// DRAM buffer of a given input size, and write it to the DRAM banks in the round
+// robin style.
+//
+// Disclaimer:
+// - This benchmark is designed to support an input size larger than 4GB. But
+// current tt-metal does not seem to support buffer allocation larger than 4GB
+// yet.
+// - Also, detail::ReadFromBuffer API used in DRAM write test may take a long time if
+// the input size is large.
+//
+// Usage example:
+// ./test_dram_offchip
+// --k
+// --n
+// --num-blocks
+// --k
+// --k
+// --num-tests
+// --data-type
+// --num-banks
+// --bank-start-id
+// --bypass-check (set to bypass checking performance criteria fulfillment)
+////////////////////////////////////////////////////////////////////////////////
+
+
+
+template
+std::vector slice_vec(std::vector const &v, int m, int n) {
+ auto first = v.cbegin() + m;
+ auto last = v.cbegin() + n + 1;
+
+ std::vector vec(first, last);
+ return vec;
+}
+
+void get_max_page_size_and_num_pages(uint32_t num_tiles, uint32_t num_datums_per_tile, uint32_t& page_size, uint32_t& num_pages) {
+ uint64_t total_size = static_cast(num_tiles) * num_datums_per_tile;
+
+ page_size = (8192 / num_datums_per_tile) * num_datums_per_tile;
+ while (total_size % page_size != 0 && page_size >= num_datums_per_tile) {
+ page_size -= num_datums_per_tile;
+ }
+ num_pages = total_size / page_size;
+}
+
+std::tuple create_program(
+ tt_metal::Device *device,
+ const CoreRangeSet &dram_reader_core,
+ const CoreRangeSet &l1_receiver_cores,
+ const uint32_t &single_tile_size,
+ const tt::DataFormat &tile_format,
+ uint32_t k,
+ uint32_t n,
+ uint32_t num_blocks,
+ uint32_t cb_num_blocks,
+ uint32_t num_receivers,
+ uint32_t num_mixed_df_layers,
+ uint32_t cb_padding,
+ std::shared_ptr input_buffer,
+ std::shared_ptr output_buffer
+ ) {
+
+ log_info("created program");
+
+ tt_metal::Program program = tt_metal::Program();
+
+ auto all_cores = dram_reader_core.merge(l1_receiver_cores);
+
+ uint32_t start_tile_id = 0;
+ uint32_t kt = k / 32;
+ uint32_t nt = n / 32;
+ uint32_t block_h = kt / num_blocks;
+ uint32_t num_tile_rows_write = block_h;
+ uint32_t block_w = nt;
+ uint32_t block_num_tiles = block_h * block_w;
+
+ // DRAM reader CB
+ uint32_t reader_cb_index = 0;
+ uint32_t reader_cb_size = block_h * block_w * single_tile_size * 3;
+ // For debug purpose
+ // uint32_t reader_cb_size = block_h * block_w * single_tile_size;
+ uint32_t reader_page_size, reader_num_pages;
+ get_max_page_size_and_num_pages(block_num_tiles, single_tile_size, reader_page_size, reader_num_pages);
+
+ uint32_t receiver_block_num_tile = block_h * block_w / num_receivers;
+ uint32_t writer_page_size, writer_num_pages;
+ get_max_page_size_and_num_pages(block_w / num_receivers, single_tile_size, writer_page_size, writer_num_pages);
+
+ log_info("writer_page_size: {}", writer_page_size);
+ log_info("writer_num_pages: {}", writer_num_pages);
+
+ uint32_t reader_cb_addr = device->get_base_allocator_addr(HalMemType::L1);
+ tt_metal::CircularBufferConfig reader_cb_config =
+ tt_metal::CircularBufferConfig(reader_cb_size, {{reader_cb_index, tile_format}})
+ .set_page_size(reader_cb_index, single_tile_size);
+ auto reader_cb = tt_metal::CreateCircularBuffer(program, dram_reader_core, reader_cb_config);
+
+ // mixed cb dataformat
+ uint32_t next_layer_num_blocks = num_blocks * 2;
+ uint32_t next_layer_block_h = kt / next_layer_num_blocks;
+ uint32_t next_layer_block_num_tiles = next_layer_block_h * block_w;
+ uint32_t next_layer_num_tile_rows_write = next_layer_block_h;
+ uint32_t next_layer_receiver_block_num_tile = next_layer_block_num_tiles / num_receivers;
+
+ uint32_t next_layer_single_tile_size = single_tile_size;
+ if (tile_format == tt::DataFormat::Float16_b) {
+ next_layer_single_tile_size = 1088;
+ } else {
+ next_layer_single_tile_size = 2048;
+ }
+ uint32_t next_layer_reader_page_size, next_layer_reader_num_pages;
+ get_max_page_size_and_num_pages(next_layer_block_num_tiles, next_layer_single_tile_size, next_layer_reader_page_size, next_layer_reader_num_pages);
+
+ uint32_t next_layer_writer_page_size, next_layer_writer_num_pages;
+ get_max_page_size_and_num_pages(block_w / num_receivers, next_layer_single_tile_size, next_layer_writer_page_size, next_layer_writer_num_pages);
+
+ // L1 receiver CB
+ uint32_t receiver_cb_index = 0;
+ uint32_t receiver_cb_size = block_h * block_w * single_tile_size * cb_num_blocks / num_receivers + cb_padding;
+ uint32_t receiver_page_size = 32;
+ uint32_t receiver_cb_addr = output_buffer->address();
+ tt_metal::CircularBufferConfig receiver_cb_config =
+ tt_metal::CircularBufferConfig(receiver_cb_size, {{receiver_cb_index, tile_format}})
+ .set_page_size(receiver_cb_index, receiver_page_size).set_globally_allocated_address(*output_buffer);
+ auto receiver_cb = tt_metal::CreateCircularBuffer(program, l1_receiver_cores, receiver_cb_config);
+
+ log_info("reader_cb_size: {}", reader_cb_size);
+ log_info("receiver_cb_size: {}", receiver_cb_size);
+
+ // semaphore
+ std::vector pages_acked_semaphore_ids(num_receivers);
+ std::vector pages_sent_semaphore_ids(num_receivers);
+ for (uint32_t i=0; i < num_receivers; ++i) {
+ pages_acked_semaphore_ids[i] = tt_metal::CreateSemaphore(program, all_cores, INVALID);
+ pages_sent_semaphore_ids[i] = tt_metal::CreateSemaphore(program, all_cores, INVALID);
+ }
+
+ std::vector reader_compile_time_args = {
+ (std::uint32_t) input_buffer->address(),
+ (std::uint32_t) start_tile_id,
+ (std::uint32_t) tt_metal::NOC::RISCV_0_default,
+ (std::uint32_t) num_mixed_df_layers
+ };
+
+ auto reader_kernel = tt_metal::CreateKernel(
+ program,
+ "tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/kernels/reader_dram.cpp",
+ dram_reader_core,
+ tt_metal::DataMovementConfig{
+ .processor = tt_metal::DataMovementProcessor::RISCV_0,
+ .noc = tt_metal::NOC::RISCV_0_default,
+ .noc_mode = tt_metal::NOC_MODE::DM_DYNAMIC_NOC,
+ .compile_args = reader_compile_time_args});
+
+ std::vector writer_compile_time_args = {
+ (std::uint32_t) tt_metal::NOC::RISCV_0_default,
+ (std::uint32_t) receiver_cb_addr,
+ (std::uint32_t) receiver_cb_size,
+ (std::uint32_t) num_receivers,
+ (std::uint32_t) num_mixed_df_layers
+ };
+
+ auto writer_kernel = tt_metal::CreateKernel(
+ program,
+ "tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/kernels/writer_l1.cpp",
+ dram_reader_core,
+ tt_metal::DataMovementConfig{
+ .processor = tt_metal::DataMovementProcessor::RISCV_1,
+ .noc = tt_metal::NOC::RISCV_1_default,
+ .noc_mode = tt_metal::NOC_MODE::DM_DYNAMIC_NOC,
+ .compile_args = writer_compile_time_args});
+
+ std::vector receiver_compile_time_args = {
+ (std::uint32_t) reader_cb_addr,
+ (std::uint32_t) receiver_cb_size,
+ (std::uint32_t) num_mixed_df_layers,
+ };
+
+ auto receiver_kernel = tt_metal::CreateKernel(
+ program,
+ "tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/kernels/receiver_l1.cpp",
+ l1_receiver_cores,
+ tt_metal::DataMovementConfig{
+ .processor = tt_metal::DataMovementProcessor::RISCV_1,
+ .noc = tt_metal::NOC::RISCV_1_default,
+ .compile_args = receiver_compile_time_args});
+
+ // reader rt
+ auto dram_reader_core_coord = dram_reader_core.ranges().begin()->start_coord;
+ log_info("dram_reader_core_coord: {}", dram_reader_core_coord);
+ auto dram_reader_core_coord_physical = device->worker_core_from_logical_core(dram_reader_core_coord);
+ uint32_t bank_id = 0;
+ uint32_t vc = bank_id & 0x1;
+ std::vector reader_rt_args = {
+ (std::uint32_t) bank_id,
+ (std::uint32_t) vc
+ };
+ for (uint32_t i = 0; i < num_mixed_df_layers; ++i) {
+ reader_rt_args.push_back(i%2 == 0 ? reader_page_size : next_layer_reader_page_size);
+ }
+ for (uint32_t i = 0; i < num_mixed_df_layers; ++i) {
+ reader_rt_args.push_back(i%2 == 0 ? reader_num_pages : next_layer_reader_num_pages);
+ }
+ for (uint32_t i = 0; i < num_mixed_df_layers; ++i) {
+ reader_rt_args.push_back(i%2 == 0 ? num_blocks : next_layer_num_blocks);
+ }
+ for (uint32_t i = 0; i < num_mixed_df_layers; ++i) {
+ reader_rt_args.push_back(i%2 == 0 ? block_num_tiles : next_layer_block_num_tiles);
+ }
+ tt_metal::SetRuntimeArgs(program, reader_kernel, dram_reader_core_coord, reader_rt_args);
+
+ // writer rt
+ std::vector l1_receiver_core_coords;
+ for (auto l1_receiver_core_coord : *l1_receiver_cores.ranges().begin()) {
+ l1_receiver_core_coords.push_back(l1_receiver_core_coord);
+ }
+ std::vector writer_rt_args;
+ for (uint32_t i=0; i < num_receivers; ++i) {
+ auto l1_receiver_core_coord_physical = device->worker_core_from_logical_core(l1_receiver_core_coords[i]);
+ writer_rt_args.push_back(l1_receiver_core_coord_physical.x);
+ }
+ for (uint32_t i=0; i < num_receivers; ++i) {
+ auto l1_receiver_core_coord_physical = device->worker_core_from_logical_core(l1_receiver_core_coords[i]);
+ writer_rt_args.push_back(l1_receiver_core_coord_physical.y);
+ }
+ for (uint32_t i=0; i < num_receivers; ++i) {
+ writer_rt_args.push_back(pages_acked_semaphore_ids[i]);
+ }
+ for (uint32_t i=0; i < num_receivers; ++i) {
+ writer_rt_args.push_back(pages_sent_semaphore_ids[i]);
+ }
+ for (uint32_t i = 0; i < num_mixed_df_layers; ++i) {
+ writer_rt_args.push_back(i%2 == 0 ? writer_page_size : next_layer_writer_page_size);
+ }
+ for (uint32_t i = 0; i < num_mixed_df_layers; ++i) {
+ writer_rt_args.push_back(i%2 == 0 ? writer_num_pages : next_layer_writer_num_pages);
+ }
+ for (uint32_t i = 0; i < num_mixed_df_layers; ++i) {
+ writer_rt_args.push_back(i%2 == 0 ? num_blocks : next_layer_num_blocks);
+ }
+ for (uint32_t i = 0; i < num_mixed_df_layers; ++i) {
+ writer_rt_args.push_back(i%2 == 0 ? block_num_tiles : next_layer_block_num_tiles);
+ }
+ for (uint32_t i = 0; i < num_mixed_df_layers; ++i) {
+ writer_rt_args.push_back(i%2 == 0 ? single_tile_size : next_layer_single_tile_size);
+ }
+ for (uint32_t i = 0; i < num_mixed_df_layers; ++i) {
+ writer_rt_args.push_back(i%2 == 0 ? num_tile_rows_write : next_layer_num_tile_rows_write);
+ }
+ tt_metal::SetRuntimeArgs(program, writer_kernel, dram_reader_core_coord, writer_rt_args);
+
+ // reciever rt
+ for (uint32_t i=0; i < num_receivers; ++i) {
+ std::vector receiver_rt_args = {
+ (std::uint32_t) vc & 0x3,
+ (std::uint32_t) dram_reader_core_coord_physical.x,
+ (std::uint32_t) dram_reader_core_coord_physical.y
+ };
+ vc ++;
+
+ receiver_rt_args.push_back(pages_acked_semaphore_ids[i]);
+ receiver_rt_args.push_back(pages_sent_semaphore_ids[i]);
+
+ for (uint32_t i = 0; i < num_mixed_df_layers; ++i) {
+ receiver_rt_args.push_back(i%2 == 0 ? single_tile_size : next_layer_single_tile_size);
+ }
+ for (uint32_t i = 0; i < num_mixed_df_layers; ++i) {
+ receiver_rt_args.push_back(i%2 == 0 ? num_blocks : next_layer_num_blocks);
+ }
+ for (uint32_t i = 0; i < num_mixed_df_layers; ++i) {
+ receiver_rt_args.push_back(i%2 == 0 ? receiver_block_num_tile : next_layer_receiver_block_num_tile);
+ }
+
+ log_info("l1_receiver_core_coords: {}", l1_receiver_core_coords[i]);
+
+ tt_metal::SetRuntimeArgs(program, receiver_kernel, l1_receiver_core_coords[i], receiver_rt_args);
+ }
+
+ return {std::move(program), reader_kernel, reader_cb_addr};
+}
+
+float to_float(bfloat16 bfloat16_num) {
+ return bfloat16_num.to_float();
+}
+
+float pcc(const std::vector& x, const std::vector& y) {
+ if (x.size() != y.size()) {
+ throw std::invalid_argument("Vectors must be of the same length.");
+ }
+
+ int n = x.size();
+ float mean_x = 0, mean_y = 0;
+ for (int i = 0; i < n; ++i) {
+ mean_x += x[i];
+ mean_y += y[i];
+ }
+ mean_x /= n;
+ mean_y /= n;
+
+ float numerator = 0, sum_sq_x = 0, sum_sq_y = 0;
+ for (int i = 0; i < n; ++i) {
+ float diff_x = x[i] - mean_x;
+ float diff_y = y[i] - mean_y;
+ numerator += diff_x * diff_y;
+ sum_sq_x += diff_x * diff_x;
+ sum_sq_y += diff_y * diff_y;
+ }
+
+ float denominator = std::sqrt(sum_sq_x * sum_sq_y);
+ if (denominator == 0) {
+ return 0;
+ }
+
+ return numerator / denominator;
+}
+
+bool validation_bfp8_b(
+ tt::deprecated::Tensor input_tensor,
+ const tt::DataFormat &data_format,
+ uint32_t num_blocks,
+ uint32_t cb_num_blocks,
+ uint32_t kt,
+ uint32_t nt,
+ std::shared_ptr out_buffer
+) {
+ bool pass = true;
+ std::vector golden_vec(kt * nt * 32 * 32 / num_blocks * cb_num_blocks, 0); // Initialize with zeros
+ std::vector result_vec(kt * nt * 32 * 32 / num_blocks * cb_num_blocks, 0);
+ auto num_datums_per_cb = kt * nt * 32 * 32 / num_blocks * cb_num_blocks;
+
+ std::vector result_untilized;
+ std::vector result;
+ tt::tt_metal::detail::ReadFromBuffer(out_buffer, result);
+ auto result_bfp8 = unpack_bfp8_tiles_into_float_vec(result, true, false);
+ result_untilized = tt::test_utils::untilize(result_bfp8, kt*32 / num_blocks * cb_num_blocks, nt*32);
+
+ const auto& values = input_tensor.get_values();
+
+ int index = 0;
+ for (int i = 0; i < kt * nt * 32 * 32; ++i) {
+ golden_vec[index] = float(values[i]);
+ index++;
+
+ if (index == num_datums_per_cb) {
+ index = 0;
+ }
+ }
+
+ for (int i=0; i= 0.9999;
+ if (!pass) {
+ log_error(LogTest, "validation single core failed");
+ }
+ return pass;
+}
+
+
+bool validation_fp16(
+ tt::deprecated::Tensor input_tensor,
+ const tt::DataFormat &data_format,
+ uint32_t num_blocks,
+ uint32_t cb_num_blocks,
+ uint32_t kt,
+ uint32_t nt,
+ std::shared_ptr out_buffer
+) {
+ bool pass = true;
+ std::vector golden_vec(kt * nt * 32 * 32 / num_blocks * cb_num_blocks, 0); // Initialize with zeros
+ std::vector result_vec(kt * nt * 32 * 32 / num_blocks * cb_num_blocks, 0);
+ auto num_datums_per_cb = kt * nt * 32 * 32 / num_blocks * cb_num_blocks;
+
+ std::vector result;
+ tt::tt_metal::detail::ReadFromBuffer(out_buffer, result);
+ auto result_bfp16 = unpack_uint32_vec_into_bfloat16_vec(result);
+ auto result_flat_layout = convert_to_flat_layout(result_bfp16);
+ auto result_untilized = tt::test_utils::untilize(result_flat_layout, kt*32 / num_blocks * cb_num_blocks, nt*32);
+
+ const auto& values = input_tensor.get_values();
+
+ int index = 0;
+ for (int i = 0; i < kt * nt * 32 * 32; ++i) {
+ golden_vec[index] = to_float(values[i]);
+ index++;
+
+ if (index == num_datums_per_cb) {
+ index = 0;
+ }
+ }
+
+ for (int i=0; i(result_untilized[i]));
+ }
+
+ pass &= (golden_vec == result_vec);
+ if (!pass) {
+ log_error(LogTest, "validation single core failed");
+ }
+ return pass;
+}
+
+bool validation_mixed_df(
+ tt::deprecated::Tensor input_tensor_fp16,
+ tt::deprecated::Tensor input_tensor_fp8,
+ const tt::DataFormat &data_format,
+ uint32_t num_blocks,
+ uint32_t cb_num_blocks,
+ uint32_t kt,
+ uint32_t nt,
+ std::shared_ptr out_buffer,
+ uint32_t num_mixed_df_layers,
+ uint32_t num_receivers
+) {
+ bool pass = true;
+
+ std::vector result;
+ tt::tt_metal::detail::ReadFromBuffer(out_buffer, result);
+
+ auto result_bfp16 = unpack_uint32_vec_into_bfloat16_vec(result);
+ auto result_untilized_fp16 = convert_to_flat_layout(result_bfp16);
+
+ std::vector golden_vec(kt*32 / num_blocks * cb_num_blocks * nt*32);
+ std::vector result_vec_fp16(kt*32 / num_blocks * cb_num_blocks * nt*32);
+
+ // compare with the result tilized with tilized
+ auto values_fp16 = tt::test_utils::tilize(input_tensor_fp16.get_values(), kt*32, nt*32);
+
+ auto num_datums_per_cb = kt * nt * 32 * 32 / num_blocks * cb_num_blocks / num_receivers;
+ int start_index = 0;
+ int fifo_size = kt*32 / num_blocks * cb_num_blocks * nt*32 * 2 / num_receivers;
+ int fifo_size_page_aligned, page_size, num_pages, layer_transfer_size, fifo_wr_ptr = 0;
+ for (int l = 0; l < num_mixed_df_layers; ++l) {
+ if (l % 2 == 0) { // fp16
+ page_size = 2048;
+ } else {
+ page_size = 1088;
+ }
+ layer_transfer_size = page_size * kt * nt / num_receivers;
+ num_pages = fifo_size / page_size;
+ fifo_size_page_aligned = page_size * num_pages;
+
+ bool fifo_wr_ptr_exceed_fifo_limit = fifo_wr_ptr > fifo_size_page_aligned;
+ uint32_t num_pages_till_fifo_limit = (fifo_size_page_aligned - fifo_wr_ptr) / page_size;
+ // start pointer addr of current layer
+ fifo_wr_ptr = fifo_wr_ptr_exceed_fifo_limit ? 0 : fifo_size_page_aligned - num_pages_till_fifo_limit * page_size;
+ // start index to read, fifo_wr_ptr / 2 because fp16 format
+ start_index = fifo_wr_ptr == fifo_size_page_aligned ? 0 : fifo_wr_ptr / 2;
+ // end pointer addr of current layer
+ fifo_wr_ptr = (fifo_wr_ptr + layer_transfer_size) % fifo_size_page_aligned;
+ }
+
+ std::vector > values_fp16_split(num_receivers, std::vector(values_fp16.size() / num_receivers));
+
+ int index = 0;
+ for (int k = 0; k < kt; ++k) {
+ for (int n = 0; n < num_receivers; ++n) {
+ for (int i = 0; i < nt * 32 * 32 / num_receivers; ++i) {
+ values_fp16_split[n][i + k * nt * 32 * 32 / num_receivers] = to_float(values_fp16[index]);
+ index ++;
+ }
+ }
+ }
+
+ std::vector > golden_vec_split(num_receivers, std::vector(golden_vec.size() / num_receivers));
+
+ for (int n = 0; n < num_receivers; ++n) {
+ index = start_index;
+ for (int i = 0; i < kt * nt * 32 * 32 / num_receivers; ++i) {
+ golden_vec_split[n][index] = values_fp16_split[n][i];
+ index ++;
+
+ if (index == num_datums_per_cb) {
+ index = 0;
+ }
+ }
+ }
+
+ index = 0;
+ for (int k = 0; k < kt / num_blocks * cb_num_blocks; ++k) {
+ for (int n = 0; n < num_receivers; ++n) {
+ for (int i = 0; i < nt * 32 * 32 / num_receivers; ++i) {
+ golden_vec[index] = golden_vec_split[n][i + k * nt * 32 * 32 / num_receivers];
+ index ++;
+ }
+ }
+ }
+
+ for (int i=0; i(result_untilized_fp16[i]));
+ }
+
+ // For debug purpose
+ // for (int i = 0; i < golden_vec.size(); ++i) {
+ // std::cout << golden_vec[i] << " ";
+ // if ((i+1) % 32 == 0) {
+ // std::cout << std::endl;
+ // }
+ // }
+ // std::cout << std::endl;
+ // std::cout << std::endl;
+ // for (int i = 0; i < result_vec_fp16.size(); ++i) {
+ // std::cout << result_vec_fp16[i] << " ";
+ // if ((i+1) % 32 == 0) {
+ // std::cout << std::endl;
+ // }
+ // }
+
+ pass &= pcc(golden_vec, result_vec_fp16) == 1.0;
+
+ if (!pass) {
+ log_error(LogTest, "validation single core failed");
+ }
+ return pass;
+}
+
+std::shared_ptr create_and_transfer_data_sharded_cb(
+ tt_metal::Device* device,
+ vector input_vec,
+ uint32_t ht,
+ uint32_t wt,
+ BufferType buffer_type,
+ tt::DataFormat data_format,
+ CoreRangeSet cores,
+ uint32_t num_receivers
+) {
+
+ uint32_t size_bytes;
+ uint32_t page_size_bytes;
+ if (data_format == tt::DataFormat::Bfp8_b) {
+ size_bytes = ht * wt * 1088;
+ page_size_bytes = 1088;
+ } else {
+ size_bytes = ht * tt::constants::TILE_HEIGHT * wt * tt::constants::TILE_WIDTH * 2;
+ page_size_bytes = tt::constants::TILE_HW * 2;
+ }
+
+ ShardSpecBuffer shard_spec = ShardSpecBuffer(
+ cores,
+ {ht * tt::constants::TILE_HEIGHT, wt * tt::constants::TILE_WIDTH / num_receivers},
+ ShardOrientation::ROW_MAJOR,
+ false,
+ {tt::constants::TILE_HEIGHT, tt::constants::TILE_WIDTH},
+ {ht, wt});
+
+ log_info("cores: {}", cores);
+ log_info("size_bytes: {}", size_bytes);
+ log_info("page_size_bytes: {}", page_size_bytes);
+
+ auto input_buffer = CreateBuffer(tt::tt_metal::ShardedBufferConfig{
+ .device = device,
+ .size = size_bytes,
+ .page_size = page_size_bytes,
+ .buffer_type = buffer_type,
+ .buffer_layout = TensorMemoryLayout::WIDTH_SHARDED,
+ .shard_parameters = shard_spec});
+ tt::tt_metal::detail::WriteToBuffer(input_buffer, input_vec);
+
+ log_info("created sharded tensor");
+
+ return input_buffer;
+}
+
+int main(int argc, char **argv) {
+ if (getenv("TT_METAL_SLOW_DISPATCH_MODE") != nullptr) {
+ log_error("Test not supported w/ slow dispatch, exiting");
+ }
+
+ bool pass = true;
+ bool use_device_profiler = false;
+ uint32_t df = 0;
+ std::vector dram_bandwidth;
+ uint32_t num_tests = 1;
+ uint32_t num_blocks = 8;
+ uint32_t cb_num_blocks = 8;
+ uint32_t cb_padding = 16;
+ uint32_t num_receivers = 1;
+ uint32_t num_mixed_df_layers = 1;
+ uint64_t k = 8192, n = 128;
+
+ try {
+ ////////////////////////////////////////////////////////////////////////////
+ // Initial Runtime Args Parse
+ ////////////////////////////////////////////////////////////////////////////
+ std::vector input_args(argv, argv + argc);
+ try {
+ std::tie(k, input_args) =
+ test_args::get_command_option_uint64_and_remaining_args(input_args, "--k", 8192);
+ std::tie(n, input_args) =
+ test_args::get_command_option_uint64_and_remaining_args(input_args, "--n", 12*128);
+ std::tie(num_blocks, input_args) =
+ test_args::get_command_option_uint64_and_remaining_args(input_args, "--num-blocks", 8);
+ std::tie(cb_num_blocks, input_args) =
+ test_args::get_command_option_uint64_and_remaining_args(input_args, "--cb-num-blocks", 8);
+ std::tie(cb_padding, input_args) =
+ test_args::get_command_option_uint64_and_remaining_args(input_args, "--cb-padding", 16);
+ std::tie(num_tests, input_args) =
+ test_args::get_command_option_uint32_and_remaining_args(input_args, "--num-tests", 1);
+ std::tie(use_device_profiler, input_args) =
+ test_args::has_command_option_and_remaining_args(input_args, "--use-device-profiler");
+ std::tie(df, input_args) =
+ test_args::get_command_option_uint32_and_remaining_args(input_args, "--data-type", 0);
+ std::tie(num_receivers, input_args) =
+ test_args::get_command_option_uint64_and_remaining_args(input_args, "--num-receivers", 1);
+ std::tie(num_mixed_df_layers, input_args) =
+ test_args::get_command_option_uint64_and_remaining_args(input_args, "--num-mixed-df-layers", 1);
+
+
+ test_args::validate_remaining_args(input_args);
+ } catch (const std::exception &e) {
+ log_error(tt::LogTest, "Command line arguments found exception", e.what());
+ TT_ASSERT(false);
+ }
+
+ log_info("num_mixed_df_layers: {} ", num_mixed_df_layers);
+ log_info("num_receivers: {} ", num_receivers);
+
+ TT_FATAL(num_mixed_df_layers % 2 == 1, "currently only support odd number of layers testing, due to issue with validatoin");
+ if (num_mixed_df_layers > 1) {
+ TT_FATAL(df == 1, "must start with bfloat16 format for mix_df test");
+ }
+
+ if (use_device_profiler) {
+ #if !defined(TRACY_ENABLE)
+ log_error(
+ LogTest,
+ "Metal library and test code should be build with "
+ "profiler option using ./scripts/build_scripts/build_with_profiler_opt.sh");
+ #endif
+ auto device_profiler = getenv("TT_METAL_DEVICE_PROFILER");
+ TT_FATAL(
+ device_profiler,
+ "Before running the program, do one of the following in a shell: "
+ "either export the environment variable by executing export TT_METAL_DEVICE_PROFILER=1, "
+ "or run the program with TT_METAL_DEVICE_PROFILER=1 prefixed to the command");
+ }
+
+ ////////////////////////////////////////////////////////////////////////////
+ // Parameters Setup
+ ////////////////////////////////////////////////////////////////////////////
+ uint32_t num_banks = 1;
+ uint32_t input_size = 0;
+ tt::DataFormat tile_format = tt::DataFormat::Bfp8_b;
+ if (df == 0) {
+ input_size = k * n * 1088 / 1024;
+ tile_format = tt::DataFormat::Bfp8_b;
+ } else if (df == 1) {
+ input_size = k * n * 2;
+ tile_format = tt::DataFormat::Float16_b;
+ } else {
+ TT_THROW("Input data format {} is invalid. Please change.", df);
+ }
+ uint32_t output_size = input_size / num_blocks * cb_num_blocks;
+ uint32_t kt = k / 32;
+ uint32_t nt = n / 32;
+ uint32_t block_h = kt / num_blocks;
+ uint32_t block_w = nt;
+ uint32_t num_datums_per_tile = 32 * 32;
+
+ uint32_t single_tile_size = tt_metal::detail::TileSize(tile_format);
+
+ TT_FATAL(input_size % single_tile_size == 0, "input size is not aligned to tile size");
+ ////////////////////////////////////////////////////////////////////////////
+ // Device Setup
+ ////////////////////////////////////////////////////////////////////////////
+ int device_id = 0;
+ tt_metal::Device *device = tt_metal::CreateDevice(device_id);
+
+ CoreCoord dram_bank_coord = CoreCoord{0, 0};
+ CoreCoord dram_reader_core_coord = CoreCoord{0, 0};
+ CoreRange dram_reader_core_coord_range = CoreRange(dram_reader_core_coord);
+ CoreRangeSet dram_reader_core{std::set{CoreRange{dram_reader_core_coord}}};
+ CoreRange l1_receiver_core_coord_range = CoreRange(CoreCoord{0, 0});
+ if (device->arch() == tt::ARCH::GRAYSKULL) {
+ l1_receiver_core_coord_range = CoreRange{CoreCoord{0, 1}, CoreCoord{0, num_receivers}};
+ } else {
+ l1_receiver_core_coord_range = CoreRange{CoreCoord{1, 0}, CoreCoord{num_receivers, 0}};
+ }
+ CoreRangeSet l1_receiver_core{std::set{l1_receiver_core_coord_range}};
+
+ ////////////////////////////////////////////////////////////////////////////
+ // Input Setup
+ ////////////////////////////////////////////////////////////////////////////
+ std::vector > input_buffers(num_mixed_df_layers);
+ std::shared_ptr output_buffer;
+ auto input_shape = SHAPE{1, 1, k, n};
+ tt::deprecated::Tensor tensor_fp16 = tt::deprecated::initialize_tensor(input_shape, tt::deprecated::Initialize::INCREMENT, 100, std::chrono::system_clock::now().time_since_epoch().count());
+ tt::deprecated::Tensor tensor_fp8 = tt::deprecated::initialize_tensor(input_shape, tt::deprecated::Initialize::INCREMENT, 100, std::chrono::system_clock::now().time_since_epoch().count());
+ if (tile_format == tt::DataFormat::Bfp8_b) {
+ for (uint32_t i = 0; i < num_mixed_df_layers; ++i) {
+ if (i%2 == 0) { // even layers
+ auto input_vec_tilized = tt::test_utils::tilize(tensor_fp8.get_values(), k, n);
+ std::vector packed_input_vec_tile_layout = pack_fp32_vec_as_bfp8_tiles(input_vec_tilized, true, false);
+ input_buffers[i] = create_and_transfer_data_sharded_cb(device, packed_input_vec_tile_layout, kt, nt, tt_metal::BufferType::DRAM, tt::DataFormat::Bfp8_b, dram_reader_core, num_banks);
+ } else { // odd layers
+ auto input_vec_tilized = tt::test_utils::tilize(tensor_fp16.get_values(), k, n);
+ auto input_vec_tile_layout = convert_to_tile_layout(input_vec_tilized);
+ vector packed_input_vec_tile_layout = pack_bfloat16_vec_into_uint32_vec(input_vec_tile_layout);
+ input_buffers[i] = create_and_transfer_data_sharded_cb(device, packed_input_vec_tile_layout, kt, nt, tt_metal::BufferType::DRAM, tt::DataFormat::Float16_b, dram_reader_core, num_banks);
+ }
+ }
+
+ // output
+ vector outputs = create_constant_vector_of_bfp8(output_size, 0, true);
+ output_buffer = create_and_transfer_data_sharded_cb(device, outputs, kt / num_blocks * cb_num_blocks, nt, tt_metal::BufferType::L1, tt::DataFormat::Bfp8_b, l1_receiver_core, num_receivers);
+
+ } else {
+ for (uint32_t i = 0; i < num_mixed_df_layers; ++i) {
+ if (i%2 == 0) { // even layers
+ auto input_vec_tilized = tt::test_utils::tilize(tensor_fp16.get_values(), k, n);
+ auto input_vec_tile_layout = convert_to_tile_layout(input_vec_tilized);
+ vector packed_input_vec_tile_layout = pack_bfloat16_vec_into_uint32_vec(input_vec_tile_layout);
+ input_buffers[i] = create_and_transfer_data_sharded_cb(device, packed_input_vec_tile_layout, kt, nt, tt_metal::BufferType::DRAM, tt::DataFormat::Float16_b, dram_reader_core, num_banks);
+ } else {
+ auto input_vec_tilized = tt::test_utils::tilize(tensor_fp8.get_values(), k, n);
+ std::vector packed_input_vec_tile_layout = pack_fp32_vec_as_bfp8_tiles(input_vec_tilized, true, false);
+ input_buffers[i] = create_and_transfer_data_sharded_cb(device, packed_input_vec_tile_layout, kt, nt, tt_metal::BufferType::DRAM, tt::DataFormat::Bfp8_b, dram_reader_core, num_banks);
+ }
+ }
+
+ // output
+ vector outputs = create_constant_vector_of_bfloat16(output_size, 0);
+ output_buffer = create_and_transfer_data_sharded_cb(device, outputs, kt / num_blocks * cb_num_blocks, nt, tt_metal::BufferType::L1, tt::DataFormat::Float16_b, l1_receiver_core, num_receivers);
+ }
+
+ for (uint32_t i=0; i < num_mixed_df_layers; ++i) {
+ log_info("input_buffers addr: {}", input_buffers[i]->address());
+ }
+
+ ////////////////////////////////////////////////////////////////////////////
+ // Application Setup
+ ////////////////////////////////////////////////////////////////////////////
+ auto [program, kernel, output_cb_addr] = create_program(device, dram_reader_core, l1_receiver_core, single_tile_size, tile_format, k, n, num_blocks, cb_num_blocks, num_receivers, num_mixed_df_layers, cb_padding, input_buffers[0], output_buffer);
+
+ ////////////////////////////////////////////////////////////////////////////
+ // Execution Application
+ ////////////////////////////////////////////////////////////////////////////
+ tt_metal::detail::CompileProgram(device, program);
+
+ log_info(LogTest, "Num tests {}", num_tests);
+ for (uint32_t i = 0; i < num_tests; ++i) {
+ EnqueueProgram(device->command_queue(), program, false);
+ Finish(device->command_queue());
+ tt_metal::DumpDeviceProfileResults(device, program);
+ }
+
+ ////////////////////////////////////////////////////////////////////////////
+ // Validation & Teardown
+ ////////////////////////////////////////////////////////////////////////////
+ if (num_mixed_df_layers == 1) {
+ if (tile_format == tt::DataFormat::Bfp8_b) {
+ pass = validation_bfp8_b(
+ tensor_fp8,
+ tile_format,
+ num_blocks,
+ cb_num_blocks,
+ kt,
+ nt,
+ output_buffer);
+ } else {
+ pass = validation_fp16(
+ tensor_fp16,
+ tile_format,
+ num_blocks,
+ cb_num_blocks,
+ kt,
+ nt,
+ output_buffer);
+ }
+ } else {
+ pass = validation_mixed_df(
+ tensor_fp16,
+ tensor_fp8,
+ tile_format,
+ num_blocks,
+ cb_num_blocks,
+ kt,
+ nt,
+ output_buffer,
+ num_mixed_df_layers,
+ num_receivers);
+ }
+
+ pass &= tt_metal::CloseDevice(device);
+ } catch (const std::exception &e) {
+ pass = false;
+ log_error(LogTest, "{}", e.what());
+ log_error(LogTest, "System error message: {}", std::strerror(errno));
+ }
+
+ if (pass) {
+ log_info(LogTest, "Test Passed");
+ } else {
+ log_error(LogTest, "Test Failed");
+ }
+
+ return 0;
+}
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/8_dram_adjacent_core_read/kernels/reader_dram.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/8_dram_adjacent_core_read/kernels/reader_dram.cpp
index e42ab99525a..17509788f2e 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/8_dram_adjacent_core_read/kernels/reader_dram.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/8_dram_adjacent_core_read/kernels/reader_dram.cpp
@@ -49,7 +49,7 @@ void kernel_main() {
constexpr uint32_t cb_id = 0;
- uint32_t src_base_addr = noc_async_read_tile_dram_sharded_set_state(input_addr, bank_id, vc);
+ uint32_t src_base_addr = noc_async_read_tile_dram_sharded_set_state(input_addr, page_size, bank_id, vc);
uint32_t l1_read_addr = 0;
constexpr uint32_t total_num_blocks_in_buffer = 3;
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/8_dram_adjacent_core_read/test_dram_read.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/8_dram_adjacent_core_read/test_dram_read.cpp
index facfd0ab019..773380ebee9 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/8_dram_adjacent_core_read/test_dram_read.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/8_dram_adjacent_core_read/test_dram_read.cpp
@@ -632,7 +632,7 @@ int main(int argc, char **argv) {
uint32_t num_cores = num_banks; // number of DRAM banks
// uint32_t num_banks_all = 12;
- CoreRangeSet all_cores = CoreRangeSet{{}};
+ CoreRangeSet all_cores;
std::vector all_cores_list;
if (device->arch() == tt::ARCH::WORMHOLE_B0) {
get_dram_reader_core_coords_wormhole_b0(device, all_cores, all_cores_list);
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/kernels/reader_dram.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/kernels/reader_dram.cpp
index 48c659c54ce..479dec38ec1 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/kernels/reader_dram.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/kernels/reader_dram.cpp
@@ -51,7 +51,7 @@ void kernel_main() {
constexpr uint32_t cb_id = 0;
- uint32_t src_base_addr = noc_async_read_tile_dram_sharded_set_state(input_addr, bank_id, vc);
+ uint32_t src_base_addr = noc_async_read_tile_dram_sharded_set_state(input_addr, page_size, bank_id, vc);
uint32_t src_read_addr = 0;
#ifdef ARCH_GRAYSKULL
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/test_dram_read_l1_write.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/test_dram_read_l1_write.cpp
index a3d62706327..814b28abe02 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/test_dram_read_l1_write.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/test_dram_read_l1_write.cpp
@@ -815,9 +815,9 @@ int main(int argc, char **argv) {
uint32_t num_tiles = static_cast((input_size + single_tile_size - 1) / single_tile_size);
uint32_t num_cores = num_banks; // number of DRAM banks
- CoreRangeSet all_dram_reader_cores = CoreRangeSet{{}};
+ CoreRangeSet all_dram_reader_cores;
std::vector all_dram_reader_cores_ordered;
- CoreRangeSet all_l1_receiver_cores = CoreRangeSet{{}};
+ CoreRangeSet all_l1_receiver_cores;
std::vector all_l1_writer_cores_ordered;
if (device->arch() == tt::ARCH::BLACKHOLE) {
get_dram_reader_core_coords_blackhole(device, all_dram_reader_cores, all_dram_reader_cores_ordered);
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/CMakeLists.txt b/tests/tt_metal/tt_metal/perf_microbenchmark/CMakeLists.txt
index 94875c6114f..5d839ed65ba 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/CMakeLists.txt
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/CMakeLists.txt
@@ -38,6 +38,7 @@ set(PERF_MICROBENCH_TESTS_SRCS
7_kernel_launch/test_kernel_launch.cpp
8_dram_adjacent_core_read/test_dram_read.cpp
9_dram_adjacent_read_remote_l1_write/test_dram_read_l1_write.cpp
+ 10_dram_read_remote_cb_sync/test_dram_read_remote_cb.cpp
)
foreach (TEST_SRC ${PERF_MICROBENCH_TESTS_SRCS})
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/common.h b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/common.h
index 7382029c62f..0959bd24c98 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/common.h
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/common.h
@@ -6,7 +6,7 @@
#include
#include
-#include "core_coord.h"
+#include "core_coord.hpp"
#include "tt_metal/common/logger.hpp"
#include "tt_metal/host_api.hpp"
#include "tt_metal/impl/device/device.hpp"
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency.cpp
index 1128d5d7809..b660e49d921 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency.cpp
@@ -8,7 +8,7 @@
#include
#include
-#include "core_coord.h"
+#include "core_coord.hpp"
#include "logger.hpp"
#include "tt_metal/host_api.hpp"
#include "tt_metal/detail/tt_metal.hpp"
@@ -263,7 +263,7 @@ int main(int argc, char **argv) {
{"MCAST_NOC_END_ADDR_Y", std::to_string(mcast_noc_addr_end_y)}
};
if (!page_size_as_runtime_arg_g) {
- defines.insert(pair("PAGE_SIZE", std::to_string(page_size_g)));
+ defines.insert(std::pair("PAGE_SIZE", std::to_string(page_size_g)));
}
tt_metal::CircularBufferConfig cb_config = tt_metal::CircularBufferConfig(page_size_g * page_count_g, {{0, tt::DataFormat::Float32}})
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_bidirectional_bandwidth_no_edm.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_bidirectional_bandwidth_no_edm.cpp
index 981e7b56dd1..7d5d555ab82 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_bidirectional_bandwidth_no_edm.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_bidirectional_bandwidth_no_edm.cpp
@@ -13,7 +13,7 @@
#include "impl/device/device.hpp"
#include "impl/kernels/kernel_types.hpp"
#include "tt_backend_api_types.hpp"
-#include "tt_metal/common/core_coord.h"
+#include "tt_metal/common/core_coord.hpp"
#include "tt_metal/common/math.hpp"
#include "tt_metal/detail/tt_metal.hpp"
#include "tt_metal/host_api.hpp"
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_hop_latencies_no_edm.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_hop_latencies_no_edm.cpp
index 5a3deb226f2..11a8d230a1f 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_hop_latencies_no_edm.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_hop_latencies_no_edm.cpp
@@ -15,7 +15,7 @@
#include "impl/kernels/data_types.hpp"
#include "impl/kernels/kernel_types.hpp"
#include "tt_backend_api_types.hpp"
-#include "tt_metal/common/core_coord.h"
+#include "tt_metal/common/core_coord.hpp"
#include "tt_metal/common/math.hpp"
#include "tt_metal/detail/tt_metal.hpp"
#include "tt_metal/host_api.hpp"
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_link_ping_latency_no_edm.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_link_ping_latency_no_edm.cpp
index 8aae6595809..636bd6bfa48 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_link_ping_latency_no_edm.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_link_ping_latency_no_edm.cpp
@@ -14,7 +14,7 @@
#include "impl/device/device.hpp"
#include "impl/kernels/kernel_types.hpp"
#include "tt_backend_api_types.hpp"
-#include "tt_metal/common/core_coord.h"
+#include "tt_metal/common/core_coord.hpp"
#include "tt_metal/common/math.hpp"
#include "tt_metal/detail/tt_metal.hpp"
#include "tt_metal/host_api.hpp"
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_read_and_send_data.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_read_and_send_data.cpp
index 32177a45799..cea16f0d0c1 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_read_and_send_data.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_read_and_send_data.cpp
@@ -8,7 +8,7 @@
#include
#include
-#include "tt_metal/common/core_coord.h"
+#include "tt_metal/common/core_coord.hpp"
#include "tt_metal/common/math.hpp"
#include "tt_metal/detail/tt_metal.hpp"
#include "tt_metal/host_api.hpp"
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_send_data_looping.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_send_data_looping.cpp
index 2283cf458a9..41dda7d647f 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_send_data_looping.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_send_data_looping.cpp
@@ -8,7 +8,7 @@
#include
#include
-#include "tt_metal/common/core_coord.h"
+#include "tt_metal/common/core_coord.hpp"
#include "tt_metal/common/math.hpp"
#include "tt_metal/detail/tt_metal.hpp"
#include "tt_metal/host_api.hpp"
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_workers_and_erisc_datamover_unidirectional.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_workers_and_erisc_datamover_unidirectional.cpp
index 626d6ed7668..ca52fc83771 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_workers_and_erisc_datamover_unidirectional.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_workers_and_erisc_datamover_unidirectional.cpp
@@ -10,7 +10,7 @@
#include "device/tt_arch_types.h"
#include "tt_backend_api_types.hpp"
-#include "tt_metal/common/core_coord.h"
+#include "tt_metal/common/core_coord.hpp"
#include "tt_metal/common/math.hpp"
#include "tt_metal/detail/tt_metal.hpp"
#include "tt_metal/host_api.hpp"
diff --git a/tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp b/tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp
index 6626b1ee418..eaf18c1aada 100644
--- a/tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp
+++ b/tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp
@@ -180,6 +180,8 @@ int main(int argc, char **argv) {
std::vector ths;
ths.reserve(num_devices);
+ uint32_t dm_class_idx = magic_enum::enum_integer(HalProcessorClassType::DM);
+ uint32_t compute_class_idx = magic_enum::enum_integer(HalProcessorClassType::COMPUTE);
for (int i = 0; i < num_devices; i++) {
auto& device = devices[i];
auto& program = new_programs[i];
@@ -197,28 +199,35 @@ int main(int argc, char **argv) {
TT_FATAL(riscv1_kernel->binaries(mask) == ncrisc_binaries.at(mask), "Error");
std::string brisc_hex_path = device->build_kernel_target_path(
- JitBuildProcessorType::DATA_MOVEMENT,
+ programmable_core_index,
+ dm_class_idx,
0,
get_latest_kernel_binary_path(mask, riscv0_kernel));
- ll_api::memory brisc_binary = llrt::get_risc_binary(brisc_hex_path, 0, llrt::PackSpans::PACK);
+ ll_api::memory brisc_binary = llrt::get_risc_binary(brisc_hex_path, 0, ll_api::memory::PackSpans::PACK, ll_api::memory::Relocate::XIP);
TT_FATAL(
brisc_binary == brisc_binaries.at(mask).at(0),
"Expected saved BRISC binary to be the same as binary in persistent cache");
std::string ncrisc_hex_path = device->build_kernel_target_path(
- JitBuildProcessorType::DATA_MOVEMENT,
+ programmable_core_index,
+ dm_class_idx,
1,
get_latest_kernel_binary_path(mask, riscv1_kernel));
- ll_api::memory ncrisc_binary = llrt::get_risc_binary(ncrisc_hex_path, 1, llrt::PackSpans::PACK);
+ ll_api::memory::Relocate relo_type =
+ (device->arch() == tt::ARCH::GRAYSKULL || device->arch() == tt::ARCH::WORMHOLE_B0) ?
+ ll_api::memory::Relocate::NONE : ll_api::memory::Relocate::XIP;
+
+ ll_api::memory ncrisc_binary = llrt::get_risc_binary(ncrisc_hex_path, 1, ll_api::memory::PackSpans::PACK, relo_type);
TT_FATAL(
ncrisc_binary == ncrisc_binaries.at(mask).at(0),
"Expected saved NCRISC binary to be the same as binary in persistent cache");
for (int trisc_id = 0; trisc_id <= 2; trisc_id++) {
std::string trisc_id_str = std::to_string(trisc_id);
std::string trisc_hex_path = device->build_kernel_target_path(
- JitBuildProcessorType::COMPUTE,
+ programmable_core_index,
+ compute_class_idx,
trisc_id,
get_latest_kernel_binary_path(mask, compute_kernel));
- ll_api::memory trisc_binary = llrt::get_risc_binary(trisc_hex_path, 2, llrt::PackSpans::PACK);
+ ll_api::memory trisc_binary = llrt::get_risc_binary(trisc_hex_path, 2, ll_api::memory::PackSpans::PACK, ll_api::memory::Relocate::XIP);
TT_FATAL(
trisc_binary == compute_binaries.at(mask).at(trisc_id),
"Expected saved TRISC binary for {} to be the same as binary in persistent cache", trisc_id_str);
diff --git a/tests/tt_metal/tt_metal/test_core_range_set.cpp b/tests/tt_metal/tt_metal/test_core_range_set.cpp
index 0c6cbb21ff0..d40d2516128 100644
--- a/tests/tt_metal/tt_metal/test_core_range_set.cpp
+++ b/tests/tt_metal/tt_metal/test_core_range_set.cpp
@@ -223,7 +223,7 @@ int main(int argc, char **argv) {
tt_metal::Program program = tt_metal::CreateProgram();
CoreRange core_range_one({0, 0}, {1, 1});
CoreRange core_range_two({2, 2}, {3, 3});
- CoreRangeSet core_ranges = CoreRangeSet({core_range_one, core_range_two});
+ CoreRangeSet core_ranges = CoreRangeSet(std::vector{core_range_one, core_range_two});
pass &= test_program_specified_with_core_range_set(device, program, core_ranges);
diff --git a/tests/tt_metal/tt_metal/test_create_kernel_from_string.cpp b/tests/tt_metal/tt_metal/test_create_kernel_from_string.cpp
index be751bcd70b..fcf79a112f0 100644
--- a/tests/tt_metal/tt_metal/test_create_kernel_from_string.cpp
+++ b/tests/tt_metal/tt_metal/test_create_kernel_from_string.cpp
@@ -4,7 +4,7 @@
#include
-#include "core_coord.h"
+#include "core_coord.hpp"
#include "detail/tt_metal.hpp"
#include "host_api.hpp"
#include "impl/device/device.hpp"
diff --git a/tests/tt_metal/tt_metal/test_kernel_path_env_var.cpp b/tests/tt_metal/tt_metal/test_kernel_path_env_var.cpp
index 7d23b4a302a..aceb624577e 100644
--- a/tests/tt_metal/tt_metal/test_kernel_path_env_var.cpp
+++ b/tests/tt_metal/tt_metal/test_kernel_path_env_var.cpp
@@ -8,7 +8,7 @@
#include
#include "assert.hpp"
-#include "core_coord.h"
+#include "core_coord.hpp"
#include "detail/tt_metal.hpp"
#include "host_api.hpp"
#include "impl/kernels/data_types.hpp"
diff --git a/tests/tt_metal/tt_metal/test_multi_core_kernel.cpp b/tests/tt_metal/tt_metal/test_multi_core_kernel.cpp
index 601ce80f696..5a1e242b314 100644
--- a/tests/tt_metal/tt_metal/test_multi_core_kernel.cpp
+++ b/tests/tt_metal/tt_metal/test_multi_core_kernel.cpp
@@ -9,7 +9,7 @@
#include "tt_metal/host_api.hpp"
#include "tt_metal/detail/tt_metal.hpp"
#include "common/bfloat16.hpp"
-#include "common/core_coord.h"
+#include "common/core_coord.hpp"
// #include "tt_gdb/tt_gdb.hpp"
@@ -251,7 +251,7 @@ bool test_multi_core_kernel_unique_runtime_args(tt_metal::Device *device) {
CoreRange core_group({0, 1}, {1, 1});
CoreRange single_core({1, 0}, {1, 0});
CoreRange all_cores(start_core, end_core);
- CoreRangeSet core_blocks = CoreRangeSet({start_core_range, single_core, core_group});
+ CoreRangeSet core_blocks = CoreRangeSet(std::vector{start_core_range, single_core, core_group});
uint32_t single_tile_size = 2 * 1024;
int32_t num_tiles = 2048;
diff --git a/tests/tt_metal/tt_metal/unit_tests/basic/initialize_semaphores.cpp b/tests/tt_metal/tt_metal/unit_tests/basic/initialize_semaphores.cpp
index 0865725e13f..d43254f7c37 100644
--- a/tests/tt_metal/tt_metal/unit_tests/basic/initialize_semaphores.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/basic/initialize_semaphores.cpp
@@ -132,7 +132,7 @@ TEST_F(DeviceFixture, CreateMultipleSemaphoresOnSameCore) {
CoreRange core_range({1, 0}, {3, 0});
CoreRangeSet core_range_set({core_range});
- CoreRangeSet core_range_set2 = core_range_set.merge({core1});
+ CoreRangeSet core_range_set2 = core_range_set.merge(std::set{core1});
std::set set_of_cores({CoreRange({2,0}, {2,0}), CoreRange({3,0}, {3,0}), CoreRange({5,0}, {5,0})});
CoreRangeSet core_range_set3(set_of_cores);
CoreRangeSet core_range_set4({CoreRange({5,0}, {6,0})});
diff --git a/tests/tt_metal/tt_metal/unit_tests/basic/runtime_args.cpp b/tests/tt_metal/tt_metal/unit_tests/basic/runtime_args.cpp
index ec54d27d5d6..520d04986d2 100644
--- a/tests/tt_metal/tt_metal/unit_tests/basic/runtime_args.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/basic/runtime_args.cpp
@@ -173,7 +173,7 @@ TEST_F(DeviceFixture, LegallyModifyRTArgsDataMovement) {
// First run the program with the initial runtime args
CoreRange first_core_range(CoreCoord(0, 0), CoreCoord(1, 1));
CoreRange second_core_range(CoreCoord(3, 3), CoreCoord(5, 5));
- CoreRangeSet core_range_set({first_core_range, second_core_range});
+ CoreRangeSet core_range_set(std::vector{first_core_range, second_core_range});
auto program = unit_tests::runtime_args::initialize_program_data_movement_rta(this->devices_.at(id), core_range_set, 2);
ASSERT_TRUE(program.num_kernels() == 1);
std::vector initial_runtime_args = {101, 202};
@@ -219,7 +219,7 @@ TEST_F(DeviceFixture, LegallyModifyRTArgsCompute) {
// First run the program with the initial runtime args
CoreRange first_core_range(CoreCoord(0, 0), CoreCoord(1, 1));
CoreRange second_core_range(CoreCoord(3, 3), CoreCoord(5, 5));
- CoreRangeSet core_range_set({first_core_range, second_core_range});
+ CoreRangeSet core_range_set(std::vector{first_core_range, second_core_range});
std::vector initial_runtime_args = {101, 202};
std::vector common_runtime_args = {11, 22, 33, 44};
auto program = unit_tests::runtime_args::initialize_program_compute(this->devices_.at(id), core_range_set, initial_runtime_args.size(), common_runtime_args.size());
@@ -249,7 +249,7 @@ TEST_F(DeviceFixture, SetRuntimeArgsSubsetOfCoresCompute) {
// First run the program with the initial runtime args
CoreRange first_core_range(CoreCoord(0, 0), CoreCoord(1, 1));
CoreRange second_core_range(CoreCoord(3, 3), CoreCoord(5, 5));
- CoreRangeSet core_range_set({first_core_range, second_core_range});
+ CoreRangeSet core_range_set(std::vector{first_core_range, second_core_range});
std::vector initial_runtime_args = {101, 202};
std::vector common_runtime_args = {11, 22, 33, 44};
@@ -277,7 +277,7 @@ TEST_F(DeviceFixture, SetRuntimeArgsUniqueValuesCompute) {
// First run the program with the initial runtime args
CoreRange first_core_range(CoreCoord(0, 0), CoreCoord(1, 1));
CoreRange second_core_range(CoreCoord(3, 3), CoreCoord(5, 5));
- CoreRangeSet core_range_set({first_core_range, second_core_range});
+ CoreRangeSet core_range_set(std::vector{first_core_range, second_core_range});
std::vector common_runtime_args = {11, 22, 33, 44};
auto program = unit_tests::runtime_args::initialize_program_compute(this->devices_.at(id), core_range_set, 2, common_runtime_args.size());
@@ -311,7 +311,7 @@ TEST_F(DeviceFixture, SetRuntimeArgsVaryingLengthPerCore) {
// First run the program with the initial runtime args
CoreRange first_core_range(CoreCoord(0, 0), CoreCoord(1, 1));
CoreRange second_core_range(CoreCoord(3, 3), CoreCoord(5, 5));
- CoreRangeSet core_range_set({first_core_range, second_core_range});
+ CoreRangeSet core_range_set(std::vector{first_core_range, second_core_range});
std::vector common_runtime_args = {11, 22, 33, 44};
// Figure out max number of unique runtime args across all cores, so kernel
@@ -359,7 +359,7 @@ TEST_F(DeviceFixture, SetRuntimeArgsVaryingLengthPerCore) {
TEST_F(DeviceFixture, IllegalTooManyRuntimeArgs) {
for (unsigned int id = 0; id < num_devices_; id++) {
CoreRange first_core_range(CoreCoord(1, 1), CoreCoord(2, 2));
- CoreRangeSet core_range_set({first_core_range});
+ CoreRangeSet core_range_set(first_core_range);
auto program = unit_tests::runtime_args::initialize_program_compute(this->devices_.at(id), core_range_set, 0, 0); // Kernel isn't run here.
// Set 100 unique args, then try to set 300 common args and fail.
@@ -381,7 +381,7 @@ TEST_F(DeviceFixture, IllegallyModifyRTArgs) {
// First run the program with the initial runtime args
CoreRange first_core_range(CoreCoord(0, 0), CoreCoord(1, 1));
CoreRange second_core_range(CoreCoord(3, 3), CoreCoord(5, 5));
- CoreRangeSet core_range_set({first_core_range, second_core_range});
+ CoreRangeSet core_range_set(std::vector{first_core_range, second_core_range});
auto program = unit_tests::runtime_args::initialize_program_data_movement_rta(this->devices_.at(id), core_range_set, 2);
ASSERT_TRUE(program.num_kernels() == 1);
std::vector initial_runtime_args = {101, 202};
diff --git a/tests/tt_metal/tt_metal/unit_tests/circular_buffer/test_CircularBuffer_creation.cpp b/tests/tt_metal/tt_metal/unit_tests/circular_buffer/test_CircularBuffer_creation.cpp
index 199aa429f88..3b6173fb45d 100644
--- a/tests/tt_metal/tt_metal/unit_tests/circular_buffer/test_CircularBuffer_creation.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/circular_buffer/test_CircularBuffer_creation.cpp
@@ -75,6 +75,7 @@ TEST_F(DeviceFixture, TestCreateCircularBufferAtValidIndices) {
auto cb = CreateCircularBuffer(program, cr_set, config);
for (unsigned int id = 0; id < num_devices_; id++) {
+ detail::CompileProgram(devices_.at(id), program);
program.finalize(devices_.at(id));
EXPECT_TRUE(test_cb_config_written_to_core(program, this->devices_.at(id), cr_set, golden_cb_config));
}
diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_sfpu_compute.cpp b/tests/tt_metal/tt_metal/unit_tests/compute/test_sfpu_compute.cpp
index 532276b9e39..56a9dbc37e0 100644
--- a/tests/tt_metal/tt_metal/unit_tests/compute/test_sfpu_compute.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/compute/test_sfpu_compute.cpp
@@ -105,7 +105,7 @@ struct SfpuConfig {
size_t tile_byte_size = 0;
tt::DataFormat l1_input_data_format = tt::DataFormat::Invalid;
tt::DataFormat l1_output_data_format = tt::DataFormat::Invalid;
- CoreRangeSet cores = {{}};
+ CoreRangeSet cores = CoreRangeSet();
std::string sfpu_op = "";
bool approx_mode = true;
};
@@ -398,7 +398,7 @@ TEST_F(DeviceFixture, DISABLED_AllCoreSingleTileSfpuApproxCompute) {
.tile_byte_size = 2 * 32 * 32,
.l1_input_data_format = tt::DataFormat::Float16_b,
.l1_output_data_format = tt::DataFormat::Float16_b,
- .cores = {{}},
+ .cores = CoreRangeSet(),
.approx_mode = true};
auto arch = this->arch_;
@@ -437,7 +437,7 @@ TEST_F(DeviceFixture, DISABLED_AllCoreMultiTileSfpuApproxCompute) {
.tile_byte_size = 2 * 32 * 32,
.l1_input_data_format = tt::DataFormat::Float16_b,
.l1_output_data_format = tt::DataFormat::Float16_b,
- .cores = {{}},
+ .cores = CoreRangeSet(),
.approx_mode = true};
auto arch = this->arch_;
diff --git a/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRangeSet_construct.cpp b/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRangeSet_construct.cpp
index c1ba749cab4..04cdc9f15c2 100644
--- a/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRangeSet_construct.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRangeSet_construct.cpp
@@ -3,25 +3,25 @@
// SPDX-License-Identifier: Apache-2.0
#include "gtest/gtest.h"
-#include "tt_metal/common/core_coord.h"
+#include "tt_metal/common/core_coord.hpp"
#include "core_coord_fixture.hpp"
namespace basic_tests::CoreRangeSet{
TEST_F(CoreCoordHarness, TestCoreRangeSetValidConstruct)
{
- EXPECT_NO_THROW ( ::CoreRangeSet({this->sc1, this->cr2}));
- EXPECT_NO_THROW ( ::CoreRangeSet({this->cr1, this->cr2}) );
+ EXPECT_NO_THROW(::CoreRangeSet(std::vector{this->sc1, this->cr2}));
+ EXPECT_NO_THROW(::CoreRangeSet(std::vector{this->cr1, this->cr2}));
- ::CoreRangeSet valid_ranges = ::CoreRangeSet({this->cr1, this->cr2});
+ ::CoreRangeSet valid_ranges = ::CoreRangeSet(std::vector{this->cr1, this->cr2});
EXPECT_EQ(valid_ranges.ranges().size(), 2);
}
TEST_F(CoreCoordHarness, TestCoreRangeSetInvalidConstruct)
{
::CoreRange overlapping_range({1, 2}, {3, 3});
- EXPECT_ANY_THROW( ::CoreRangeSet({this->cr1, this->cr2, overlapping_range}) );
- EXPECT_ANY_THROW( ::CoreRangeSet({this->sc1, this->cr1}) );
+ EXPECT_ANY_THROW(::CoreRangeSet(std::vector{this->cr1, this->cr2, overlapping_range}));
+ EXPECT_ANY_THROW(::CoreRangeSet(std::vector{this->sc1, this->cr1}));
}
diff --git a/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRangeSet_merge.cpp b/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRangeSet_merge.cpp
index 32c0092741f..d8bd37fa6c7 100644
--- a/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRangeSet_merge.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRangeSet_merge.cpp
@@ -4,7 +4,7 @@
#include "gtest/gtest.h"
-#include "tt_metal/common/core_coord.h"
+#include "tt_metal/common/core_coord.hpp"
#include "core_coord_fixture.hpp"
#include
@@ -12,18 +12,19 @@ namespace basic_tests::CoreRangeSet{
TEST_F(CoreCoordHarness, TestCoreRangeSetMergeNoSolution)
{
- EXPECT_EQ ( ::CoreRangeSet({sc1}).merge({sc3}).ranges() , std::set<::CoreRange>( {sc1,sc3}) );
- EXPECT_EQ ( ::CoreRangeSet({cr1}).merge({cr2}).ranges() , std::set<::CoreRange>( {cr1,cr2}) );
- EXPECT_EQ ( ::CoreRangeSet({cr1}).merge({cr1,cr2}).ranges() , std::set<::CoreRange>( {cr1,cr2}) );
- EXPECT_EQ ( ::CoreRangeSet({cr1}).merge({cr2}).merge({cr3}).ranges() , std::set<::CoreRange>( {cr1,cr2,cr3}) );
+ EXPECT_EQ(::CoreRangeSet(sc1).merge(std::set{sc3}).ranges(), std::set<::CoreRange>({sc1, sc3}));
+ EXPECT_EQ(::CoreRangeSet(cr1).merge(std::set{cr2}).ranges(), std::set<::CoreRange>({cr1, cr2}));
+ EXPECT_EQ(::CoreRangeSet(cr1).merge(std::set{cr1, cr2}).ranges(), std::set<::CoreRange>({cr1, cr2}));
+ EXPECT_EQ(
+ ::CoreRangeSet(cr1).merge(std::set{cr2}).merge(std::set{cr3}).ranges(), std::set<::CoreRange>({cr1, cr2, cr3}));
}
TEST_F(CoreCoordHarness, TestCoreRangeSetMergeCoreCoord)
{
- ::CoreRangeSet empty_crs({});
- EXPECT_EQ ( empty_crs.merge({this->sc1}).ranges().size(), 1);
- EXPECT_EQ ( ::CoreRangeSet({cr1}).merge({sc3, sc4}).ranges() , std::set<::CoreRange>( {cr16}) );
- EXPECT_EQ ( ::CoreRangeSet({cr1}).merge({sc3}).merge({sc4}).ranges() , std::set<::CoreRange>( {cr16}) );
+ ::CoreRangeSet empty_crs;
+ EXPECT_EQ(empty_crs.merge(std::set{this->sc1}).ranges().size(), 1);
+ EXPECT_EQ(::CoreRangeSet(cr1).merge(std::set{sc3, sc4}).ranges(), std::set<::CoreRange>({cr16}));
+ EXPECT_EQ(::CoreRangeSet(cr1).merge(std::set{sc3}).merge(std::set{sc4}).ranges(), std::set<::CoreRange>({cr16}));
CoreRange rect ( {0,0}, {4,2});
std::set rect_pts;
for (unsigned y = rect.start_coord.y; y <= rect.end_coord.y; y++){
@@ -38,18 +39,22 @@ TEST_F(CoreCoordHarness, TestCoreRangeSetMergeCoreCoord)
EXPECT_EQ ( empty_crs.merge(rect_pts).ranges(), std::set<::CoreRange>( {rect, CoreRange( {2,3}, {3,5} ) } ));
// "H", sub-optimal currently, should be reduced down to 3 CRs instead of 5
- EXPECT_EQ ( empty_crs.merge( { CoreRange { {0,0}, {1,5} }, CoreRange { {3,0}, {4,5}}, CoreRange { {0,2} , {4,3} } } ).ranges(),
+ EXPECT_EQ ( empty_crs.merge( std::vector{ CoreRange { {0,0}, {1,5} }, CoreRange { {3,0}, {4,5}}, CoreRange { {0,2} , {4,3} } } ).ranges(),
std::set<::CoreRange>( { CoreRange { {0,0}, {1,1} }, CoreRange { {0,2}, {4,3}}, CoreRange{ {0,4}, {1,5}},
CoreRange { {3,0}, {4,1} }, CoreRange{ {3,4}, {4,5} } } ));
}
TEST_F(CoreCoordHarness, TestCoreRangeSetMergeCoreRange)
{
- EXPECT_EQ ( ::CoreRangeSet({cr1}).merge({cr1}).ranges() , std::set<::CoreRange>( {cr1}) );
- EXPECT_EQ ( ::CoreRangeSet({cr7}).merge({cr6}).merge({cr4}).ranges() , std::set<::CoreRange>( {cr8} ) );
- EXPECT_EQ ( ::CoreRangeSet({cr8}).merge({cr7}).merge({cr6}).merge({cr4}).ranges() , std::set<::CoreRange>( {cr8} ) );
- EXPECT_EQ ( ::CoreRangeSet({cr1, cr2, cr3}).merge({cr4}).ranges() , std::set<::CoreRange>( {cr4}) );
- EXPECT_EQ ( ::CoreRangeSet({cr1, cr2}).merge({cr4}).merge({cr6}).ranges() , std::set<::CoreRange>( {cr6}) );
+ EXPECT_EQ(::CoreRangeSet(cr1).merge(std::set{cr1}).ranges(), std::set<::CoreRange>({cr1}));
+ EXPECT_EQ(::CoreRangeSet(cr7).merge(std::set{cr6}).merge(std::set{cr4}).ranges(), std::set<::CoreRange>({cr8}));
+ EXPECT_EQ(
+ ::CoreRangeSet(cr8).merge(std::set{cr7}).merge(std::set{cr6}).merge(std::set{cr4}).ranges(),
+ std::set<::CoreRange>({cr8}));
+ EXPECT_EQ(::CoreRangeSet(std::vector{cr1, cr2, cr3}).merge(std::set{cr4}).ranges(), std::set<::CoreRange>({cr4}));
+ EXPECT_EQ(
+ ::CoreRangeSet(std::vector{cr1, cr2}).merge(std::set{cr4}).merge(std::set{cr6}).ranges(),
+ std::set<::CoreRange>({cr6}));
}
}
diff --git a/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_adjacent.cpp b/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_adjacent.cpp
index 45b44f6e7bc..f08976402d6 100644
--- a/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_adjacent.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_adjacent.cpp
@@ -5,7 +5,7 @@
#include "gtest/gtest.h"
-#include "tt_metal/common/core_coord.h"
+#include "tt_metal/common/core_coord.hpp"
#include "core_coord_fixture.hpp"
namespace basic_tests::CoreRange{
diff --git a/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_contains.cpp b/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_contains.cpp
index 32828a50539..c9080e08da3 100644
--- a/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_contains.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_contains.cpp
@@ -4,7 +4,7 @@
#include "gtest/gtest.h"
-#include "tt_metal/common/core_coord.h"
+#include "tt_metal/common/core_coord.hpp"
#include "core_coord_fixture.hpp"
namespace basic_tests::CoreRange{
diff --git a/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_intersects.cpp b/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_intersects.cpp
index df1b90e40ac..0ee1fe3608e 100644
--- a/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_intersects.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_intersects.cpp
@@ -4,7 +4,7 @@
#include "gtest/gtest.h"
-#include "tt_metal/common/core_coord.h"
+#include "tt_metal/common/core_coord.hpp"
#include "core_coord_fixture.hpp"
namespace basic_tests::CoreRange{
diff --git a/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_iterator.cpp b/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_iterator.cpp
index 6ba11cc71eb..d475d3c897b 100644
--- a/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_iterator.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_iterator.cpp
@@ -4,7 +4,7 @@
#include "gtest/gtest.h"
-#include "tt_metal/common/core_coord.h"
+#include "tt_metal/common/core_coord.hpp"
#include "core_coord_fixture.hpp"
namespace basic_tests::CoreRange {
diff --git a/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_merge.cpp b/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_merge.cpp
index d23cd67188a..db8a1b2c7ad 100644
--- a/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_merge.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_merge.cpp
@@ -5,7 +5,7 @@
#include "gtest/gtest.h"
-#include "tt_metal/common/core_coord.h"
+#include "tt_metal/common/core_coord.hpp"
#include "core_coord_fixture.hpp"
namespace basic_tests::CoreRange{
diff --git a/tests/tt_metal/tt_metal/unit_tests/multichip/erisc_app_direct_send.cpp b/tests/tt_metal/tt_metal/unit_tests/multichip/erisc_app_direct_send.cpp
index 72fcf923b11..de55e3f17f5 100644
--- a/tests/tt_metal/tt_metal/unit_tests/multichip/erisc_app_direct_send.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/multichip/erisc_app_direct_send.cpp
@@ -98,8 +98,9 @@ bool send_over_eth(
llrt::write_hex_vec_to_core(receiver_device->id(), receiver_core, args_1, eth_l1_mem::address_map::ERISC_APP_SYNC_INFO_BASE);
// TODO: this should be updated to use kernel api
- ll_api::memory binary_mem_send = llrt::get_risc_binary(sender_device->build_firmware_target_path(JitBuildProcessorType::ETHERNET, 0));
- ll_api::memory binary_mem_receive = llrt::get_risc_binary(receiver_device->build_firmware_target_path(JitBuildProcessorType::ETHERNET, 0));
+ uint32_t active_eth_index = hal.get_programmable_core_type_index(HalProgrammableCoreType::ACTIVE_ETH);
+ ll_api::memory binary_mem_send = llrt::get_risc_binary(sender_device->build_firmware_target_path(active_eth_index, 0, 0));
+ ll_api::memory binary_mem_receive = llrt::get_risc_binary(receiver_device->build_firmware_target_path(active_eth_index, 0, 0));
for (const auto& eth_core : eth_cores) {
llrt::write_hex_vec_to_core(
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/command_queue_test_utils.hpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/command_queue_test_utils.hpp
index 2223519bc63..3b1a12c88ba 100644
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/command_queue_test_utils.hpp
+++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/command_queue_test_utils.hpp
@@ -12,7 +12,7 @@ struct TestBufferConfig {
tt::tt_metal::BufferType buftype;
};
-inline pair> EnqueueWriteBuffer_prior_to_wrap(tt::tt_metal::Device* device, tt::tt_metal::CommandQueue& cq, const TestBufferConfig& config) {
+inline std::pair> EnqueueWriteBuffer_prior_to_wrap(tt::tt_metal::Device* device, tt::tt_metal::CommandQueue& cq, const TestBufferConfig& config) {
// This function just enqueues a buffer (which should be large in the config)
// write as a precursor to testing the wrap mechanism
size_t buf_size = config.num_pages * config.page_size;
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueProgram.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueProgram.cpp
index 3194e16e35c..461f07c2825 100644
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueProgram.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueProgram.cpp
@@ -980,7 +980,7 @@ TEST_F(CommandQueueSingleCardFixture, TestAllCbConfigsCorrectlySentMultipleCoreR
CoreCoord worker_grid_size = device->compute_with_storage_grid_size();
CoreRange cr1({worker_grid_size.x - 2, worker_grid_size.y - 2}, {worker_grid_size.x - 1, worker_grid_size.y - 1});
- CoreRangeSet core_ranges({cr0, cr1});
+ CoreRangeSet core_ranges(std::vector{cr0, cr1});
DummyProgramMultiCBConfig config = {.cr_set = core_ranges, .cb_config_vector = cb_config_vector};
@@ -1001,7 +1001,7 @@ TEST_F(CommandQueueSingleCardFixture, TestAllCbConfigsCorrectlySentUpdateSizeMul
CoreCoord worker_grid_size = device->compute_with_storage_grid_size();
CoreRange cr1({worker_grid_size.x - 2, worker_grid_size.y - 2}, {worker_grid_size.x - 1, worker_grid_size.y - 1});
- CoreRangeSet core_ranges({cr0, cr1});
+ CoreRangeSet core_ranges(std::vector{cr0, cr1});
DummyProgramMultiCBConfig config = {.cr_set = core_ranges, .cb_config_vector = cb_config_vector};
@@ -1023,7 +1023,7 @@ TEST_F(CommandQueueSingleCardFixture, TestMultiCbConfigsCorrectlySentUpdateSizeM
CoreCoord worker_grid_size = device->compute_with_storage_grid_size();
CoreRange cr1({worker_grid_size.x - 2, worker_grid_size.y - 2}, {worker_grid_size.x - 1, worker_grid_size.y - 1});
- CoreRangeSet core_ranges({cr0, cr1});
+ CoreRangeSet core_ranges(std::vector{cr0, cr1});
DummyProgramMultiCBConfig config = {.cr_set = core_ranges, .cb_config_vector = cb_config_vector};
@@ -1036,7 +1036,7 @@ TEST_F(CommandQueueSingleCardFixture, TestAllSemConfigsCorrectlySentMultiCore) {
CoreCoord worker_grid_size = device->compute_with_storage_grid_size();
CoreRange cr({0, 0}, {worker_grid_size.x - 1, worker_grid_size.y - 1});
- CoreRangeSet cr_set({cr});
+ CoreRangeSet cr_set(cr);
DummyProgramConfig config = {.cr_set = cr_set, .num_sems = NUM_SEMAPHORES};
@@ -1052,7 +1052,7 @@ TEST_F(CommandQueueSingleCardFixture, TestAllSemaphoreConfigsCorrectlySentMultip
CoreCoord worker_grid_size = device->compute_with_storage_grid_size();
CoreRange second_cr({worker_grid_size.x - 2, worker_grid_size.y - 2}, {worker_grid_size.x - 1, worker_grid_size.y - 1});
- CoreRangeSet cr_set({first_cr, second_cr});
+ CoreRangeSet cr_set(std::vector{first_cr, second_cr});
Program program;
DummyProgramConfig config = {.cr_set = cr_set, .num_sems = NUM_SEMAPHORES};
@@ -1089,7 +1089,7 @@ TEST_F(CommandQueueSingleCardFixture, TestAllRuntimeArgsCorrectlySentMultiCore)
CoreCoord worker_grid_size = device->compute_with_storage_grid_size();
CoreRange cr({0, 0}, {worker_grid_size.x - 1, worker_grid_size.y - 1});
- CoreRangeSet cr_set({cr});
+ CoreRangeSet cr_set(cr);
DummyProgramConfig dummy_program_config = {.cr_set = cr_set};
EXPECT_TRUE(local_test_functions::test_dummy_EnqueueProgram_with_runtime_args(device, device->command_queue(), dummy_program_config, 13, 17, 19, 1));
@@ -1101,7 +1101,7 @@ TEST_F(CommandQueueSingleCardFixture, TestAllRuntimeArgsCorrectlySentMultiCore_2
CoreCoord worker_grid_size = device->compute_with_storage_grid_size();
CoreRange cr({0, 0}, {worker_grid_size.x - 1, worker_grid_size.y - 1});
- CoreRangeSet cr_set({cr});
+ CoreRangeSet cr_set(cr);
DummyProgramConfig dummy_program_config = {.cr_set = cr_set};
EXPECT_TRUE(local_test_functions::test_dummy_EnqueueProgram_with_runtime_args(device, device->command_queue(), dummy_program_config, 255, 255, 255, 1));
@@ -1114,7 +1114,7 @@ TEST_F(CommandQueueSingleCardFixture, TestSendRuntimeArgsMultiCoreRange) {
CoreRange cr0({0, 0}, {worker_grid_size.x - 1, 3});
CoreRange cr1({0, 4}, {worker_grid_size.x - 1, worker_grid_size.y - 1});
- CoreRangeSet cr_set({cr0, cr1});
+ CoreRangeSet cr_set(std::vector{cr0, cr1});
DummyProgramConfig dummy_program_config = {.cr_set = cr_set};
EXPECT_TRUE(local_test_functions::test_dummy_EnqueueProgram_with_runtime_args_multi_crs(
@@ -1129,7 +1129,7 @@ TEST_F(CommandQueueSingleCardFixture, TestSendRuntimeArgsMultiNonOverlappingCore
CoreRange cr0({0, 0}, {worker_grid_size.x - 1, 3});
CoreRange cr1({0, 5}, {worker_grid_size.x - 1, worker_grid_size.y - 1});
- CoreRangeSet cr_set({cr0, cr1});
+ CoreRangeSet cr_set(std::vector{cr0, cr1});
DummyProgramConfig dummy_program_config = {.cr_set = cr_set};
EXPECT_TRUE(local_test_functions::test_dummy_EnqueueProgram_with_runtime_args_multi_crs(
@@ -1143,7 +1143,7 @@ TEST_F(CommandQueueSingleCardFixture, TestUpdateRuntimeArgsMultiCoreRange) {
CoreRange cr0({0, 0}, {worker_grid_size.x - 1, 3});
CoreRange cr1({0, 5}, {worker_grid_size.x - 1, worker_grid_size.y - 1});
- CoreRangeSet cr_set({cr0, cr1});
+ CoreRangeSet cr_set(std::vector{cr0, cr1});
DummyProgramConfig dummy_program_config = {.cr_set = cr_set};
EXPECT_TRUE(local_test_functions::test_dummy_EnqueueProgram_with_runtime_args_multi_crs(
@@ -1155,7 +1155,7 @@ TEST_F(CommandQueueSingleCardFixture, TestUpdateRuntimeArgsMultiCoreRange) {
TEST_F(CommandQueueSingleCardFixture, IncrementRuntimeArgsSanityMultiCoreCompute) {
CoreRange cr0({1, 1}, {2, 2});
CoreRange cr1({3, 3}, {4, 4});
- CoreRangeSet cr_set({cr0, cr1});
+ CoreRangeSet cr_set(std::vector{cr0, cr1});
DummyProgramConfig dummy_program_config = {.cr_set = cr_set};
for (Device *device : devices_) {
EXPECT_TRUE(local_test_functions::test_increment_runtime_args_sanity(device, dummy_program_config, 16, 16, tt::RISCV::COMPUTE));
@@ -1166,7 +1166,7 @@ TEST_F(CommandQueueSingleCardFixture, IncrementRuntimeArgsSanityMultiCoreCompute
TEST_F(CommandQueueSingleCardFixture, IncrementRuntimeArgsSanityMultiCoreCompute_255_UniqueArgs) {
CoreRange cr0({1, 1}, {2, 2});
CoreRange cr1({3, 3}, {4, 4});
- CoreRangeSet cr_set({cr0, cr1});
+ CoreRangeSet cr_set(std::vector{cr0, cr1});
DummyProgramConfig dummy_program_config = {.cr_set = cr_set};
for (Device *device : devices_) {
EXPECT_TRUE(local_test_functions::test_increment_runtime_args_sanity(device, dummy_program_config, 255, 0, tt::RISCV::COMPUTE));
@@ -1177,7 +1177,7 @@ TEST_F(CommandQueueSingleCardFixture, IncrementRuntimeArgsSanityMultiCoreCompute
TEST_F(CommandQueueSingleCardFixture, IncrementRuntimeArgsSanityMultiCoreCompute_255_CommonArgs) {
CoreRange cr0({1, 1}, {2, 2});
CoreRange cr1({3, 3}, {4, 4});
- CoreRangeSet cr_set({cr0, cr1});
+ CoreRangeSet cr_set(std::vector{cr0, cr1});
DummyProgramConfig dummy_program_config = {.cr_set = cr_set};
for (Device *device : devices_) {
EXPECT_TRUE(local_test_functions::test_increment_runtime_args_sanity(device, dummy_program_config, 0, 255, tt::RISCV::COMPUTE));
@@ -1188,7 +1188,7 @@ TEST_F(CommandQueueSingleCardFixture, IncrementRuntimeArgsSanityMultiCoreCompute
TEST_F(CommandQueueSingleCardFixture, IncrementRuntimeArgsSanityMultiCoreDataMovementBrisc) {
CoreRange cr0({1, 1}, {2, 2});
CoreRange cr1({3, 3}, {4, 4});
- CoreRangeSet cr_set({cr0, cr1});
+ CoreRangeSet cr_set(std::vector{cr0, cr1});
DummyProgramConfig dummy_program_config = {.cr_set = cr_set};
for (Device *device : devices_) {
EXPECT_TRUE(local_test_functions::test_increment_runtime_args_sanity(device, dummy_program_config, 16, 16, tt::RISCV::BRISC));
@@ -1199,7 +1199,7 @@ TEST_F(CommandQueueSingleCardFixture, IncrementRuntimeArgsSanityMultiCoreDataMov
TEST_F(CommandQueueSingleCardFixture, IncrementRuntimeArgsSanityMultiCoreDataMovementNcrisc) {
CoreRange cr0({1, 1}, {2, 2});
CoreRange cr1({3, 3}, {4, 4});
- CoreRangeSet cr_set({cr0, cr1});
+ CoreRangeSet cr_set(std::vector{cr0, cr1});
DummyProgramConfig dummy_program_config = {.cr_set = cr_set};
for (Device *device : devices_) {
EXPECT_TRUE(local_test_functions::test_increment_runtime_args_sanity(device, dummy_program_config, 16, 16, tt::RISCV::NCRISC));
@@ -1219,7 +1219,7 @@ TEST_F(CommandQueueSingleCardFixture, DISABLED_TestFillDispatchCoreBuffer) {
CoreCoord worker_grid_size = device->compute_with_storage_grid_size();
CoreRange cr({0, 0}, {worker_grid_size.x - 1, worker_grid_size.y - 1});
- CoreRangeSet cr_set({cr});
+ CoreRangeSet cr_set(cr);
DummyProgramConfig dummy_program_config = {.cr_set = cr_set};
@@ -1240,7 +1240,7 @@ TEST_F(CommandQueueFixture, TestRandomizedProgram) {
CoreCoord worker_grid_size = this->device_->compute_with_storage_grid_size();
CoreRange cr({0, 0}, {worker_grid_size.x - 1, worker_grid_size.y - 1});
- CoreRangeSet cr_set({cr});
+ CoreRangeSet cr_set(cr);
log_info(tt::LogTest, "Starting compile of {} programs now.", NUM_PROGRAMS);
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/compute/sfpu/sfpu_compute.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/compute/sfpu/sfpu_compute.cpp
index 1e6e8c54362..8cb072266de 100644
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/compute/sfpu/sfpu_compute.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/compute/sfpu/sfpu_compute.cpp
@@ -104,7 +104,7 @@ struct SfpuConfig {
size_t tile_byte_size = 0;
tt::DataFormat l1_input_data_format = tt::DataFormat::Invalid;
tt::DataFormat l1_output_data_format = tt::DataFormat::Invalid;
- CoreRangeSet cores = {{}};
+ CoreRangeSet cores = CoreRangeSet();
std::string sfpu_op = "";
bool approx_mode = true;
};
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/streams/test_autonomous_relay_streams.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/streams/test_autonomous_relay_streams.cpp
index 9a07d724462..74080be0bb8 100644
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/streams/test_autonomous_relay_streams.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/streams/test_autonomous_relay_streams.cpp
@@ -18,7 +18,7 @@
#include "impl/buffers/circular_buffer.hpp"
#include "impl/kernels/data_types.hpp"
#include "impl/kernels/kernel_types.hpp"
-#include "tt_metal/common/core_coord.h"
+#include "tt_metal/common/core_coord.hpp"
#include "tt_metal/common/math.hpp"
#include "tt_metal/detail/tt_metal.hpp"
#include "tt_metal/host_api.hpp"
diff --git a/tests/ttnn/unit_tests/gtests/ccl/test_erisc_data_mover_with_workers.cpp b/tests/ttnn/unit_tests/gtests/ccl/test_erisc_data_mover_with_workers.cpp
index 4c1d15083e0..a62985f8bd3 100644
--- a/tests/ttnn/unit_tests/gtests/ccl/test_erisc_data_mover_with_workers.cpp
+++ b/tests/ttnn/unit_tests/gtests/ccl/test_erisc_data_mover_with_workers.cpp
@@ -12,7 +12,7 @@
#include "device/tt_arch_types.h"
// #include "tt_backend_api_types.hpp"
-#include "tt_metal/common/core_coord.h"
+#include "tt_metal/common/core_coord.hpp"
#include "tt_metal/common/math.hpp"
#include "tt_metal/detail/tt_metal.hpp"
#include "tt_metal/host_api.hpp"
diff --git a/tests/ttnn/unit_tests/operations/test_all_gather_TG_nightly.py b/tests/ttnn/unit_tests/operations/test_all_gather_TG_nightly.py
new file mode 100644
index 00000000000..c982be2bd9b
--- /dev/null
+++ b/tests/ttnn/unit_tests/operations/test_all_gather_TG_nightly.py
@@ -0,0 +1,291 @@
+# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import pytest
+from loguru import logger
+import ttnn
+from tests.tt_eager.python_api_testing.sweep_tests.comparison_funcs import comp_equal, comp_pcc
+from tests.ttnn.unit_tests.operations.test_all_gather_TG_post_commit import (
+ run_line_all_gather_on_TG_with_mesh_tensor_along_rows,
+)
+from models.utility_functions import skip_for_grayskull
+
+from ttnn import ShardTensor2dMesh, ConcatMesh2dToTensor
+
+
+# Enumerate the post-commit cases explicitly
+@skip_for_grayskull("Requires eth connected devices to run")
+@pytest.mark.parametrize(
+ "num_devices, num_links",
+ [(4, 3)],
+)
+@pytest.mark.parametrize(
+ "input_dtype",
+ [
+ ttnn.bfloat16,
+ ttnn.bfloat8_b,
+ ],
+)
+@pytest.mark.parametrize("shard_grid_orientation", [ttnn.ShardOrientation.ROW_MAJOR])
+@pytest.mark.parametrize(
+ "tensor_mem_layout,per_chip_output_shape, dim, input_shard_shape,shard_grid,layout",
+ (
+ # LLama
+ (
+ ttnn.TensorMemoryLayout.WIDTH_SHARDED,
+ (1, 1, 32, 1024 * 4),
+ 3,
+ (32, 32),
+ ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(7, 3))}),
+ ttnn.TILE_LAYOUT,
+ ),
+ (
+ ttnn.TensorMemoryLayout.WIDTH_SHARDED,
+ (4, 1, 32, 1280),
+ 0,
+ (32, 32),
+ ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(7, 4))}),
+ ttnn.TILE_LAYOUT,
+ ),
+ ),
+)
+@pytest.mark.parametrize("replication_factor", [8])
+@pytest.mark.parametrize("enable_async", [True])
+@pytest.mark.parametrize("mesh_device", [pytest.param((8, 4), id="8x4_grid")], indirect=True)
+def test_line_all_gather_sharded_on_TG_rows_post_commit(
+ mesh_device,
+ num_devices,
+ per_chip_output_shape,
+ input_shard_shape,
+ shard_grid,
+ shard_grid_orientation,
+ tensor_mem_layout,
+ dim,
+ num_links,
+ input_dtype,
+ layout,
+ use_program_cache,
+ function_level_defaults,
+ enable_async,
+ replication_factor,
+ num_iters=1,
+):
+ input_shard_spec = ttnn.ShardSpec(
+ shard_grid,
+ input_shard_shape,
+ shard_grid_orientation,
+ False,
+ )
+ run_line_all_gather_on_TG_with_mesh_tensor_along_rows(
+ mesh_device,
+ num_devices,
+ per_chip_output_shape,
+ tensor_mem_layout,
+ dim,
+ num_links,
+ input_dtype,
+ layout,
+ ttnn.BufferType.L1,
+ use_program_cache,
+ function_level_defaults,
+ enable_async=enable_async,
+ input_shard_spec=input_shard_spec,
+ num_iters=num_iters,
+ num_all_gather_instances=replication_factor,
+ cluster_axis=1,
+ )
+
+
+# Enumerate the post-commit cases explicitly
+@skip_for_grayskull("Requires eth connected devices to run")
+@pytest.mark.parametrize(
+ "num_devices, num_links",
+ [(8, 4), (8, 3), (8, 2)],
+)
+@pytest.mark.parametrize(
+ "input_dtype",
+ [
+ ttnn.bfloat16,
+ ttnn.bfloat8_b,
+ ],
+)
+@pytest.mark.parametrize("shard_grid_orientation", [ttnn.ShardOrientation.ROW_MAJOR])
+@pytest.mark.parametrize(
+ "tensor_mem_layout, input_shape, dim, input_shard_shape,shard_grid,layout",
+ (
+ (
+ ttnn.TensorMemoryLayout.WIDTH_SHARDED,
+ (8, 1, 32, 2048),
+ 0,
+ (32, 64),
+ ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(7, 3))}),
+ ttnn.TILE_LAYOUT,
+ ),
+ (
+ ttnn.TensorMemoryLayout.WIDTH_SHARDED,
+ (1, 8, 32, 2048),
+ 1,
+ (32, 64),
+ ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(7, 3))}),
+ ttnn.TILE_LAYOUT,
+ ),
+ (
+ ttnn.TensorMemoryLayout.WIDTH_SHARDED,
+ (1, 1, 256, 2048),
+ 2,
+ (32, 64),
+ ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(7, 3))}),
+ ttnn.TILE_LAYOUT,
+ ),
+ (
+ ttnn.TensorMemoryLayout.WIDTH_SHARDED,
+ (1, 1, 32, 16384),
+ 3,
+ (32, 64),
+ ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(7, 3))}),
+ ttnn.TILE_LAYOUT,
+ ),
+ (
+ ttnn.TensorMemoryLayout.HEIGHT_SHARDED,
+ (8, 1, 2048, 32),
+ 0,
+ (64, 32),
+ ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(7, 3))}),
+ ttnn.TILE_LAYOUT,
+ ),
+ (
+ ttnn.TensorMemoryLayout.HEIGHT_SHARDED,
+ (1, 8, 2048, 32),
+ 1,
+ (64, 32),
+ ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(7, 3))}),
+ ttnn.TILE_LAYOUT,
+ ),
+ (
+ ttnn.TensorMemoryLayout.HEIGHT_SHARDED,
+ (1, 1, 16384, 32),
+ 2,
+ (64, 32),
+ ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(7, 3))}),
+ ttnn.TILE_LAYOUT,
+ ),
+ (
+ ttnn.TensorMemoryLayout.HEIGHT_SHARDED,
+ (1, 1, 2048, 256),
+ 3,
+ (64, 32),
+ ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(7, 3))}),
+ ttnn.TILE_LAYOUT,
+ ),
+ ),
+)
+@pytest.mark.parametrize("replication_factor", [4])
+@pytest.mark.parametrize("enable_async", [True])
+@pytest.mark.parametrize("mesh_device", [pytest.param((8, 4), id="8x4_grid")], indirect=True)
+def test_line_all_gather_sharded_on_TG_cols_post_commit(
+ mesh_device,
+ num_devices,
+ input_shape,
+ input_shard_shape,
+ shard_grid,
+ shard_grid_orientation,
+ tensor_mem_layout,
+ dim,
+ num_links,
+ input_dtype,
+ layout,
+ use_program_cache,
+ function_level_defaults,
+ enable_async,
+ replication_factor,
+ num_iters=1,
+):
+ input_shard_spec = ttnn.ShardSpec(
+ shard_grid,
+ input_shard_shape,
+ shard_grid_orientation,
+ False,
+ )
+
+ run_line_all_gather_on_TG_with_mesh_tensor_along_rows(
+ mesh_device,
+ num_devices,
+ input_shape,
+ tensor_mem_layout,
+ dim,
+ num_links,
+ input_dtype,
+ layout,
+ ttnn.BufferType.L1,
+ use_program_cache,
+ function_level_defaults,
+ enable_async=enable_async,
+ num_iters=num_iters,
+ input_shard_spec=input_shard_spec,
+ num_all_gather_instances=replication_factor,
+ cluster_axis=0,
+ )
+
+
+@skip_for_grayskull("Requires eth connected devices to run")
+@pytest.mark.parametrize(
+ "num_devices, num_links, per_chip_output_shape, dim, layout",
+ [
+ (8, 4, [1, 8, 32, 1280], 1, ttnn.TILE_LAYOUT),
+ (8, 4, [8, 1, 32, 1280], 0, ttnn.TILE_LAYOUT),
+ (8, 4, [1, 8, 32, 2048], 1, ttnn.TILE_LAYOUT),
+ (8, 4, [1, 8, 32, 2304], 1, ttnn.TILE_LAYOUT),
+ (8, 4, [1, 8, 32, 4096], 1, ttnn.TILE_LAYOUT),
+ ],
+)
+@pytest.mark.parametrize(
+ "input_dtype",
+ [
+ ttnn.bfloat16,
+ ttnn.bfloat8_b,
+ ],
+)
+@pytest.mark.parametrize(
+ "buffer_type",
+ [
+ ttnn.BufferType.DRAM,
+ ttnn.BufferType.L1,
+ ],
+)
+@pytest.mark.parametrize("enable_async", [True])
+@pytest.mark.parametrize("replication_factor", [4]) # 1, 4])
+@pytest.mark.parametrize("mesh_device", [pytest.param((8, 4), id="8x4_grid")], indirect=True)
+def test_line_all_gather_on_TG_cols_nightly(
+ mesh_device,
+ num_devices,
+ per_chip_output_shape,
+ dim,
+ num_links,
+ input_dtype,
+ layout,
+ buffer_type,
+ use_program_cache,
+ function_level_defaults,
+ enable_async,
+ replication_factor,
+ num_iters=1,
+):
+ run_line_all_gather_on_TG_with_mesh_tensor_along_rows(
+ mesh_device,
+ num_devices,
+ per_chip_output_shape,
+ ttnn.TensorMemoryLayout.INTERLEAVED,
+ dim,
+ num_links,
+ input_dtype,
+ layout,
+ buffer_type,
+ use_program_cache,
+ function_level_defaults,
+ enable_async=enable_async,
+ num_iters=num_iters,
+ num_all_gather_instances=replication_factor,
+ cluster_axis=0,
+ )
diff --git a/tests/ttnn/unit_tests/operations/test_all_gather_TG_post_commit.py b/tests/ttnn/unit_tests/operations/test_all_gather_TG_post_commit.py
index 2f940250eba..be6572f75e6 100644
--- a/tests/ttnn/unit_tests/operations/test_all_gather_TG_post_commit.py
+++ b/tests/ttnn/unit_tests/operations/test_all_gather_TG_post_commit.py
@@ -23,8 +23,8 @@ def report_mismatches(golden, actual, max_printable=None):
]
if print_it:
printed += 1
- logger.error(
- f"output mismatch for tensor at [{w}, {z}, {y}, {x}]: expected {int(golden[w, z, y, x])} != actual {int(actual[w, z, y, x])}"
+ print(
+ f"output mismatch for tensor at [{w}, {z}, {y}, {x}]: expected {golden[w, z, y, x]} != actual {actual[w, z, y, x]}"
)
@@ -40,92 +40,79 @@ def print_tile_corners_of_tensor(t):
for x in range(0, t.shape[3], 32):
yy = 0
xx = 0
- str_vals += f"{int(t[w, z, y + yy, x + xx]):<5} "[:5]
+ val = int(t[w, z, y + yy, x + xx].item())
+ str_vals += f"{val:<5} "[:5]
print(f"{str_vals}")
def run_line_all_gather_on_TG_with_mesh_tensor_along_rows(
mesh_device,
num_devices_per_line,
- input_shape_per_all_gather,
+ per_chip_output_shape,
+ tensor_memory_layout,
dim,
num_links,
input_dtype,
layout,
- mem_config,
+ buffer_type: ttnn.BufferType,
use_program_cache,
function_level_defaults,
enable_async,
- num_all_gather_instances=1,
- num_iters=1,
- cluster_axis=0,
+ input_shard_spec: ttnn.ShardSpec = None,
+ num_all_gather_instances: int = 1,
+ num_iters: int = 1,
+ cluster_axis: int = 0,
):
if len(mesh_device.get_devices()) != 32:
pytest.skip("Not TG!")
mesh_device.enable_async(enable_async)
- input_shape_per_chip = list(input_shape_per_all_gather)
- input_shape_per_chip[2 if cluster_axis == 0 else 3] //= num_devices_per_line
- tensor_height_per_all_gather = input_shape_per_all_gather[-2]
-
- full_mesh_input_shape = list(input_shape_per_all_gather)
- full_mesh_input_shape[-2] *= num_all_gather_instances
- logger.info(f"tensor_height_per_all_gather: {tensor_height_per_all_gather}")
- logger.info(f"input_shape_per_all_gather: {input_shape_per_all_gather}")
- logger.info(f"input_shape_per_chip: {input_shape_per_chip}")
- logger.info(f"full_mesh_input_shape: {full_mesh_input_shape}")
- logger.info(f"input_shape_per_all_gather: {input_shape_per_all_gather}")
-
- full_tensor = torch.zeros(full_mesh_input_shape, dtype=torch.bfloat16)
+ input_shape_per_chip = list(per_chip_output_shape)
+ input_shape_per_chip[dim] //= num_devices_per_line
+ tensor_height_per_all_gather = per_chip_output_shape[-2]
- for i in range(num_all_gather_instances):
- full_tensor[0, 0, i * tensor_height_per_all_gather : (i + 1) * tensor_height_per_all_gather, :] = torch.rand(
- input_shape_per_all_gather
- ).bfloat16()
+ full_mesh_input_shape = list(per_chip_output_shape)
+ ## The `all_gather_instances_concat_dim` is the dimension we will split the cluster spanning tensor along in order to split it
+ ## off into per-all-gather tensors
+ all_gather_instances_concat_dim = 1 if dim == 0 else 0
+ full_mesh_input_shape[all_gather_instances_concat_dim] *= num_all_gather_instances
+ logger.info(
+ f"per_chip_output_shape: {full_mesh_input_shape}, dim: {dim}, all_gather_instances_concat_dim: {all_gather_instances_concat_dim}, num_devices_per_line: {num_devices_per_line}"
+ )
- logger.info(f"full_tensor.shape: {full_tensor.shape}")
- debug = False
- if debug:
- tile_id = 0
- for w in range(full_tensor.shape[0]):
- for z in range(full_tensor.shape[1]):
- for y in range(0, full_tensor.shape[2], 32):
- for x in range(0, full_tensor.shape[3], 32):
- yy_max = 32 if y + 32 < full_tensor.shape[2] else full_tensor.shape[2] - y
- xx_max = 32 if x + 32 < full_tensor.shape[3] else full_tensor.shape[3] - x
- full_tensor[w, z, y : y + yy_max, x : x + xx_max] = tile_id
- tile_id += 1
+ all_gather_instances_goldens = []
+ full_input_tensor_unfractured = torch.rand(full_mesh_input_shape, dtype=torch.bfloat16)
- #
- # assemble the golden output tensor
- #
- inner_dim_concat_axis = 2
- outer_dim_concat_axis = 3
- full_tensor_chunks_per_allgather = torch.chunk(full_tensor, num_all_gather_instances, dim=inner_dim_concat_axis)
- output_chunks_per_allgather = []
- for i, chunk in enumerate(full_tensor_chunks_per_allgather):
- width_chunks = torch.chunk(chunk, num_devices_per_line, dim=outer_dim_concat_axis)
- output_chunk = torch.cat(width_chunks, dim=dim)
- output_chunks_per_allgather.append(output_chunk)
- full_mesh_output_golden_per_chip = torch.cat(output_chunks_per_allgather, dim=inner_dim_concat_axis)
- logger.info(f"full_mesh_output_golden_per_chip.shape: {full_mesh_output_golden_per_chip.shape}")
- non_replicated_output_golden_tensors = [full_mesh_output_golden_per_chip] * num_devices_per_line
- full_mesh_output_golden = torch.cat(non_replicated_output_golden_tensors, dim=outer_dim_concat_axis)
- logger.info(f"full_mesh_output_golden.shape: {full_mesh_output_golden.shape}")
+ input_mem_config = ttnn.MemoryConfig(tensor_memory_layout, buffer_type=buffer_type, shard_spec=input_shard_spec)
+ shard_dims = (dim, all_gather_instances_concat_dim) if cluster_axis == 0 else (all_gather_instances_concat_dim, dim)
+ concat_dims = shard_dims
- shard_dims = (-1, -2) if cluster_axis == 0 else (-2, -1)
mesh_shape = (
(num_devices_per_line, num_all_gather_instances)
if cluster_axis == 0
else (num_all_gather_instances, num_devices_per_line)
)
- logger.info(f"mesh_shape: {mesh_shape}")
+
+ output_shard_spec = None
+ if input_shard_spec is not None:
+ output_shard_shape = list(input_shard_spec.shape)
+ if dim == 3:
+ output_shard_shape[1] *= num_devices_per_line
+ else:
+ output_shard_shape[0] *= num_devices_per_line
+ output_shard_spec = ttnn.ShardSpec(
+ input_shard_spec.grid,
+ output_shard_shape,
+ input_shard_spec.orientation,
+ False,
+ )
+ output_mem_config = ttnn.MemoryConfig(tensor_memory_layout, buffer_type=buffer_type, shard_spec=output_shard_spec)
ttnn_tensor = ttnn.from_torch(
- full_tensor,
+ full_input_tensor_unfractured,
dtype=input_dtype,
device=mesh_device,
layout=layout,
- memory_config=mem_config,
+ memory_config=input_mem_config,
mesh_mapper=ShardTensor2dMesh(mesh_device, mesh_shape=mesh_shape, dims=shard_dims),
)
ttnn_tensor = ttnn.to_device(ttnn_tensor, mesh_device)
@@ -138,61 +125,48 @@ def run_line_all_gather_on_TG_with_mesh_tensor_along_rows(
cluster_axis=cluster_axis,
mesh_device=mesh_device,
num_links=num_links,
+ memory_config=output_mem_config,
topology=ttnn.Topology.Linear,
)
- concat_dims = (3, 2) if cluster_axis == 0 else (2, 3)
- if debug:
- readback_input_tensor = ttnn.to_torch(
- ttnn_tensor, mesh_composer=ConcatMesh2dToTensor(mesh_device, mesh_shape=mesh_shape, dims=concat_dims)
- )
- print(f"readback_input_tensor")
- print_tile_corners_of_tensor(readback_input_tensor)
-
- if debug:
- for i, t in enumerate(ttnn.get_device_tensors(ttnn_tensor)):
- print(f"readback_input_tensor {i}")
- print_tile_corners_of_tensor(t)
-
- if debug:
- for i, t in enumerate(ttnn.get_device_tensors(ttnn_tensor_out)):
- t = t.cpu().to(ttnn.ROW_MAJOR_LAYOUT).to_torch()
- print(f"OUTPUT TENSOR {i}")
- print_tile_corners_of_tensor(t)
-
# ttnn.visualize_mesh_device(mesh_device, tensor=ttnn_tensor_out)
- logger.info(f"concat_dims: {concat_dims}")
tt_output_tensor = ttnn.to_torch(
ttnn_tensor_out, mesh_composer=ConcatMesh2dToTensor(mesh_device, mesh_shape=mesh_shape, dims=concat_dims)
)
- logger.info(f"tt_output_tensor.shape: {tt_output_tensor.shape}")
+ output_tensors_list = torch.chunk(tt_output_tensor, num_all_gather_instances, dim=all_gather_instances_concat_dim)
+ output_golden = torch.zeros(tt_output_tensor.shape)
- if debug:
- print(f"tt_output_tensor")
- print_tile_corners_of_tensor(tt_output_tensor)
+ # Repeat the input tensor to represent the fact that the full concatenated input tensor lives across every
+ # device in the line
+ repeat_factor = [1] * len(output_golden.shape)
+ repeat_factor[dim] = num_devices_per_line
+ output_golden[:, :, :, :] = full_input_tensor_unfractured.repeat(repeat_factor)
- ## This full_tensor will only be 1/num_devices_per_line of the tt_output_tensor. We should just be able to concatenate it along the
+ eq = True
if input_dtype == ttnn.bfloat16:
- eq, output = comp_equal(tt_output_tensor, full_mesh_output_golden)
- if not eq and debug:
- report_mismatches(full_mesh_output_golden, tt_output_tensor)
+ eq, output = comp_equal(tt_output_tensor, output_golden)
+ if not eq and debug is True:
+ logger.error(f"found mismatches")
+ report_mismatches(tt_output_tensor, output_golden, 100)
+ print_tile_corners_of_tensor(output_tensor)
else:
- eq, output = comp_pcc(tt_output_tensor, full_mesh_output_golden)
+ eq, output = comp_pcc(tt_output_tensor, output_golden)
if not eq:
- logger.error(f"output mismatch for tensor")
+ logger.error(f"output mismatch for tensor: {output}")
+
assert eq, f"FAILED: {output}"
# Enumerate the post-commit cases explicitly
@skip_for_grayskull("Requires eth connected devices to run")
@pytest.mark.parametrize(
- "num_devices, num_links, input_shape, dim, layout",
+ "num_devices, num_links, per_chip_output_shape, dim, layout",
[
- (4, 3, [1, 1, 32, 1280], 0, ttnn.TILE_LAYOUT),
- (4, 3, [1, 1, 32, 16384], 3, ttnn.TILE_LAYOUT),
- (4, 3, [1, 1, 32, 2304], 1, ttnn.TILE_LAYOUT),
- (4, 3, [1, 1, 32, 4096], 1, ttnn.TILE_LAYOUT),
- (4, 3, [1, 1, 32, 6656], 1, ttnn.TILE_LAYOUT),
+ (4, 3, [4, 1, 32, 1280], 0, ttnn.TILE_LAYOUT),
+ (4, 3, [1, 1, 32, 16384 * 4], 3, ttnn.TILE_LAYOUT),
+ (4, 3, [1, 4, 32, 2304], 1, ttnn.TILE_LAYOUT),
+ (4, 3, [1, 4, 32, 4096], 1, ttnn.TILE_LAYOUT),
+ (4, 3, [1, 4, 32, 6656], 1, ttnn.TILE_LAYOUT),
],
)
@pytest.mark.parametrize(
@@ -203,10 +177,10 @@ def run_line_all_gather_on_TG_with_mesh_tensor_along_rows(
],
)
@pytest.mark.parametrize(
- "mem_config",
+ "buffer_type",
[
- ttnn.MemoryConfig(buffer_type=ttnn.BufferType.DRAM),
- ttnn.MemoryConfig(buffer_type=ttnn.BufferType.L1),
+ ttnn.BufferType.DRAM,
+ ttnn.BufferType.L1,
],
)
@pytest.mark.parametrize("replication_factor", [8]) # 1, 8])
@@ -215,12 +189,12 @@ def run_line_all_gather_on_TG_with_mesh_tensor_along_rows(
def test_line_all_gather_on_TG_rows_post_commit(
mesh_device,
num_devices,
- input_shape,
+ per_chip_output_shape,
dim,
num_links,
input_dtype,
layout,
- mem_config,
+ buffer_type,
use_program_cache,
function_level_defaults,
enable_async,
@@ -230,12 +204,13 @@ def test_line_all_gather_on_TG_rows_post_commit(
run_line_all_gather_on_TG_with_mesh_tensor_along_rows(
mesh_device,
num_devices,
- input_shape,
+ per_chip_output_shape,
+ ttnn.TensorMemoryLayout.INTERLEAVED,
dim,
num_links,
input_dtype,
layout,
- mem_config,
+ buffer_type,
use_program_cache,
function_level_defaults,
enable_async=enable_async,
@@ -247,40 +222,39 @@ def test_line_all_gather_on_TG_rows_post_commit(
@skip_for_grayskull("Requires eth connected devices to run")
@pytest.mark.parametrize(
- "num_devices, num_links, input_shape, dim, layout",
+ "num_devices, num_links, per_chip_output_shape, dim, layout",
[
- # (8, 4, [1, 1, 32, 1280], 1, ttnn.TILE_LAYOUT), # Rightmost column of tiles per input not copied to final output
- (8, 4, [1, 1, 32, 2048], 1, ttnn.TILE_LAYOUT), # passes
- (8, 4, [1, 1, 32, 2304], 1, ttnn.TILE_LAYOUT), # passes
- (8, 4, [1, 1, 32, 4096], 1, ttnn.TILE_LAYOUT), # passes
+ (8, 4, [1, 8, 32, 1280], 1, ttnn.TILE_LAYOUT),
+ (8, 4, [8, 1, 32, 1280], 0, ttnn.TILE_LAYOUT),
+ (8, 4, [1, 8, 32, 2048], 1, ttnn.TILE_LAYOUT),
+ (8, 4, [1, 8, 32, 2304], 1, ttnn.TILE_LAYOUT),
+ (8, 4, [1, 8, 32, 4096], 1, ttnn.TILE_LAYOUT),
],
)
@pytest.mark.parametrize(
"input_dtype",
[
ttnn.bfloat16,
- # ttnn.bfloat8_b,
],
)
@pytest.mark.parametrize(
- "mem_config",
+ "buffer_type",
[
- ttnn.MemoryConfig(buffer_type=ttnn.BufferType.DRAM),
- # ttnn.MemoryConfig(buffer_type=ttnn.BufferType.L1),
+ ttnn.BufferType.DRAM,
],
)
-@pytest.mark.parametrize("enable_async", [False])
-@pytest.mark.parametrize("replication_factor", [4]) # 1, 4])
+@pytest.mark.parametrize("enable_async", [True])
+@pytest.mark.parametrize("replication_factor", [4])
@pytest.mark.parametrize("mesh_device", [pytest.param((8, 4), id="8x4_grid")], indirect=True)
def test_line_all_gather_on_TG_cols_post_commit(
mesh_device,
num_devices,
- input_shape,
+ per_chip_output_shape,
dim,
num_links,
input_dtype,
layout,
- mem_config,
+ buffer_type,
use_program_cache,
function_level_defaults,
enable_async,
@@ -290,12 +264,13 @@ def test_line_all_gather_on_TG_cols_post_commit(
run_line_all_gather_on_TG_with_mesh_tensor_along_rows(
mesh_device,
num_devices,
- input_shape,
+ per_chip_output_shape,
+ ttnn.TensorMemoryLayout.INTERLEAVED,
dim,
num_links,
input_dtype,
layout,
- mem_config,
+ buffer_type,
use_program_cache,
function_level_defaults,
enable_async=enable_async,
diff --git a/tests/ttnn/unit_tests/operations/test_maxpool2d.py b/tests/ttnn/unit_tests/operations/test_maxpool2d.py
index 192d47e2a78..b90fd026c96 100644
--- a/tests/ttnn/unit_tests/operations/test_maxpool2d.py
+++ b/tests/ttnn/unit_tests/operations/test_maxpool2d.py
@@ -23,6 +23,7 @@ def run_max_pool(
device,
dtype,
memory_config=None,
+ shard_scheme=None,
):
in_n, in_c, in_h, in_w = act_shape
kernel_h, kernel_w = kernel_size
@@ -30,22 +31,37 @@ def run_max_pool(
stride_h, stride_w = stride
dilation_h, dilation_w = dilation
- if 2 * pad_h > kernel_h or 2 * pad_w > kernel_w:
- pytest.skip("Invalid case")
-
- if (kernel_h == 3 and pad_h != 1) or (kernel_h == 2 and pad_h != 0):
- pytest.skip("kernel size and padding combination not supported")
+ if shard_scheme != ttnn.TensorMemoryLayout.WIDTH_SHARDED:
+ if 2 * pad_h > kernel_h or 2 * pad_w > kernel_w:
+ pytest.skip("Invalid case")
+ if (kernel_h == 3 and pad_h != 1) or (kernel_h == 2 and pad_h != 0):
+ pytest.skip("kernel size and padding combination not supported")
out_h = math.floor((in_h + 2 * pad_h - (dilation_h * kernel_h - 1) - 1) / stride_h) + 1
out_w = math.floor((in_w + 2 * pad_w - (dilation_w * kernel_w - 1) - 1) / stride_w) + 1
- if in_c % 16 != 0:
- pytest.skip("Current maxpool writer needs nchannels to be multiple of 16!")
-
- if in_c == 16 and dtype == ttnn.bfloat8_b and in_n * in_h * in_w > 600000:
- pytest.skip("This case runs out of memory on Grayskull")
-
- if in_n > 16 and in_c > 64 and dtype == ttnn.bfloat8_b and is_wormhole_b0():
- pytest.skip("This case runs out of memory on Wormhole b0")
+ cores_x = device.core_grid.x
+ cores_y = device.core_grid.y
+ max_cores = cores_x * cores_y
+
+ if shard_scheme == ttnn.TensorMemoryLayout.HEIGHT_SHARDED or shard_scheme is None:
+ if in_c % 16 != 0:
+ pytest.skip("Current maxpool writer needs nchannels to be multiple of 16!")
+ if in_c == 16 and dtype == ttnn.bfloat8_b and in_n * in_h * in_w > 600000:
+ pytest.skip("This case runs out of memory on Grayskull")
+ if in_n > 16 and in_c > 64 and dtype == ttnn.bfloat8_b and is_wormhole_b0():
+ pytest.skip("This case runs out of memory on Wormhole b0")
+
+ if shard_scheme == ttnn.TensorMemoryLayout.WIDTH_SHARDED:
+ if in_c < max_cores:
+ pytest.skip("Width sharding requires channles >= cores")
+ if in_c / max_cores < 16:
+ pytest.skip("Width sharding requires large enough channels to shard (at least 16 per core)")
+
+ if shard_scheme == ttnn.TensorMemoryLayout.BLOCK_SHARDED:
+ if in_c < cores_x:
+ pytest.skip("Block sharding requires channles >= cores")
+ if in_c / cores_x < 16:
+ pytest.skip("Block sharding requires large enough channels to shard (at least 16 per core)")
torch.manual_seed(0)
torch.set_printoptions(precision=3, sci_mode=False, linewidth=500, threshold=10000, edgeitems=32)
@@ -72,12 +88,15 @@ def run_max_pool(
if dtype == ttnn.bfloat8_b:
if (in_h * in_w) % 32 != 0:
pytest.skip("For BFP8_B datatype, input height * width should be multiple of 32")
+ if shard_scheme == ttnn.TensorMemoryLayout.WIDTH_SHARDED and (in_c / max_cores) % 32 != 0:
+ pytest.skip("For BFP8_B datatype, input channels / max_cores should be multiple of 32")
+ if shard_scheme == ttnn.TensorMemoryLayout.BLOCK_SHARDED and (in_c / cores_x) % 32 != 0:
+ pytest.skip("For BFP8_B datatype, input channels / cores_x should be multiple of 32")
ttact = ttnn.from_torch(act_reshaped, dtype, layout=ttnn.TILE_LAYOUT)
else:
ttact = ttnn.from_torch(act_reshaped, dtype)
- pre_shard = True
- # pre_shard = False
+ pre_shard = shard_scheme == None
ttact_device = ttnn.to_device(ttact, device)
if pre_shard:
@@ -109,6 +128,7 @@ def run_max_pool(
padding=[pad_h, pad_w],
dilation=[dilation_h, dilation_w],
memory_config=memory_config,
+ applied_shard_scheme=shard_scheme,
)
output_host = output.cpu()
@@ -249,6 +269,141 @@ def test_run_max_pool_mem_config(
run_max_pool(act_shape, (3, 3), (1, 1), (2, 2), (1, 1), device, ttnn.bfloat16, memory_config=memory_config)
+@pytest.mark.parametrize("device_params", [{"l1_small_size": 24576}], indirect=True)
+@pytest.mark.parametrize(
+ "act_shape", ## NCHW
+ (
+ (
+ [1, 512, 28, 28],
+ [1, 512, 14, 14],
+ [1, 1024, 6, 6],
+ [1, 2048, 6, 6],
+ [1, 4096, 6, 6],
+ [4, 1024, 40, 40],
+ [2, 2048, 40, 40],
+ [8, 4096, 10, 16],
+ )
+ ),
+)
+@pytest.mark.parametrize(
+ "kernel_size",
+ (
+ (2, 2),
+ (3, 3),
+ ),
+)
+@pytest.mark.parametrize(
+ "padding",
+ (
+ (0, 0),
+ (1, 1),
+ ),
+)
+@pytest.mark.parametrize(
+ "stride",
+ ((2, 2),),
+)
+@pytest.mark.parametrize("dilation", ((1, 1),)) ## default
+@pytest.mark.parametrize("dtype", [ttnn.bfloat16, ttnn.bfloat8_b])
+def test_run_max_pool_width_shard(
+ act_shape,
+ kernel_size,
+ padding,
+ stride,
+ dilation,
+ device,
+ dtype,
+ use_program_cache,
+):
+ run_max_pool(
+ act_shape,
+ kernel_size,
+ padding,
+ stride,
+ dilation,
+ device,
+ dtype,
+ shard_scheme=ttnn.TensorMemoryLayout.WIDTH_SHARDED,
+ )
+
+
+@pytest.mark.parametrize("device_params", [{"l1_small_size": 24576}], indirect=True)
+@pytest.mark.parametrize(
+ "act_shape", ## NCHW
+ (
+ (
+ [1, 256, 56, 56],
+ [1, 256, 28, 28],
+ [1, 256, 14, 14],
+ [1, 256, 10, 14],
+ [1, 512, 8, 6],
+ [1, 1024, 6, 6],
+ [1, 2048, 4, 6],
+ [4, 512, 40, 40],
+ [2, 1024, 40, 40],
+ [8, 2048, 10, 16],
+ ## resnet shapes
+ [1, 64, 112, 112],
+ [4, 64, 112, 112],
+ [8, 64, 112, 112],
+ [16, 64, 112, 112],
+ ## hpr shapes
+ [8, 32, 132, 20],
+ [16, 32, 132, 20],
+ [32, 32, 132, 20],
+ [64, 32, 132, 20],
+ [128, 32, 132, 20],
+ [8, 32, 264, 40],
+ [16, 32, 264, 40],
+ [32, 32, 264, 40],
+ [4, 16, 1056, 160],
+ [8, 16, 528, 80],
+ [16, 16, 528, 80],
+ )
+ ),
+)
+@pytest.mark.parametrize(
+ "kernel_size",
+ (
+ (2, 2),
+ (3, 3),
+ ),
+)
+@pytest.mark.parametrize(
+ "padding",
+ (
+ (0, 0),
+ (1, 1),
+ ),
+)
+@pytest.mark.parametrize(
+ "stride",
+ ((2, 2),),
+)
+@pytest.mark.parametrize("dilation", ((1, 1),)) ## default
+@pytest.mark.parametrize("dtype", [ttnn.bfloat16, ttnn.bfloat8_b])
+def test_run_max_pool_block_shard(
+ act_shape,
+ kernel_size,
+ padding,
+ stride,
+ dilation,
+ device,
+ dtype,
+ use_program_cache,
+):
+ run_max_pool(
+ act_shape,
+ kernel_size,
+ padding,
+ stride,
+ dilation,
+ device,
+ dtype,
+ shard_scheme=ttnn.TensorMemoryLayout.BLOCK_SHARDED,
+ )
+
+
@pytest.mark.parametrize("device_params", [{"l1_small_size": 24576}], indirect=True)
@pytest.mark.parametrize(
"act_shape", ## NCHW
diff --git a/tests/ttnn/unit_tests/operations/test_softmax.py b/tests/ttnn/unit_tests/operations/test_softmax.py
index bf8e285cd5d..ff45493669c 100644
--- a/tests/ttnn/unit_tests/operations/test_softmax.py
+++ b/tests/ttnn/unit_tests/operations/test_softmax.py
@@ -15,22 +15,46 @@
@pytest.mark.parametrize(
"input_vector",
- [[100.0, 101.0], [100.0, 1000.0], [-100.0, -101.0], [-1000.0, -100.0], [-100, -108, -99, -100, -101, -98]],
+ [
+ [100.0, 101.0],
+ [100.0, 1000.0],
+ [-100.0, -99.0],
+ [-100.0, -101.0],
+ [-1000.0, -100.0],
+ [-100, -108, -99, -100, -101, -98],
+ ],
)
-def test_softmax_stable_neg_values(device, input_vector):
+@pytest.mark.parametrize("math_approx", [True, False])
+@pytest.mark.parametrize("fp32_acc_en", [True, False])
+def test_softmax_stable_neg_values(device, input_vector, math_approx, fp32_acc_en):
torch.manual_seed(0)
torch_input_tensor = torch.tensor([[[input_vector]]], dtype=torch.bfloat16)
torch_output_tensor = F.softmax(torch_input_tensor, dim=-1, dtype=torch.bfloat16)
+ if is_grayskull():
+ compute_kernel_config = ttnn.GrayskullComputeKernelConfig(
+ math_fidelity=ttnn.MathFidelity.HiFi4,
+ math_approx_mode=math_approx,
+ )
+ else:
+ compute_kernel_config = ttnn.WormholeComputeKernelConfig(
+ math_fidelity=ttnn.MathFidelity.HiFi4,
+ math_approx_mode=math_approx,
+ fp32_dest_acc_en=fp32_acc_en,
+ packer_l1_acc=False,
+ )
+
input_tensor = ttnn.from_torch(torch_input_tensor, layout=ttnn.TILE_LAYOUT, device=device)
- output_tensor = ttnn.softmax(input_tensor, dim=-1, numeric_stable=True)
+ output_tensor = ttnn.softmax(input_tensor, dim=-1, compute_kernel_config=compute_kernel_config, numeric_stable=True)
output_tensor = ttnn.to_torch(output_tensor)
assert_with_pcc(torch_output_tensor, output_tensor, 0.999)
-def run_softmax_stable_with_program_cache(device, batch_size, h, w, skip_scale_mask, math_approx):
+def run_softmax_stable_with_program_cache(
+ device, batch_size, h, w, skip_scale_mask, math_approx, fp32_acc_en, in_dtype
+):
torch.manual_seed(0)
scale = 1.0
@@ -47,7 +71,7 @@ def run_softmax_stable_with_program_cache(device, batch_size, h, w, skip_scale_m
torch_output_tensor = torch_input_tensor
torch_output_tensor = F.softmax(torch_output_tensor, dim=-1, dtype=torch.bfloat16)
- input_tensor = ttnn.from_torch(torch_input_tensor, layout=ttnn.TILE_LAYOUT, device=device)
+ input_tensor = ttnn.from_torch(torch_input_tensor, dtype=in_dtype, layout=ttnn.TILE_LAYOUT, device=device)
if is_grayskull():
compute_kernel_config = ttnn.GrayskullComputeKernelConfig(
@@ -58,7 +82,7 @@ def run_softmax_stable_with_program_cache(device, batch_size, h, w, skip_scale_m
compute_kernel_config = ttnn.WormholeComputeKernelConfig(
math_fidelity=ttnn.MathFidelity.HiFi4,
math_approx_mode=math_approx,
- fp32_dest_acc_en=False,
+ fp32_dest_acc_en=fp32_acc_en,
packer_l1_acc=False,
)
@@ -80,9 +104,15 @@ def run_softmax_stable_with_program_cache(device, batch_size, h, w, skip_scale_m
@pytest.mark.parametrize("w", [1024, 1500])
@pytest.mark.parametrize("skip_scale_mask", [True, False])
@pytest.mark.parametrize("math_approx", [True, False])
-def test_softmax_stable_with_program_cache(device, batch_size, h, w, skip_scale_mask, math_approx, use_program_cache):
+@pytest.mark.parametrize("fp32_acc_en", [True, False])
+@pytest.mark.parametrize("in_dtype", [ttnn.bfloat8_b, ttnn.bfloat16])
+def test_softmax_stable_with_program_cache(
+ device, batch_size, h, w, skip_scale_mask, math_approx, fp32_acc_en, in_dtype, use_program_cache
+):
for _ in range(2):
- run_softmax_stable_with_program_cache(device, batch_size, h, w, skip_scale_mask, math_approx)
+ run_softmax_stable_with_program_cache(
+ device, batch_size, h, w, skip_scale_mask, math_approx, fp32_acc_en, in_dtype
+ )
# dummy tensor to change tensor alloc
dummy_shape = [1, 1, 32, 32]
py_dummy_tensor = torch.randn(dummy_shape)
@@ -96,7 +126,9 @@ def test_softmax_stable_with_program_cache(device, batch_size, h, w, skip_scale_
assert device.num_program_cache_entries() == 1
-def run_softmax_sharded_stable(device, batch_size, num_heads, h, w, skip_scale_mask):
+def run_softmax_sharded_stable(
+ device, batch_size, num_heads, h, w, skip_scale_mask, math_approx, fp32_acc_en, in_dtype
+):
torch.manual_seed(0)
grid_size = (batch_size, num_heads)
@@ -123,21 +155,41 @@ def run_softmax_sharded_stable(device, batch_size, num_heads, h, w, skip_scale_m
)
program_config = ttnn.SoftmaxShardedMultiCoreProgramConfig(
compute_with_storage_grid_size=grid_size,
- subblock_w=6,
+ subblock_w=6 if not fp32_acc_en else 3,
block_h=h // 32,
block_w=w // 32,
)
+ if is_grayskull():
+ compute_kernel_config = ttnn.GrayskullComputeKernelConfig(
+ math_fidelity=ttnn.MathFidelity.HiFi4,
+ math_approx_mode=math_approx,
+ )
+ else:
+ compute_kernel_config = ttnn.WormholeComputeKernelConfig(
+ math_fidelity=ttnn.MathFidelity.HiFi4,
+ math_approx_mode=math_approx,
+ fp32_dest_acc_en=fp32_acc_en,
+ packer_l1_acc=False,
+ )
input_tensor = ttnn.from_torch(
- torch_input_tensor, layout=ttnn.TILE_LAYOUT, device=device, memory_config=memory_config
+ torch_input_tensor, dtype=in_dtype, layout=ttnn.TILE_LAYOUT, device=device, memory_config=memory_config
)
if not skip_scale_mask:
output_tensor = ttnn.scale_mask_softmax_in_place(
- input_tensor, scale, attention_mask_t, program_config=program_config, numeric_stable=True
+ input_tensor,
+ scale,
+ attention_mask_t,
+ program_config=program_config,
+ compute_kernel_config=compute_kernel_config,
+ numeric_stable=True,
)
else:
output_tensor = ttnn.scale_mask_softmax_in_place(
- input_tensor, program_config=program_config, numeric_stable=True
+ input_tensor,
+ program_config=program_config,
+ compute_kernel_config=compute_kernel_config,
+ numeric_stable=True,
)
output_tensor = ttnn.to_torch(output_tensor)
@@ -149,11 +201,16 @@ def run_softmax_sharded_stable(device, batch_size, num_heads, h, w, skip_scale_m
@pytest.mark.parametrize("h", [384])
@pytest.mark.parametrize("w", [384])
@pytest.mark.parametrize("skip_scale_mask", [True, False])
+@pytest.mark.parametrize("math_approx", [True, False])
+@pytest.mark.parametrize("fp32_acc_en", [True, False])
+@pytest.mark.parametrize("in_dtype", [ttnn.bfloat8_b, ttnn.bfloat16])
def test_softmax_sharded_stable_with_program_cache(
- device, batch_size, num_heads, h, w, skip_scale_mask, use_program_cache
+ device, batch_size, num_heads, h, w, skip_scale_mask, math_approx, fp32_acc_en, in_dtype, use_program_cache
):
for _ in range(2):
- run_softmax_sharded_stable(device, batch_size, num_heads, h, w, skip_scale_mask)
+ run_softmax_sharded_stable(
+ device, batch_size, num_heads, h, w, skip_scale_mask, math_approx, fp32_acc_en, in_dtype
+ )
# dummy tensor to change tensor alloc
dummy_shape = [1, 1, 32, 32]
py_dummy_tensor = torch.randn(dummy_shape)
diff --git a/tt_metal/common/CMakeLists.txt b/tt_metal/common/CMakeLists.txt
index 75433fda8c8..294d5700810 100644
--- a/tt_metal/common/CMakeLists.txt
+++ b/tt_metal/common/CMakeLists.txt
@@ -1,5 +1,6 @@
set(COMMON_SRCS
+ ${CMAKE_CURRENT_SOURCE_DIR}/core_coord.cpp
${CMAKE_CURRENT_SOURCE_DIR}/core_descriptor.cpp
${CMAKE_CURRENT_SOURCE_DIR}/metal_soc_descriptor.cpp
${CMAKE_CURRENT_SOURCE_DIR}/tt_backend_api_types.cpp
diff --git a/tt_metal/common/core_coord.cpp b/tt_metal/common/core_coord.cpp
new file mode 100644
index 00000000000..7ea7ea6a5d7
--- /dev/null
+++ b/tt_metal/common/core_coord.cpp
@@ -0,0 +1,525 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "tt_metal/common/core_coord.hpp"
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include "third_party/umd/device/tt_xy_pair.h"
+#include "tt_metal/common/assert.hpp"
+#include "tt_metal/third_party/tracy/public/tracy/Tracy.hpp"
+#include "tt_metal/tt_stl/reflection.hpp"
+#include "tt_metal/tt_stl/span.hpp"
+
+auto fmt::formatter::format(const CoreCoord &core_coord, format_context &ctx) const
+ -> format_context::iterator {
+ std::stringstream ss;
+ ss << core_coord.str();
+ return fmt::format_to(ctx.out(), "{}", ss.str());
+}
+
+std::string RelativeCoreCoord::str() const { return "(x=" + std::to_string(x) + ",y=" + std::to_string(y) + ")"; }
+
+CoreCoord get_core_coord_from_relative(const RelativeCoreCoord &in, const CoreCoord &grid_size) {
+ CoreCoord coord;
+ coord.x = in.x + ((in.x < 0) ? grid_size.x : 0);
+ coord.y = in.y + ((in.y < 0) ? grid_size.y : 0);
+ return coord;
+}
+
+CoreRange::CoreRange(const CoreCoord &point) : start_coord(point), end_coord(point) {}
+
+CoreRange::CoreRange(const CoreCoord &start_coord, const CoreCoord &end_coord) {
+ TT_FATAL(
+ end_coord.x >= start_coord.x and end_coord.y >= start_coord.y,
+ "Invalid core range for start_coord: {}, end_coord: {}",
+ start_coord.str(),
+ end_coord.str());
+
+ this->start_coord = start_coord;
+ this->end_coord = end_coord;
+}
+
+std::optional CoreRange::intersects(const CoreRange &other) const {
+ std::size_t x1 = std::max(this->start_coord.x, other.start_coord.x);
+ std::size_t y1 = std::max(this->start_coord.y, other.start_coord.y);
+ std::size_t x2 = std::min(this->end_coord.x, other.end_coord.x);
+ std::size_t y2 = std::min(this->end_coord.y, other.end_coord.y);
+ if (x1 <= x2 and y1 <= y2) {
+ return CoreRange({x1, y1}, {x2, y2});
+ }
+
+ return {};
+}
+
+bool CoreRange::adjacent(const CoreRange &other) const {
+ std::size_t x1 = std::max(this->start_coord.x, other.start_coord.x);
+ std::size_t y1 = std::max(this->start_coord.y, other.start_coord.y);
+ std::size_t x2 = std::min(this->end_coord.x, other.end_coord.x);
+ std::size_t y2 = std::min(this->end_coord.y, other.end_coord.y);
+ return ((x2 + 1 == x1 && y1 <= y2) || (y2 + 1 == y1 && x1 <= x2));
+}
+
+bool CoreRange::contains(const CoreRange &other) const {
+ return (other.start_coord.x >= this->start_coord.x) && (other.end_coord.x <= this->end_coord.x) &&
+ (other.start_coord.y >= this->start_coord.y) && (other.end_coord.y <= this->end_coord.y);
+}
+
+bool CoreRange::contains(const CoreCoord &other) const {
+ return (other.x >= this->start_coord.x) && (other.x <= this->end_coord.x) && (other.y >= this->start_coord.y) &&
+ (other.y <= this->end_coord.y);
+}
+
+// Merge lined-up (in x or y dimension) intersecting/adjacent rectangles
+std::optional CoreRange::merge(const CoreRange &cr) const {
+ if (this->intersects(cr) || this->adjacent(cr)) {
+ if (this->start_coord.x == cr.start_coord.x && this->end_coord.x == cr.end_coord.x)
+ return CoreRange(
+ {this->start_coord.x, std::min(this->start_coord.y, cr.start_coord.y)},
+ {this->end_coord.x, std::max(this->end_coord.y, cr.end_coord.y)});
+
+ else if (this->start_coord.y == cr.start_coord.y && this->end_coord.y == cr.end_coord.y)
+ return CoreRange(
+ {std::min(this->start_coord.x, cr.start_coord.x), this->start_coord.y},
+ {std::max(this->end_coord.x, cr.end_coord.x), this->end_coord.y});
+ }
+ return std::nullopt;
+}
+
+std::string CoreRange::str() const { return "[" + this->start_coord.str() + " - " + this->end_coord.str() + "]"; }
+
+size_t CoreRange::size() const {
+ return (this->end_coord.x - this->start_coord.x + 1) * (this->end_coord.y - this->start_coord.y + 1);
+}
+
+CoreCoord CoreRange::grid_size() const {
+ return {this->end_coord.x - this->start_coord.x + 1, this->end_coord.y - this->start_coord.y + 1};
+}
+
+CoreRange::CoreIterator::CoreIterator(const CoreCoord ¤t, const CoreRange &core_range) :
+ current_(current), range_(core_range) {}
+
+CoreCoord &CoreRange::CoreIterator::operator*() { return current_; }
+
+CoreRange::CoreIterator &CoreRange::CoreIterator::operator++() {
+ CoreCoord next;
+
+ const bool is_curr_core_at_end_of_row = current_.x == range_.end_coord.x;
+ if (is_curr_core_at_end_of_row) {
+ // Go to the beginning of the next row
+ next.x = range_.start_coord.x;
+ next.y = current_.y + 1;
+ } else {
+ next.x = current_.x + 1;
+ next.y = current_.y;
+ }
+
+ current_ = next;
+ return *this;
+}
+
+CoreRange::CoreIterator CoreRange::begin() const { return CoreRange::CoreIterator(this->start_coord, *this); }
+
+CoreRange::CoreIterator CoreRange::end() const {
+ const CoreCoord iterator_end(this->start_coord.x, this->end_coord.y + 1);
+ return CoreRange::CoreIterator(iterator_end, *this);
+}
+
+bool CoreRange::CoreIterator::operator==(const CoreIterator &other) const { return current_ == other.current_; }
+
+bool CoreRange::CoreIterator::operator!=(const CoreIterator &other) const { return !(current_ == other.current_); }
+
+auto fmt::formatter::format(const CoreRange &core_range, format_context &ctx) const
+ -> format_context::iterator {
+ std::stringstream ss;
+ ss << core_range.str();
+ return fmt::format_to(ctx.out(), "{}", ss.str());
+}
+
+CoreRangeSet::CoreRangeSet(const std::vector &core_ranges) :
+ ranges_(core_ranges.begin(), core_ranges.end()) {
+ ZoneScoped;
+ this->validate_no_overlap();
+}
+
+CoreRangeSet::CoreRangeSet(const std::set &core_ranges) : ranges_(core_ranges.begin(), core_ranges.end()) {
+ ZoneScoped;
+ this->validate_no_overlap();
+}
+
+CoreRangeSet::CoreRangeSet(const CoreRange &core_range) : ranges_{core_range} {}
+
+void swap(CoreRangeSet &first, CoreRangeSet &second) {
+ std::scoped_lock lock(first.ranges_guard, second.ranges_guard);
+ std::swap(first.ranges_, second.ranges_);
+}
+
+CoreRangeSet::CoreRangeSet(const CoreRangeSet &other) {
+ std::scoped_lock lock(other.ranges_guard);
+ this->ranges_ = other.ranges_;
+}
+
+CoreRangeSet &CoreRangeSet::operator=(const CoreRangeSet &other) {
+ std::scoped_lock lock(other.ranges_guard);
+ this->ranges_ = other.ranges_;
+ return *this;
+}
+
+CoreRangeSet::CoreRangeSet(CoreRangeSet &&other) { swap(*this, other); }
+
+CoreRangeSet &CoreRangeSet::operator=(CoreRangeSet &&other) {
+ swap(*this, other);
+ return *this;
+}
+
+CoreRangeSet::CoreRangeSet(std::vector &&core_ranges) : ranges_(std::move(core_ranges)) {
+ ZoneScoped;
+ this->validate_no_overlap();
+}
+
+size_t CoreRangeSet::size() const { return ranges_.size(); }
+
+template
+CoreRangeSet CoreRangeSet::merge(const T &other) const {
+ size_t min_x = std::numeric_limits::max(), max_x = 0, min_y = std::numeric_limits::max(), max_y = 0;
+ std::set crs(this->ranges_.begin(), this->ranges_.end());
+ crs.insert(other.begin(), other.end());
+
+ for (const auto &cr : crs) {
+ min_x = std::min(min_x, cr.start_coord.x);
+ max_x = std::max(max_x, cr.end_coord.x);
+ min_y = std::min(min_y, cr.start_coord.y);
+ max_y = std::max(max_y, cr.end_coord.y);
+ }
+
+ // By overallocating by one x entry, we can avoid needing to check for
+ // boundary conditions when iterating, since there'll always be one
+ // last false entry
+ bool grid[max_y + 1][max_x + 2];
+ memset(grid, 0, sizeof(grid));
+
+ for (const auto &cr : crs)
+ for (unsigned y = cr.start_coord.y; y <= cr.end_coord.y; y++)
+ for (unsigned x = cr.start_coord.x; x <= cr.end_coord.x; x++) grid[y][x] = true;
+
+ crs.clear();
+ for (unsigned y = min_y; y <= max_y; y++) {
+ std::set filter_set, tmp, new_crs;
+ std::vector ranges;
+ for (unsigned x = min_x; x <= max_x + 1; x++) {
+ if (grid[y][x]) {
+ unsigned x_start = x;
+ while (grid[y][x]) x++;
+ ranges.push_back(CoreRange({x_start, y}, {x - 1, y}));
+ }
+ }
+
+ for (const auto &cr : ranges) {
+ for (const auto &prev_cr : crs) {
+ if (auto merged = cr.merge(prev_cr)) {
+ new_crs.insert(merged.value());
+ filter_set.insert(prev_cr);
+ filter_set.insert(cr);
+ }
+ }
+ crs.insert(cr);
+ }
+ // Set(A) = Set(A) - Set(B)
+ std::set_difference(
+ std::make_move_iterator(crs.begin()),
+ std::make_move_iterator(crs.end()),
+ filter_set.begin(),
+ filter_set.end(),
+ std::inserter(tmp, tmp.end()));
+ crs.swap(tmp);
+ crs.insert(new_crs.begin(), new_crs.end());
+ }
+ return CoreRangeSet(crs);
+}
+
+template CoreRangeSet CoreRangeSet::merge>(const std::vector &other) const;
+template CoreRangeSet CoreRangeSet::merge>(const std::set &other) const;
+
+template <>
+CoreRangeSet CoreRangeSet::merge(const CoreRangeSet &other) const {
+ return this->merge(other.ranges());
+}
+
+bool CoreRangeSet::core_coord_in_core_ranges(const CoreCoord &core_coord) const {
+ ZoneScoped;
+ for (const auto &cr : this->ranges_) {
+ if (cr.contains(core_coord))
+ return true;
+ }
+ return false;
+}
+
+bool CoreRangeSet::intersects(const CoreRange &cr) const {
+ for (const auto &local_cr : this->ranges_) {
+ if (local_cr.intersects(cr))
+ return true;
+ }
+ return false;
+}
+
+const std::vector &CoreRangeSet::ranges() const { return this->ranges_; }
+
+std::string CoreRangeSet::str() const {
+ if (this->ranges().size() > 0) {
+ std::string core_range_set_str = "{";
+ for (const auto &core_range : this->ranges_) {
+ core_range_set_str += core_range.str() + ", ";
+ }
+ core_range_set_str[core_range_set_str.length() - 2] = '}';
+ core_range_set_str.pop_back();
+ return core_range_set_str;
+ } else {
+ return "{}";
+ }
+}
+
+uint32_t CoreRangeSet::num_cores() const {
+ uint32_t num_cores = 0;
+ for (const auto &core_range : this->ranges()) {
+ num_cores += core_range.size();
+ }
+ return num_cores;
+}
+
+CoreRange CoreRangeSet::bounding_box() const {
+ TT_FATAL(this->ranges().size() > 0, "Cannot get bounding_box of an empty CoreRangeSet!");
+ size_t min_x = UINT32_MAX, min_y = UINT32_MAX, max_x = 0, max_y = 0;
+ for (const auto &cr : this->ranges()) {
+ min_x = std::min(min_x, cr.start_coord.x);
+ max_x = std::max(max_x, cr.end_coord.x);
+ min_y = std::min(min_y, cr.start_coord.y);
+ max_y = std::max(max_y, cr.end_coord.y);
+ }
+ return {{min_x, min_y}, {max_x, max_y}};
+}
+
+void CoreRangeSet::validate_no_overlap() {
+ if (this->ranges_.size() < 2) {
+ return;
+ }
+ for (auto outer_it = this->ranges_.begin(); outer_it != this->ranges_.end() - 1; outer_it++) {
+ for (auto inner_it = outer_it + 1; inner_it != this->ranges_.end(); inner_it++) {
+ CoreRange &first_core_range = *outer_it;
+ CoreRange &second_core_range = *inner_it;
+ bool first_core_left_of_second = first_core_range.end_coord.x < second_core_range.start_coord.x;
+ bool first_core_right_of_second = first_core_range.start_coord.x > second_core_range.end_coord.x;
+ bool first_core_above_second = first_core_range.end_coord.y < second_core_range.start_coord.y;
+ bool first_core_below_second = first_core_range.start_coord.y > second_core_range.end_coord.y;
+ auto no_overlap = first_core_left_of_second or first_core_right_of_second or first_core_above_second or
+ first_core_below_second;
+ if (not no_overlap) {
+ TT_THROW(
+ "Cannot create CoreRangeSet with specified core ranges because core ranges {} and {} overlap!",
+ first_core_range.str(),
+ second_core_range.str());
+ }
+ }
+ }
+}
+
+bool operator==(const CoreRangeSet &a, const CoreRangeSet &b) {
+ if (a.ranges().size() == b.ranges().size()) {
+ auto range_a = a.ranges();
+ auto range_b = b.ranges();
+ for (auto it_a = range_a.begin(), it_b = range_b.begin(); it_a != range_a.end(); it_a++, it_b++) {
+ if (*it_a != *it_b) {
+ return false;
+ }
+ }
+ return true;
+ }
+ return false;
+}
+
+std::vector grid_to_cores(uint32_t num_cores, uint32_t grid_size_x, uint32_t grid_size_y, bool row_wise) {
+ std::vector cores;
+ cores.reserve(num_cores);
+ TT_ASSERT(
+ num_cores <= grid_size_x * grid_size_y,
+ "Number of cores {} exceeds grid size {}x{}",
+ num_cores,
+ grid_size_x,
+ grid_size_y);
+ if (row_wise) {
+ for (uint32_t i = 0; i < num_cores; ++i) {
+ cores.push_back({i % grid_size_x, i / grid_size_x});
+ }
+ } else {
+ for (uint32_t i = 0; i < num_cores; ++i) {
+ cores.push_back({i / grid_size_y, i % grid_size_y});
+ }
+ }
+ return cores;
+}
+
+std::vector grid_to_cores(CoreCoord start, CoreCoord end, bool row_wise) {
+ std::vector cores;
+ auto num_cores_x = (end.x + 1) - start.x;
+ auto num_cores_y = (end.y + 1) - start.y;
+ uint32_t num_cores = num_cores_x * num_cores_y;
+ cores.reserve(num_cores);
+ if (row_wise) {
+ for (uint32_t j = start.y; j < (end.y + 1); j++) {
+ for (uint32_t i = start.x; i < (end.x + 1); i++) {
+ cores.push_back({i, j});
+ }
+ }
+
+ } else {
+ for (uint32_t i = start.x; i < (end.x + 1); i++) {
+ for (uint32_t j = start.y; j < (end.y + 1); j++) {
+ cores.push_back({i, j});
+ }
+ }
+ }
+ return cores;
+}
+
+// Noop cores are appended at the end with no guarantees on ordering
+std::vector grid_to_cores_with_noop(
+ const uint32_t bbox_x,
+ const uint32_t bbox_y,
+ const uint32_t grid_size_x,
+ const uint32_t grid_size_y,
+ const bool row_wise) {
+ ZoneScoped;
+ std::vector cores;
+ cores.reserve(grid_size_x * grid_size_y);
+ TT_ASSERT(bbox_x < grid_size_x);
+ TT_ASSERT(bbox_y < grid_size_y);
+ const uint32_t box_size_x = bbox_x + 1;
+ const uint32_t box_size_y = bbox_y + 1;
+
+ if (row_wise) {
+ for (uint32_t i = 0; i < box_size_x * box_size_y; ++i) {
+ cores.push_back({i % box_size_x, i / box_size_x});
+ }
+ } else {
+ for (uint32_t i = 0; i < box_size_x * box_size_y; ++i) {
+ cores.push_back({i / box_size_y, i % box_size_y});
+ }
+ }
+
+ // Right rectangle noops
+ for (uint32_t x = box_size_x; x < grid_size_x; ++x) {
+ for (uint32_t y = 0; y < grid_size_y; ++y) {
+ cores.push_back({x, y});
+ }
+ }
+
+ // Bottom rectangle noops
+ for (uint32_t y = box_size_y; y < grid_size_y; ++y) {
+ for (uint32_t x = 0; x < box_size_x; ++x) {
+ cores.push_back({x, y});
+ }
+ }
+
+ return cores;
+}
+
+std::vector corerange_to_cores(const CoreRangeSet &crs, std::optional max_cores, bool row_wise) {
+ uint32_t num_total_cores = 0;
+ std::vector all_cores;
+ uint32_t offset = 0;
+
+ for (auto core_range : crs.ranges()) {
+ auto start_coord = core_range.start_coord;
+ auto end_coord = core_range.end_coord;
+ auto cores = grid_to_cores(start_coord, end_coord, row_wise);
+ if (max_cores.has_value()) {
+ if (all_cores.size() + cores.size() > max_cores.value()) {
+ uint32_t num_cores_to_add = max_cores.value() - all_cores.size();
+ all_cores.insert(all_cores.end(), cores.begin(), cores.begin() + num_cores_to_add);
+ } else {
+ all_cores.insert(all_cores.end(), cores.begin(), cores.end());
+ }
+ } else {
+ all_cores.insert(all_cores.end(), cores.begin(), cores.end());
+ }
+ }
+
+ return all_cores;
+}
+
+bool operator!=(const CoreRangeSet &a, const CoreRangeSet &b) { return !(a == b); }
+
+auto fmt::formatter::format(const CoreRangeSet &core_range_set, format_context &ctx) const
+ -> format_context::iterator {
+ std::stringstream ss;
+ ss << core_range_set.str();
+ return fmt::format_to(ctx.out(), "{}", ss.str());
+}
+
+namespace std {
+
+std::size_t hash::operator()(RelativeCoreCoord const &o) const {
+ std::size_t seed = 0;
+ seed = std::hash()(o.x) ^ std::hash()(o.y) << 1;
+ return seed;
+}
+
+std::size_t hash::operator()(const CoreRange &core_range) const {
+ std::size_t seed = 0;
+ seed = std::hash{}(core_range.start_coord) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+ seed = std::hash{}(core_range.end_coord) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+ return seed;
+}
+
+std::size_t hash::operator()(const CoreRangeSet &core_range_set) const {
+ std::size_t seed = 0;
+ for (const auto &core_range : core_range_set.ranges()) {
+ seed = std::hash{}(core_range) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+ }
+ return seed;
+}
+
+} // namespace std
+
+namespace tt::stl::json {
+
+nlohmann::json to_json_t::operator()(const CoreCoord &core_coord) noexcept {
+ return {{"x", to_json(core_coord.x)}, {"y", to_json(core_coord.y)}};
+}
+
+CoreCoord from_json_t::operator()(const nlohmann::json &json) noexcept {
+ return {from_json(json.at("x")), from_json(json.at("y"))};
+}
+
+nlohmann::json to_json_t::operator()(const RelativeCoreCoord &relative_core_coord) noexcept {
+ return {{"x", to_json(relative_core_coord.x)}, {"y", to_json(relative_core_coord.y)}};
+}
+
+RelativeCoreCoord from_json_t::operator()(const nlohmann::json &json) noexcept {
+ return {from_json(json.at("x")), from_json(json.at("y"))};
+}
+
+nlohmann::json to_json_t::operator()(const CoreRange &core_range) noexcept {
+ return {{"start", to_json(core_range.start_coord)}, {"end", to_json(core_range.end_coord)}};
+}
+
+CoreRange from_json_t::operator()(const nlohmann::json &json) noexcept {
+ return {from_json(json.at("start")), from_json(json.at("end"))};
+}
+
+nlohmann::json to_json_t::operator()(const CoreRangeSet &core_range_set) noexcept {
+ nlohmann::json core_range_set_json = nlohmann::json::array();
+ return to_json(core_range_set.ranges());
+}
+
+CoreRangeSet from_json_t::operator()(const nlohmann::json &json) noexcept {
+ return CoreRangeSet(from_json>(json));
+}
+
+} // namespace tt::stl::json
diff --git a/tt_metal/common/core_coord.h b/tt_metal/common/core_coord.h
deleted file mode 100644
index 448ef85edb1..00000000000
--- a/tt_metal/common/core_coord.h
+++ /dev/null
@@ -1,636 +0,0 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include
-#include
-#include
-#include
-#include
-#include
-
-#include "third_party/json/json.hpp"
-#include "third_party/umd/device/tt_xy_pair.h"
-#include "tt_metal/common/assert.hpp"
-#include "tt_metal/third_party/tracy/public/tracy/Tracy.hpp"
-#include "tt_metal/tt_stl/reflection.hpp"
-
-using std::pair;
-
-using CoreCoord = tt_xy_pair;
-
-template <>
-struct fmt::formatter {
- constexpr auto parse(format_parse_context &ctx) -> format_parse_context::iterator { return ctx.end(); }
-
- auto format(const CoreCoord &core_coord, format_context &ctx) const -> format_context::iterator {
- std::stringstream ss;
- ss << core_coord.str();
- return fmt::format_to(ctx.out(), "{}", ss.str());
- }
-};
-
-constexpr inline bool operator<=(const CoreCoord &a, const CoreCoord &b) { return (a < b) or (a == b); }
-
-struct RelativeCoreCoord {
- long x = 0;
- long y = 0;
-
- std::string str() const { return "(x=" + std::to_string(x) + ",y=" + std::to_string(y) + ")"; }
-};
-
-constexpr inline bool operator==(const RelativeCoreCoord &a, const RelativeCoreCoord &b) {
- return a.x == b.x && a.y == b.y;
-}
-
-constexpr inline bool operator!=(const RelativeCoreCoord &a, const RelativeCoreCoord &b) { return !(a == b); }
-
-namespace std {
-template <>
-struct hash {
- std::size_t operator()(RelativeCoreCoord const &o) const {
- std::size_t seed = 0;
- seed = std::hash()(o.x) ^ std::hash()(o.y) << 1;
- return seed;
- }
-};
-} // namespace std
-
-inline CoreCoord get_core_coord_from_relative(const RelativeCoreCoord &in, const CoreCoord &grid_size) {
- CoreCoord coord;
- coord.x = in.x + ((in.x < 0) ? grid_size.x : 0);
- coord.y = in.y + ((in.y < 0) ? grid_size.y : 0);
- return coord;
-}
-
-struct CoreRange {
- CoreCoord start_coord;
- CoreCoord end_coord;
- CoreRange(const CoreCoord &point) {
- this->start_coord = point;
- this->end_coord = point;
- }
-
- CoreRange(const CoreCoord &start_coord, const CoreCoord &end_coord) {
- TT_ASSERT(
- end_coord.x >= start_coord.x and end_coord.y >= start_coord.y,
- "Invalid core range for start_coord: {}, end_coord: {}", start_coord.str(), end_coord.str());
-
- this->start_coord = start_coord;
- this->end_coord = end_coord;
- }
-
- CoreRange(const CoreRange &other) = default;
- CoreRange &operator=(const CoreRange &other) = default;
- CoreRange(CoreRange &&other) = default;
- CoreRange &operator=(CoreRange &&other) = default;
-
- // void validate() {
- // TT_FATAL(
- // end_coord.x >= start_coord.x and end_coord.y >= start_coord.y,
- // "Invalid core range for start_coord: {}, end_coord: {}", start_coord.str(), end_coord.str());
- // }
-
- inline std::optional intersects(const CoreRange &other) const {
- std::size_t x1 = std::max(this->start_coord.x, other.start_coord.x);
- std::size_t y1 = std::max(this->start_coord.y, other.start_coord.y);
- std::size_t x2 = std::min(this->end_coord.x, other.end_coord.x);
- std::size_t y2 = std::min(this->end_coord.y, other.end_coord.y);
- if (x1 <= x2 and y1 <= y2)
- return CoreRange({x1, y1}, {x2, y2});
-
- return {};
- }
-
- inline bool adjacent(const CoreRange &other) const {
- std::size_t x1 = std::max(this->start_coord.x, other.start_coord.x);
- std::size_t y1 = std::max(this->start_coord.y, other.start_coord.y);
- std::size_t x2 = std::min(this->end_coord.x, other.end_coord.x);
- std::size_t y2 = std::min(this->end_coord.y, other.end_coord.y);
- return ((x2 + 1 == x1 && y1 <= y2) || (y2 + 1 == y1 && x1 <= x2));
- }
-
- inline bool contains(const CoreRange &other) const {
- return (other.start_coord.x >= this->start_coord.x) && (other.end_coord.x <= this->end_coord.x) && (other.start_coord.y >= this->start_coord.y) &&
- (other.end_coord.y <= this->end_coord.y);
- }
-
- inline bool contains(const CoreCoord &other) const {
- return (other.x >= this->start_coord.x) && (other.x <= this->end_coord.x) && (other.y >= this->start_coord.y) &&
- (other.y <= this->end_coord.y);
- }
-
- // Merge lined-up (in x or y dimension) intersecting/adjacent rectangles
- std::optional merge(const CoreRange &cr) const {
- if (this->intersects(cr) || this->adjacent(cr)) {
- if (this->start_coord.x == cr.start_coord.x && this->end_coord.x == cr.end_coord.x)
- return CoreRange(
- {this->start_coord.x, std::min(this->start_coord.y, cr.start_coord.y)},
- {this->end_coord.x, std::max(this->end_coord.y, cr.end_coord.y)});
-
- else if (this->start_coord.y == cr.start_coord.y && this->end_coord.y == cr.end_coord.y)
- return CoreRange(
- {std::min(this->start_coord.x, cr.start_coord.x), this->start_coord.y},
- {std::max(this->end_coord.x, cr.end_coord.x), this->end_coord.y});
- }
- return std::nullopt;
- }
-
- std::string str() const { return "[" + this->start_coord.str() + " - " + this->end_coord.str() + "]"; }
-
- size_t size() const { return (this->end_coord.x - this->start_coord.x + 1) * (this->end_coord.y - this->start_coord.y + 1); }
-
- CoreCoord grid_size() const { return {this->end_coord.x - this->start_coord.x + 1, this->end_coord.y - this->start_coord.y + 1}; }
-
- class CoreIterator
- {
- public:
- CoreIterator(const CoreCoord& current, const CoreRange& core_range) :
- current_(current),
- range_(core_range)
- {}
-
- CoreCoord& operator*()
- {
- return current_;
- }
-
- CoreIterator& operator++()
- {
- CoreCoord next;
-
- const bool is_curr_core_at_end_of_row = current_.x == range_.end_coord.x;
- if (is_curr_core_at_end_of_row)
- {
- // Go to the beginning of the next row
- next.x = range_.start_coord.x;
- next.y = current_.y + 1;
- }
- else
- {
- next.x = current_.x + 1;
- next.y = current_.y;
- }
-
- current_ = next;
- return *this;
- }
-
- bool operator==(const CoreIterator& other) const
- {
- return current_ == other.current_;
- }
-
- bool operator!=(const CoreIterator& other) const
- {
- return !(current_ == other.current_);
- }
-
- private:
- CoreCoord current_;
- const CoreRange& range_;
- };
-
- CoreIterator begin() const
- {
- return CoreIterator(this->start_coord, *this);
- }
-
- CoreIterator end() const
- {
- const CoreCoord iterator_end(this->start_coord.x, this->end_coord.y + 1);
- return CoreIterator(iterator_end, *this);
- }
-};
-
-constexpr inline bool operator==(const CoreRange &a, const CoreRange &b) {
- return a.start_coord == b.start_coord && a.end_coord == b.end_coord;
-}
-
-constexpr inline bool operator!=(const CoreRange &a, const CoreRange &b) { return !(a == b); }
-
-constexpr inline bool operator<(const CoreRange &left, const CoreRange &right) {
- return (left.start_coord < right.start_coord || (left.start_coord == right.start_coord && left.end_coord < right.end_coord));
-}
-
-template <>
-struct fmt::formatter {
- constexpr auto parse(format_parse_context &ctx) -> format_parse_context::iterator { return ctx.end(); }
-
- auto format(const CoreRange &core_range, format_context &ctx) const -> format_context::iterator {
- std::stringstream ss;
- ss << core_range.str();
- return fmt::format_to(ctx.out(), "{}", ss.str());
- }
-};
-
-namespace std {
-template <>
-struct hash {
- std::size_t operator()(const CoreRange &core_range) const {
- std::size_t seed = 0;
- seed = std::hash{}(core_range.start_coord) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
- seed = std::hash{}(core_range.end_coord) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
- return seed;
- }
-};
-} // namespace std
-
-class CoreRangeSet {
- public:
- CoreRangeSet(const std::set &core_ranges) : ranges_(core_ranges) {
- ZoneScoped;
- for (auto outer_it = this->ranges_.begin(); outer_it != this->ranges_.end(); outer_it++) {
- for (auto inner_it = this->ranges_.begin(); inner_it != this->ranges_.end(); inner_it++) {
- if (outer_it == inner_it) {
- continue;
- }
- CoreRange first_core_range = *outer_it;
- CoreRange second_core_range = *inner_it;
- bool first_core_left_of_second = first_core_range.end_coord.x < second_core_range.start_coord.x;
- bool first_core_right_of_second = first_core_range.start_coord.x > second_core_range.end_coord.x;
- bool first_core_above_second = first_core_range.end_coord.y < second_core_range.start_coord.y;
- bool first_core_below_second = first_core_range.start_coord.y > second_core_range.end_coord.y;
- auto no_overlap = first_core_left_of_second or first_core_right_of_second or first_core_above_second or
- first_core_below_second;
- if (not no_overlap) {
- TT_THROW(
- "Cannot create CoreRangeSet with specified core ranges because core ranges {} and {} overlap!",
- first_core_range.str(),
- second_core_range.str());
- }
- }
- }
- }
-
- friend void swap(CoreRangeSet& first, CoreRangeSet& second) {
- std::scoped_lock lock(first.ranges_guard, second.ranges_guard);
- std::swap(first.ranges_, second.ranges_);
- }
-
- CoreRangeSet(const CoreRangeSet &other) {
- std::scoped_lock lock(other.ranges_guard);
- this->ranges_ = other.ranges_;
- }
- CoreRangeSet &operator=(const CoreRangeSet &other) {
- std::scoped_lock lock(other.ranges_guard);
- this->ranges_ = other.ranges_;
- return *this;
- }
-
- CoreRangeSet(CoreRangeSet &&other) {
- swap(*this, other);
- }
-
- CoreRangeSet &operator=(CoreRangeSet &&other) {;
- swap(*this, other);
- return *this;
- }
-
- auto size() const { return ranges_.size(); }
-
- CoreRangeSet merge(const std::set &other) const {
- size_t min_x = std::numeric_limits::max(), max_x = 0, min_y = std::numeric_limits::max(),
- max_y = 0;
- std::set crs = this->ranges_;
- crs.insert(other.begin(), other.end());
-
- for (const auto &cr : crs) {
- // std::cout << "merging " << cr.str() << std::endl;
- min_x = std::min(min_x, cr.start_coord.x);
- max_x = std::max(max_x, cr.end_coord.x);
- min_y = std::min(min_y, cr.start_coord.y);
- max_y = std::max(max_y, cr.end_coord.y);
- }
-
- // By overallocating by one x entry, we can avoid needing to check for
- // boundary conditions when iterating, since there'll always be one
- // last false entry
- bool grid[max_y + 1][max_x + 2];
- memset(grid, 0, sizeof(grid));
-
- for (const auto &cr : crs)
- for (unsigned y = cr.start_coord.y; y <= cr.end_coord.y; y++)
- for (unsigned x = cr.start_coord.x; x <= cr.end_coord.x; x++) grid[y][x] = true;
-
- crs.clear();
- for (unsigned y = min_y; y <= max_y; y++) {
- std::set filter_set, tmp, new_crs;
- std::vector ranges;
- for (unsigned x = min_x; x <= max_x + 1; x++) {
- if (grid[y][x]) {
- unsigned x_start = x;
- while (grid[y][x]) x++;
- ranges.push_back(CoreRange({x_start, y}, {x - 1, y}));
- }
- }
-
- for (const auto &cr : ranges) {
- for (const auto &prev_cr : crs) {
- if (auto merged = cr.merge(prev_cr)) {
- // std::cout << "merging " << cr.str() << " and " << prev_cr.str() << " with " <<
- // merged.value().str() << std::endl;
- new_crs.insert(merged.value());
- filter_set.insert(prev_cr);
- filter_set.insert(cr);
- }
- }
- crs.insert(cr);
- }
- // Set(A) = Set(A) - Set(B)
- std::set_difference(
- std::make_move_iterator(crs.begin()),
- std::make_move_iterator(crs.end()),
- filter_set.begin(),
- filter_set.end(),
- std::inserter(tmp, tmp.end()));
- crs.swap(tmp);
- crs.insert(new_crs.begin(), new_crs.end());
- }
- // for ( const auto & cr : crs ){
- // std::cout << " final merged CR:" << cr.str() << std::endl;
- // }
- return CoreRangeSet(crs);
- }
-
- CoreRangeSet merge(const CoreRangeSet &s) const { return this->merge(s.ranges()); }
-
- inline bool core_coord_in_core_ranges(const CoreCoord &core_coord) const {
- ZoneScoped;
- for (const auto &cr : this->ranges_) {
- if (cr.contains(core_coord))
- return true;
- }
- return false;
- }
-
- inline bool intersects(const CoreRange &cr) const {
- for (const auto &local_cr : this->ranges_) {
- if (local_cr.intersects(cr))
- return true;
- }
- return false;
- }
-
- const std::set &ranges() const { return this->ranges_; }
-
- std::string str() const {
- if (this->ranges().size() > 0) {
- std::string core_range_set_str = "{";
- for (const auto &core_range : this->ranges_) {
- core_range_set_str += core_range.str() + ", ";
- }
- core_range_set_str[core_range_set_str.length() - 2] = '}';
- core_range_set_str.pop_back();
- return core_range_set_str;
- } else {
- return "{}";
- }
- }
-
- const uint32_t num_cores() const {
- uint32_t num_cores = 0;
- for (const auto &core_range : this->ranges()) {
- num_cores += core_range.size();
- }
- return num_cores;
- }
-
- CoreRange bounding_box() const {
- TT_FATAL(this->ranges().size() > 0, "Cannot get bounding_box of an empty CoreRangeSet!");
- size_t min_x = UINT32_MAX, min_y = UINT32_MAX, max_x = 0, max_y = 0;
- for (const auto &cr : this->ranges()) {
- min_x = std::min(min_x, cr.start_coord.x);
- max_x = std::max(max_x, cr.end_coord.x);
- min_y = std::min(min_y, cr.start_coord.y);
- max_y = std::max(max_y, cr.end_coord.y);
- }
- return {{min_x, min_y}, {max_x, max_y}};
- }
-
- private:
- mutable std::mutex ranges_guard;
- std::set ranges_;
-};
-
-const inline bool operator==(const CoreRangeSet &a, const CoreRangeSet &b) {
- if (a.ranges().size() == b.ranges().size()) {
- auto range_a = a.ranges();
- auto range_b = b.ranges();
- for (auto it_a = range_a.begin(), it_b = range_b.begin(); it_a != range_a.end(); it_a++, it_b++) {
- if (*it_a != *it_b) {
- return false;
- }
- }
- return true;
- }
- return false;
-}
-
-inline std::vector grid_to_cores(
- uint32_t num_cores, uint32_t grid_size_x, uint32_t grid_size_y, bool row_wise = false) {
- std::vector cores;
- cores.reserve(num_cores);
- TT_ASSERT(
- num_cores <= grid_size_x * grid_size_y,
- "Number of cores {} exceeds grid size {}x{}",
- num_cores,
- grid_size_x,
- grid_size_y);
- if (row_wise) {
- for (uint32_t i = 0; i < num_cores; ++i) {
- cores.push_back({i % grid_size_x, i / grid_size_x});
- }
- } else {
- for (uint32_t i = 0; i < num_cores; ++i) {
- cores.push_back({i / grid_size_y, i % grid_size_y});
- }
- }
- return cores;
-}
-
-inline std::vector grid_to_cores(CoreCoord start, CoreCoord end, bool row_wise = false) {
- std::vector cores;
- auto num_cores_x = (end.x + 1) - start.x;
- auto num_cores_y = (end.y + 1) - start.y;
- uint32_t num_cores = num_cores_x * num_cores_y;
- cores.reserve(num_cores);
- if (row_wise) {
- for (uint32_t j = start.y; j < (end.y + 1); j++) {
- for (uint32_t i = start.x; i < (end.x + 1); i++) {
- cores.push_back({i, j});
- }
- }
-
- } else {
- for (uint32_t i = start.x; i < (end.x + 1); i++) {
- for (uint32_t j = start.y; j < (end.y + 1); j++) {
- cores.push_back({i, j});
- }
- }
- }
- return cores;
-}
-
-// Noop cores are appended at the end with no guarantees on ordering
-inline std::vector grid_to_cores_with_noop(
- const uint32_t bbox_x,
- const uint32_t bbox_y,
- const uint32_t grid_size_x,
- const uint32_t grid_size_y,
- const bool row_wise = false) {
- ZoneScoped;
- std::vector cores;
- cores.reserve(grid_size_x * grid_size_y);
- TT_ASSERT(bbox_x < grid_size_x);
- TT_ASSERT(bbox_y < grid_size_y);
- const uint32_t box_size_x = bbox_x + 1;
- const uint32_t box_size_y = bbox_y + 1;
-
- if (row_wise) {
- for (uint32_t i = 0; i < box_size_x * box_size_y; ++i) {
- cores.push_back({i % box_size_x, i / box_size_x});
- }
- } else {
- for (uint32_t i = 0; i < box_size_x * box_size_y; ++i) {
- cores.push_back({i / box_size_y, i % box_size_y});
- }
- }
-
- // Right rectangle noops
- for (uint32_t x = box_size_x; x < grid_size_x; ++x) {
- for (uint32_t y = 0; y < grid_size_y; ++y) {
- cores.push_back({x, y});
- }
- }
-
- // Bottom rectangle noops
- for (uint32_t y = box_size_y; y < grid_size_y; ++y) {
- for (uint32_t x = 0; x < box_size_x; ++x) {
- cores.push_back({x, y});
- }
- }
-
- return cores;
-}
-
-inline std::vector corerange_to_cores(
- const CoreRangeSet &crs, std::optional max_cores = std::nullopt, bool row_wise = false) {
- uint32_t num_total_cores = 0;
- std::vector all_cores;
- uint32_t offset = 0;
-
- for (auto core_range : crs.ranges()) {
- auto start_coord = core_range.start_coord;
- auto end_coord = core_range.end_coord;
- auto cores = grid_to_cores(start_coord, end_coord, row_wise);
- if (max_cores.has_value()) {
- if (all_cores.size() + cores.size() > max_cores.value()) {
- uint32_t num_cores_to_add = max_cores.value() - all_cores.size();
- all_cores.insert(all_cores.end(), cores.begin(), cores.begin() + num_cores_to_add);
- } else {
- all_cores.insert(all_cores.end(), cores.begin(), cores.end());
- }
- } else {
- all_cores.insert(all_cores.end(), cores.begin(), cores.end());
- }
- }
-
- return all_cores;
-}
-
-const inline bool operator!=(const CoreRangeSet &a, const CoreRangeSet &b) { return !(a == b); }
-
-template <>
-struct fmt::formatter {
- constexpr auto parse(format_parse_context &ctx) -> format_parse_context::iterator { return ctx.end(); }
-
- auto format(const CoreRangeSet &core_range_set, format_context &ctx) const -> format_context::iterator {
- std::stringstream ss;
- ss << core_range_set.str();
- return fmt::format_to(ctx.out(), "{}", ss.str());
- }
-};
-
-// Adding to tt::tt_metal namespace as we transition to moving this out of global namespace eventually.
-namespace tt::tt_metal {
- using ::CoreCoord;
- using ::CoreRange;
- using ::CoreRangeSet;
-}
-
-namespace std {
-template <>
-struct hash {
- std::size_t operator()(const CoreRangeSet &core_range_set) const {
- std::size_t seed = 0;
- for (const auto &core_range : core_range_set.ranges()) {
- seed = std::hash{}(core_range) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
- }
- return seed;
- }
-};
-} // namespace std
-
-namespace tt::stl::json {
-
-template <>
-struct to_json_t {
- nlohmann::json operator()(const CoreCoord &core_coord) noexcept {
- return {{"x", to_json(core_coord.x)}, {"y", to_json(core_coord.y)}};
- }
-};
-
-template <>
-struct from_json_t {
- CoreCoord operator()(const nlohmann::json &json) noexcept {
- return {from_json(json.at("x")), from_json(json.at("y"))};
- }
-};
-
-template <>
-struct to_json_t {
- nlohmann::json operator()(const RelativeCoreCoord &relative_core_coord) noexcept {
- return {{"x", to_json(relative_core_coord.x)}, {"y", to_json(relative_core_coord.y)}};
- }
-};
-
-template <>
-struct from_json_t {
- RelativeCoreCoord operator()(const nlohmann::json &json) noexcept {
- return {from_json(json.at("x")), from_json(json.at("y"))};
- }
-};
-
-template <>
-struct to_json_t {
- nlohmann::json operator()(const CoreRange &core_range) noexcept {
- return {{"start", to_json(core_range.start_coord)}, {"end", to_json(core_range.end_coord)}};
- }
-};
-
-template <>
-struct from_json_t {
- CoreRange operator()(const nlohmann::json &json) noexcept {
- return {from_json(json.at("start")), from_json(json.at("end"))};
- }
-};
-
-template <>
-struct to_json_t {
- nlohmann::json operator()(const CoreRangeSet &core_range_set) noexcept {
- nlohmann::json core_range_set_json = nlohmann::json::array();
- return to_json(core_range_set.ranges());
- }
-};
-
-template <>
-struct from_json_t {
- CoreRangeSet operator()(const nlohmann::json &json) noexcept {
- return CoreRangeSet(from_json>(json));
- }
-};
-
-} // namespace tt::stl::json
diff --git a/tt_metal/common/core_coord.hpp b/tt_metal/common/core_coord.hpp
new file mode 100644
index 00000000000..d5623836152
--- /dev/null
+++ b/tt_metal/common/core_coord.hpp
@@ -0,0 +1,255 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include "third_party/json/json.hpp"
+#include "third_party/umd/device/tt_xy_pair.h"
+#include "tt_metal/tt_stl/reflection.hpp"
+
+using CoreCoord = tt_xy_pair;
+
+template <>
+struct fmt::formatter {
+ constexpr auto parse(format_parse_context &ctx) -> format_parse_context::iterator { return ctx.end(); }
+
+ auto format(const CoreCoord &core_coord, format_context &ctx) const -> format_context::iterator;
+};
+
+constexpr inline bool operator<=(const CoreCoord &a, const CoreCoord &b) { return (a < b) or (a == b); }
+
+struct RelativeCoreCoord {
+ long x = 0;
+ long y = 0;
+
+ std::string str() const;
+};
+
+constexpr inline bool operator==(const RelativeCoreCoord &a, const RelativeCoreCoord &b) {
+ return a.x == b.x && a.y == b.y;
+}
+
+constexpr inline bool operator!=(const RelativeCoreCoord &a, const RelativeCoreCoord &b) { return !(a == b); }
+
+CoreCoord get_core_coord_from_relative(const RelativeCoreCoord &in, const CoreCoord &grid_size);
+
+struct CoreRange {
+ CoreCoord start_coord;
+ CoreCoord end_coord;
+ CoreRange(const CoreCoord &point);
+
+ CoreRange(const CoreCoord &start_coord, const CoreCoord &end_coord);
+
+ CoreRange(const CoreRange &other) = default;
+ CoreRange &operator=(const CoreRange &other) = default;
+ CoreRange(CoreRange &&other) = default;
+ CoreRange &operator=(CoreRange &&other) = default;
+
+ std::optional