diff --git a/.github/workflows/_produce-data.yaml b/.github/workflows/_produce-data.yaml
index 6f35a04f007..fd547b44aa9 100644
--- a/.github/workflows/_produce-data.yaml
+++ b/.github/workflows/_produce-data.yaml
@@ -12,6 +12,10 @@ on:
         description: "Run attempt of the workflow run"
         default: 1
         type: number
+      upload_data:
+        description: "Upload data to datastore cluster for our dashboard"
+        default: false
+        type: boolean
   workflow_run:
     workflows:
       - "All post-commit tests"
@@ -33,6 +37,8 @@ on:
       - "(TGG) TGG unit tests"
       - "(TGG) TGG demo tests"
       - "(TGG) TGG frequent tests"
+      - "ttnn - Run sweeps"
+      - "Blackhole post-commit tests"
     types:
       - completed
 
@@ -111,7 +117,7 @@ jobs:
         run: ls -hal
       - name: Upload cicd data
         uses: ./.github/actions/upload-data-via-sftp
-        if: ${{ github.event_name == 'workflow_run' }}
+        if: ${{ github.event_name == 'workflow_run' || inputs.upload_data }}
         with:
           ssh-private-key: ${{ secrets.SFTP_CICD_WRITER_KEY }}
           sftp-batchfile: .github/actions/upload-data-via-sftp/cicd_data_batchfile.txt
diff --git a/.github/workflows/single-card-demo-tests.yaml b/.github/workflows/single-card-demo-tests.yaml
index 4918c0faa56..ef7c101d8fb 100644
--- a/.github/workflows/single-card-demo-tests.yaml
+++ b/.github/workflows/single-card-demo-tests.yaml
@@ -4,7 +4,7 @@ on:
   workflow_dispatch:
   workflow_call:
   schedule:
-    - cron: "0 0 * * 1,2,3,4,5"
+    - cron: "0 */6 * * 1,2,3,4,5"
     - cron: "0 */4 * * 0,6"
 
 jobs:
diff --git a/.github/workflows/stress-fast-dispatch-build-and-unit-tests.yaml b/.github/workflows/stress-fast-dispatch-build-and-unit-tests.yaml
index 81d24a83835..d2e1d8b63a3 100644
--- a/.github/workflows/stress-fast-dispatch-build-and-unit-tests.yaml
+++ b/.github/workflows/stress-fast-dispatch-build-and-unit-tests.yaml
@@ -19,8 +19,6 @@ jobs:
       fail-fast: false
       matrix:
         runner-info: [
-          {arch: grayskull, runs-on: ["pipeline-stress", "E150", "bare-metal", "in-service"], machine-type: "bare_metal", name: "E150"},
-          {arch: wormhole_b0, runs-on: ["pipeline-stress", "N300", "bare-metal", "in-service"], machine-type: "bare_metal", name: "N300"},
           # E150
           {arch: grayskull, runs-on: ["cloud-virtual-machine", "E150", "in-service"], machine-type: "virtual_machine", name: "E150"},
           # N150
diff --git a/.github/workflows/ttnn-run-sweeps.yaml b/.github/workflows/ttnn-run-sweeps.yaml
index 99cd17e0f01..1e8ed77978f 100644
--- a/.github/workflows/ttnn-run-sweeps.yaml
+++ b/.github/workflows/ttnn-run-sweeps.yaml
@@ -166,6 +166,10 @@ on:
           - eltwise.unary.hardtanh.hardtanh_pytorch2
           - eltwise.unary.leaky_relu.leaky_relu
           - eltwise.unary.reglu.reglu
+          - eltwise.unary_complex.polar.polar
+          - eltwise.unary_complex.angle.angle
+          - eltwise.unary_complex.polar_bw.polar_bw
+          - eltwise.unary_complex.angle_bw.angle_bw
           - eltwise.binary.subtract.subtract
           - eltwise.binary.subtract.subtract_tensor_pytorch2
           - eltwise.binary.multiply.multiply
diff --git a/CODEOWNERS b/CODEOWNERS
index e87255b0cb4..1a4b8906716 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -22,7 +22,7 @@ third_party/ @tt-rkim @TT-billteng
 MANIFEST.in @tt-rkim
 setup.py @tt-rkim
 pyproject.toml @tt-rkim @TT-billteng
-requirements*.txt @tt-rkim @TT-billteng
+requirements*.txt @tt-rkim @TT-billteng @ttmchiou
 setup_hugepages.py @tt-rkim @TT-billteng
 
 scripts/docker @TT-billteng
@@ -55,6 +55,7 @@ tt_metal/ @abhullar-tt @pgkeller @aliuTT @tt-aho @tt-dma @tt-asaigal @ubcheema
 tt_metal/host_api.hpp @abhullar-tt @pgkeller @aliuTT @tt-aho @tt-dma @tt-asaigal @davorchap
 tt_metal/impl/device/ @abhullar-tt @pgkeller @aliuTT @tt-aho @tt-dma @tt-asaigal @ubcheema @davorchap @cfjchu
 tt_metal/distributed/ @cfjchu @aliuTT @tt-asaigal
+tt_metal/**/requirements*.txt @tt-rkim @TT-billteng @ttmchiou
 
 # metal - dispatch
 tt_metal/impl/dispatch/kernels/packet_* @ubcheema @aliuTT
diff --git a/README.md b/README.md
index 5ffc2a53f70..00cf1dad925 100644
--- a/README.md
+++ b/README.md
@@ -21,21 +21,20 @@
 ---
 
 ## LLMs
-| Model                                                         | Batch | Hardware                                                 | ttft (s) | t/s/u | Target<br>t/s/u | t/s    | Release                                                                   |
+| Model                                                         | Batch | Hardware                                                 | ttft (ms) | t/s/u | Target<br>t/s/u | t/s    | Release                                                                   |
 |---------------------------------------------------------------|-------|----------------------------------------------------------|----------|-------|-----------------|--------|---------------------------------------------------------------------------|
 | [Falcon7B-decode](./models/demos/ttnn_falcon7b)               | 32    | [e150](https://tenstorrent.com/hardware/grayskull)       |          | 4.2   | 4.4             | 134.4  |                                                                           |
-| [Falcon7B](./models/demos/wormhole/falcon7b)                  | 32    | [n150](https://tenstorrent.com/hardware/wormhole)        | 0.07     | 16.7  | 26              | 534.4  | [v0.52.0-rc31](https://github.com/tenstorrent/tt-metal/tree/v0.52.0-rc31) |
+| [Falcon7B](./models/demos/wormhole/falcon7b)                  | 32    | [n150](https://tenstorrent.com/hardware/wormhole)        | 75       | 17.0  | 26              | 544.0  | [v0.53.0-rc16](https://github.com/tenstorrent/tt-metal/tree/v0.53.0-rc16) |
 | [Mistral-7B](./models/demos/wormhole/mistral7b)               | 32    | [n150](https://tenstorrent.com/hardware/wormhole)        |          | 9.9   | 25              | 316.8  | [v0.51.0-rc28](https://github.com/tenstorrent/tt-metal/tree/v0.51.0-rc28) |
-| [Mamba-2.8B](./models/demos/wormhole/mamba)                   | 32    | [n150](https://tenstorrent.com/hardware/wormhole)        | 0.04     | 12.3  | 41              | 393.6  | [v0.51.0-rc26](https://github.com/tenstorrent/tt-metal/tree/v0.51.0-rc26) |
-| [LLaMA-3.1-8B](./models/demos/wormhole/llama31_8b)            | 1     | [n150](https://tenstorrent.com/hardware/wormhole)        | 0.20     | 21.4  | 23              | 21.4   | [v0.52.0-rc31](https://github.com/tenstorrent/tt-metal/tree/v0.52.0-rc31) |
-| [Falcon7B (DP=8)](./models/demos/t3000/falcon7b)              | 256   | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | 0.10     | 14.4  | 26              | 3686.4 | [v0.52.0-rc31](https://github.com/tenstorrent/tt-metal/tree/v0.52.0-rc31) |
-| [LLaMA-2-70B - (TP=8)](./models/demos/t3000/llama2_70b)       | 32    | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | 0.19     | 15.1  | 20              | 483.2  | [v0.52.0-rc31](https://github.com/tenstorrent/tt-metal/tree/v0.52.0-rc31) |
-| [LLaMA-3.1-70B (TP=8)](./models/demos/t3000/llama3_70b)       | 32    | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | 0.19     | 15.1  | 20              | 483.2  | [v0.52.0-rc31](https://github.com/tenstorrent/tt-metal/tree/v0.52.0-rc31) |
-| [Falcon40B (TP=8)](./models/demos/t3000/falcon40b)            | 32    | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) |          | 5.3   | 36              | 169.6  | [v0.52.0-rc31](https://github.com/tenstorrent/tt-metal/tree/v0.52.0-rc31) |
-| [Mixtral7Bx8 (TP=8)](./models/demos/t3000/mixtral8x7b)        | 32    | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | 0.23     | 14.2  | 33              | 454.4  | [v0.52.0-rc31](https://github.com/tenstorrent/tt-metal/tree/v0.52.0-rc31) |
-| [Falcon7B (DP=32)](./models/demos/tg/falcon7b)                | 1024  | [Galaxy](https://tenstorrent.com/hardware/galaxy)        | 0.24     | 4.4   | 26              | 4505.6 | [v0.52.0-rc31](https://github.com/tenstorrent/tt-metal/tree/v0.52.0-rc31) |
-| [LLaMA-3.1-70B (DP=4, TP=8)](./models/demos/t3000/llama3_70b) | 128   | [Galaxy](https://tenstorrent.com/hardware/galaxy)        | 0.19     | 14.3  | 20              | 1835.5 | [v0.52.0-rc31](https://github.com/tenstorrent/tt-metal/tree/v0.52.0-rc31) |
-> **Last Update:** October 7, 2024
+| [Mamba-2.8B](./models/demos/wormhole/mamba)                   | 32    | [n150](https://tenstorrent.com/hardware/wormhole)        | 48       | 12.3  | 41              | 393.6  | [v0.51.0-rc26](https://github.com/tenstorrent/tt-metal/tree/v0.51.0-rc26) |
+| [LLaMA-3.1-8B](./models/demos/wormhole/llama31_8b)            | 1     | [n150](https://tenstorrent.com/hardware/wormhole)        | 291      | 22.9  | 23              | 22.9   | [v0.53.0-rc16](https://github.com/tenstorrent/tt-metal/tree/v0.53.0-rc16) |
+| [Falcon7B (DP=8)](./models/demos/t3000/falcon7b)              | 256   | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | 101      | 14.4  | 26              | 3686.4 | [v0.53.0-rc16](https://github.com/tenstorrent/tt-metal/tree/v0.53.0-rc16) |
+| [LLaMA-3.1-70B (TP=8)](./models/demos/t3000/llama3_70b)       | 32    | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | 190      | 15.1  | 20              | 483.2  | [v0.53.0-rc16](https://github.com/tenstorrent/tt-metal/tree/v0.53.0-rc16) |
+| [Falcon40B (TP=8)](./models/demos/t3000/falcon40b)            | 32    | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) |          | 5.3   | 36              | 169.6  | [v0.53.0-rc16](https://github.com/tenstorrent/tt-metal/tree/v0.53.0-rc16) |
+| [Mixtral7Bx8 (TP=8)](./models/demos/t3000/mixtral8x7b)        | 32    | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | 235      | 14.2  | 33              | 454.4  | [v0.53.0-rc16](https://github.com/tenstorrent/tt-metal/tree/v0.53.0-rc16) |
+| [Falcon7B (DP=32)](./models/demos/tg/falcon7b)                | 1024  | [Galaxy](https://tenstorrent.com/hardware/galaxy)        | 242      | 4.4   | 26              | 4505.6 | [v0.53.0-rc16](https://github.com/tenstorrent/tt-metal/tree/v0.53.0-rc16) |
+| [LLaMA-3.1-70B (DP=4, TP=8)](./models/demos/t3000/llama3_70b) | 128   | [Galaxy](https://tenstorrent.com/hardware/galaxy)        | 190      | 14.3  | 20              | 1835.5 | [v0.52.0-rc31](https://github.com/tenstorrent/tt-metal/tree/v0.52.0-rc31) |
+> **Last Update:** October 21, 2024
 
 > **Notes:**
 > - TP = Tensor Parallel, DP = Data Parallel; Defines parallelization factors across multiple devices.
@@ -54,6 +53,8 @@
 | [ViT](./models/demos/grayskull/vit)                                         | 9     | [e150](https://tenstorrent.com/hardware/grayskull)       | 1,360   | 2,000      |             |
 | [ViT](./models/demos/wormhole/vit)                                          | 8     | [n150](https://tenstorrent.com/hardware/wormhole)        | 912     | 1,600      |             |
 | [Stable Diffusion 1.4 (512x512)](./models/demos/wormhole/stable_diffusion)  | 1     | [n150](https://tenstorrent.com/hardware/wormhole)        | 0.167   | 0.3        |             |
+| [U-Net](./models/experimental/functional_unet)                              | 2     | [n150](https://tenstorrent.com/hardware/wormhole)        | 530     | 1000       | [v0.53.0-rc22](https://github.com/tenstorrent/tt-metal/tree/v0.53.0-rc22) |
+
 
 ## NLPs
 | Model                                               | Batch | Hardware                                           | sen/sec | Target sen/sec | Release |
@@ -70,6 +71,7 @@ For the latest model updates and features, please see [MODEL_UPDATES.md](models/
 - [Advanced Performance Optimizations for Models](./tech_reports/AdvancedPerformanceOperationsForModels/AdvancedPerformanceOptimizationsForModels.md) (updated Oct 17th)
 - [Programming Mesh of Devices](./tech_reports/Programming%20Mesh%20of%20Devices/Programming%20Mesh%20of%20Devices%20with%20TT-NN.md) (updated Sept 9th)
 - [ViT Implementation in TT-NN on GS](./tech_reports/ViT-TTNN/vit.md)  (updated Sept 22nd)
+- [LLMs Bring up in TT-NN](./tech_reports/LLMs/llms.md)  (updated Oct 29th)
 ---
 
 <div align="center">
diff --git a/models/MODEL_UPDATES.md b/models/MODEL_UPDATES.md
index feaa61cc031..aa8fc1d9232 100644
--- a/models/MODEL_UPDATES.md
+++ b/models/MODEL_UPDATES.md
@@ -4,6 +4,11 @@
 >
 > Please refer to the front-page [README](../README.md) for the latest verified release for each model.
 
+## October 21, 2024
+
+### [Llama 3/3.1 - 70B](demos/t3000/llama3_70b)
+- Enabled prefill workloads to pad to multiples of 1024 instead of powers of 2, improving overall performance for longer sequences
+
 ## October 7, 2024
 
 ### [Llama 3.1 - 8B](demos/wormhole/llama31_8b)
diff --git a/tech_reports/LLMs/llms.md b/tech_reports/LLMs/llms.md
new file mode 100644
index 00000000000..4b4a34f6a7c
--- /dev/null
+++ b/tech_reports/LLMs/llms.md
@@ -0,0 +1,112 @@
+# LLMs in TT-NN
+Authors: 
+## Contents
+- [LLMs in TT-NN](#llms-in-tt-nn)
+  - [Contents](#contents)
+  - [1. Overview](#1-overview)
+  - [2. Modules](#2-modules)
+    - [2.1 Embedding](#21-embedding)
+    - [2.2 RoPE](#22-rope)
+    - [2.3 Norm](#23-norm) 
+    - [2.4 Attention](#24-attention)
+    - [2.5 MLP](#25-mlp)
+    - [2.6 Decoder](#26-decoder)
+    - [2.7 LM Head](#27-lm-head)
+  - [3. Features](#3-features)
+    - [3.1 Generative Decoding](#31-generative-decoding)
+    - [3.2 Prefill and Decode](#32-prefill-and-decode)
+    - [3.3 Multi-Device](#33-multi-device)
+    - [3.4 Continuous Batching](#34-continuous-batching)
+    - [3.5 vLLM Integration](#34-vllm-integration)
+  - [4. Best Practices and Optimizations](#4-best-practices-and-optimizations)
+    - [4.1 Tracing](#41-tracing)
+    - [4.2 Async Mode](#42-async-mode)
+    - [4.3 Multiple CQs](#43-multiple-cqs)
+    - [4.4 Op Configs](#44-op-configs)
+    - [4.5 Accuracy](#45-accuracy)
+    - [4.6 Performance Analysis](#46-performance-analysis)
+    - [4.7 Misc. Performance Optimizations](#47-misc-performance-optimizations)
+    - [4.8 Module Tests](#48-module-tests)
+    - [4.9 Performance Testing](#49-performance-testing)
+    - [4.10 Common Pitfalls](#410-common-pitfalls)
+      - [4.10.1 Error Messages](#4101-error-messages)
+      - [4.10.2 Shard Spec Mismatches](#4102-shard-spec-mismatches)
+      - [4.10.3 Ethernet Dispatch Cores](#4103-ethernet-dispatch-cores)
+      - [4.10.4 Hangs](#4104-hangs)
+        - [4.10.4.1 Tracing](#41041-tracing)
+        - [4.10.4.2 Large Matmuls](#41042-large-matmuls)
+
+## 1. Overview
+## 2. Modules
+### 2.1 Embedding
+### 2.2 RoPE
+  - Iterative update system
+  - When to use our fused op
+### 2.3 Norm
+  - Replicated layernorm vs distributed layernorm
+    - Layernorm/rmsnorm weights in row major / wrapped around tile size trick
+### 2.4 Attention
+  - Flash Attention and Flash Decode
+    - general description
+    - limitations
+    - which dims are parallelized
+### 2.5 MLP
+### 2.6 Decoder
+### 2.7 LM Head
+## 3. Features
+### 3.1 Generative Decoding
+### 3.2 Prefill and Decode
+  - submodules, tests
+  - how to combine prefill and decode, 
+  - slicing prefill to fit in L1
+### 3.3 Multi-Device
+  - device mesh
+  - column parallel followed by row parallel
+  - sharding, CCL ops, reducing CCL overheads, etc.
+### 3.4 Continuous Batching
+  - quick intro and how it is implemented in demos.
+### 3.5 vLLM Integration
+  - Our vLLM repo and what's needed to integrate with it.
+## 4. Best Practices and Optimizations
+### 4.1 Tracing
+  - link to existing doc, why it helps decode more
+### 4.2 Async Mode
+### 4.3 Multiple CQs
+  - how to feed back output to input and read output asyncronously
+### 4.4 Op Configs
+  - Writing correct program configs and shard specs 
+  - Deciding how many cores to run an op on
+    - Why did we use 16 cores for MLP
+  - Which matmul to use when @Colman Glagovich 
+    - 1d, 2d, dram-sharded, ...
+  - Implicitly padding weights in program config for matmuls
+### 4.5 Accuracy
+  - How we measure it (PCC, perplexity, top-1/top-5, end-user tests, benchmarking)
+  - How much PCC is enough? Rules of thumb.
+  - Accuracy tests
+  - Debugging PCC issues
+### 4.6 Performance Analysis
+  - Performance tooling, tracy
+### 4.7 Misc. Performance Optimizations
+  - Which dim to shard matmuls on
+  - DRAM-sharding
+  - Avoiding sharded to interleaved calls
+### 4.8 Module Tests
+### 4.9 Performance Testing
+### 4.10 Common Pitfalls
+#### 4.10.1 Error Messages
+  - Running out of L1
+  - Shard spec and program config mismatches
+  - For some TTNN ops (e.g. ttnn.all_gather) it's not supported to pass -1 in the dim argument. 
+    - You'll see an error related to op invocation where the arguments don't match
+#### 4.10.2 Shard Spec Mismatches
+#### 4.10.3 Ethernet Dispatch Cores
+  - link to any other description, and mention it is needed for N300 and T3K
+#### 4.10.4 Hangs
+##### 4.10.4.1 Tracing
+  - Host communications cause tracing to hang
+  - Running without async mode enabled causes tracing to hang
+  - Careful with print in tracing
+##### 4.10.4.2 Large Matmuls
+  - Large matmuls hanging? Link to appropriate ticket with workaround
+  - Issue is being investigated with a workaround of setting the output subblock to 1,1 and grid size to 8x7
diff --git a/tech_reports/MetalProfiler/media/profiler-diagram.png b/tech_reports/MetalProfiler/media/profiler-diagram.png
new file mode 100644
index 00000000000..855f051d86a
Binary files /dev/null and b/tech_reports/MetalProfiler/media/profiler-diagram.png differ
diff --git a/tech_reports/MetalProfiler/metal-profiler.md b/tech_reports/MetalProfiler/metal-profiler.md
new file mode 100644
index 00000000000..44b844612c4
--- /dev/null
+++ b/tech_reports/MetalProfiler/metal-profiler.md
@@ -0,0 +1,331 @@
+# Metal Profiler
+
+## Quick Links
+- Tracy Profiler Repo: https://github.com/wolfpld/tracy
+- Tracy Documentation: https://github.com/wolfpld/tracy/releases/latest/download/tracy.pdf
+- Metal Fork of Tracy: https://github.com/tenstorrent-metal/tracy/tree/71d4c8d378b52af7da7012b9b595a61e9304f0bb
+
+## Introduction
+Tracy is an open-source C++ profiling tool with sampling and code instrumentation profiling capabilities. The profiled application is a client, and the profiler itself is a server (by default runs on port 8086). It was named this way because the client is a thin layer that just collects events and sends them for processing and long-term storage on the server. The fact that the server needs to connect to the client to begin the profiling session may be a bit confusing at first.
+
+## Things built from Tracy that are needed in tt-metal
+tt-metal is still on v0.10 of tracy. tt-metal has forked the tracy repo and added specific functionality to support running tracy on tenstorrent devices. Repo located here: https://github.com/tenstorrent-metal/tracy/tree/71d4c8d378b52af7da7012b9b595a61e9304f0bb. They key differences between v0.10 and v0.11.1 are how the tools within Tracy are built, with the former being from Makefile and the later upgrading to CMake. Metal plans to uplift to this version in the near future.
+
+For instructional purposes, the following section describes metal's fork of v0.10.
+
+### tracy-client
+tracy-client is a library that you link to your application. It will act as a thin wrapper between your application and the server. All macro calls that you insert into your application via Tracy APIs will interface with the tracy-client.
+```
+cmake -B build -S . -DCMAKE_BUILD_TYPE=Release -DTRACY_ENABLE=ON
+cmake --build build --config Release --parallel
+libTracyClient.a
+```
+
+### tracy-capture
+tracy-capture is a command line executable that acts as the tracy server to capture events from tracy-client. It will dump a .tracy file which you can feed into tracy-profiler GUI. 
+```
+cd capture/build/unix
+make all
+./tracy-capture -o test.tracy
+```
+
+### tracy-profiler
+tracy-profiler is a gui application that also acts as the tracy server. It can consume events from tracy-client live or can ingest a .tracy file computed offline. Typically you would run this on your local macbook while running tracy client + application on some remote machine. This is usually built on a machine in the same network as your host machine.
+```
+cd profiler/build/unix
+make all
+./tracy-profiler
+```
+
+### tracy-csvexport
+tracy-csvexport is a command line executable that consumes .tracy file and outputs a csv file with all the data within the .tracy file. It is meant for an easier way to view the data.
+```
+cd csvexport/build/unix
+make all
+./tracy-csvexport test.tracy
+```
+
+## Basic Tracy Application
+The following section descibres how to integrate Tracy into your project. It is meant for devs to understand the flow of Tracy before using it in tt-metal or extending tt-metal within their own application.
+
+### 1. Add Tracy
+Add the Tracy repository to your project directory (as a third_party submodule)
+```
+mkdir third_party
+cd third_party
+git clone https://github.com/tenstorrent-metal/tracy/tree/71d4c8d378b52af7da7012b9b595a61e9304f0bb
+cd tracy
+```
+
+### 2. Build tracy-client
+Build tracy-client as a static lib and link to your executable target (or use Tracy::TracyClient)
+```
+add_executable(runner main.cpp)
+add_subdirectory(third_party/tracy)
+target_link_libraries(runner PUBLIC hellolib Tracy::TracyClient)
+target_include_directories(runner third_party/tracy/public)
+```
+
+### 3. Add Tracy includes
+Add tracy/Tracy.hpp as an include file to any file that will use tracy apis. Tracy source files are located in tracy/public directory
+```
+#include "tracy/Tracy.hpp"
+```
+
+### 4. Define compile options
+Define TRACY_ENABLE=ON for the WHOLE project (otherwise, won't be able to collect any data)
+```
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTRACY_ENABLE=ON")
+```
+
+### 5. Insert macros 
+Insert tracy related macro calls into your code 
+eg. Zones in Tracy are marked sections of code that users are interested in profiling. Tracy provides macros such as ZoneScoped; to accomplish this. Please refer to section 3 of Tracy’s documentation for further information on zones and available macros.
+```
+TracyMessageL("hello");
+ZoneScoped;
+FrameMarkNamed("main");
+```
+
+### 6. Build tracy-capture
+
+### 7. Build tracy-csvexport
+
+### 8. Build tracy-profiler
+
+## Developer Flow for using Tracy
+
+### 1. Start tracy-capture
+This will start listening on default port + address for which the tracy-client will interact with. It will dump all it's results into a .tracy file (which can be uploaded into tracy-profiler after)
+```
+./tracy-capture -o hello.tracy -f
+```
+
+### 2. (Optional) Start tracy-profiler
+Instead of starting tracy-capture via command line, you can start tracy-profiler from your macbook. The tracy-profiler is a gui that will communicate in real-time with the tracy-client and display the results on the gui.
+```
+./tracy-profiler
+```
+
+### 3. Start application
+Start your application in a different terminal. This is the application that has been compiled with all the stuff mentioned in Basic Tracy Integration. As your application runs, you will see tracy-capture capturing events/tracy-profiler capturing events. 
+```
+./runner
+```
+
+### 4. (Only if did 1.) Feed .tracy into tracy-profiler
+If you used tracy-capture, it will dump a .tracy file once this is complete. You can then feed this .tracy file into the tracy-profiler to view the results.
+
+### 5. (Only if did 1.) View .tracy contents 
+You can also view the contents of the .tracy file as a csv file using tracy-csvexport. This will dump the results in csv format which you can pip into a file and view the results. Optionally, you can also save the .tracy file via the GUI itself and then feed it into the tracy-csvexport tool.
+```
+./tracy-csvexport hello.tracy
+```
+
+## Tracy Example
+The following section will provide an example of how to use Tracy in a sample app, step by step.
+
+### 1. Setup project directory structure
+```
+- project/
+  - third_party/
+    - tracy/
+  - include/
+    - hellolib.hpp
+  - CMakeLists.txt
+  - main.cpp
+  - hellolib.cpp 
+```
+
+### 2. Fill in contents of each file found below
+// third_party/tracy
+```
+mkdir third_party
+cd third_party
+git clone https://github.com/tenstorrent-metal/tracy/tree/71d4c8d378b52af7da7012b9b595a61e9304f0bb
+cd tracy
+```
+
+// hellolib.hpp
+```
+#include "tracy/Tracy.hpp"
+#include <string>
+
+int add(int a, int b);
+int subtract(int a, int b);
+int multiply(int a, int b);
+int divide(int a, int b);
+```
+
+// CMakeLists.txt
+```
+cmake_minimum_required(VERSION 3.10)
+
+project(TracyTest LANGUAGES CXX)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED True)
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTRACY_ENABLE=ON")
+
+add_library(hellolib SHARED hellolib.cpp)
+
+target_include_directories(hellolib
+    PUBLIC 
+        ${CMAKE_SOURCE_DIR}/include
+        ${CMAKE_SOURCE_DIR}/third_party/tracy/public
+)
+
+add_executable(runner main.cpp)
+
+add_subdirectory(third_party/tracy)
+
+# Link the executable against the shared library
+target_link_libraries(runner PUBLIC hellolib Tracy::TracyClient)
+```
+
+// main.cpp
+```
+#include "tracy/Tracy.hpp"
+#include <iostream>
+#include "hellolib.hpp"
+#include <unistd.h> // for sleep()
+
+int main() {
+    sleep(5); // need to add short sleep call so that tracy client can establish connection
+    FrameMarkNamed("main");
+    TracyMessageL("Hello");
+    ZoneScoped;
+    int c = 0;
+    std::cout << "Hello, World = " << c << std::endl;
+    c = add(5, 5);
+    c = subtract(5, 5);
+    c = multiply(5, 5);
+    c = divide(5, 5);
+    return 0;
+}
+```
+
+// hellolib.cpp
+```
+#include "tracy/Tracy.hpp"
+#include "hellolib.hpp"
+#include <iostream>
+#include <string>
+
+int add(int a, int b) {
+  TracyMessageL("add");
+  ZoneScoped;
+  int c = a + b;
+  return c;
+}
+
+int subtract(int a, int b) {
+  TracyMessageL("subtract");
+  ZoneScoped;
+  return a - b;
+}
+
+int multiply(int a, int b) {
+  TracyMessageL("multiply");
+  ZoneScoped;
+  return a * b;
+}
+
+int divide(int a, int b) {
+  TracyMessageL("divide");
+  ZoneScoped;
+  return a / b;
+}
+```
+
+### 3. Build Project
+This will build the executable and binarines in `build/` folder.
+```
+mkdir -p build 
+cd build/
+cmake -G Ninja -DTRACY_ENABLE=ON .. 
+ninja 
+```
+
+### 4. Build tracy-capture
+
+### 5. Optional: Build tracy-profiler (on macbook)
+
+### 6. Build tracy-csvexport
+
+### 7. Start tracy-capture OR tracy-profiler
+Start this in a separate terminal. This will dump all events into hello.tracy.
+```
+./tracy-capture -o hello.tracy
+```
+
+Start this on your macbook. This will collect all events live. You need to make sure you port forward from your remote machine to your macbook.
+```
+./tracy-profiler
+```
+
+### 8. Run project
+This will run the executable and the results will be collected by tracy-capture and stored into `hello.tracy` file or displayed lived on tracy-profiler.
+```
+cd build
+./runner
+```
+
+### (Optional) Run tracy-csvexport
+If you used tracy-capture and want to view the results, you can pass them through tracy-csvexport. This will dump out all the results in csv format which you can then pipe to a file. You can also save a .tracy file via the tracy-profiler GUI and view them using this tool.
+```
+./csvexport hello.tracy
+```
+
+### (Optional) Upload .tracy file into tracy-profiler
+If you used tracy-capture to get the .tracy file, you can upload it into tracy-profiler GUI offline on your macbook. Follow instructions on GUI widget.
+
+## Tracy + Metal
+The following sections relates to tt-metal's usage of Tracy. tt-metal uses v0.10 version of Tracy. They have also built on-top of tracy with custom files to support device side profiling. Repo found here: https://github.com/tenstorrent-metal/tracy/tree/71d4c8d378b52af7da7012b9b595a61e9304f0bb. There are several components regarding how tt-metal integrates Tracy and provides profiler support.
+
+### Building in profiler mode
+You can build metal in profiler mode using the following
+```
+./build_metal -p
+```
+All of the tools that are needed by metal are generated under `build/tools/profiler/bin/`.
+
+### tt_metal.so
+tt_metal shared library is generated, which has all the low level implementation details. This library can be used standalone if calling tt_metal APIs and it is linked against ttnn.so if using ttnn APIs. This library is also what Tracy links against.
+```
+location: tt_metal/CMakeLists.txt
+eg: target_link_libraries(tt_metal PUBLIC compiler_flags $<$<BOOL:${ENABLE_TRACY}>:TracyClient>)
+```
+
+### profiler.o
+A profiler object gets generated with various low level API calls within tt-metal. This object is linked against tt_metal.so. 
+```
+location: tt-metal/tt_metal/tools/profiler
+eg: profiler.cpp
+```
+
+### Tracy module tool for dev convenience 
+```
+location: tt-metal/ttnn/tracy
+eg: __main__.py
+```
+
+Developers can use the tracy module tool that will handle everything interally for them (such as tracy-capture, tracy-csvexport etc). This is provided for convenience. Profiling python code with tracy requires running your python code with the python tracy module. For profiling your entire python program, run your program as follows.
+```
+python -m tracy {test_script}.py
+```
+
+### Tracy post-processing scripts
+Metal will dump out various information about kernel profiling data. All this information gets cleaned and presented in a visible format through various post-processing scripts. If you are using the tracy module script infrastructure provided by metal, it will handle all of this for you.
+```
+location: tt-metal/tt_metal/tools/profiler
+eg: process_ops_logs.py
+```
+
+### Tracy + Metal Architecture
+The following image depicts the architectural diagram and program flow for how metal integrates Tracy and how the internal flow works. Everything is handled internally by the Tracy module tool that metal provides for dev convenience.
+
+<img src="media/profiler-diagram.png" style="width:200px;">
+
+## Extending Tracy to External Applications
+For an example of how metal Tracy is used in another application, please refer to https://github.com/tenstorrent/tt-mlir, specifically https://github.com/tenstorrent/tt-mlir/tree/main/runtime/tools/python (ttrt) - an independent runtime wrapper around metal APIs.
diff --git a/tests/scripts/run_moreh_microbenchmark.sh b/tests/scripts/run_moreh_microbenchmark.sh
index cdccd2f8302..2b7107bb7df 100755
--- a/tests/scripts/run_moreh_microbenchmark.sh
+++ b/tests/scripts/run_moreh_microbenchmark.sh
@@ -35,6 +35,7 @@ run_profiling_test() {
   if [[ "$ARCH_NAME" == "wormhole_b0" ]]; then
     pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_matmul_single_core_sharded -k $ARCH_NAME
     pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_dram_read_12_core -k $ARCH_NAME
+    pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_dram_read_remote_cb_sync -k $ARCH_NAME
   fi
   # bypass wh_b0 for now until we can move FD cores to last col
   if [[ "$ARCH_NAME" != "wormhole_b0" ]]; then
diff --git a/tests/scripts/test_moreh_microbenchmark.py b/tests/scripts/test_moreh_microbenchmark.py
index dc1e3b9b4c9..c93b82c45d6 100755
--- a/tests/scripts/test_moreh_microbenchmark.py
+++ b/tests/scripts/test_moreh_microbenchmark.py
@@ -287,6 +287,33 @@ def run_dram_read_l1_write_cmd(k, n, num_blocks, df, num_banks, bank_start_id):
     run_moreh_single_test("DRAM BW test multi-core", command)
 
 
+def run_dram_read_remote_cb_sync_cmd(
+    k, n, num_blocks, cb_num_blocks, cb_padding, df, num_receivers, num_mixed_df_layers
+):
+    command = (
+        "TT_METAL_DEVICE_PROFILER=1 ./build/test/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/test_dram_read_remote_cb "
+        + " --k "
+        + str(k)
+        + " --n "
+        + str(n)
+        + " --num-blocks "
+        + str(num_blocks)
+        + " --cb-num-blocks "
+        + str(cb_num_blocks)
+        + " --cb-padding "
+        + str(cb_padding)
+        + " --num-tests "
+        + str(1)
+        + " --data-type "
+        + str(df)
+        + " --num-receivers "
+        + str(num_receivers)
+        + " --num-mixed-df-layers "
+        + str(num_mixed_df_layers)
+    )
+    run_moreh_single_test("DRAM read remote CB sync single-core ", command)
+
+
 # noc
 def test_noc_local(r=9, c=12, nt=256, cb=1):
     command = (
@@ -739,6 +766,64 @@ def test_dram_read_l1_write_core(arch, freq, test_vector, num_tests, nblock, dat
     assert bw_bound <= throughput
 
 
+@pytest.mark.parametrize(
+    "arch, freq, test_vector, num_tests, nblock, cb_nblock, cb_padding, data_format, num_receivers, num_mixed_df_layers",
+    [
+        # single layer single receiver test
+        ("wormhole_b0", 1000, np.array([32768, 128]), 1, 64, 5, 256, 1, 1, 1),
+        # single layer multi receiver test
+        ("wormhole_b0", 1000, np.array([32768, 128]), 1, 64, 3, 256, 1, 2, 1),
+        # multi layer multi receiver test
+        ("wormhole_b0", 1000, np.array([32768, 256]), 1, 64, 5, 256, 1, 4, 15),
+    ],
+)
+def test_dram_read_remote_cb_sync(
+    arch, freq, test_vector, num_tests, nblock, cb_nblock, cb_padding, data_format, num_receivers, num_mixed_df_layers
+):
+    data = []
+    cycle_list = []
+    time_list = []
+    throughput_list = []
+    for _ in range(num_tests):
+        k = int(test_vector[0])
+        n = int(test_vector[1])
+        input_size = 0
+        if data_format == 0:
+            input_size += k * n * 1088 // 1024
+        elif data_format == 1:
+            input_size += k * n * 2048 // 1024
+        for i in range(num_mixed_df_layers - 1):
+            if i % 2 == 0:
+                input_size += k * n * 1088 // 1024
+            else:
+                input_size += k * n * 2048 // 1024
+        run_dram_read_remote_cb_sync_cmd(
+            k, n, nblock, cb_nblock, cb_padding, data_format, num_receivers, num_mixed_df_layers
+        )
+        cycle = profile_results_kernel_duration()
+        time = cycle / freq / 1000.0 / 1000.0
+        throughput = input_size / cycle * freq / 1000.0
+        cycle_list.append(cycle)
+        time_list.append(time)
+        throughput_list.append(throughput)
+    cycle = sum(cycle_list) / len(cycle_list)
+    time = sum(time_list) / len(time_list)
+    throughput = sum(throughput_list) / len(throughput_list)
+    logger.info("DRAM read cycle: " + str(cycle))
+    logger.info("DRAM read time: " + str(time))
+    logger.info("DRAM read throughput: " + str(throughput))
+    data.append([throughput])
+    # check within range
+    dev_freq = get_device_freq()
+    if arch == "grayskull":
+        bw_bound = 100.0
+    elif arch == "wormhole_b0":
+        bw_bound = 22.0
+    elif arch == "blackhole":
+        bw_bound = 340.0
+    assert bw_bound <= throughput
+
+
 @pytest.mark.parametrize(
     "arch, freq, r, c, test_vector_global, test_vector_local",
     [
diff --git a/tests/scripts/tg/run_tg_nightly_tests.sh b/tests/scripts/tg/run_tg_nightly_tests.sh
index e56038e0ca6..6810e5b58d4 100755
--- a/tests/scripts/tg/run_tg_nightly_tests.sh
+++ b/tests/scripts/tg/run_tg_nightly_tests.sh
@@ -7,6 +7,8 @@ run_tg_llama3_70b_tests() {
 
   echo "LOG_METAL: Running run_tg_llama3_70b_tests"
 
+  pytest tests/ttnn/unit_tests/operations/test_all_gather_TG_nightly.py ; fail+=$?
+
   # Falcon40B prefill 60 layer end to end with 10 loops; we need 8x8 grid size
   pytest tests/nightly/tg/models/demos/tg/llama3_70b ; fail+=$?
 
diff --git a/tests/sweep_framework/sweeps/eltwise/unary_complex/angle/angle.py b/tests/sweep_framework/sweeps/eltwise/unary_complex/angle/angle.py
new file mode 100644
index 00000000000..e1e872d0585
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary_complex/angle/angle.py
@@ -0,0 +1,103 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import random
+import ttnn
+from tests.sweep_framework.sweep_utils.utils import gen_shapes, sanitize_shape_rm
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 30
+
+random.seed(0)
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+    "xfail": {
+        "input_shape": gen_shapes([1, 1, 1, 1], [6, 12, 256, 256], [1, 1, 1, 1], 16)
+        + gen_shapes([1, 1, 1], [12, 256, 256], [1, 1, 1], 16)
+        + gen_shapes([1, 1], [256, 256], [1, 1], 16),
+        "input_a_dtype": [ttnn.bfloat16],
+        "input_layout": [ttnn.TILE_LAYOUT, ttnn.ROW_MAJOR_LAYOUT],
+        "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+        "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+    },
+}
+
+
+# Invalidate vector is called during the generation phase where each vector will be passed in.
+# If invalidated, the vector will still be stored but will be skipped.
+# Returns False, None if the vector is valid, and True, str with a reason for invalidation if it is invalid.
+def invalidate_vector(test_vector) -> Tuple[bool, Optional[str]]:
+    if test_vector["input_layout"] == ttnn.ROW_MAJOR_LAYOUT:
+        return True, "Inputs to eltwise binary must be tilized"
+    if test_vector["input_layout"] == ttnn.ROW_MAJOR_LAYOUT and test_vector["input_a_dtype"] == ttnn.bfloat8_b:
+        return True, "bfloat8_b is only supported on tiled layout"
+    return False, None
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+    input_shape,
+    input_a_dtype,
+    input_layout,
+    input_a_memory_config,
+    output_memory_config,
+    *,
+    device,
+) -> list:
+    data_seed = random.randint(0, 20000000)
+    torch.manual_seed(data_seed)
+
+    if input_layout == ttnn.ROW_MAJOR_LAYOUT:
+        input_shape = sanitize_shape_rm(input_shape)
+
+    torch_real = gen_func_with_cast_tt(partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype)(
+        input_shape
+    ).to(torch.float32)
+    torch_imag = gen_func_with_cast_tt(partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype)(
+        input_shape
+    ).to(torch.float32)
+    torch_input_tensor_a = torch.complex(torch_real, torch_imag)
+
+    golden_function = ttnn.get_golden_function(ttnn.angle)
+    torch_output_tensor = golden_function(torch_input_tensor_a)
+
+    input_tensor_a_real = ttnn.from_torch(
+        torch_real,
+        dtype=input_a_dtype,
+        layout=input_layout,
+        device=device,
+        memory_config=input_a_memory_config,
+    )
+    input_tensor_a_imag = ttnn.from_torch(
+        torch_imag,
+        dtype=input_a_dtype,
+        layout=input_layout,
+        device=device,
+        memory_config=input_a_memory_config,
+    )
+    input_tensor_a = ttnn.complex_tensor(input_tensor_a_real, input_tensor_a_imag)
+
+    start_time = start_measuring_time()
+    output_tensor = ttnn.angle(input_tensor_a, memory_config=output_memory_config)
+    e2e_perf = stop_measuring_time(start_time)
+
+    output_tensor = ttnn.to_torch(output_tensor)
+
+    return [check_with_pcc(torch_output_tensor, output_tensor, 0.999), e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary_complex/angle_bw/angle_bw.py b/tests/sweep_framework/sweeps/eltwise/unary_complex/angle_bw/angle_bw.py
new file mode 100644
index 00000000000..ce3dd28f636
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary_complex/angle_bw/angle_bw.py
@@ -0,0 +1,125 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import random
+import ttnn
+from tests.sweep_framework.sweep_utils.utils import gen_shapes, sanitize_shape_rm
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 30
+
+random.seed(0)
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+    "nightly": {
+        "input_shape": gen_shapes([1, 1, 1, 1], [6, 12, 256, 256], [1, 1, 1, 1], 8)
+        + gen_shapes([1, 1, 1], [12, 256, 256], [1, 1, 1], 8)
+        + gen_shapes([1, 1], [256, 256], [1, 1], 8),
+        "grad_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
+        "input_a_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
+        "input_layout": [ttnn.TILE_LAYOUT, ttnn.ROW_MAJOR_LAYOUT],
+        "grad_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+        "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+        "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+    },
+}
+
+
+# Invalidate vector is called during the generation phase where each vector will be passed in.
+# If invalidated, the vector will still be stored but will be skipped.
+# Returns False, None if the vector is valid, and True, str with a reason for invalidation if it is invalid.
+def invalidate_vector(test_vector) -> Tuple[bool, Optional[str]]:
+    if test_vector["input_layout"] == ttnn.ROW_MAJOR_LAYOUT:
+        return True, "Inputs to eltwise binary must be tilized"
+    if test_vector["input_a_dtype"] == ttnn.bfloat8_b:
+        return True, "bfloat8_b is not supported on input_tensor_a"
+    if test_vector["input_layout"] == ttnn.ROW_MAJOR_LAYOUT and test_vector["input_a_dtype"] == ttnn.bfloat8_b:
+        return True, "bfloat8_b is only supported on tiled layout"
+    return False, None
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+    input_shape,
+    grad_dtype,
+    input_a_dtype,
+    input_layout,
+    grad_memory_config,
+    input_a_memory_config,
+    output_memory_config,
+    *,
+    device,
+) -> list:
+    data_seed = random.randint(0, 20000000)
+    torch.manual_seed(data_seed)
+
+    if input_layout == ttnn.ROW_MAJOR_LAYOUT:
+        input_shape = sanitize_shape_rm(input_shape)
+
+    torch_grad_tensor = gen_func_with_cast_tt(
+        partial(torch_random, low=-100, high=100, dtype=torch.float32), grad_dtype
+    )(input_shape).to(torch.float32)
+
+    torch_real = gen_func_with_cast_tt(partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype)(
+        input_shape
+    ).to(torch.float32)
+    torch_imag = gen_func_with_cast_tt(partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype)(
+        input_shape
+    ).to(torch.float32)
+
+    torch_input_tensor_a = torch.complex(torch_real, torch_imag)
+
+    torch_input_tensor_a.requires_grad = True
+
+    golden_function = ttnn.get_golden_function(ttnn.angle_bw)
+    torch_output_tensor = golden_function(torch_grad_tensor, torch_input_tensor_a)[0]
+
+    grad_tensor = ttnn.from_torch(
+        torch_grad_tensor,
+        dtype=grad_dtype,
+        layout=input_layout,
+        device=device,
+        memory_config=grad_memory_config,
+    )
+
+    input_tensor_a_real = ttnn.from_torch(
+        torch_real,
+        dtype=input_a_dtype,
+        layout=input_layout,
+        device=device,
+        memory_config=input_a_memory_config,
+    )
+    input_tensor_a_imag = ttnn.from_torch(
+        torch_imag,
+        dtype=input_a_dtype,
+        layout=input_layout,
+        device=device,
+        memory_config=input_a_memory_config,
+    )
+
+    input_tensor_a = ttnn.complex_tensor(input_tensor_a_real, input_tensor_a_imag)
+
+    start_time = start_measuring_time()
+    output_tensor = ttnn.angle_bw(grad_tensor, input_tensor_a, memory_config=output_memory_config)[0]
+    e2e_perf = stop_measuring_time(start_time)
+
+    output_tensor = torch.cat((ttnn.to_torch(output_tensor.real), ttnn.to_torch(output_tensor.imag)), dim=-1)
+
+    return [check_with_pcc(torch_output_tensor, output_tensor, 0.999), e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary_complex/polar/polar.py b/tests/sweep_framework/sweeps/eltwise/unary_complex/polar/polar.py
new file mode 100644
index 00000000000..857f4d533fd
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary_complex/polar/polar.py
@@ -0,0 +1,109 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import random
+import ttnn
+from tests.sweep_framework.sweep_utils.utils import gen_shapes, sanitize_shape_rm
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 30
+
+random.seed(0)
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+    "nightly": {
+        "input_shape": gen_shapes([1, 1, 1, 1], [6, 12, 256, 256], [1, 1, 1, 1], 16)
+        + gen_shapes([1, 1, 1], [12, 256, 256], [1, 1, 1], 16)
+        + gen_shapes([1, 1], [256, 256], [1, 1], 16),
+        "input_a_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
+        "input_layout": [ttnn.TILE_LAYOUT, ttnn.ROW_MAJOR_LAYOUT],
+        "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+        "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+    },
+}
+
+
+# Invalidate vector is called during the generation phase where each vector will be passed in.
+# If invalidated, the vector will still be stored but will be skipped.
+# Returns False, None if the vector is valid, and True, str with a reason for invalidation if it is invalid.
+def invalidate_vector(test_vector) -> Tuple[bool, Optional[str]]:
+    if test_vector["input_layout"] == ttnn.ROW_MAJOR_LAYOUT:
+        return True, "Inputs to eltwise binary must be tilized"
+    if test_vector["input_layout"] == ttnn.ROW_MAJOR_LAYOUT and test_vector["input_a_dtype"] == ttnn.bfloat8_b:
+        return True, "bfloat8_b is only supported on tiled layout"
+    return False, None
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+    input_shape,
+    input_a_dtype,
+    input_layout,
+    input_a_memory_config,
+    output_memory_config,
+    *,
+    device,
+) -> list:
+    data_seed = random.randint(0, 20000000)
+    torch.manual_seed(data_seed)
+
+    if input_layout == ttnn.ROW_MAJOR_LAYOUT:
+        input_shape = sanitize_shape_rm(input_shape)
+
+    torch_real = gen_func_with_cast_tt(partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype)(
+        input_shape
+    ).to(torch.float32)
+    torch_imag = gen_func_with_cast_tt(partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype)(
+        input_shape
+    ).to(torch.float32)
+
+    golden_function = torch.polar
+    torch_output_tensor = golden_function(torch_real, torch_imag)
+
+    input_tensor_a_real = ttnn.from_torch(
+        torch_real,
+        dtype=input_a_dtype,
+        layout=input_layout,
+        device=device,
+        memory_config=input_a_memory_config,
+    )
+    input_tensor_a_imag = ttnn.from_torch(
+        torch_imag,
+        dtype=input_a_dtype,
+        layout=input_layout,
+        device=device,
+        memory_config=input_a_memory_config,
+    )
+    input_tensor_a = ttnn.complex_tensor(input_tensor_a_real, input_tensor_a_imag)
+
+    start_time = start_measuring_time()
+    output_tensor = ttnn.polar(input_tensor_a, memory_config=output_memory_config)
+    e2e_perf = stop_measuring_time(start_time)
+
+    output_tensor = torch.complex(
+        ttnn.to_torch(output_tensor.real).to(torch.float32), ttnn.to_torch(output_tensor.imag).to(torch.float32)
+    )
+
+    return [
+        check_with_pcc(
+            torch.view_as_real(torch_output_tensor.clone()), torch.view_as_real(output_tensor.clone()), 0.999
+        ),
+        e2e_perf,
+    ]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary_complex/polar_bw/polar_bw.py b/tests/sweep_framework/sweeps/eltwise/unary_complex/polar_bw/polar_bw.py
new file mode 100644
index 00000000000..2ac0d2dec36
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary_complex/polar_bw/polar_bw.py
@@ -0,0 +1,137 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import random
+import ttnn
+from tests.sweep_framework.sweep_utils.utils import gen_shapes, sanitize_shape_rm
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 30
+
+random.seed(0)
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+    "nightly": {
+        "input_shape": gen_shapes([1, 1, 1, 1], [6, 12, 256, 256], [1, 1, 1, 1], 8)
+        + gen_shapes([1, 1, 1], [12, 256, 256], [1, 1, 1], 8)
+        + gen_shapes([1, 1], [256, 256], [1, 1], 8),
+        "grad_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
+        "input_a_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
+        "input_layout": [ttnn.TILE_LAYOUT, ttnn.ROW_MAJOR_LAYOUT],
+        "grad_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+        "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+        "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+    },
+}
+
+
+# Invalidate vector is called during the generation phase where each vector will be passed in.
+# If invalidated, the vector will still be stored but will be skipped.
+# Returns False, None if the vector is valid, and True, str with a reason for invalidation if it is invalid.
+def invalidate_vector(test_vector) -> Tuple[bool, Optional[str]]:
+    if test_vector["input_layout"] == ttnn.ROW_MAJOR_LAYOUT:
+        return True, "Inputs to eltwise binary must be tilized"
+    if test_vector["input_a_dtype"] == ttnn.bfloat8_b:
+        return True, "bfloat8_b is not supported on input_tensor_a"
+    if test_vector["input_layout"] == ttnn.ROW_MAJOR_LAYOUT and test_vector["input_a_dtype"] == ttnn.bfloat8_b:
+        return True, "bfloat8_b is only supported on tiled layout"
+    return False, None
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+    input_shape,
+    grad_dtype,
+    input_a_dtype,
+    input_layout,
+    grad_memory_config,
+    input_a_memory_config,
+    output_memory_config,
+    *,
+    device,
+) -> list:
+    data_seed = random.randint(0, 20000000)
+    torch.manual_seed(data_seed)
+
+    if input_layout == ttnn.ROW_MAJOR_LAYOUT:
+        input_shape = sanitize_shape_rm(input_shape)
+
+    torch_grad_real = gen_func_with_cast_tt(partial(torch_random, low=-100, high=100, dtype=torch.float32), grad_dtype)(
+        input_shape
+    ).to(torch.float32)
+    torch_grad_imag = gen_func_with_cast_tt(partial(torch_random, low=-100, high=100, dtype=torch.float32), grad_dtype)(
+        input_shape
+    ).to(torch.float32)
+
+    torch_real = gen_func_with_cast_tt(partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype)(
+        input_shape
+    ).to(torch.float32)
+    torch_imag = gen_func_with_cast_tt(partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype)(
+        input_shape
+    ).to(torch.float32)
+
+    torch_grad_tensor = torch.complex(torch_grad_real, torch_grad_imag)
+    torch_input_tensor_a = torch.complex(torch_real, torch_imag)
+
+    torch_input_tensor_a.requires_grad = True
+
+    golden_function = ttnn.get_golden_function(ttnn.polar_bw)
+    torch_output_tensor = golden_function(torch_grad_tensor, torch_input_tensor_a)[0]
+
+    grad_real = ttnn.from_torch(
+        torch_grad_real,
+        dtype=grad_dtype,
+        layout=input_layout,
+        device=device,
+        memory_config=grad_memory_config,
+    )
+    grad_imag = ttnn.from_torch(
+        torch_grad_imag,
+        dtype=grad_dtype,
+        layout=input_layout,
+        device=device,
+        memory_config=grad_memory_config,
+    )
+
+    input_tensor_a_real = ttnn.from_torch(
+        torch_real,
+        dtype=input_a_dtype,
+        layout=input_layout,
+        device=device,
+        memory_config=input_a_memory_config,
+    )
+    input_tensor_a_imag = ttnn.from_torch(
+        torch_imag,
+        dtype=input_a_dtype,
+        layout=input_layout,
+        device=device,
+        memory_config=input_a_memory_config,
+    )
+
+    grad_tensor = ttnn.complex_tensor(grad_real, grad_imag)
+    input_tensor_a = ttnn.complex_tensor(input_tensor_a_real, input_tensor_a_imag)
+
+    start_time = start_measuring_time()
+    output_tensor = ttnn.polar_bw(grad_tensor, input_tensor_a, memory_config=output_memory_config)[0]
+    e2e_perf = stop_measuring_time(start_time)
+
+    output_tensor = torch.cat((ttnn.to_torch(output_tensor.real), ttnn.to_torch(output_tensor.imag)), dim=-1)
+
+    return [check_with_pcc(torch_output_tensor, output_tensor, 0.999), e2e_perf]
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/kernels/reader_dram.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/kernels/reader_dram.cpp
new file mode 100644
index 00000000000..9b5988f4e63
--- /dev/null
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/kernels/reader_dram.cpp
@@ -0,0 +1,145 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <stdint.h>
+
+#include "dataflow_api.h"
+#include "ttnn/cpp/ttnn/operations/ccl/kernel_common/worker_sync_utils.hpp"
+
+#include "debug/dprint.h"
+
+template <uint32_t bank_base_address, uint32_t page_size, bool use_vc>
+FORCE_INLINE
+void noc_async_read_tile_dram_sharded(uint32_t src_addr, uint32_t dest_addr, uint32_t bank_id = 0, const uint32_t vc = 0) {
+    uint32_t src_addr_;
+    uint32_t src_noc_xy;
+
+    src_addr_ = src_addr + bank_base_address;
+    src_addr_ += bank_to_dram_offset[bank_id];
+    src_noc_xy = dram_bank_to_noc_xy[noc_index][bank_id];
+
+    WAYPOINT("NRTW");
+    DEBUG_SANITIZE_NOC_READ_TRANSACTION(noc_index, get_noc_addr_helper(src_noc_xy, src_addr_), dest_addr, page_size);
+    while (!noc_cmd_buf_ready(noc_index, NCRISC_RD_CMD_BUF));
+    WAYPOINT("NRTD");
+
+    if constexpr(use_vc) {
+        uint32_t noc_rd_cmd_field = NOC_CMD_CPY | NOC_CMD_RD | NOC_CMD_RESP_MARKED | NOC_CMD_VC_STATIC | NOC_CMD_STATIC_VC(vc);
+        NOC_CMD_BUF_WRITE_REG(noc_index, NCRISC_RD_CMD_BUF, NOC_CTRL, noc_rd_cmd_field);
+    }
+
+    NOC_CMD_BUF_WRITE_REG(noc_index, NCRISC_RD_CMD_BUF, NOC_RET_ADDR_LO, dest_addr);
+    NOC_CMD_BUF_WRITE_REG(noc_index, NCRISC_RD_CMD_BUF, NOC_TARG_ADDR_LO, src_addr_);      // (uint32_t)src_addr
+    NOC_CMD_BUF_WRITE_REG(noc_index, NCRISC_RD_CMD_BUF, NOC_TARG_ADDR_COORDINATE, src_noc_xy);   // src_addr >> 32
+    NOC_CMD_BUF_WRITE_REG(noc_index, NCRISC_RD_CMD_BUF, NOC_AT_LEN_BE, page_size);  // len_bytes
+    NOC_CMD_BUF_WRITE_REG(noc_index, NCRISC_RD_CMD_BUF, NOC_CMD_CTRL, NOC_CTRL_SEND_REQ);
+    noc_reads_num_issued[noc_index] += 1;
+}
+
+void kernel_main() {
+    constexpr uint32_t input_addr = get_compile_time_arg_val(0);
+    constexpr uint32_t input_start_tile_id = get_compile_time_arg_val(1);
+    constexpr uint32_t noc = get_compile_time_arg_val(2);
+    constexpr uint32_t num_layers = get_compile_time_arg_val(3);
+
+    uint32_t rt_args_idx = 0;
+    const uint32_t bank_id = get_arg_val<uint32_t>(rt_args_idx++);
+    const uint32_t vc = get_arg_val<uint32_t>(rt_args_idx++);
+    tt_l1_ptr uint32_t* page_size = (tt_l1_ptr uint32_t*)(get_arg_addr(increment_arg_idx(rt_args_idx, num_layers)));
+    tt_l1_ptr uint32_t* num_pages = (tt_l1_ptr uint32_t*)(get_arg_addr(increment_arg_idx(rt_args_idx, num_layers)));
+    tt_l1_ptr uint32_t* num_blocks = (tt_l1_ptr uint32_t*)(get_arg_addr(increment_arg_idx(rt_args_idx, num_layers)));
+    tt_l1_ptr uint32_t* block_num_tiles = (tt_l1_ptr uint32_t*)(get_arg_addr(increment_arg_idx(rt_args_idx, num_layers)));
+
+    constexpr uint32_t cb_id = 0;
+    constexpr uint32_t total_num_blocks_in_buffer = 3;
+
+    uint32_t block_size_bytes = num_pages[0] * page_size[0];
+    uint32_t l1_buffer_start_addr = get_write_ptr(cb_id);
+    uint32_t l1_buffer_end_addr = get_write_ptr(cb_id) + block_size_bytes * total_num_blocks_in_buffer;
+
+    uint32_t src_read_addr = 0;
+    uint32_t src_read_addr_offset_bytes = 0;
+
+    for (uint32_t l = 0; l < num_layers; ++l) {
+        uint32_t curr_page_size = page_size[l];
+        uint32_t curr_num_pages = num_pages[l];
+        uint32_t curr_num_blocks = num_blocks[l];
+        uint32_t curr_block_num_tiles = block_num_tiles[l];
+
+        uint32_t curr_block_size_bytes = curr_num_pages * curr_page_size;
+        uint32_t curr_layer_size_bytes = curr_num_blocks * curr_block_size_bytes;
+
+        uint32_t src_base_addr = noc_async_read_tile_dram_sharded_set_state<true>(input_addr, curr_page_size, bank_id, vc);
+        src_read_addr = src_read_addr_offset_bytes;
+
+        // For debug purpose, use trivial DRAM read method
+        // for (uint32_t block = 0; block < curr_num_blocks; ++block) {
+        //     // Operand 1
+        //     cb_reserve_back(cb_id, curr_block_num_tiles);
+        //     auto l1_write_addr = get_write_ptr(cb_id);
+
+        //     for (uint32_t h = 0; h < curr_num_pages; ++h) {
+        //         noc_async_read_tile_dram_sharded_with_state(src_base_addr, src_read_addr, l1_write_addr);
+        //         src_read_addr += curr_page_size;
+        //         l1_write_addr += curr_page_size;
+        //     }
+
+        //     noc_async_read_barrier();
+
+        //     cb_push_back(cb_id, curr_block_num_tiles);
+        // }
+
+        uint32_t num_free_blocks_in_buffer = total_num_blocks_in_buffer;
+        uint32_t curr_block_trid = 1;
+        uint32_t block_trid_to_wait = 1;
+
+        cb_reserve_back(cb_id, curr_block_num_tiles);
+        uint32_t l1_write_addr_offset = 0;
+        uint32_t l1_write_addr_start = get_write_ptr(cb_id);
+        if (l1_write_addr_start >= l1_buffer_end_addr) {
+            l1_write_addr_start = l1_buffer_start_addr;
+        }
+        uint32_t l1_write_addr = l1_write_addr_start;
+        for (uint32_t block = 0; block < curr_num_blocks; ++block) {
+            noc_async_read_tile_dram_sharded_set_trid(curr_block_trid);
+
+            uint32_t temp_l1_write_addr = l1_write_addr;
+            for (uint32_t h = 0; h < curr_num_pages; ++h) {
+                noc_async_read_tile_dram_sharded_with_state_with_trid(
+                    src_base_addr, src_read_addr, temp_l1_write_addr, curr_block_trid);
+                src_read_addr += curr_page_size;
+                temp_l1_write_addr += curr_page_size;
+            }
+
+            if (num_free_blocks_in_buffer == 2) {
+                noc_async_read_barrier_with_trid(block_trid_to_wait);
+                cb_push_back(cb_id, curr_block_num_tiles);
+                // wait for next block trid
+                block_trid_to_wait = block_trid_to_wait == 3 ? 1 : (block_trid_to_wait + 1);
+                // reserve for next block
+                cb_reserve_back(cb_id, curr_block_num_tiles * 2);
+            } else {
+                num_free_blocks_in_buffer -= 1;
+            }
+
+            if (curr_block_trid == total_num_blocks_in_buffer) {
+                curr_block_trid = 1;
+            } else {
+                curr_block_trid += 1;
+            }
+
+            l1_write_addr += block_size_bytes;
+            if (l1_write_addr >= l1_buffer_end_addr) {
+                l1_write_addr = l1_buffer_start_addr;
+            }
+        }
+        // last block to wait
+        noc_async_read_barrier_with_trid(block_trid_to_wait);
+        cb_push_back(cb_id, curr_block_num_tiles);
+
+        src_read_addr_offset_bytes += curr_layer_size_bytes;
+
+    }
+
+}
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/kernels/receiver_l1.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/kernels/receiver_l1.cpp
new file mode 100644
index 00000000000..7e702916608
--- /dev/null
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/kernels/receiver_l1.cpp
@@ -0,0 +1,164 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <stdint.h>
+
+#include "dataflow_api.h"
+#include "ttnn/cpp/ttnn/operations/ccl/kernel_common/worker_sync_utils.hpp"
+
+#include "debug/dprint.h"
+
+constexpr uint32_t ALIGNED_PAGE_SIZE = 16;
+
+constexpr uint32_t cb_start_addr = get_compile_time_arg_val(0);
+constexpr uint32_t cb_rd_ptr = get_compile_time_arg_val(0);
+constexpr uint32_t cb_size = get_compile_time_arg_val(1);
+constexpr uint32_t num_layers = get_compile_time_arg_val(2);
+
+uint32_t rt_args_idx = 0;
+uint32_t vc;
+uint32_t noc_x;
+uint32_t noc_y;
+uint32_t pages_acked_semaphore_addr;
+uint32_t pages_sent_semaphore_addr;
+tt_l1_ptr uint32_t* page_size;
+tt_l1_ptr uint32_t* num_blocks;
+tt_l1_ptr uint32_t* block_num_tiles;
+
+uint32_t start_page_size;
+
+struct RemoteReceiverCBInterface {
+    volatile tt_l1_ptr uint32_t* pages_acked;
+    volatile tt_l1_ptr uint32_t* pages_sent;
+
+    uint32_t fifo_size;
+    uint32_t fifo_limit;
+    uint32_t fifo_limit_page_aligned;
+
+    uint32_t fifo_page_size;
+    uint32_t fifo_aligned_num_pages;
+
+    uint32_t fifo_rd_ptr;
+
+    uint32_t fifo_start_addr;
+
+    uint32_t aligned_page_size;
+};
+
+RemoteReceiverCBInterface remote_cb_interface;
+
+template<uint32_t aligned_page_size>
+FORCE_INLINE void setup_remote_receiver_cb_interface() {
+    uint32_t num_pages = cb_size / start_page_size;
+    uint32_t cb_size_page_aligned = num_pages * start_page_size;
+
+    remote_cb_interface.fifo_size = cb_size;
+    remote_cb_interface.fifo_limit = cb_size + cb_start_addr;
+    remote_cb_interface.fifo_limit_page_aligned = cb_size_page_aligned + cb_start_addr;
+
+
+    remote_cb_interface.fifo_page_size = start_page_size;
+    remote_cb_interface.fifo_aligned_num_pages = num_pages * start_page_size / aligned_page_size;
+
+    remote_cb_interface.fifo_rd_ptr = cb_rd_ptr;
+
+    remote_cb_interface.fifo_start_addr = cb_start_addr;
+
+    remote_cb_interface.pages_acked = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(get_semaphore(pages_acked_semaphore_addr));
+    remote_cb_interface.pages_sent = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(get_semaphore(pages_sent_semaphore_addr));
+
+    remote_cb_interface.aligned_page_size = aligned_page_size;
+}
+
+FORCE_INLINE void setup_remote_cb_page_size(uint32_t page_size, uint32_t remote_noc_x, uint32_t remote_noc_y, uint8_t noc = noc_index) {
+    uint32_t num_pages = remote_cb_interface.fifo_size / page_size;
+    uint32_t cb_size_page_aligned = num_pages * page_size;
+
+    remote_cb_interface.fifo_limit_page_aligned = cb_size_page_aligned + remote_cb_interface.fifo_start_addr;
+    remote_cb_interface.fifo_page_size = page_size;
+    remote_cb_interface.fifo_aligned_num_pages = num_pages * page_size / remote_cb_interface.aligned_page_size;
+
+    uint32_t curr_fifo_rd_ptr = remote_cb_interface.fifo_rd_ptr;
+    bool fifo_rd_ptr_exceed_fifo_limit = curr_fifo_rd_ptr > remote_cb_interface.fifo_limit_page_aligned;
+    uint32_t num_pages_till_fifo_limit = (remote_cb_interface.fifo_limit_page_aligned - curr_fifo_rd_ptr) / page_size;
+
+    if (fifo_rd_ptr_exceed_fifo_limit) {
+        remote_cb_interface.fifo_rd_ptr = remote_cb_interface.fifo_start_addr;
+    } else {
+        uint32_t next_fifo_rd_ptr = remote_cb_interface.fifo_limit_page_aligned - num_pages_till_fifo_limit * page_size;
+        uint32_t pages_acked = (next_fifo_rd_ptr - remote_cb_interface.fifo_rd_ptr) / remote_cb_interface.aligned_page_size;
+        remote_cb_interface.fifo_rd_ptr = next_fifo_rd_ptr;
+
+        // increment the aligned pages acked because we skipped to next aligned page location
+        *remote_cb_interface.pages_acked += pages_acked;
+        uint64_t remote_ack_ptr_addr = get_noc_addr(remote_noc_x, remote_noc_y, (uint32_t)remote_cb_interface.pages_acked, noc);
+        noc_semaphore_inc(remote_ack_ptr_addr, pages_acked, noc);
+    }
+}
+
+FORCE_INLINE void remote_cb_wait_front(uint32_t num_pages) {
+    uint32_t len_bytes = num_pages * remote_cb_interface.fifo_page_size;
+    uint32_t num_pages_wait = len_bytes / remote_cb_interface.aligned_page_size;
+    volatile uint32_t num_pages_recv = 0;
+    uint32_t pages_acked = 0;
+    uint32_t pages_sent = 0;
+
+    do {
+
+        pages_acked = (uint32_t)reg_read((uint32_t)remote_cb_interface.pages_acked);
+        pages_sent = (uint32_t)reg_read((uint32_t)remote_cb_interface.pages_sent);
+        num_pages_recv = pages_sent - pages_acked;
+    } while (num_pages_recv < num_pages_wait);
+}
+
+FORCE_INLINE void remote_cb_pop_front(uint32_t num_pages, uint32_t remote_noc_x, uint32_t remote_noc_y, uint8_t noc = noc_index) {
+    uint32_t len_bytes = num_pages * remote_cb_interface.fifo_page_size;
+    uint32_t num_aligned_pages = len_bytes / remote_cb_interface.aligned_page_size;
+
+    *remote_cb_interface.pages_acked += num_aligned_pages;
+    remote_cb_interface.fifo_rd_ptr += len_bytes;
+
+    if (remote_cb_interface.fifo_rd_ptr >= remote_cb_interface.fifo_limit_page_aligned) {
+        remote_cb_interface.fifo_rd_ptr = remote_cb_interface.fifo_start_addr;
+    }
+
+    uint64_t remote_ack_ptr_addr = get_noc_addr(remote_noc_x, remote_noc_y, (uint32_t)remote_cb_interface.pages_acked, noc);
+    noc_semaphore_inc(remote_ack_ptr_addr, num_aligned_pages, noc);
+}
+
+
+void kernel_main() {
+
+    uint32_t rt_args_idx = 0;
+    vc = get_arg_val<uint32_t>(rt_args_idx++);
+    noc_x = get_arg_val<uint32_t>(rt_args_idx++);
+    noc_y = get_arg_val<uint32_t>(rt_args_idx++);
+    pages_acked_semaphore_addr = get_arg_val<uint32_t>(rt_args_idx++);
+    pages_sent_semaphore_addr = get_arg_val<uint32_t>(rt_args_idx++);
+
+    page_size = (tt_l1_ptr uint32_t*)(get_arg_addr(increment_arg_idx(rt_args_idx, num_layers)));
+    num_blocks = (tt_l1_ptr uint32_t*)(get_arg_addr(increment_arg_idx(rt_args_idx, num_layers)));
+    block_num_tiles = (tt_l1_ptr uint32_t*)(get_arg_addr(increment_arg_idx(rt_args_idx, num_layers)));
+
+    start_page_size = page_size[0];
+
+    constexpr uint32_t cb_id = 0;
+
+    setup_remote_receiver_cb_interface<ALIGNED_PAGE_SIZE>();
+
+    for (uint32_t l = 0; l < num_layers; ++l) {
+        uint32_t curr_page_size = page_size[l];
+        uint32_t curr_num_blocks = num_blocks[l];
+        uint32_t curr_block_num_tiles = block_num_tiles[l];
+
+        setup_remote_cb_page_size(curr_page_size, noc_x, noc_y);
+
+        for (uint32_t block = 0; block < curr_num_blocks; ++block) {
+            remote_cb_wait_front(curr_block_num_tiles);
+
+            remote_cb_pop_front(curr_block_num_tiles, noc_x, noc_y);
+        }
+    }
+
+}
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/kernels/writer_l1.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/kernels/writer_l1.cpp
new file mode 100644
index 00000000000..0fefcfbf9b1
--- /dev/null
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/kernels/writer_l1.cpp
@@ -0,0 +1,331 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <stdint.h>
+
+#include "dataflow_api.h"
+#include "ttnn/cpp/ttnn/operations/ccl/kernel_common/worker_sync_utils.hpp"
+
+#include "debug/dprint.h"
+
+constexpr uint32_t ALIGNED_PAGE_SIZE = 16;
+
+constexpr uint32_t noc = get_compile_time_arg_val(0);
+constexpr uint32_t cb_start_addr = get_compile_time_arg_val(1);
+constexpr uint32_t cb_wr_ptr = get_compile_time_arg_val(1);
+constexpr uint32_t cb_size = get_compile_time_arg_val(2);
+constexpr uint32_t num_receivers = get_compile_time_arg_val(3);
+constexpr uint32_t num_layers = get_compile_time_arg_val(4);
+
+tt_l1_ptr uint32_t* noc_x;
+tt_l1_ptr uint32_t* noc_y;
+tt_l1_ptr uint32_t* pages_acked_semaphore_addr;
+tt_l1_ptr uint32_t* pages_sent_semaphore_addr;
+tt_l1_ptr uint32_t* coalesced_page_size;
+tt_l1_ptr uint32_t* coalesced_num_pages;
+tt_l1_ptr uint32_t* num_blocks;
+tt_l1_ptr uint32_t* block_num_tiles;
+tt_l1_ptr uint32_t* page_size;
+tt_l1_ptr uint32_t* num_tile_rows;
+
+uint32_t start_page_size;
+uint32_t layer = 0;
+
+template<uint32_t num_recv_cbs>
+struct RemoteSenderCBInterface {
+    uint32_t num_receivers;
+
+    volatile tt_l1_ptr uint32_t* pages_acked[num_recv_cbs];
+    volatile tt_l1_ptr uint32_t* pages_sent[num_recv_cbs];
+
+    uint32_t fifo_size;
+    uint32_t fifo_limit;
+    uint32_t fifo_limit_page_aligned;
+
+    uint32_t fifo_page_size;
+    uint32_t fifo_aligned_num_pages;
+
+    uint32_t fifo_wr_ptr;
+
+    uint32_t fifo_start_addr;
+
+    uint32_t aligned_page_size;
+};
+
+RemoteSenderCBInterface<num_receivers> remote_cb_interface;
+
+template<uint32_t aligned_page_size>
+FORCE_INLINE void setup_remote_sender_cb_interface() {
+    uint32_t num_pages = cb_size / start_page_size;
+    uint32_t cb_size_page_aligned = num_pages * start_page_size;
+
+    remote_cb_interface.fifo_size = cb_size;
+    remote_cb_interface.fifo_limit = cb_size + cb_start_addr;
+    remote_cb_interface.fifo_limit_page_aligned = cb_size_page_aligned + cb_start_addr;
+
+    remote_cb_interface.fifo_page_size = start_page_size;
+    remote_cb_interface.fifo_aligned_num_pages = num_pages * start_page_size / aligned_page_size;
+
+    remote_cb_interface.fifo_wr_ptr = cb_wr_ptr;
+
+    remote_cb_interface.fifo_start_addr = cb_start_addr;
+
+    remote_cb_interface.num_receivers = num_receivers;
+
+    for (uint32_t i=0; i < num_receivers; ++i) {
+        remote_cb_interface.pages_acked[i] = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(get_semaphore(pages_acked_semaphore_addr[i]));
+        remote_cb_interface.pages_sent[i] = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(get_semaphore(pages_sent_semaphore_addr[i]));
+    }
+
+    remote_cb_interface.aligned_page_size = aligned_page_size;
+
+}
+
+FORCE_INLINE void setup_remote_cb_page_size(uint32_t page_size, uint32_t* remote_noc_x, uint32_t* remote_noc_y, uint8_t noc = noc_index) {
+    uint32_t num_pages = remote_cb_interface.fifo_size / page_size;
+    uint32_t cb_size_page_aligned = num_pages * page_size;
+
+    remote_cb_interface.fifo_limit_page_aligned = cb_size_page_aligned + remote_cb_interface.fifo_start_addr;
+    remote_cb_interface.fifo_page_size = page_size;
+    remote_cb_interface.fifo_aligned_num_pages = num_pages * page_size / remote_cb_interface.aligned_page_size;
+
+    uint32_t curr_fifo_wr_ptr = remote_cb_interface.fifo_wr_ptr;
+    bool fifo_wr_ptr_exceed_fifo_limit = curr_fifo_wr_ptr > remote_cb_interface.fifo_limit_page_aligned;
+    uint32_t num_pages_till_fifo_limit = (remote_cb_interface.fifo_limit_page_aligned - curr_fifo_wr_ptr) / page_size;
+
+    if (fifo_wr_ptr_exceed_fifo_limit) {
+        remote_cb_interface.fifo_wr_ptr = remote_cb_interface.fifo_start_addr;
+    } else {
+        uint32_t next_fifo_wr_ptr = remote_cb_interface.fifo_limit_page_aligned - num_pages_till_fifo_limit * page_size;
+        uint32_t pages_sent = (next_fifo_wr_ptr - remote_cb_interface.fifo_wr_ptr) / remote_cb_interface.aligned_page_size;
+        remote_cb_interface.fifo_wr_ptr = next_fifo_wr_ptr;
+
+        // increment the aligned pages sent because we skipped to next aligned page location
+        for (uint32_t i=0; i < remote_cb_interface.num_receivers; ++i) {
+            uint32_t remote_noc_xy = uint32_t(NOC_XY_ENCODING(DYNAMIC_NOC_X(noc, remote_noc_x[i]), DYNAMIC_NOC_Y(noc, remote_noc_y[i])));
+            *remote_cb_interface.pages_sent[i] += pages_sent;
+            uint64_t remote_ack_ptr_addr = get_noc_addr_helper(remote_noc_xy, (uint32_t)remote_cb_interface.pages_sent[i]);
+            noc_semaphore_inc(remote_ack_ptr_addr, pages_sent, noc);
+        }
+    }
+}
+
+FORCE_INLINE void remote_cb_reserve_back(uint32_t num_pages) {
+    uint32_t len_bytes = num_pages * remote_cb_interface.fifo_page_size;
+    uint32_t num_pages_wait = len_bytes / remote_cb_interface.aligned_page_size;
+    uint32_t free_pages;
+
+    for (uint32_t i=0; i < remote_cb_interface.num_receivers; ++i) {
+        do {
+            uint32_t pages_acked = (uint32_t)reg_read((uint32_t)remote_cb_interface.pages_acked[0]);
+            uint32_t pages_sent = (uint32_t)reg_read((uint32_t)remote_cb_interface.pages_sent[0]);
+            free_pages = remote_cb_interface.fifo_aligned_num_pages - (pages_sent - pages_acked);
+        } while (free_pages < num_pages_wait);
+    }
+}
+
+// unused for now, but we might need to use this one if we want to transfer the maximum noc packet
+FORCE_INLINE void remote_cb_push_back_and_write_pages_(uint32_t local_cb_addr, uint32_t num_pages, uint32_t remote_noc_x, uint32_t remote_noc_y, uint8_t noc = noc_index) {
+    uint32_t len_bytes = num_pages * remote_cb_interface.fifo_page_size;
+    uint32_t pages_sent = len_bytes / remote_cb_interface.aligned_page_size;
+
+    uint32_t local_fifo_rd_ptr = local_cb_addr;
+    uint32_t remote_fifo_wr_ptr = remote_cb_interface.fifo_wr_ptr;
+
+    uint32_t src_addr = local_cb_addr;
+    uint32_t dest_addr = remote_cb_interface.fifo_wr_ptr;
+    uint32_t remote_noc_xy = uint32_t(NOC_XY_ENCODING(DYNAMIC_NOC_X(noc, remote_noc_x), DYNAMIC_NOC_Y(noc, remote_noc_y)));
+    uint64_t dest_noc_addr = get_noc_addr_helper(remote_noc_xy, dest_addr);
+
+
+    while (len_bytes > NOC_MAX_BURST_SIZE) {
+
+        src_addr = local_fifo_rd_ptr;
+        dest_addr = remote_fifo_wr_ptr;
+        dest_noc_addr = get_noc_addr_helper(remote_noc_xy, dest_addr);
+
+        // split one write to two chunks
+        if ((dest_addr + NOC_MAX_BURST_SIZE) >= remote_cb_interface.fifo_limit_page_aligned) {
+            uint32_t first_len_bytes = remote_cb_interface.fifo_limit_page_aligned - dest_addr;
+            uint32_t second_len_bytes = NOC_MAX_BURST_SIZE - first_len_bytes;
+
+            // issue first write transfer
+            while (!noc_cmd_buf_ready(noc, write_cmd_buf));
+            ncrisc_noc_fast_write(noc, write_cmd_buf, src_addr, dest_noc_addr, first_len_bytes, NOC_UNICAST_WRITE_VC, false, false, 1, true);
+            src_addr += first_len_bytes;
+            dest_addr = remote_cb_interface.fifo_start_addr;
+            dest_noc_addr = get_noc_addr_helper(remote_noc_xy, dest_addr);
+
+            if (second_len_bytes != 0) {
+                // issue second write transfer
+                while (!noc_cmd_buf_ready(noc, write_cmd_buf));
+                ncrisc_noc_fast_write(noc, write_cmd_buf, src_addr, dest_noc_addr, second_len_bytes, NOC_UNICAST_WRITE_VC, false, false, 1, true);
+                src_addr += second_len_bytes;
+                dest_addr += second_len_bytes;
+            }
+
+        } else { // issue write in one request
+            while (!noc_cmd_buf_ready(noc, write_cmd_buf));
+            ncrisc_noc_fast_write(noc, write_cmd_buf, src_addr, dest_noc_addr, NOC_MAX_BURST_SIZE, NOC_UNICAST_WRITE_VC, false, false, 1, true);
+            src_addr += NOC_MAX_BURST_SIZE;
+            dest_addr += NOC_MAX_BURST_SIZE;
+        }
+
+        // update local and remote pointers
+        local_fifo_rd_ptr = src_addr;
+        remote_fifo_wr_ptr = dest_addr;
+
+        len_bytes -= NOC_MAX_BURST_SIZE;
+    }
+
+    dest_noc_addr = get_noc_addr_helper(remote_noc_xy, dest_addr);
+    // split one write to two chunks for last write
+    if ((dest_addr + len_bytes) >= remote_cb_interface.fifo_limit_page_aligned) {
+
+        uint32_t first_len_bytes = remote_cb_interface.fifo_limit_page_aligned - dest_addr;
+        uint32_t second_len_bytes = len_bytes - first_len_bytes;
+
+        // issue first write transfer
+        while (!noc_cmd_buf_ready(noc, write_cmd_buf));
+        ncrisc_noc_fast_write(noc, write_cmd_buf, src_addr, dest_noc_addr, first_len_bytes, NOC_UNICAST_WRITE_VC, false, false, 1, true);
+        src_addr += first_len_bytes;
+        dest_addr = remote_cb_interface.fifo_start_addr;
+        dest_noc_addr = get_noc_addr_helper(remote_noc_xy, dest_addr);
+
+        if (second_len_bytes != 0) {
+            // issue second write transfer
+            while (!noc_cmd_buf_ready(noc, write_cmd_buf));
+            ncrisc_noc_fast_write(noc, write_cmd_buf, src_addr, dest_noc_addr, second_len_bytes, NOC_UNICAST_WRITE_VC, false, false, 1, true);
+            src_addr += second_len_bytes;
+            dest_addr += second_len_bytes;
+        }
+
+    } else { // issue write in one request
+        while (!noc_cmd_buf_ready(noc, write_cmd_buf));
+        ncrisc_noc_fast_write(noc, write_cmd_buf, src_addr, dest_noc_addr, len_bytes, NOC_UNICAST_WRITE_VC, false, false, 1, true);
+        src_addr += len_bytes;
+        dest_addr += len_bytes;
+    }
+
+    *remote_cb_interface.pages_sent += pages_sent;
+    remote_cb_interface.fifo_wr_ptr = dest_addr;
+
+    uint64_t remote_ack_ptr_addr = get_noc_addr_helper(remote_noc_xy, (uint32_t)remote_cb_interface.pages_sent);
+    noc_semaphore_inc(remote_ack_ptr_addr, pages_sent, noc);
+}
+
+FORCE_INLINE void remote_cb_push_back_and_write_pages(uint32_t local_cb_addr, uint32_t num_pages, uint32_t num_rows, uint32_t coalesced_num_pages_per_row, uint32_t coalesced_page_size, uint32_t* remote_noc_x, uint32_t* remote_noc_y, uint8_t noc = noc_index) {
+    uint32_t len_bytes = num_pages * remote_cb_interface.fifo_page_size;
+    uint32_t pages_sent = len_bytes / remote_cb_interface.aligned_page_size;
+
+    uint32_t next_receiver_start_addr_stride = coalesced_num_pages_per_row * coalesced_page_size;
+    uint32_t next_block_row_stride = next_receiver_start_addr_stride * remote_cb_interface.num_receivers;
+
+    uint32_t dest_addr;
+
+    uint32_t next_receiver_start_addr_offset = 0;
+    for (uint32_t i=0; i < remote_cb_interface.num_receivers; ++i) {
+
+        uint32_t src_addr = local_cb_addr + next_receiver_start_addr_offset;
+        dest_addr = remote_cb_interface.fifo_wr_ptr;
+
+        uint32_t remote_noc_xy = uint32_t(NOC_XY_ENCODING(DYNAMIC_NOC_X(noc, remote_noc_x[i]), DYNAMIC_NOC_Y(noc, remote_noc_y[i])));
+        uint64_t dest_noc_addr = get_noc_addr_helper(remote_noc_xy, dest_addr);
+
+        noc_async_write_one_packet_set_state(dest_noc_addr, coalesced_page_size, noc);
+
+        for (uint32_t h = 0; h < num_rows; ++h) {
+            uint32_t prev_src_addr = src_addr;
+            for (uint32_t w = 0; w < coalesced_num_pages_per_row; ++w) {
+                dest_noc_addr = get_noc_addr_helper(remote_noc_xy, dest_addr);
+
+                if ((dest_addr + coalesced_page_size) > remote_cb_interface.fifo_limit_page_aligned) {
+
+                    uint32_t first_len_bytes = remote_cb_interface.fifo_limit_page_aligned - dest_addr;
+                    uint32_t second_len_bytes = coalesced_page_size - first_len_bytes;
+
+                    if (first_len_bytes != 0) {
+                        noc_async_write_one_packet(src_addr, dest_noc_addr, first_len_bytes, noc);
+                        src_addr += first_len_bytes;
+                    }
+
+                    dest_addr = remote_cb_interface.fifo_start_addr;
+                    dest_noc_addr = get_noc_addr_helper(remote_noc_xy, dest_addr);
+
+                    noc_async_write_one_packet(src_addr, dest_noc_addr, second_len_bytes, noc);
+
+                    src_addr += second_len_bytes;
+                    dest_addr += second_len_bytes;
+                    dest_noc_addr = get_noc_addr_helper(remote_noc_xy, dest_addr);
+
+                    noc_async_write_one_packet_set_state(dest_noc_addr, coalesced_page_size, noc);
+
+                } else {
+                    noc_async_write_one_packet_with_state(src_addr, dest_noc_addr, noc);
+
+                    src_addr += coalesced_page_size;
+                    dest_addr += coalesced_page_size;
+                }
+            }
+            src_addr = prev_src_addr + next_block_row_stride;
+        }
+        next_receiver_start_addr_offset += next_receiver_start_addr_stride;
+
+        *remote_cb_interface.pages_sent[i] += pages_sent;
+
+        uint64_t remote_ack_ptr_addr = get_noc_addr_helper(remote_noc_xy, (uint32_t)remote_cb_interface.pages_sent[i]);
+        noc_semaphore_inc(remote_ack_ptr_addr, pages_sent, noc);
+    }
+
+    remote_cb_interface.fifo_wr_ptr = dest_addr;
+
+}
+
+void kernel_main() {
+
+    uint32_t rt_args_idx = 0;
+    noc_x = (tt_l1_ptr uint32_t*)(get_arg_addr(increment_arg_idx(rt_args_idx, num_receivers)));
+    noc_y = (tt_l1_ptr uint32_t*)(get_arg_addr(increment_arg_idx(rt_args_idx, num_receivers)));
+    pages_acked_semaphore_addr = (tt_l1_ptr uint32_t*)(get_arg_addr(increment_arg_idx(rt_args_idx, num_receivers)));
+    pages_sent_semaphore_addr = (tt_l1_ptr uint32_t*)(get_arg_addr(increment_arg_idx(rt_args_idx, num_receivers)));
+
+    coalesced_page_size = (tt_l1_ptr uint32_t*)(get_arg_addr(increment_arg_idx(rt_args_idx, num_layers)));
+    coalesced_num_pages = (tt_l1_ptr uint32_t*)(get_arg_addr(increment_arg_idx(rt_args_idx, num_layers)));
+    num_blocks = (tt_l1_ptr uint32_t*)(get_arg_addr(increment_arg_idx(rt_args_idx, num_layers)));
+    block_num_tiles = (tt_l1_ptr uint32_t*)(get_arg_addr(increment_arg_idx(rt_args_idx, num_layers)));
+    page_size = (tt_l1_ptr uint32_t*)(get_arg_addr(increment_arg_idx(rt_args_idx, num_layers)));
+    num_tile_rows = (tt_l1_ptr uint32_t*)(get_arg_addr(increment_arg_idx(rt_args_idx, num_layers)));
+
+    start_page_size = page_size[0];
+
+    constexpr uint32_t cb_id = 0;
+
+    setup_remote_sender_cb_interface<ALIGNED_PAGE_SIZE>();
+
+    for (uint32_t l = 0; l < num_layers; ++l) {
+        uint32_t curr_coalesced_page_size = coalesced_page_size[l];
+        uint32_t curr_coalesced_num_pages = coalesced_num_pages[l];
+        uint32_t curr_num_blocks = num_blocks[l];
+        uint32_t curr_block_num_tiles = block_num_tiles[l];
+        uint32_t curr_page_size = page_size[l];
+        uint32_t curr_num_tile_rows = num_tile_rows[l];
+        uint32_t curr_receiver_block_num_tiles = curr_block_num_tiles / num_receivers;
+
+        setup_remote_cb_page_size(curr_page_size, noc_x, noc_y, noc);
+
+        for (uint32_t block = 0; block < curr_num_blocks; ++block) {
+
+            cb_wait_front(cb_id, curr_block_num_tiles);
+
+            uint32_t local_cb_addr = get_read_ptr(cb_id);
+            remote_cb_reserve_back(curr_receiver_block_num_tiles);
+            remote_cb_push_back_and_write_pages(local_cb_addr, curr_receiver_block_num_tiles, curr_num_tile_rows, curr_coalesced_num_pages, curr_coalesced_page_size, noc_x, noc_y, noc);
+
+            cb_pop_front(cb_id, curr_block_num_tiles);
+
+        }
+        layer++;
+    }
+
+}
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/test_dram_read_remote_cb.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/test_dram_read_remote_cb.cpp
new file mode 100644
index 00000000000..5bbf0ca25b0
--- /dev/null
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/test_dram_read_remote_cb.cpp
@@ -0,0 +1,832 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <algorithm>
+#include <cctype>
+#include <chrono>
+#include <functional>
+#include <random>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include "common/bfloat8.hpp"
+#include "common/bfloat16.hpp"
+#include "common/tt_backend_api_types.hpp"
+#include "tt_metal/detail/tt_metal.hpp"
+#include "tt_metal/detail/util.hpp"
+#include "tt_metal/host_api.hpp"
+#include "tt_metal/tt_metal/perf_microbenchmark/common/util.hpp"
+#include "tt_metal/common/work_split.hpp"
+#include "tests/tt_metal/test_utils/tilization.hpp"
+#include "tt_metal/test_utils/deprecated/tensor.hpp"
+#include "tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/matmul_utils.hpp"
+#include <yaml-cpp/yaml.h>
+
+using namespace tt;
+using std::chrono::duration_cast;
+using std::chrono::microseconds;
+
+////////////////////////////////////////////////////////////////////////////////
+// A tensix core that's next to a DRAM bank reads from the bank, and writes to
+// the neighbour receiver tensix core. It creates a bfloat16/bfloat8_b format
+// DRAM buffer of a given input size, and write it to the DRAM banks in the round
+// robin style.
+//
+// Disclaimer:
+//   - This benchmark is designed to support an input size larger than 4GB. But
+//   current tt-metal does not seem to support buffer allocation larger than 4GB
+//   yet.
+//   - Also, detail::ReadFromBuffer API used in DRAM write test may take a long time if
+//   the input size is large.
+//
+// Usage example:
+//   ./test_dram_offchip
+//     --k
+//     --n
+//     --num-blocks
+//     --k
+//     --k
+//     --num-tests <count of tests>
+//     --data-type
+//     --num-banks
+//     --bank-start-id
+//     --bypass-check (set to bypass checking performance criteria fulfillment)
+////////////////////////////////////////////////////////////////////////////////
+
+
+
+template <typename T>
+std::vector<T> slice_vec(std::vector<T> const &v, int m, int n) {
+    auto first = v.cbegin() + m;
+    auto last = v.cbegin() + n + 1;
+
+    std::vector<T> vec(first, last);
+    return vec;
+}
+
+void get_max_page_size_and_num_pages(uint32_t num_tiles, uint32_t num_datums_per_tile, uint32_t& page_size, uint32_t& num_pages) {
+    uint64_t total_size = static_cast<uint64_t>(num_tiles) * num_datums_per_tile;
+
+    page_size = (8192 / num_datums_per_tile) * num_datums_per_tile;
+    while (total_size % page_size != 0 && page_size >= num_datums_per_tile) {
+        page_size -= num_datums_per_tile;
+    }
+    num_pages = total_size / page_size;
+}
+
+std::tuple<tt_metal::Program, tt_metal::KernelHandle, uint32_t> create_program(
+    tt_metal::Device *device,
+    const CoreRangeSet &dram_reader_core,
+    const CoreRangeSet &l1_receiver_cores,
+    const uint32_t &single_tile_size,
+    const tt::DataFormat &tile_format,
+    uint32_t k,
+    uint32_t n,
+    uint32_t num_blocks,
+    uint32_t cb_num_blocks,
+    uint32_t num_receivers,
+    uint32_t num_mixed_df_layers,
+    uint32_t cb_padding,
+    std::shared_ptr<tt::tt_metal::Buffer> input_buffer,
+    std::shared_ptr<tt::tt_metal::Buffer> output_buffer
+    ) {
+
+    log_info("created program");
+
+    tt_metal::Program program = tt_metal::Program();
+
+    auto all_cores = dram_reader_core.merge(l1_receiver_cores);
+
+    uint32_t start_tile_id = 0;
+    uint32_t kt = k / 32;
+    uint32_t nt = n / 32;
+    uint32_t block_h = kt / num_blocks;
+    uint32_t num_tile_rows_write = block_h;
+    uint32_t block_w = nt;
+    uint32_t block_num_tiles = block_h * block_w;
+
+    // DRAM reader CB
+    uint32_t reader_cb_index = 0;
+    uint32_t reader_cb_size = block_h * block_w * single_tile_size * 3;
+    // For debug purpose
+    // uint32_t reader_cb_size = block_h * block_w * single_tile_size;
+    uint32_t reader_page_size, reader_num_pages;
+    get_max_page_size_and_num_pages(block_num_tiles, single_tile_size, reader_page_size, reader_num_pages);
+
+    uint32_t receiver_block_num_tile = block_h * block_w / num_receivers;
+    uint32_t writer_page_size, writer_num_pages;
+    get_max_page_size_and_num_pages(block_w / num_receivers, single_tile_size, writer_page_size, writer_num_pages);
+
+    log_info("writer_page_size: {}", writer_page_size);
+    log_info("writer_num_pages: {}", writer_num_pages);
+
+    uint32_t reader_cb_addr = device->get_base_allocator_addr(HalMemType::L1);
+    tt_metal::CircularBufferConfig reader_cb_config =
+        tt_metal::CircularBufferConfig(reader_cb_size, {{reader_cb_index, tile_format}})
+            .set_page_size(reader_cb_index, single_tile_size);
+    auto reader_cb = tt_metal::CreateCircularBuffer(program, dram_reader_core, reader_cb_config);
+
+    // mixed cb dataformat
+    uint32_t next_layer_num_blocks = num_blocks * 2;
+    uint32_t next_layer_block_h = kt / next_layer_num_blocks;
+    uint32_t next_layer_block_num_tiles = next_layer_block_h * block_w;
+    uint32_t next_layer_num_tile_rows_write = next_layer_block_h;
+    uint32_t next_layer_receiver_block_num_tile = next_layer_block_num_tiles / num_receivers;
+
+    uint32_t next_layer_single_tile_size = single_tile_size;
+    if (tile_format == tt::DataFormat::Float16_b) {
+        next_layer_single_tile_size = 1088;
+    } else {
+        next_layer_single_tile_size = 2048;
+    }
+    uint32_t next_layer_reader_page_size, next_layer_reader_num_pages;
+    get_max_page_size_and_num_pages(next_layer_block_num_tiles, next_layer_single_tile_size, next_layer_reader_page_size, next_layer_reader_num_pages);
+
+    uint32_t next_layer_writer_page_size, next_layer_writer_num_pages;
+    get_max_page_size_and_num_pages(block_w / num_receivers, next_layer_single_tile_size, next_layer_writer_page_size, next_layer_writer_num_pages);
+
+    // L1 receiver CB
+    uint32_t receiver_cb_index = 0;
+    uint32_t receiver_cb_size = block_h * block_w * single_tile_size * cb_num_blocks / num_receivers + cb_padding;
+    uint32_t receiver_page_size = 32;
+    uint32_t receiver_cb_addr = output_buffer->address();
+    tt_metal::CircularBufferConfig receiver_cb_config =
+        tt_metal::CircularBufferConfig(receiver_cb_size, {{receiver_cb_index, tile_format}})
+            .set_page_size(receiver_cb_index, receiver_page_size).set_globally_allocated_address(*output_buffer);
+    auto receiver_cb = tt_metal::CreateCircularBuffer(program, l1_receiver_cores, receiver_cb_config);
+
+    log_info("reader_cb_size: {}", reader_cb_size);
+    log_info("receiver_cb_size: {}", receiver_cb_size);
+
+    // semaphore
+    std::vector<uint32_t> pages_acked_semaphore_ids(num_receivers);
+    std::vector<uint32_t> pages_sent_semaphore_ids(num_receivers);
+    for (uint32_t i=0; i < num_receivers; ++i) {
+        pages_acked_semaphore_ids[i] = tt_metal::CreateSemaphore(program, all_cores, INVALID);
+        pages_sent_semaphore_ids[i] = tt_metal::CreateSemaphore(program, all_cores, INVALID);
+    }
+
+    std::vector<uint32_t> reader_compile_time_args = {
+        (std::uint32_t) input_buffer->address(),
+        (std::uint32_t) start_tile_id,
+        (std::uint32_t) tt_metal::NOC::RISCV_0_default,
+        (std::uint32_t) num_mixed_df_layers
+    };
+
+    auto reader_kernel = tt_metal::CreateKernel(
+        program,
+        "tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/kernels/reader_dram.cpp",
+        dram_reader_core,
+        tt_metal::DataMovementConfig{
+            .processor = tt_metal::DataMovementProcessor::RISCV_0,
+            .noc = tt_metal::NOC::RISCV_0_default,
+            .noc_mode = tt_metal::NOC_MODE::DM_DYNAMIC_NOC,
+            .compile_args = reader_compile_time_args});
+
+    std::vector<uint32_t> writer_compile_time_args = {
+        (std::uint32_t) tt_metal::NOC::RISCV_0_default,
+        (std::uint32_t) receiver_cb_addr,
+        (std::uint32_t) receiver_cb_size,
+        (std::uint32_t) num_receivers,
+        (std::uint32_t) num_mixed_df_layers
+    };
+
+    auto writer_kernel = tt_metal::CreateKernel(
+        program,
+        "tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/kernels/writer_l1.cpp",
+        dram_reader_core,
+        tt_metal::DataMovementConfig{
+            .processor = tt_metal::DataMovementProcessor::RISCV_1,
+            .noc = tt_metal::NOC::RISCV_1_default,
+            .noc_mode = tt_metal::NOC_MODE::DM_DYNAMIC_NOC,
+            .compile_args = writer_compile_time_args});
+
+    std::vector<uint32_t> receiver_compile_time_args = {
+        (std::uint32_t) reader_cb_addr,
+        (std::uint32_t) receiver_cb_size,
+        (std::uint32_t) num_mixed_df_layers,
+    };
+
+    auto receiver_kernel = tt_metal::CreateKernel(
+        program,
+        "tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/kernels/receiver_l1.cpp",
+        l1_receiver_cores,
+        tt_metal::DataMovementConfig{
+            .processor = tt_metal::DataMovementProcessor::RISCV_1,
+            .noc = tt_metal::NOC::RISCV_1_default,
+            .compile_args = receiver_compile_time_args});
+
+    // reader rt
+    auto dram_reader_core_coord = dram_reader_core.ranges().begin()->start_coord;
+    log_info("dram_reader_core_coord: {}", dram_reader_core_coord);
+    auto dram_reader_core_coord_physical = device->worker_core_from_logical_core(dram_reader_core_coord);
+    uint32_t bank_id = 0;
+    uint32_t vc = bank_id & 0x1;
+    std::vector<uint32_t> reader_rt_args = {
+        (std::uint32_t) bank_id,
+        (std::uint32_t) vc
+    };
+    for (uint32_t i = 0; i < num_mixed_df_layers; ++i) {
+        reader_rt_args.push_back(i%2 == 0 ? reader_page_size : next_layer_reader_page_size);
+    }
+    for (uint32_t i = 0; i < num_mixed_df_layers; ++i) {
+        reader_rt_args.push_back(i%2 == 0 ? reader_num_pages : next_layer_reader_num_pages);
+    }
+    for (uint32_t i = 0; i < num_mixed_df_layers; ++i) {
+        reader_rt_args.push_back(i%2 == 0 ? num_blocks : next_layer_num_blocks);
+    }
+    for (uint32_t i = 0; i < num_mixed_df_layers; ++i) {
+        reader_rt_args.push_back(i%2 == 0 ? block_num_tiles : next_layer_block_num_tiles);
+    }
+    tt_metal::SetRuntimeArgs(program, reader_kernel, dram_reader_core_coord, reader_rt_args);
+
+    // writer rt
+    std::vector<CoreCoord> l1_receiver_core_coords;
+    for (auto l1_receiver_core_coord : *l1_receiver_cores.ranges().begin()) {
+        l1_receiver_core_coords.push_back(l1_receiver_core_coord);
+    }
+    std::vector<uint32_t> writer_rt_args;
+    for (uint32_t i=0; i < num_receivers; ++i) {
+        auto l1_receiver_core_coord_physical = device->worker_core_from_logical_core(l1_receiver_core_coords[i]);
+        writer_rt_args.push_back(l1_receiver_core_coord_physical.x);
+    }
+    for (uint32_t i=0; i < num_receivers; ++i) {
+        auto l1_receiver_core_coord_physical = device->worker_core_from_logical_core(l1_receiver_core_coords[i]);
+        writer_rt_args.push_back(l1_receiver_core_coord_physical.y);
+    }
+    for (uint32_t i=0; i < num_receivers; ++i) {
+        writer_rt_args.push_back(pages_acked_semaphore_ids[i]);
+    }
+    for (uint32_t i=0; i < num_receivers; ++i) {
+        writer_rt_args.push_back(pages_sent_semaphore_ids[i]);
+    }
+    for (uint32_t i = 0; i < num_mixed_df_layers; ++i) {
+        writer_rt_args.push_back(i%2 == 0 ? writer_page_size : next_layer_writer_page_size);
+    }
+    for (uint32_t i = 0; i < num_mixed_df_layers; ++i) {
+        writer_rt_args.push_back(i%2 == 0 ? writer_num_pages : next_layer_writer_num_pages);
+    }
+    for (uint32_t i = 0; i < num_mixed_df_layers; ++i) {
+        writer_rt_args.push_back(i%2 == 0 ? num_blocks : next_layer_num_blocks);
+    }
+    for (uint32_t i = 0; i < num_mixed_df_layers; ++i) {
+        writer_rt_args.push_back(i%2 == 0 ? block_num_tiles : next_layer_block_num_tiles);
+    }
+    for (uint32_t i = 0; i < num_mixed_df_layers; ++i) {
+        writer_rt_args.push_back(i%2 == 0 ? single_tile_size : next_layer_single_tile_size);
+    }
+    for (uint32_t i = 0; i < num_mixed_df_layers; ++i) {
+        writer_rt_args.push_back(i%2 == 0 ? num_tile_rows_write : next_layer_num_tile_rows_write);
+    }
+    tt_metal::SetRuntimeArgs(program, writer_kernel, dram_reader_core_coord, writer_rt_args);
+
+    // reciever rt
+    for (uint32_t i=0; i < num_receivers; ++i) {
+        std::vector<uint32_t> receiver_rt_args = {
+            (std::uint32_t) vc & 0x3,
+            (std::uint32_t) dram_reader_core_coord_physical.x,
+            (std::uint32_t) dram_reader_core_coord_physical.y
+        };
+        vc ++;
+
+        receiver_rt_args.push_back(pages_acked_semaphore_ids[i]);
+        receiver_rt_args.push_back(pages_sent_semaphore_ids[i]);
+
+        for (uint32_t i = 0; i < num_mixed_df_layers; ++i) {
+            receiver_rt_args.push_back(i%2 == 0 ? single_tile_size : next_layer_single_tile_size);
+        }
+        for (uint32_t i = 0; i < num_mixed_df_layers; ++i) {
+            receiver_rt_args.push_back(i%2 == 0 ? num_blocks : next_layer_num_blocks);
+        }
+        for (uint32_t i = 0; i < num_mixed_df_layers; ++i) {
+            receiver_rt_args.push_back(i%2 == 0 ? receiver_block_num_tile : next_layer_receiver_block_num_tile);
+        }
+
+        log_info("l1_receiver_core_coords: {}", l1_receiver_core_coords[i]);
+
+        tt_metal::SetRuntimeArgs(program, receiver_kernel, l1_receiver_core_coords[i], receiver_rt_args);
+    }
+
+    return {std::move(program), reader_kernel, reader_cb_addr};
+}
+
+float to_float(bfloat16 bfloat16_num) {
+    return bfloat16_num.to_float();
+}
+
+float pcc(const std::vector<float>& x, const std::vector<float>& y) {
+    if (x.size() != y.size()) {
+        throw std::invalid_argument("Vectors must be of the same length.");
+    }
+
+    int n = x.size();
+    float mean_x = 0, mean_y = 0;
+    for (int i = 0; i < n; ++i) {
+        mean_x += x[i];
+        mean_y += y[i];
+    }
+    mean_x /= n;
+    mean_y /= n;
+
+    float numerator = 0, sum_sq_x = 0, sum_sq_y = 0;
+    for (int i = 0; i < n; ++i) {
+        float diff_x = x[i] - mean_x;
+        float diff_y = y[i] - mean_y;
+        numerator += diff_x * diff_y;
+        sum_sq_x += diff_x * diff_x;
+        sum_sq_y += diff_y * diff_y;
+    }
+
+    float denominator = std::sqrt(sum_sq_x * sum_sq_y);
+    if (denominator == 0) {
+        return 0;
+    }
+
+    return numerator / denominator;
+}
+
+bool validation_bfp8_b(
+    tt::deprecated::Tensor<float> input_tensor,
+    const tt::DataFormat &data_format,
+    uint32_t num_blocks,
+    uint32_t cb_num_blocks,
+    uint32_t kt,
+    uint32_t nt,
+    std::shared_ptr<tt::tt_metal::Buffer> out_buffer
+) {
+    bool pass = true;
+    std::vector<float> golden_vec(kt * nt * 32 * 32 / num_blocks * cb_num_blocks, 0); // Initialize with zeros
+    std::vector<float> result_vec(kt * nt * 32 * 32 / num_blocks * cb_num_blocks, 0);
+    auto num_datums_per_cb = kt * nt * 32 * 32 / num_blocks * cb_num_blocks;
+
+    std::vector<float> result_untilized;
+    std::vector<uint32_t> result;
+    tt::tt_metal::detail::ReadFromBuffer(out_buffer, result);
+    auto result_bfp8 = unpack_bfp8_tiles_into_float_vec(result, true, false);
+    result_untilized = tt::test_utils::untilize(result_bfp8, kt*32 / num_blocks * cb_num_blocks, nt*32);
+
+    const auto& values = input_tensor.get_values();
+
+    int index = 0;
+    for (int i = 0; i < kt * nt * 32 * 32; ++i) {
+        golden_vec[index] = float(values[i]);
+        index++;
+
+        if (index == num_datums_per_cb) {
+            index = 0;
+        }
+    }
+
+    for (int i=0; i<result_untilized.size(); ++i) {
+        result_vec[i] = result_untilized[i];
+    }
+
+    pass &= pcc(golden_vec, result_vec) >= 0.9999;
+    if (!pass) {
+        log_error(LogTest, "validation single core failed");
+    }
+    return pass;
+}
+
+
+bool validation_fp16(
+    tt::deprecated::Tensor<bfloat16> input_tensor,
+    const tt::DataFormat &data_format,
+    uint32_t num_blocks,
+    uint32_t cb_num_blocks,
+    uint32_t kt,
+    uint32_t nt,
+    std::shared_ptr<tt::tt_metal::Buffer> out_buffer
+) {
+    bool pass = true;
+    std::vector<float> golden_vec(kt * nt * 32 * 32 / num_blocks * cb_num_blocks, 0); // Initialize with zeros
+    std::vector<float> result_vec(kt * nt * 32 * 32 / num_blocks * cb_num_blocks, 0);
+    auto num_datums_per_cb = kt * nt * 32 * 32 / num_blocks * cb_num_blocks;
+
+    std::vector<uint32_t> result;
+    tt::tt_metal::detail::ReadFromBuffer(out_buffer, result);
+    auto result_bfp16 = unpack_uint32_vec_into_bfloat16_vec(result);
+    auto result_flat_layout = convert_to_flat_layout(result_bfp16);
+    auto result_untilized = tt::test_utils::untilize(result_flat_layout, kt*32 / num_blocks * cb_num_blocks, nt*32);
+
+    const auto& values = input_tensor.get_values();
+
+    int index = 0;
+    for (int i = 0; i < kt * nt * 32 * 32; ++i) {
+        golden_vec[index] = to_float(values[i]);
+        index++;
+
+        if (index == num_datums_per_cb) {
+            index = 0;
+        }
+    }
+
+    for (int i=0; i<result_untilized.size(); ++i) {
+        result_vec[i] = to_float(static_cast<bfloat16>(result_untilized[i]));
+    }
+
+    pass &= (golden_vec == result_vec);
+    if (!pass) {
+        log_error(LogTest, "validation single core failed");
+    }
+    return pass;
+}
+
+bool validation_mixed_df(
+    tt::deprecated::Tensor<bfloat16> input_tensor_fp16,
+    tt::deprecated::Tensor<float> input_tensor_fp8,
+    const tt::DataFormat &data_format,
+    uint32_t num_blocks,
+    uint32_t cb_num_blocks,
+    uint32_t kt,
+    uint32_t nt,
+    std::shared_ptr<tt::tt_metal::Buffer> out_buffer,
+    uint32_t num_mixed_df_layers,
+    uint32_t num_receivers
+) {
+    bool pass = true;
+
+    std::vector<uint32_t> result;
+    tt::tt_metal::detail::ReadFromBuffer(out_buffer, result);
+
+    auto result_bfp16 = unpack_uint32_vec_into_bfloat16_vec(result);
+    auto result_untilized_fp16 = convert_to_flat_layout(result_bfp16);
+
+    std::vector<float> golden_vec(kt*32 / num_blocks * cb_num_blocks * nt*32);
+    std::vector<float> result_vec_fp16(kt*32 / num_blocks * cb_num_blocks * nt*32);
+
+    // compare with the result tilized with tilized
+    auto values_fp16 = tt::test_utils::tilize(input_tensor_fp16.get_values(), kt*32, nt*32);
+
+    auto num_datums_per_cb = kt * nt * 32 * 32 / num_blocks * cb_num_blocks / num_receivers;
+    int start_index = 0;
+    int fifo_size = kt*32 / num_blocks * cb_num_blocks * nt*32 * 2 / num_receivers;
+    int fifo_size_page_aligned, page_size, num_pages, layer_transfer_size, fifo_wr_ptr = 0;
+    for (int l = 0; l < num_mixed_df_layers; ++l) {
+        if (l % 2 == 0) { // fp16
+            page_size = 2048;
+        } else {
+            page_size = 1088;
+        }
+        layer_transfer_size = page_size * kt * nt / num_receivers;
+        num_pages = fifo_size / page_size;
+        fifo_size_page_aligned = page_size * num_pages;
+
+        bool fifo_wr_ptr_exceed_fifo_limit = fifo_wr_ptr > fifo_size_page_aligned;
+        uint32_t num_pages_till_fifo_limit = (fifo_size_page_aligned - fifo_wr_ptr) / page_size;
+        // start pointer addr of current layer
+        fifo_wr_ptr = fifo_wr_ptr_exceed_fifo_limit ? 0 : fifo_size_page_aligned - num_pages_till_fifo_limit * page_size;
+        // start index to read, fifo_wr_ptr / 2 because fp16 format
+        start_index = fifo_wr_ptr == fifo_size_page_aligned ? 0 : fifo_wr_ptr / 2;
+        // end pointer addr of current layer
+        fifo_wr_ptr = (fifo_wr_ptr + layer_transfer_size) % fifo_size_page_aligned;
+    }
+
+    std::vector<std::vector<float> > values_fp16_split(num_receivers, std::vector<float>(values_fp16.size() / num_receivers));
+
+    int index = 0;
+    for (int k = 0; k < kt; ++k) {
+        for (int n = 0; n < num_receivers; ++n) {
+            for (int i = 0; i < nt * 32 * 32 / num_receivers; ++i) {
+                values_fp16_split[n][i + k * nt * 32 * 32 / num_receivers] = to_float(values_fp16[index]);
+                index ++;
+            }
+        }
+    }
+
+    std::vector<std::vector<float> > golden_vec_split(num_receivers, std::vector<float>(golden_vec.size() / num_receivers));
+
+    for (int n = 0; n < num_receivers; ++n) {
+        index = start_index;
+        for (int i = 0; i < kt * nt * 32 * 32 / num_receivers; ++i) {
+            golden_vec_split[n][index] = values_fp16_split[n][i];
+            index ++;
+
+            if (index == num_datums_per_cb) {
+                index = 0;
+            }
+        }
+    }
+
+    index = 0;
+    for (int k = 0; k < kt / num_blocks * cb_num_blocks; ++k) {
+        for (int n = 0; n < num_receivers; ++n) {
+            for (int i = 0; i < nt * 32 * 32 / num_receivers; ++i) {
+                golden_vec[index] = golden_vec_split[n][i + k * nt * 32 * 32 / num_receivers];
+                index ++;
+            }
+        }
+    }
+
+    for (int i=0; i<result_untilized_fp16.size(); ++i) {
+        result_vec_fp16[i] = to_float(static_cast<bfloat16>(result_untilized_fp16[i]));
+    }
+
+    // For debug purpose
+    // for (int i = 0; i < golden_vec.size(); ++i) {
+    //     std::cout << golden_vec[i] << " ";
+    //     if ((i+1) % 32 == 0) {
+    //         std::cout << std::endl;
+    //     }
+    // }
+    // std::cout << std::endl;
+    // std::cout << std::endl;
+    // for (int i = 0; i < result_vec_fp16.size(); ++i) {
+    //     std::cout << result_vec_fp16[i] << " ";
+    //     if ((i+1) % 32 == 0) {
+    //         std::cout << std::endl;
+    //     }
+    // }
+
+    pass &= pcc(golden_vec, result_vec_fp16) == 1.0;
+
+    if (!pass) {
+        log_error(LogTest, "validation single core failed");
+    }
+    return pass;
+}
+
+std::shared_ptr<tt::tt_metal::Buffer> create_and_transfer_data_sharded_cb(
+    tt_metal::Device* device,
+    vector<uint32_t> input_vec,
+    uint32_t ht,
+    uint32_t wt,
+    BufferType buffer_type,
+    tt::DataFormat data_format,
+    CoreRangeSet cores,
+    uint32_t num_receivers
+) {
+
+    uint32_t size_bytes;
+    uint32_t page_size_bytes;
+    if (data_format == tt::DataFormat::Bfp8_b) {
+        size_bytes = ht * wt * 1088;
+        page_size_bytes = 1088;
+    } else {
+        size_bytes = ht * tt::constants::TILE_HEIGHT * wt * tt::constants::TILE_WIDTH * 2;
+        page_size_bytes = tt::constants::TILE_HW * 2;
+    }
+
+    ShardSpecBuffer shard_spec = ShardSpecBuffer(
+                cores,
+                {ht * tt::constants::TILE_HEIGHT, wt * tt::constants::TILE_WIDTH / num_receivers},
+                ShardOrientation::ROW_MAJOR,
+                false,
+                {tt::constants::TILE_HEIGHT, tt::constants::TILE_WIDTH},
+                {ht, wt});
+
+    log_info("cores: {}", cores);
+    log_info("size_bytes: {}", size_bytes);
+    log_info("page_size_bytes: {}", page_size_bytes);
+
+    auto input_buffer = CreateBuffer(tt::tt_metal::ShardedBufferConfig{
+                                        .device = device,
+                                        .size = size_bytes,
+                                        .page_size = page_size_bytes,
+                                        .buffer_type = buffer_type,
+                                        .buffer_layout = TensorMemoryLayout::WIDTH_SHARDED,
+                                        .shard_parameters = shard_spec});
+    tt::tt_metal::detail::WriteToBuffer(input_buffer, input_vec);
+
+    log_info("created sharded tensor");
+
+    return input_buffer;
+}
+
+int main(int argc, char **argv) {
+    if (getenv("TT_METAL_SLOW_DISPATCH_MODE") != nullptr) {
+        log_error("Test not supported w/ slow dispatch, exiting");
+    }
+
+    bool pass = true;
+    bool use_device_profiler = false;
+    uint32_t df = 0;
+    std::vector<double> dram_bandwidth;
+    uint32_t num_tests = 1;
+    uint32_t num_blocks = 8;
+    uint32_t cb_num_blocks = 8;
+    uint32_t cb_padding = 16;
+    uint32_t num_receivers = 1;
+    uint32_t num_mixed_df_layers = 1;
+    uint64_t k = 8192, n = 128;
+
+    try {
+        ////////////////////////////////////////////////////////////////////////////
+        //                      Initial Runtime Args Parse
+        ////////////////////////////////////////////////////////////////////////////
+        std::vector<std::string> input_args(argv, argv + argc);
+        try {
+            std::tie(k, input_args) =
+                test_args::get_command_option_uint64_and_remaining_args(input_args, "--k", 8192);
+            std::tie(n, input_args) =
+                test_args::get_command_option_uint64_and_remaining_args(input_args, "--n", 12*128);
+            std::tie(num_blocks, input_args) =
+                test_args::get_command_option_uint64_and_remaining_args(input_args, "--num-blocks", 8);
+            std::tie(cb_num_blocks, input_args) =
+                test_args::get_command_option_uint64_and_remaining_args(input_args, "--cb-num-blocks", 8);
+            std::tie(cb_padding, input_args) =
+                test_args::get_command_option_uint64_and_remaining_args(input_args, "--cb-padding", 16);
+            std::tie(num_tests, input_args) =
+                test_args::get_command_option_uint32_and_remaining_args(input_args, "--num-tests", 1);
+            std::tie(use_device_profiler, input_args) =
+                test_args::has_command_option_and_remaining_args(input_args, "--use-device-profiler");
+            std::tie(df, input_args) =
+                test_args::get_command_option_uint32_and_remaining_args(input_args, "--data-type", 0);
+            std::tie(num_receivers, input_args) =
+                test_args::get_command_option_uint64_and_remaining_args(input_args, "--num-receivers", 1);
+            std::tie(num_mixed_df_layers, input_args) =
+                test_args::get_command_option_uint64_and_remaining_args(input_args, "--num-mixed-df-layers", 1);
+
+
+            test_args::validate_remaining_args(input_args);
+        } catch (const std::exception &e) {
+            log_error(tt::LogTest, "Command line arguments found exception", e.what());
+            TT_ASSERT(false);
+        }
+
+        log_info("num_mixed_df_layers: {} ", num_mixed_df_layers);
+        log_info("num_receivers: {} ", num_receivers);
+
+        TT_FATAL(num_mixed_df_layers % 2 == 1, "currently only support odd number of layers testing, due to issue with validatoin");
+        if (num_mixed_df_layers > 1) {
+            TT_FATAL(df == 1, "must start with bfloat16 format for mix_df test");
+        }
+
+        if (use_device_profiler) {
+            #if !defined(TRACY_ENABLE)
+            log_error(
+                LogTest,
+                "Metal library and test code should be build with "
+                "profiler option using ./scripts/build_scripts/build_with_profiler_opt.sh");
+            #endif
+            auto device_profiler = getenv("TT_METAL_DEVICE_PROFILER");
+            TT_FATAL(
+                device_profiler,
+                "Before running the program, do one of the following in a shell: "
+                "either export the environment variable by executing export TT_METAL_DEVICE_PROFILER=1, "
+                "or run the program with TT_METAL_DEVICE_PROFILER=1 prefixed to the command");
+        }
+
+        ////////////////////////////////////////////////////////////////////////////
+        //                      Parameters Setup
+        ////////////////////////////////////////////////////////////////////////////
+        uint32_t num_banks = 1;
+        uint32_t input_size = 0;
+        tt::DataFormat tile_format = tt::DataFormat::Bfp8_b;
+        if (df == 0) {
+            input_size = k * n * 1088 / 1024;
+            tile_format = tt::DataFormat::Bfp8_b;
+        } else if (df == 1) {
+            input_size = k * n * 2;
+            tile_format = tt::DataFormat::Float16_b;
+        } else {
+            TT_THROW("Input data format {} is invalid. Please change.", df);
+        }
+        uint32_t output_size = input_size / num_blocks * cb_num_blocks;
+        uint32_t kt = k / 32;
+        uint32_t nt = n / 32;
+        uint32_t block_h = kt / num_blocks;
+        uint32_t block_w = nt;
+        uint32_t num_datums_per_tile = 32 * 32;
+
+        uint32_t single_tile_size = tt_metal::detail::TileSize(tile_format);
+
+        TT_FATAL(input_size % single_tile_size == 0, "input size is not aligned to tile size");
+        ////////////////////////////////////////////////////////////////////////////
+        //                      Device Setup
+        ////////////////////////////////////////////////////////////////////////////
+        int device_id = 0;
+        tt_metal::Device *device = tt_metal::CreateDevice(device_id);
+
+        CoreCoord dram_bank_coord = CoreCoord{0, 0};
+        CoreCoord dram_reader_core_coord = CoreCoord{0, 0};
+        CoreRange dram_reader_core_coord_range = CoreRange(dram_reader_core_coord);
+        CoreRangeSet dram_reader_core{std::set<CoreRange>{CoreRange{dram_reader_core_coord}}};
+        CoreRange l1_receiver_core_coord_range = CoreRange(CoreCoord{0, 0});
+        if (device->arch() == tt::ARCH::GRAYSKULL) {
+            l1_receiver_core_coord_range = CoreRange{CoreCoord{0, 1}, CoreCoord{0, num_receivers}};
+        } else {
+            l1_receiver_core_coord_range = CoreRange{CoreCoord{1, 0}, CoreCoord{num_receivers, 0}};
+        }
+        CoreRangeSet l1_receiver_core{std::set<CoreRange>{l1_receiver_core_coord_range}};
+
+        ////////////////////////////////////////////////////////////////////////////
+        //                      Input Setup
+        ////////////////////////////////////////////////////////////////////////////
+        std::vector<std::shared_ptr<tt::tt_metal::Buffer> > input_buffers(num_mixed_df_layers);
+        std::shared_ptr<tt::tt_metal::Buffer> output_buffer;
+        auto input_shape = SHAPE{1, 1, k, n};
+        tt::deprecated::Tensor<bfloat16> tensor_fp16 = tt::deprecated::initialize_tensor<bfloat16>(input_shape, tt::deprecated::Initialize::INCREMENT, 100, std::chrono::system_clock::now().time_since_epoch().count());
+        tt::deprecated::Tensor<float> tensor_fp8 = tt::deprecated::initialize_tensor<float>(input_shape, tt::deprecated::Initialize::INCREMENT, 100, std::chrono::system_clock::now().time_since_epoch().count());
+        if (tile_format == tt::DataFormat::Bfp8_b) {
+            for (uint32_t i = 0; i < num_mixed_df_layers; ++i) {
+                if (i%2 == 0) { // even layers
+                    auto input_vec_tilized = tt::test_utils::tilize(tensor_fp8.get_values(), k, n);
+                    std::vector<uint32_t> packed_input_vec_tile_layout = pack_fp32_vec_as_bfp8_tiles(input_vec_tilized, true, false);
+                    input_buffers[i] = create_and_transfer_data_sharded_cb(device, packed_input_vec_tile_layout, kt, nt, tt_metal::BufferType::DRAM, tt::DataFormat::Bfp8_b, dram_reader_core, num_banks);
+                } else { // odd layers
+                    auto input_vec_tilized = tt::test_utils::tilize(tensor_fp16.get_values(), k, n);
+                    auto input_vec_tile_layout = convert_to_tile_layout(input_vec_tilized);
+                    vector<uint32_t> packed_input_vec_tile_layout = pack_bfloat16_vec_into_uint32_vec(input_vec_tile_layout);
+                    input_buffers[i] = create_and_transfer_data_sharded_cb(device, packed_input_vec_tile_layout, kt, nt, tt_metal::BufferType::DRAM, tt::DataFormat::Float16_b, dram_reader_core, num_banks);
+                }
+            }
+
+            // output
+            vector<uint32_t> outputs = create_constant_vector_of_bfp8(output_size, 0, true);
+            output_buffer = create_and_transfer_data_sharded_cb(device, outputs, kt / num_blocks * cb_num_blocks, nt, tt_metal::BufferType::L1, tt::DataFormat::Bfp8_b, l1_receiver_core, num_receivers);
+
+        } else {
+            for (uint32_t i = 0; i < num_mixed_df_layers; ++i) {
+                if (i%2 == 0) { // even layers
+                    auto input_vec_tilized = tt::test_utils::tilize(tensor_fp16.get_values(), k, n);
+                    auto input_vec_tile_layout = convert_to_tile_layout(input_vec_tilized);
+                    vector<uint32_t> packed_input_vec_tile_layout = pack_bfloat16_vec_into_uint32_vec(input_vec_tile_layout);
+                    input_buffers[i] = create_and_transfer_data_sharded_cb(device, packed_input_vec_tile_layout, kt, nt, tt_metal::BufferType::DRAM, tt::DataFormat::Float16_b, dram_reader_core, num_banks);
+                } else {
+                    auto input_vec_tilized = tt::test_utils::tilize(tensor_fp8.get_values(), k, n);
+                    std::vector<uint32_t> packed_input_vec_tile_layout = pack_fp32_vec_as_bfp8_tiles(input_vec_tilized, true, false);
+                    input_buffers[i] = create_and_transfer_data_sharded_cb(device, packed_input_vec_tile_layout, kt, nt, tt_metal::BufferType::DRAM, tt::DataFormat::Bfp8_b, dram_reader_core, num_banks);
+                }
+            }
+
+            // output
+            vector<uint32_t> outputs = create_constant_vector_of_bfloat16(output_size, 0);
+            output_buffer = create_and_transfer_data_sharded_cb(device, outputs, kt / num_blocks * cb_num_blocks, nt, tt_metal::BufferType::L1, tt::DataFormat::Float16_b, l1_receiver_core, num_receivers);
+        }
+
+        for (uint32_t i=0; i < num_mixed_df_layers; ++i) {
+            log_info("input_buffers addr: {}", input_buffers[i]->address());
+        }
+
+        ////////////////////////////////////////////////////////////////////////////
+        //                      Application Setup
+        ////////////////////////////////////////////////////////////////////////////
+        auto [program, kernel, output_cb_addr] = create_program(device, dram_reader_core, l1_receiver_core, single_tile_size, tile_format, k, n, num_blocks, cb_num_blocks, num_receivers, num_mixed_df_layers, cb_padding, input_buffers[0], output_buffer);
+
+        ////////////////////////////////////////////////////////////////////////////
+        //                      Execution Application
+        ////////////////////////////////////////////////////////////////////////////
+        tt_metal::detail::CompileProgram(device, program);
+
+        log_info(LogTest, "Num tests {}", num_tests);
+        for (uint32_t i = 0; i < num_tests; ++i) {
+            EnqueueProgram(device->command_queue(), program, false);
+            Finish(device->command_queue());
+            tt_metal::DumpDeviceProfileResults(device, program);
+        }
+
+        ////////////////////////////////////////////////////////////////////////////
+        //                      Validation & Teardown
+        ////////////////////////////////////////////////////////////////////////////
+        if (num_mixed_df_layers == 1) {
+            if (tile_format == tt::DataFormat::Bfp8_b) {
+                pass = validation_bfp8_b(
+                    tensor_fp8,
+                    tile_format,
+                    num_blocks,
+                    cb_num_blocks,
+                    kt,
+                    nt,
+                    output_buffer);
+            } else {
+                pass = validation_fp16(
+                    tensor_fp16,
+                    tile_format,
+                    num_blocks,
+                    cb_num_blocks,
+                    kt,
+                    nt,
+                    output_buffer);
+            }
+        } else {
+            pass = validation_mixed_df(
+                    tensor_fp16,
+                    tensor_fp8,
+                    tile_format,
+                    num_blocks,
+                    cb_num_blocks,
+                    kt,
+                    nt,
+                    output_buffer,
+                    num_mixed_df_layers,
+                    num_receivers);
+        }
+
+        pass &= tt_metal::CloseDevice(device);
+    } catch (const std::exception &e) {
+        pass = false;
+        log_error(LogTest, "{}", e.what());
+        log_error(LogTest, "System error message: {}", std::strerror(errno));
+    }
+
+    if (pass) {
+        log_info(LogTest, "Test Passed");
+    } else {
+        log_error(LogTest, "Test Failed");
+    }
+
+    return 0;
+}
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/8_dram_adjacent_core_read/kernels/reader_dram.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/8_dram_adjacent_core_read/kernels/reader_dram.cpp
index e42ab99525a..17509788f2e 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/8_dram_adjacent_core_read/kernels/reader_dram.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/8_dram_adjacent_core_read/kernels/reader_dram.cpp
@@ -49,7 +49,7 @@ void kernel_main() {
 
     constexpr uint32_t cb_id = 0;
 
-    uint32_t src_base_addr = noc_async_read_tile_dram_sharded_set_state<page_size, true>(input_addr, bank_id, vc);
+    uint32_t src_base_addr = noc_async_read_tile_dram_sharded_set_state<true>(input_addr, page_size, bank_id, vc);
     uint32_t l1_read_addr = 0;
 
     constexpr uint32_t total_num_blocks_in_buffer = 3;
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/8_dram_adjacent_core_read/test_dram_read.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/8_dram_adjacent_core_read/test_dram_read.cpp
index facfd0ab019..773380ebee9 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/8_dram_adjacent_core_read/test_dram_read.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/8_dram_adjacent_core_read/test_dram_read.cpp
@@ -632,7 +632,7 @@ int main(int argc, char **argv) {
         uint32_t num_cores = num_banks; // number of DRAM banks
         // uint32_t num_banks_all = 12;
 
-        CoreRangeSet all_cores = CoreRangeSet{{}};
+        CoreRangeSet all_cores;
         std::vector<CoreCoord> all_cores_list;
         if (device->arch() == tt::ARCH::WORMHOLE_B0) {
             get_dram_reader_core_coords_wormhole_b0(device, all_cores, all_cores_list);
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/kernels/reader_dram.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/kernels/reader_dram.cpp
index 48c659c54ce..479dec38ec1 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/kernels/reader_dram.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/kernels/reader_dram.cpp
@@ -51,7 +51,7 @@ void kernel_main() {
 
     constexpr uint32_t cb_id = 0;
 
-    uint32_t src_base_addr = noc_async_read_tile_dram_sharded_set_state<page_size, true>(input_addr, bank_id, vc);
+    uint32_t src_base_addr = noc_async_read_tile_dram_sharded_set_state<true>(input_addr, page_size, bank_id, vc);
     uint32_t src_read_addr = 0;
 
 #ifdef ARCH_GRAYSKULL
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/test_dram_read_l1_write.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/test_dram_read_l1_write.cpp
index a3d62706327..814b28abe02 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/test_dram_read_l1_write.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/test_dram_read_l1_write.cpp
@@ -815,9 +815,9 @@ int main(int argc, char **argv) {
         uint32_t num_tiles = static_cast<uint32_t>((input_size + single_tile_size - 1) / single_tile_size);
         uint32_t num_cores = num_banks; // number of DRAM banks
 
-        CoreRangeSet all_dram_reader_cores = CoreRangeSet{{}};
+        CoreRangeSet all_dram_reader_cores;
         std::vector<CoreCoord> all_dram_reader_cores_ordered;
-         CoreRangeSet all_l1_receiver_cores = CoreRangeSet{{}};
+        CoreRangeSet all_l1_receiver_cores;
         std::vector<CoreCoord> all_l1_writer_cores_ordered;
         if (device->arch() == tt::ARCH::BLACKHOLE) {
             get_dram_reader_core_coords_blackhole(device, all_dram_reader_cores, all_dram_reader_cores_ordered);
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/CMakeLists.txt b/tests/tt_metal/tt_metal/perf_microbenchmark/CMakeLists.txt
index 94875c6114f..5d839ed65ba 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/CMakeLists.txt
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/CMakeLists.txt
@@ -38,6 +38,7 @@ set(PERF_MICROBENCH_TESTS_SRCS
     7_kernel_launch/test_kernel_launch.cpp
     8_dram_adjacent_core_read/test_dram_read.cpp
     9_dram_adjacent_read_remote_l1_write/test_dram_read_l1_write.cpp
+    10_dram_read_remote_cb_sync/test_dram_read_remote_cb.cpp
 )
 
 foreach (TEST_SRC ${PERF_MICROBENCH_TESTS_SRCS})
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/common.h b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/common.h
index 7382029c62f..0959bd24c98 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/common.h
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/common.h
@@ -6,7 +6,7 @@
 
 #include <cstdint>
 #include <unordered_map>
-#include "core_coord.h"
+#include "core_coord.hpp"
 #include "tt_metal/common/logger.hpp"
 #include "tt_metal/host_api.hpp"
 #include "tt_metal/impl/device/device.hpp"
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency.cpp
index 1128d5d7809..b660e49d921 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency.cpp
@@ -8,7 +8,7 @@
 #include <random>
 #include <string>
 
-#include "core_coord.h"
+#include "core_coord.hpp"
 #include "logger.hpp"
 #include "tt_metal/host_api.hpp"
 #include "tt_metal/detail/tt_metal.hpp"
@@ -263,7 +263,7 @@ int main(int argc, char **argv) {
             {"MCAST_NOC_END_ADDR_Y", std::to_string(mcast_noc_addr_end_y)}
         };
         if (!page_size_as_runtime_arg_g) {
-            defines.insert(pair<string, string>("PAGE_SIZE", std::to_string(page_size_g)));
+            defines.insert(std::pair<string, string>("PAGE_SIZE", std::to_string(page_size_g)));
         }
 
         tt_metal::CircularBufferConfig cb_config = tt_metal::CircularBufferConfig(page_size_g * page_count_g, {{0, tt::DataFormat::Float32}})
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_bidirectional_bandwidth_no_edm.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_bidirectional_bandwidth_no_edm.cpp
index 981e7b56dd1..7d5d555ab82 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_bidirectional_bandwidth_no_edm.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_bidirectional_bandwidth_no_edm.cpp
@@ -13,7 +13,7 @@
 #include "impl/device/device.hpp"
 #include "impl/kernels/kernel_types.hpp"
 #include "tt_backend_api_types.hpp"
-#include "tt_metal/common/core_coord.h"
+#include "tt_metal/common/core_coord.hpp"
 #include "tt_metal/common/math.hpp"
 #include "tt_metal/detail/tt_metal.hpp"
 #include "tt_metal/host_api.hpp"
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_hop_latencies_no_edm.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_hop_latencies_no_edm.cpp
index 5a3deb226f2..11a8d230a1f 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_hop_latencies_no_edm.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_hop_latencies_no_edm.cpp
@@ -15,7 +15,7 @@
 #include "impl/kernels/data_types.hpp"
 #include "impl/kernels/kernel_types.hpp"
 #include "tt_backend_api_types.hpp"
-#include "tt_metal/common/core_coord.h"
+#include "tt_metal/common/core_coord.hpp"
 #include "tt_metal/common/math.hpp"
 #include "tt_metal/detail/tt_metal.hpp"
 #include "tt_metal/host_api.hpp"
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_link_ping_latency_no_edm.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_link_ping_latency_no_edm.cpp
index 8aae6595809..636bd6bfa48 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_link_ping_latency_no_edm.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_link_ping_latency_no_edm.cpp
@@ -14,7 +14,7 @@
 #include "impl/device/device.hpp"
 #include "impl/kernels/kernel_types.hpp"
 #include "tt_backend_api_types.hpp"
-#include "tt_metal/common/core_coord.h"
+#include "tt_metal/common/core_coord.hpp"
 #include "tt_metal/common/math.hpp"
 #include "tt_metal/detail/tt_metal.hpp"
 #include "tt_metal/host_api.hpp"
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_read_and_send_data.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_read_and_send_data.cpp
index 32177a45799..cea16f0d0c1 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_read_and_send_data.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_read_and_send_data.cpp
@@ -8,7 +8,7 @@
 #include <limits>
 #include <random>
 
-#include "tt_metal/common/core_coord.h"
+#include "tt_metal/common/core_coord.hpp"
 #include "tt_metal/common/math.hpp"
 #include "tt_metal/detail/tt_metal.hpp"
 #include "tt_metal/host_api.hpp"
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_send_data_looping.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_send_data_looping.cpp
index 2283cf458a9..41dda7d647f 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_send_data_looping.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_send_data_looping.cpp
@@ -8,7 +8,7 @@
 #include <functional>
 #include <random>
 
-#include "tt_metal/common/core_coord.h"
+#include "tt_metal/common/core_coord.hpp"
 #include "tt_metal/common/math.hpp"
 #include "tt_metal/detail/tt_metal.hpp"
 #include "tt_metal/host_api.hpp"
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_workers_and_erisc_datamover_unidirectional.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_workers_and_erisc_datamover_unidirectional.cpp
index 626d6ed7668..ca52fc83771 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_workers_and_erisc_datamover_unidirectional.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_workers_and_erisc_datamover_unidirectional.cpp
@@ -10,7 +10,7 @@
 
 #include "device/tt_arch_types.h"
 #include "tt_backend_api_types.hpp"
-#include "tt_metal/common/core_coord.h"
+#include "tt_metal/common/core_coord.hpp"
 #include "tt_metal/common/math.hpp"
 #include "tt_metal/detail/tt_metal.hpp"
 #include "tt_metal/host_api.hpp"
diff --git a/tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp b/tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp
index 6626b1ee418..eaf18c1aada 100644
--- a/tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp
+++ b/tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp
@@ -180,6 +180,8 @@ int main(int argc, char **argv) {
 
             std::vector<std::thread> ths;
             ths.reserve(num_devices);
+            uint32_t dm_class_idx = magic_enum::enum_integer(HalProcessorClassType::DM);
+            uint32_t compute_class_idx = magic_enum::enum_integer(HalProcessorClassType::COMPUTE);
             for (int i = 0; i < num_devices; i++) {
                 auto& device = devices[i];
                 auto& program = new_programs[i];
@@ -197,28 +199,35 @@ int main(int argc, char **argv) {
                         TT_FATAL(riscv1_kernel->binaries(mask) == ncrisc_binaries.at(mask), "Error");
 
                         std::string brisc_hex_path = device->build_kernel_target_path(
-                            JitBuildProcessorType::DATA_MOVEMENT,
+                            programmable_core_index,
+                            dm_class_idx,
                             0,
                             get_latest_kernel_binary_path(mask, riscv0_kernel));
-                        ll_api::memory brisc_binary = llrt::get_risc_binary(brisc_hex_path, 0, llrt::PackSpans::PACK);
+                        ll_api::memory brisc_binary = llrt::get_risc_binary(brisc_hex_path, 0, ll_api::memory::PackSpans::PACK, ll_api::memory::Relocate::XIP);
                         TT_FATAL(
                             brisc_binary == brisc_binaries.at(mask).at(0),
                             "Expected saved BRISC binary to be the same as binary in persistent cache");
                         std::string ncrisc_hex_path = device->build_kernel_target_path(
-                            JitBuildProcessorType::DATA_MOVEMENT,
+                            programmable_core_index,
+                            dm_class_idx,
                             1,
                             get_latest_kernel_binary_path(mask, riscv1_kernel));
-                        ll_api::memory ncrisc_binary = llrt::get_risc_binary(ncrisc_hex_path, 1, llrt::PackSpans::PACK);
+                        ll_api::memory::Relocate relo_type =
+                            (device->arch() == tt::ARCH::GRAYSKULL || device->arch() == tt::ARCH::WORMHOLE_B0) ?
+                            ll_api::memory::Relocate::NONE : ll_api::memory::Relocate::XIP;
+
+                        ll_api::memory ncrisc_binary = llrt::get_risc_binary(ncrisc_hex_path, 1, ll_api::memory::PackSpans::PACK, relo_type);
                         TT_FATAL(
                             ncrisc_binary == ncrisc_binaries.at(mask).at(0),
                             "Expected saved NCRISC binary to be the same as binary in persistent cache");
                         for (int trisc_id = 0; trisc_id <= 2; trisc_id++) {
                             std::string trisc_id_str = std::to_string(trisc_id);
                             std::string trisc_hex_path = device->build_kernel_target_path(
-                                JitBuildProcessorType::COMPUTE,
+                                programmable_core_index,
+                                compute_class_idx,
                                 trisc_id,
                                 get_latest_kernel_binary_path(mask, compute_kernel));
-                            ll_api::memory trisc_binary = llrt::get_risc_binary(trisc_hex_path, 2, llrt::PackSpans::PACK);
+                            ll_api::memory trisc_binary = llrt::get_risc_binary(trisc_hex_path, 2, ll_api::memory::PackSpans::PACK, ll_api::memory::Relocate::XIP);
                             TT_FATAL(
                                 trisc_binary == compute_binaries.at(mask).at(trisc_id),
                                 "Expected saved TRISC binary for {} to be the same as binary in persistent cache", trisc_id_str);
diff --git a/tests/tt_metal/tt_metal/test_core_range_set.cpp b/tests/tt_metal/tt_metal/test_core_range_set.cpp
index 0c6cbb21ff0..d40d2516128 100644
--- a/tests/tt_metal/tt_metal/test_core_range_set.cpp
+++ b/tests/tt_metal/tt_metal/test_core_range_set.cpp
@@ -223,7 +223,7 @@ int main(int argc, char **argv) {
         tt_metal::Program program = tt_metal::CreateProgram();
         CoreRange core_range_one({0, 0}, {1, 1});
         CoreRange core_range_two({2, 2}, {3, 3});
-        CoreRangeSet core_ranges = CoreRangeSet({core_range_one, core_range_two});
+        CoreRangeSet core_ranges = CoreRangeSet(std::vector{core_range_one, core_range_two});
 
         pass &= test_program_specified_with_core_range_set(device, program, core_ranges);
 
diff --git a/tests/tt_metal/tt_metal/test_create_kernel_from_string.cpp b/tests/tt_metal/tt_metal/test_create_kernel_from_string.cpp
index be751bcd70b..fcf79a112f0 100644
--- a/tests/tt_metal/tt_metal/test_create_kernel_from_string.cpp
+++ b/tests/tt_metal/tt_metal/test_create_kernel_from_string.cpp
@@ -4,7 +4,7 @@
 
 #include <gtest/gtest.h>
 
-#include "core_coord.h"
+#include "core_coord.hpp"
 #include "detail/tt_metal.hpp"
 #include "host_api.hpp"
 #include "impl/device/device.hpp"
diff --git a/tests/tt_metal/tt_metal/test_kernel_path_env_var.cpp b/tests/tt_metal/tt_metal/test_kernel_path_env_var.cpp
index 7d23b4a302a..aceb624577e 100644
--- a/tests/tt_metal/tt_metal/test_kernel_path_env_var.cpp
+++ b/tests/tt_metal/tt_metal/test_kernel_path_env_var.cpp
@@ -8,7 +8,7 @@
 #include <filesystem>
 
 #include "assert.hpp"
-#include "core_coord.h"
+#include "core_coord.hpp"
 #include "detail/tt_metal.hpp"
 #include "host_api.hpp"
 #include "impl/kernels/data_types.hpp"
diff --git a/tests/tt_metal/tt_metal/test_multi_core_kernel.cpp b/tests/tt_metal/tt_metal/test_multi_core_kernel.cpp
index 601ce80f696..5a1e242b314 100644
--- a/tests/tt_metal/tt_metal/test_multi_core_kernel.cpp
+++ b/tests/tt_metal/tt_metal/test_multi_core_kernel.cpp
@@ -9,7 +9,7 @@
 #include "tt_metal/host_api.hpp"
 #include "tt_metal/detail/tt_metal.hpp"
 #include "common/bfloat16.hpp"
-#include "common/core_coord.h"
+#include "common/core_coord.hpp"
 // #include "tt_gdb/tt_gdb.hpp"
 
 
@@ -251,7 +251,7 @@ bool test_multi_core_kernel_unique_runtime_args(tt_metal::Device *device) {
     CoreRange core_group({0, 1}, {1, 1});
     CoreRange single_core({1, 0}, {1, 0});
     CoreRange all_cores(start_core, end_core);
-    CoreRangeSet core_blocks = CoreRangeSet({start_core_range, single_core, core_group});
+    CoreRangeSet core_blocks = CoreRangeSet(std::vector{start_core_range, single_core, core_group});
 
     uint32_t single_tile_size = 2 * 1024;
     int32_t num_tiles = 2048;
diff --git a/tests/tt_metal/tt_metal/unit_tests/basic/initialize_semaphores.cpp b/tests/tt_metal/tt_metal/unit_tests/basic/initialize_semaphores.cpp
index 0865725e13f..d43254f7c37 100644
--- a/tests/tt_metal/tt_metal/unit_tests/basic/initialize_semaphores.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/basic/initialize_semaphores.cpp
@@ -132,7 +132,7 @@ TEST_F(DeviceFixture, CreateMultipleSemaphoresOnSameCore) {
 
     CoreRange core_range({1, 0}, {3, 0});
     CoreRangeSet core_range_set({core_range});
-    CoreRangeSet core_range_set2 = core_range_set.merge({core1});
+    CoreRangeSet core_range_set2 = core_range_set.merge(std::set<CoreRange>{core1});
     std::set<CoreRange> set_of_cores({CoreRange({2,0}, {2,0}), CoreRange({3,0}, {3,0}), CoreRange({5,0}, {5,0})});
     CoreRangeSet core_range_set3(set_of_cores);
     CoreRangeSet core_range_set4({CoreRange({5,0}, {6,0})});
diff --git a/tests/tt_metal/tt_metal/unit_tests/basic/runtime_args.cpp b/tests/tt_metal/tt_metal/unit_tests/basic/runtime_args.cpp
index ec54d27d5d6..520d04986d2 100644
--- a/tests/tt_metal/tt_metal/unit_tests/basic/runtime_args.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/basic/runtime_args.cpp
@@ -173,7 +173,7 @@ TEST_F(DeviceFixture, LegallyModifyRTArgsDataMovement) {
         // First run the program with the initial runtime args
         CoreRange first_core_range(CoreCoord(0, 0), CoreCoord(1, 1));
         CoreRange second_core_range(CoreCoord(3, 3), CoreCoord(5, 5));
-        CoreRangeSet core_range_set({first_core_range, second_core_range});
+        CoreRangeSet core_range_set(std::vector{first_core_range, second_core_range});
         auto program = unit_tests::runtime_args::initialize_program_data_movement_rta(this->devices_.at(id), core_range_set, 2);
         ASSERT_TRUE(program.num_kernels() == 1);
         std::vector<uint32_t> initial_runtime_args = {101, 202};
@@ -219,7 +219,7 @@ TEST_F(DeviceFixture, LegallyModifyRTArgsCompute) {
         // First run the program with the initial runtime args
         CoreRange first_core_range(CoreCoord(0, 0), CoreCoord(1, 1));
         CoreRange second_core_range(CoreCoord(3, 3), CoreCoord(5, 5));
-        CoreRangeSet core_range_set({first_core_range, second_core_range});
+        CoreRangeSet core_range_set(std::vector{first_core_range, second_core_range});
         std::vector<uint32_t> initial_runtime_args = {101, 202};
         std::vector<uint32_t> common_runtime_args = {11, 22, 33, 44};
         auto program = unit_tests::runtime_args::initialize_program_compute(this->devices_.at(id), core_range_set, initial_runtime_args.size(), common_runtime_args.size());
@@ -249,7 +249,7 @@ TEST_F(DeviceFixture, SetRuntimeArgsSubsetOfCoresCompute) {
         // First run the program with the initial runtime args
         CoreRange first_core_range(CoreCoord(0, 0), CoreCoord(1, 1));
         CoreRange second_core_range(CoreCoord(3, 3), CoreCoord(5, 5));
-        CoreRangeSet core_range_set({first_core_range, second_core_range});
+        CoreRangeSet core_range_set(std::vector{first_core_range, second_core_range});
         std::vector<uint32_t> initial_runtime_args = {101, 202};
         std::vector<uint32_t> common_runtime_args = {11, 22, 33, 44};
 
@@ -277,7 +277,7 @@ TEST_F(DeviceFixture, SetRuntimeArgsUniqueValuesCompute) {
         // First run the program with the initial runtime args
         CoreRange first_core_range(CoreCoord(0, 0), CoreCoord(1, 1));
         CoreRange second_core_range(CoreCoord(3, 3), CoreCoord(5, 5));
-        CoreRangeSet core_range_set({first_core_range, second_core_range});
+        CoreRangeSet core_range_set(std::vector{first_core_range, second_core_range});
         std::vector<uint32_t> common_runtime_args = {11, 22, 33, 44};
         auto program = unit_tests::runtime_args::initialize_program_compute(this->devices_.at(id), core_range_set, 2, common_runtime_args.size());
 
@@ -311,7 +311,7 @@ TEST_F(DeviceFixture, SetRuntimeArgsVaryingLengthPerCore) {
         // First run the program with the initial runtime args
         CoreRange first_core_range(CoreCoord(0, 0), CoreCoord(1, 1));
         CoreRange second_core_range(CoreCoord(3, 3), CoreCoord(5, 5));
-        CoreRangeSet core_range_set({first_core_range, second_core_range});
+        CoreRangeSet core_range_set(std::vector{first_core_range, second_core_range});
         std::vector<uint32_t> common_runtime_args = {11, 22, 33, 44};
 
         // Figure out max number of unique runtime args across all cores, so kernel
@@ -359,7 +359,7 @@ TEST_F(DeviceFixture, SetRuntimeArgsVaryingLengthPerCore) {
 TEST_F(DeviceFixture, IllegalTooManyRuntimeArgs) {
     for (unsigned int id = 0; id < num_devices_; id++) {
         CoreRange first_core_range(CoreCoord(1, 1), CoreCoord(2, 2));
-        CoreRangeSet core_range_set({first_core_range});
+        CoreRangeSet core_range_set(first_core_range);
         auto program = unit_tests::runtime_args::initialize_program_compute(this->devices_.at(id), core_range_set, 0, 0); // Kernel isn't run here.
 
         // Set 100 unique args, then try to set 300 common args and fail.
@@ -381,7 +381,7 @@ TEST_F(DeviceFixture, IllegallyModifyRTArgs) {
         // First run the program with the initial runtime args
         CoreRange first_core_range(CoreCoord(0, 0), CoreCoord(1, 1));
         CoreRange second_core_range(CoreCoord(3, 3), CoreCoord(5, 5));
-        CoreRangeSet core_range_set({first_core_range, second_core_range});
+        CoreRangeSet core_range_set(std::vector{first_core_range, second_core_range});
         auto program = unit_tests::runtime_args::initialize_program_data_movement_rta(this->devices_.at(id), core_range_set, 2);
         ASSERT_TRUE(program.num_kernels() == 1);
         std::vector<uint32_t> initial_runtime_args = {101, 202};
diff --git a/tests/tt_metal/tt_metal/unit_tests/circular_buffer/test_CircularBuffer_creation.cpp b/tests/tt_metal/tt_metal/unit_tests/circular_buffer/test_CircularBuffer_creation.cpp
index 199aa429f88..3b6173fb45d 100644
--- a/tests/tt_metal/tt_metal/unit_tests/circular_buffer/test_CircularBuffer_creation.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/circular_buffer/test_CircularBuffer_creation.cpp
@@ -75,6 +75,7 @@ TEST_F(DeviceFixture, TestCreateCircularBufferAtValidIndices) {
     auto cb = CreateCircularBuffer(program, cr_set, config);
 
     for (unsigned int id = 0; id < num_devices_; id++) {
+        detail::CompileProgram(devices_.at(id), program);
         program.finalize(devices_.at(id));
         EXPECT_TRUE(test_cb_config_written_to_core(program, this->devices_.at(id), cr_set, golden_cb_config));
     }
diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_sfpu_compute.cpp b/tests/tt_metal/tt_metal/unit_tests/compute/test_sfpu_compute.cpp
index 532276b9e39..56a9dbc37e0 100644
--- a/tests/tt_metal/tt_metal/unit_tests/compute/test_sfpu_compute.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/compute/test_sfpu_compute.cpp
@@ -105,7 +105,7 @@ struct SfpuConfig {
     size_t tile_byte_size = 0;
     tt::DataFormat l1_input_data_format = tt::DataFormat::Invalid;
     tt::DataFormat l1_output_data_format = tt::DataFormat::Invalid;
-    CoreRangeSet cores = {{}};
+    CoreRangeSet cores = CoreRangeSet();
     std::string sfpu_op = "";
     bool approx_mode = true;
 };
@@ -398,7 +398,7 @@ TEST_F(DeviceFixture, DISABLED_AllCoreSingleTileSfpuApproxCompute) {
         .tile_byte_size = 2 * 32 * 32,
         .l1_input_data_format = tt::DataFormat::Float16_b,
         .l1_output_data_format = tt::DataFormat::Float16_b,
-        .cores = {{}},
+        .cores = CoreRangeSet(),
         .approx_mode = true};
 
     auto arch = this->arch_;
@@ -437,7 +437,7 @@ TEST_F(DeviceFixture, DISABLED_AllCoreMultiTileSfpuApproxCompute) {
         .tile_byte_size = 2 * 32 * 32,
         .l1_input_data_format = tt::DataFormat::Float16_b,
         .l1_output_data_format = tt::DataFormat::Float16_b,
-        .cores = {{}},
+        .cores = CoreRangeSet(),
         .approx_mode = true};
 
     auto arch = this->arch_;
diff --git a/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRangeSet_construct.cpp b/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRangeSet_construct.cpp
index c1ba749cab4..04cdc9f15c2 100644
--- a/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRangeSet_construct.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRangeSet_construct.cpp
@@ -3,25 +3,25 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "gtest/gtest.h"
-#include "tt_metal/common/core_coord.h"
+#include "tt_metal/common/core_coord.hpp"
 #include "core_coord_fixture.hpp"
 
 namespace basic_tests::CoreRangeSet{
 
 TEST_F(CoreCoordHarness, TestCoreRangeSetValidConstruct)
 {
-    EXPECT_NO_THROW ( ::CoreRangeSet({this->sc1, this->cr2}));
-    EXPECT_NO_THROW ( ::CoreRangeSet({this->cr1, this->cr2}) );
+    EXPECT_NO_THROW(::CoreRangeSet(std::vector{this->sc1, this->cr2}));
+    EXPECT_NO_THROW(::CoreRangeSet(std::vector{this->cr1, this->cr2}));
 
-    ::CoreRangeSet valid_ranges = ::CoreRangeSet({this->cr1, this->cr2});
+    ::CoreRangeSet valid_ranges = ::CoreRangeSet(std::vector{this->cr1, this->cr2});
     EXPECT_EQ(valid_ranges.ranges().size(), 2);
 }
 
 TEST_F(CoreCoordHarness, TestCoreRangeSetInvalidConstruct)
 {
     ::CoreRange overlapping_range({1, 2}, {3, 3});
-    EXPECT_ANY_THROW( ::CoreRangeSet({this->cr1, this->cr2, overlapping_range}) );
-    EXPECT_ANY_THROW( ::CoreRangeSet({this->sc1, this->cr1}) );
+    EXPECT_ANY_THROW(::CoreRangeSet(std::vector{this->cr1, this->cr2, overlapping_range}));
+    EXPECT_ANY_THROW(::CoreRangeSet(std::vector{this->sc1, this->cr1}));
 }
 
 
diff --git a/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRangeSet_merge.cpp b/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRangeSet_merge.cpp
index 32c0092741f..d8bd37fa6c7 100644
--- a/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRangeSet_merge.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRangeSet_merge.cpp
@@ -4,7 +4,7 @@
 
 
 #include "gtest/gtest.h"
-#include "tt_metal/common/core_coord.h"
+#include "tt_metal/common/core_coord.hpp"
 #include "core_coord_fixture.hpp"
 #include <set>
 
@@ -12,18 +12,19 @@ namespace basic_tests::CoreRangeSet{
 
 TEST_F(CoreCoordHarness, TestCoreRangeSetMergeNoSolution)
 {
-    EXPECT_EQ ( ::CoreRangeSet({sc1}).merge({sc3}).ranges() , std::set<::CoreRange>( {sc1,sc3}) );
-    EXPECT_EQ ( ::CoreRangeSet({cr1}).merge({cr2}).ranges() , std::set<::CoreRange>( {cr1,cr2}) );
-    EXPECT_EQ ( ::CoreRangeSet({cr1}).merge({cr1,cr2}).ranges() , std::set<::CoreRange>( {cr1,cr2}) );
-    EXPECT_EQ ( ::CoreRangeSet({cr1}).merge({cr2}).merge({cr3}).ranges() , std::set<::CoreRange>( {cr1,cr2,cr3}) );
+    EXPECT_EQ(::CoreRangeSet(sc1).merge(std::set{sc3}).ranges(), std::set<::CoreRange>({sc1, sc3}));
+    EXPECT_EQ(::CoreRangeSet(cr1).merge(std::set{cr2}).ranges(), std::set<::CoreRange>({cr1, cr2}));
+    EXPECT_EQ(::CoreRangeSet(cr1).merge(std::set{cr1, cr2}).ranges(), std::set<::CoreRange>({cr1, cr2}));
+    EXPECT_EQ(
+        ::CoreRangeSet(cr1).merge(std::set{cr2}).merge(std::set{cr3}).ranges(), std::set<::CoreRange>({cr1, cr2, cr3}));
 }
 
 TEST_F(CoreCoordHarness, TestCoreRangeSetMergeCoreCoord)
 {
-    ::CoreRangeSet empty_crs({});
-    EXPECT_EQ ( empty_crs.merge({this->sc1}).ranges().size(), 1);
-    EXPECT_EQ ( ::CoreRangeSet({cr1}).merge({sc3, sc4}).ranges() , std::set<::CoreRange>( {cr16}) );
-    EXPECT_EQ ( ::CoreRangeSet({cr1}).merge({sc3}).merge({sc4}).ranges() , std::set<::CoreRange>( {cr16}) );
+    ::CoreRangeSet empty_crs;
+    EXPECT_EQ(empty_crs.merge(std::set{this->sc1}).ranges().size(), 1);
+    EXPECT_EQ(::CoreRangeSet(cr1).merge(std::set{sc3, sc4}).ranges(), std::set<::CoreRange>({cr16}));
+    EXPECT_EQ(::CoreRangeSet(cr1).merge(std::set{sc3}).merge(std::set{sc4}).ranges(), std::set<::CoreRange>({cr16}));
     CoreRange rect ( {0,0}, {4,2});
     std::set<CoreRange> rect_pts;
     for (unsigned y = rect.start_coord.y; y <= rect.end_coord.y; y++){
@@ -38,18 +39,22 @@ TEST_F(CoreCoordHarness, TestCoreRangeSetMergeCoreCoord)
     EXPECT_EQ ( empty_crs.merge(rect_pts).ranges(), std::set<::CoreRange>( {rect, CoreRange( {2,3}, {3,5} ) } ));
 
     // "H", sub-optimal currently, should be reduced down to 3 CRs instead of 5
-    EXPECT_EQ ( empty_crs.merge( { CoreRange { {0,0}, {1,5} }, CoreRange { {3,0}, {4,5}}, CoreRange { {0,2} , {4,3} }  } ).ranges(),
+    EXPECT_EQ ( empty_crs.merge( std::vector{ CoreRange { {0,0}, {1,5} }, CoreRange { {3,0}, {4,5}}, CoreRange { {0,2} , {4,3} }  } ).ranges(),
                 std::set<::CoreRange>( { CoreRange { {0,0}, {1,1} }, CoreRange { {0,2}, {4,3}}, CoreRange{ {0,4}, {1,5}},
                                        CoreRange { {3,0}, {4,1} }, CoreRange{ {3,4}, {4,5} } } ));
 }
 
 TEST_F(CoreCoordHarness, TestCoreRangeSetMergeCoreRange)
 {
-    EXPECT_EQ ( ::CoreRangeSet({cr1}).merge({cr1}).ranges() , std::set<::CoreRange>( {cr1}) );
-    EXPECT_EQ ( ::CoreRangeSet({cr7}).merge({cr6}).merge({cr4}).ranges() , std::set<::CoreRange>( {cr8} ) );
-    EXPECT_EQ ( ::CoreRangeSet({cr8}).merge({cr7}).merge({cr6}).merge({cr4}).ranges() , std::set<::CoreRange>( {cr8} ) );
-    EXPECT_EQ ( ::CoreRangeSet({cr1, cr2, cr3}).merge({cr4}).ranges() , std::set<::CoreRange>( {cr4}) );
-    EXPECT_EQ ( ::CoreRangeSet({cr1, cr2}).merge({cr4}).merge({cr6}).ranges() , std::set<::CoreRange>( {cr6}) );
+    EXPECT_EQ(::CoreRangeSet(cr1).merge(std::set{cr1}).ranges(), std::set<::CoreRange>({cr1}));
+    EXPECT_EQ(::CoreRangeSet(cr7).merge(std::set{cr6}).merge(std::set{cr4}).ranges(), std::set<::CoreRange>({cr8}));
+    EXPECT_EQ(
+        ::CoreRangeSet(cr8).merge(std::set{cr7}).merge(std::set{cr6}).merge(std::set{cr4}).ranges(),
+        std::set<::CoreRange>({cr8}));
+    EXPECT_EQ(::CoreRangeSet(std::vector{cr1, cr2, cr3}).merge(std::set{cr4}).ranges(), std::set<::CoreRange>({cr4}));
+    EXPECT_EQ(
+        ::CoreRangeSet(std::vector{cr1, cr2}).merge(std::set{cr4}).merge(std::set{cr6}).ranges(),
+        std::set<::CoreRange>({cr6}));
 }
 
 }
diff --git a/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_adjacent.cpp b/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_adjacent.cpp
index 45b44f6e7bc..f08976402d6 100644
--- a/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_adjacent.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_adjacent.cpp
@@ -5,7 +5,7 @@
 
 
 #include "gtest/gtest.h"
-#include "tt_metal/common/core_coord.h"
+#include "tt_metal/common/core_coord.hpp"
 #include "core_coord_fixture.hpp"
 
 namespace basic_tests::CoreRange{
diff --git a/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_contains.cpp b/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_contains.cpp
index 32828a50539..c9080e08da3 100644
--- a/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_contains.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_contains.cpp
@@ -4,7 +4,7 @@
 
 
 #include "gtest/gtest.h"
-#include "tt_metal/common/core_coord.h"
+#include "tt_metal/common/core_coord.hpp"
 #include "core_coord_fixture.hpp"
 
 namespace basic_tests::CoreRange{
diff --git a/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_intersects.cpp b/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_intersects.cpp
index df1b90e40ac..0ee1fe3608e 100644
--- a/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_intersects.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_intersects.cpp
@@ -4,7 +4,7 @@
 
 
 #include "gtest/gtest.h"
-#include "tt_metal/common/core_coord.h"
+#include "tt_metal/common/core_coord.hpp"
 #include "core_coord_fixture.hpp"
 
 namespace basic_tests::CoreRange{
diff --git a/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_iterator.cpp b/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_iterator.cpp
index 6ba11cc71eb..d475d3c897b 100644
--- a/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_iterator.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_iterator.cpp
@@ -4,7 +4,7 @@
 
 
 #include "gtest/gtest.h"
-#include "tt_metal/common/core_coord.h"
+#include "tt_metal/common/core_coord.hpp"
 #include "core_coord_fixture.hpp"
 
 namespace basic_tests::CoreRange {
diff --git a/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_merge.cpp b/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_merge.cpp
index d23cd67188a..db8a1b2c7ad 100644
--- a/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_merge.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_merge.cpp
@@ -5,7 +5,7 @@
 
 
 #include "gtest/gtest.h"
-#include "tt_metal/common/core_coord.h"
+#include "tt_metal/common/core_coord.hpp"
 #include "core_coord_fixture.hpp"
 
 namespace basic_tests::CoreRange{
diff --git a/tests/tt_metal/tt_metal/unit_tests/multichip/erisc_app_direct_send.cpp b/tests/tt_metal/tt_metal/unit_tests/multichip/erisc_app_direct_send.cpp
index 72fcf923b11..de55e3f17f5 100644
--- a/tests/tt_metal/tt_metal/unit_tests/multichip/erisc_app_direct_send.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/multichip/erisc_app_direct_send.cpp
@@ -98,8 +98,9 @@ bool send_over_eth(
     llrt::write_hex_vec_to_core(receiver_device->id(), receiver_core, args_1, eth_l1_mem::address_map::ERISC_APP_SYNC_INFO_BASE);
 
     // TODO: this should be updated to use kernel api
-    ll_api::memory binary_mem_send = llrt::get_risc_binary(sender_device->build_firmware_target_path(JitBuildProcessorType::ETHERNET, 0));
-    ll_api::memory binary_mem_receive = llrt::get_risc_binary(receiver_device->build_firmware_target_path(JitBuildProcessorType::ETHERNET, 0));
+    uint32_t active_eth_index = hal.get_programmable_core_type_index(HalProgrammableCoreType::ACTIVE_ETH);
+    ll_api::memory binary_mem_send = llrt::get_risc_binary(sender_device->build_firmware_target_path(active_eth_index, 0, 0));
+    ll_api::memory binary_mem_receive = llrt::get_risc_binary(receiver_device->build_firmware_target_path(active_eth_index, 0, 0));
 
     for (const auto& eth_core : eth_cores) {
         llrt::write_hex_vec_to_core(
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/command_queue_test_utils.hpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/command_queue_test_utils.hpp
index 2223519bc63..3b1a12c88ba 100644
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/command_queue_test_utils.hpp
+++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/command_queue_test_utils.hpp
@@ -12,7 +12,7 @@ struct TestBufferConfig {
     tt::tt_metal::BufferType buftype;
 };
 
-inline pair<tt::tt_metal::Buffer, std::vector<uint32_t>> EnqueueWriteBuffer_prior_to_wrap(tt::tt_metal::Device* device, tt::tt_metal::CommandQueue& cq, const TestBufferConfig& config) {
+inline std::pair<tt::tt_metal::Buffer, std::vector<uint32_t>> EnqueueWriteBuffer_prior_to_wrap(tt::tt_metal::Device* device, tt::tt_metal::CommandQueue& cq, const TestBufferConfig& config) {
     // This function just enqueues a buffer (which should be large in the config)
     // write as a precursor to testing the wrap mechanism
     size_t buf_size = config.num_pages * config.page_size;
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueProgram.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueProgram.cpp
index 3194e16e35c..461f07c2825 100644
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueProgram.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueProgram.cpp
@@ -980,7 +980,7 @@ TEST_F(CommandQueueSingleCardFixture, TestAllCbConfigsCorrectlySentMultipleCoreR
         CoreCoord worker_grid_size = device->compute_with_storage_grid_size();
         CoreRange cr1({worker_grid_size.x - 2, worker_grid_size.y - 2}, {worker_grid_size.x - 1, worker_grid_size.y - 1});
 
-        CoreRangeSet core_ranges({cr0, cr1});
+        CoreRangeSet core_ranges(std::vector{cr0, cr1});
 
         DummyProgramMultiCBConfig config = {.cr_set = core_ranges, .cb_config_vector = cb_config_vector};
 
@@ -1001,7 +1001,7 @@ TEST_F(CommandQueueSingleCardFixture, TestAllCbConfigsCorrectlySentUpdateSizeMul
         CoreCoord worker_grid_size = device->compute_with_storage_grid_size();
         CoreRange cr1({worker_grid_size.x - 2, worker_grid_size.y - 2}, {worker_grid_size.x - 1, worker_grid_size.y - 1});
 
-        CoreRangeSet core_ranges({cr0, cr1});
+        CoreRangeSet core_ranges(std::vector{cr0, cr1});
 
         DummyProgramMultiCBConfig config = {.cr_set = core_ranges, .cb_config_vector = cb_config_vector};
 
@@ -1023,7 +1023,7 @@ TEST_F(CommandQueueSingleCardFixture, TestMultiCbConfigsCorrectlySentUpdateSizeM
         CoreCoord worker_grid_size = device->compute_with_storage_grid_size();
         CoreRange cr1({worker_grid_size.x - 2, worker_grid_size.y - 2}, {worker_grid_size.x - 1, worker_grid_size.y - 1});
 
-        CoreRangeSet core_ranges({cr0, cr1});
+        CoreRangeSet core_ranges(std::vector{cr0, cr1});
 
         DummyProgramMultiCBConfig config = {.cr_set = core_ranges, .cb_config_vector = cb_config_vector};
 
@@ -1036,7 +1036,7 @@ TEST_F(CommandQueueSingleCardFixture, TestAllSemConfigsCorrectlySentMultiCore) {
         CoreCoord worker_grid_size = device->compute_with_storage_grid_size();
 
         CoreRange cr({0, 0}, {worker_grid_size.x - 1, worker_grid_size.y - 1});
-        CoreRangeSet cr_set({cr});
+        CoreRangeSet cr_set(cr);
 
         DummyProgramConfig config = {.cr_set = cr_set, .num_sems = NUM_SEMAPHORES};
 
@@ -1052,7 +1052,7 @@ TEST_F(CommandQueueSingleCardFixture, TestAllSemaphoreConfigsCorrectlySentMultip
         CoreCoord worker_grid_size = device->compute_with_storage_grid_size();
         CoreRange second_cr({worker_grid_size.x - 2, worker_grid_size.y - 2}, {worker_grid_size.x - 1, worker_grid_size.y - 1});
 
-        CoreRangeSet cr_set({first_cr, second_cr});
+        CoreRangeSet cr_set(std::vector{first_cr, second_cr});
 
         Program program;
         DummyProgramConfig config = {.cr_set = cr_set, .num_sems = NUM_SEMAPHORES};
@@ -1089,7 +1089,7 @@ TEST_F(CommandQueueSingleCardFixture, TestAllRuntimeArgsCorrectlySentMultiCore)
         CoreCoord worker_grid_size = device->compute_with_storage_grid_size();
 
         CoreRange cr({0, 0}, {worker_grid_size.x - 1, worker_grid_size.y - 1});
-        CoreRangeSet cr_set({cr});
+        CoreRangeSet cr_set(cr);
 
         DummyProgramConfig dummy_program_config = {.cr_set = cr_set};
         EXPECT_TRUE(local_test_functions::test_dummy_EnqueueProgram_with_runtime_args(device, device->command_queue(), dummy_program_config, 13, 17, 19, 1));
@@ -1101,7 +1101,7 @@ TEST_F(CommandQueueSingleCardFixture, TestAllRuntimeArgsCorrectlySentMultiCore_2
         CoreCoord worker_grid_size = device->compute_with_storage_grid_size();
 
         CoreRange cr({0, 0}, {worker_grid_size.x - 1, worker_grid_size.y - 1});
-        CoreRangeSet cr_set({cr});
+        CoreRangeSet cr_set(cr);
 
         DummyProgramConfig dummy_program_config = {.cr_set = cr_set};
         EXPECT_TRUE(local_test_functions::test_dummy_EnqueueProgram_with_runtime_args(device, device->command_queue(), dummy_program_config, 255, 255, 255, 1));
@@ -1114,7 +1114,7 @@ TEST_F(CommandQueueSingleCardFixture, TestSendRuntimeArgsMultiCoreRange) {
 
         CoreRange cr0({0, 0}, {worker_grid_size.x - 1, 3});
         CoreRange cr1({0, 4}, {worker_grid_size.x - 1, worker_grid_size.y - 1});
-        CoreRangeSet cr_set({cr0, cr1});
+        CoreRangeSet cr_set(std::vector{cr0, cr1});
 
         DummyProgramConfig dummy_program_config = {.cr_set = cr_set};
         EXPECT_TRUE(local_test_functions::test_dummy_EnqueueProgram_with_runtime_args_multi_crs(
@@ -1129,7 +1129,7 @@ TEST_F(CommandQueueSingleCardFixture, TestSendRuntimeArgsMultiNonOverlappingCore
 
         CoreRange cr0({0, 0}, {worker_grid_size.x - 1, 3});
         CoreRange cr1({0, 5}, {worker_grid_size.x - 1, worker_grid_size.y - 1});
-        CoreRangeSet cr_set({cr0, cr1});
+        CoreRangeSet cr_set(std::vector{cr0, cr1});
 
         DummyProgramConfig dummy_program_config = {.cr_set = cr_set};
         EXPECT_TRUE(local_test_functions::test_dummy_EnqueueProgram_with_runtime_args_multi_crs(
@@ -1143,7 +1143,7 @@ TEST_F(CommandQueueSingleCardFixture, TestUpdateRuntimeArgsMultiCoreRange) {
 
         CoreRange cr0({0, 0}, {worker_grid_size.x - 1, 3});
         CoreRange cr1({0, 5}, {worker_grid_size.x - 1, worker_grid_size.y - 1});
-        CoreRangeSet cr_set({cr0, cr1});
+        CoreRangeSet cr_set(std::vector{cr0, cr1});
 
         DummyProgramConfig dummy_program_config = {.cr_set = cr_set};
         EXPECT_TRUE(local_test_functions::test_dummy_EnqueueProgram_with_runtime_args_multi_crs(
@@ -1155,7 +1155,7 @@ TEST_F(CommandQueueSingleCardFixture, TestUpdateRuntimeArgsMultiCoreRange) {
 TEST_F(CommandQueueSingleCardFixture, IncrementRuntimeArgsSanityMultiCoreCompute) {
     CoreRange cr0({1, 1}, {2, 2});
     CoreRange cr1({3, 3}, {4, 4});
-    CoreRangeSet cr_set({cr0, cr1});
+    CoreRangeSet cr_set(std::vector{cr0, cr1});
     DummyProgramConfig dummy_program_config = {.cr_set = cr_set};
     for (Device *device : devices_) {
         EXPECT_TRUE(local_test_functions::test_increment_runtime_args_sanity(device, dummy_program_config, 16, 16, tt::RISCV::COMPUTE));
@@ -1166,7 +1166,7 @@ TEST_F(CommandQueueSingleCardFixture, IncrementRuntimeArgsSanityMultiCoreCompute
 TEST_F(CommandQueueSingleCardFixture, IncrementRuntimeArgsSanityMultiCoreCompute_255_UniqueArgs) {
     CoreRange cr0({1, 1}, {2, 2});
     CoreRange cr1({3, 3}, {4, 4});
-    CoreRangeSet cr_set({cr0, cr1});
+    CoreRangeSet cr_set(std::vector{cr0, cr1});
     DummyProgramConfig dummy_program_config = {.cr_set = cr_set};
     for (Device *device : devices_) {
         EXPECT_TRUE(local_test_functions::test_increment_runtime_args_sanity(device, dummy_program_config, 255, 0, tt::RISCV::COMPUTE));
@@ -1177,7 +1177,7 @@ TEST_F(CommandQueueSingleCardFixture, IncrementRuntimeArgsSanityMultiCoreCompute
 TEST_F(CommandQueueSingleCardFixture, IncrementRuntimeArgsSanityMultiCoreCompute_255_CommonArgs) {
     CoreRange cr0({1, 1}, {2, 2});
     CoreRange cr1({3, 3}, {4, 4});
-    CoreRangeSet cr_set({cr0, cr1});
+    CoreRangeSet cr_set(std::vector{cr0, cr1});
     DummyProgramConfig dummy_program_config = {.cr_set = cr_set};
     for (Device *device : devices_) {
         EXPECT_TRUE(local_test_functions::test_increment_runtime_args_sanity(device, dummy_program_config, 0, 255, tt::RISCV::COMPUTE));
@@ -1188,7 +1188,7 @@ TEST_F(CommandQueueSingleCardFixture, IncrementRuntimeArgsSanityMultiCoreCompute
 TEST_F(CommandQueueSingleCardFixture, IncrementRuntimeArgsSanityMultiCoreDataMovementBrisc) {
     CoreRange cr0({1, 1}, {2, 2});
     CoreRange cr1({3, 3}, {4, 4});
-    CoreRangeSet cr_set({cr0, cr1});
+    CoreRangeSet cr_set(std::vector{cr0, cr1});
     DummyProgramConfig dummy_program_config = {.cr_set = cr_set};
     for (Device *device : devices_) {
         EXPECT_TRUE(local_test_functions::test_increment_runtime_args_sanity(device, dummy_program_config, 16, 16, tt::RISCV::BRISC));
@@ -1199,7 +1199,7 @@ TEST_F(CommandQueueSingleCardFixture, IncrementRuntimeArgsSanityMultiCoreDataMov
 TEST_F(CommandQueueSingleCardFixture, IncrementRuntimeArgsSanityMultiCoreDataMovementNcrisc) {
     CoreRange cr0({1, 1}, {2, 2});
     CoreRange cr1({3, 3}, {4, 4});
-    CoreRangeSet cr_set({cr0, cr1});
+    CoreRangeSet cr_set(std::vector{cr0, cr1});
     DummyProgramConfig dummy_program_config = {.cr_set = cr_set};
     for (Device *device : devices_) {
         EXPECT_TRUE(local_test_functions::test_increment_runtime_args_sanity(device, dummy_program_config, 16, 16, tt::RISCV::NCRISC));
@@ -1219,7 +1219,7 @@ TEST_F(CommandQueueSingleCardFixture, DISABLED_TestFillDispatchCoreBuffer) {
         CoreCoord worker_grid_size = device->compute_with_storage_grid_size();
 
         CoreRange cr({0, 0}, {worker_grid_size.x - 1, worker_grid_size.y - 1});
-        CoreRangeSet cr_set({cr});
+        CoreRangeSet cr_set(cr);
 
         DummyProgramConfig dummy_program_config = {.cr_set = cr_set};
 
@@ -1240,7 +1240,7 @@ TEST_F(CommandQueueFixture, TestRandomizedProgram) {
 
     CoreCoord worker_grid_size = this->device_->compute_with_storage_grid_size();
     CoreRange cr({0, 0}, {worker_grid_size.x - 1, worker_grid_size.y - 1});
-    CoreRangeSet cr_set({cr});
+    CoreRangeSet cr_set(cr);
 
     log_info(tt::LogTest, "Starting compile of {} programs now.", NUM_PROGRAMS);
 
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/compute/sfpu/sfpu_compute.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/compute/sfpu/sfpu_compute.cpp
index 1e6e8c54362..8cb072266de 100644
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/compute/sfpu/sfpu_compute.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/compute/sfpu/sfpu_compute.cpp
@@ -104,7 +104,7 @@ struct SfpuConfig {
     size_t tile_byte_size = 0;
     tt::DataFormat l1_input_data_format = tt::DataFormat::Invalid;
     tt::DataFormat l1_output_data_format = tt::DataFormat::Invalid;
-    CoreRangeSet cores = {{}};
+    CoreRangeSet cores = CoreRangeSet();
     std::string sfpu_op = "";
     bool approx_mode = true;
 };
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/streams/test_autonomous_relay_streams.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/streams/test_autonomous_relay_streams.cpp
index 9a07d724462..74080be0bb8 100644
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/streams/test_autonomous_relay_streams.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/streams/test_autonomous_relay_streams.cpp
@@ -18,7 +18,7 @@
 #include "impl/buffers/circular_buffer.hpp"
 #include "impl/kernels/data_types.hpp"
 #include "impl/kernels/kernel_types.hpp"
-#include "tt_metal/common/core_coord.h"
+#include "tt_metal/common/core_coord.hpp"
 #include "tt_metal/common/math.hpp"
 #include "tt_metal/detail/tt_metal.hpp"
 #include "tt_metal/host_api.hpp"
diff --git a/tests/ttnn/unit_tests/gtests/ccl/test_erisc_data_mover_with_workers.cpp b/tests/ttnn/unit_tests/gtests/ccl/test_erisc_data_mover_with_workers.cpp
index 4c1d15083e0..a62985f8bd3 100644
--- a/tests/ttnn/unit_tests/gtests/ccl/test_erisc_data_mover_with_workers.cpp
+++ b/tests/ttnn/unit_tests/gtests/ccl/test_erisc_data_mover_with_workers.cpp
@@ -12,7 +12,7 @@
 
 #include "device/tt_arch_types.h"
 // #include "tt_backend_api_types.hpp"
-#include "tt_metal/common/core_coord.h"
+#include "tt_metal/common/core_coord.hpp"
 #include "tt_metal/common/math.hpp"
 #include "tt_metal/detail/tt_metal.hpp"
 #include "tt_metal/host_api.hpp"
diff --git a/tests/ttnn/unit_tests/operations/test_all_gather_TG_nightly.py b/tests/ttnn/unit_tests/operations/test_all_gather_TG_nightly.py
new file mode 100644
index 00000000000..c982be2bd9b
--- /dev/null
+++ b/tests/ttnn/unit_tests/operations/test_all_gather_TG_nightly.py
@@ -0,0 +1,291 @@
+# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import pytest
+from loguru import logger
+import ttnn
+from tests.tt_eager.python_api_testing.sweep_tests.comparison_funcs import comp_equal, comp_pcc
+from tests.ttnn.unit_tests.operations.test_all_gather_TG_post_commit import (
+    run_line_all_gather_on_TG_with_mesh_tensor_along_rows,
+)
+from models.utility_functions import skip_for_grayskull
+
+from ttnn import ShardTensor2dMesh, ConcatMesh2dToTensor
+
+
+# Enumerate the post-commit cases explicitly
+@skip_for_grayskull("Requires eth connected devices to run")
+@pytest.mark.parametrize(
+    "num_devices, num_links",
+    [(4, 3)],
+)
+@pytest.mark.parametrize(
+    "input_dtype",
+    [
+        ttnn.bfloat16,
+        ttnn.bfloat8_b,
+    ],
+)
+@pytest.mark.parametrize("shard_grid_orientation", [ttnn.ShardOrientation.ROW_MAJOR])
+@pytest.mark.parametrize(
+    "tensor_mem_layout,per_chip_output_shape, dim, input_shard_shape,shard_grid,layout",
+    (
+        # LLama
+        (
+            ttnn.TensorMemoryLayout.WIDTH_SHARDED,
+            (1, 1, 32, 1024 * 4),
+            3,
+            (32, 32),
+            ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(7, 3))}),
+            ttnn.TILE_LAYOUT,
+        ),
+        (
+            ttnn.TensorMemoryLayout.WIDTH_SHARDED,
+            (4, 1, 32, 1280),
+            0,
+            (32, 32),
+            ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(7, 4))}),
+            ttnn.TILE_LAYOUT,
+        ),
+    ),
+)
+@pytest.mark.parametrize("replication_factor", [8])
+@pytest.mark.parametrize("enable_async", [True])
+@pytest.mark.parametrize("mesh_device", [pytest.param((8, 4), id="8x4_grid")], indirect=True)
+def test_line_all_gather_sharded_on_TG_rows_post_commit(
+    mesh_device,
+    num_devices,
+    per_chip_output_shape,
+    input_shard_shape,
+    shard_grid,
+    shard_grid_orientation,
+    tensor_mem_layout,
+    dim,
+    num_links,
+    input_dtype,
+    layout,
+    use_program_cache,
+    function_level_defaults,
+    enable_async,
+    replication_factor,
+    num_iters=1,
+):
+    input_shard_spec = ttnn.ShardSpec(
+        shard_grid,
+        input_shard_shape,
+        shard_grid_orientation,
+        False,
+    )
+    run_line_all_gather_on_TG_with_mesh_tensor_along_rows(
+        mesh_device,
+        num_devices,
+        per_chip_output_shape,
+        tensor_mem_layout,
+        dim,
+        num_links,
+        input_dtype,
+        layout,
+        ttnn.BufferType.L1,
+        use_program_cache,
+        function_level_defaults,
+        enable_async=enable_async,
+        input_shard_spec=input_shard_spec,
+        num_iters=num_iters,
+        num_all_gather_instances=replication_factor,
+        cluster_axis=1,
+    )
+
+
+# Enumerate the post-commit cases explicitly
+@skip_for_grayskull("Requires eth connected devices to run")
+@pytest.mark.parametrize(
+    "num_devices, num_links",
+    [(8, 4), (8, 3), (8, 2)],
+)
+@pytest.mark.parametrize(
+    "input_dtype",
+    [
+        ttnn.bfloat16,
+        ttnn.bfloat8_b,
+    ],
+)
+@pytest.mark.parametrize("shard_grid_orientation", [ttnn.ShardOrientation.ROW_MAJOR])
+@pytest.mark.parametrize(
+    "tensor_mem_layout, input_shape, dim, input_shard_shape,shard_grid,layout",
+    (
+        (
+            ttnn.TensorMemoryLayout.WIDTH_SHARDED,
+            (8, 1, 32, 2048),
+            0,
+            (32, 64),
+            ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(7, 3))}),
+            ttnn.TILE_LAYOUT,
+        ),
+        (
+            ttnn.TensorMemoryLayout.WIDTH_SHARDED,
+            (1, 8, 32, 2048),
+            1,
+            (32, 64),
+            ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(7, 3))}),
+            ttnn.TILE_LAYOUT,
+        ),
+        (
+            ttnn.TensorMemoryLayout.WIDTH_SHARDED,
+            (1, 1, 256, 2048),
+            2,
+            (32, 64),
+            ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(7, 3))}),
+            ttnn.TILE_LAYOUT,
+        ),
+        (
+            ttnn.TensorMemoryLayout.WIDTH_SHARDED,
+            (1, 1, 32, 16384),
+            3,
+            (32, 64),
+            ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(7, 3))}),
+            ttnn.TILE_LAYOUT,
+        ),
+        (
+            ttnn.TensorMemoryLayout.HEIGHT_SHARDED,
+            (8, 1, 2048, 32),
+            0,
+            (64, 32),
+            ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(7, 3))}),
+            ttnn.TILE_LAYOUT,
+        ),
+        (
+            ttnn.TensorMemoryLayout.HEIGHT_SHARDED,
+            (1, 8, 2048, 32),
+            1,
+            (64, 32),
+            ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(7, 3))}),
+            ttnn.TILE_LAYOUT,
+        ),
+        (
+            ttnn.TensorMemoryLayout.HEIGHT_SHARDED,
+            (1, 1, 16384, 32),
+            2,
+            (64, 32),
+            ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(7, 3))}),
+            ttnn.TILE_LAYOUT,
+        ),
+        (
+            ttnn.TensorMemoryLayout.HEIGHT_SHARDED,
+            (1, 1, 2048, 256),
+            3,
+            (64, 32),
+            ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(7, 3))}),
+            ttnn.TILE_LAYOUT,
+        ),
+    ),
+)
+@pytest.mark.parametrize("replication_factor", [4])
+@pytest.mark.parametrize("enable_async", [True])
+@pytest.mark.parametrize("mesh_device", [pytest.param((8, 4), id="8x4_grid")], indirect=True)
+def test_line_all_gather_sharded_on_TG_cols_post_commit(
+    mesh_device,
+    num_devices,
+    input_shape,
+    input_shard_shape,
+    shard_grid,
+    shard_grid_orientation,
+    tensor_mem_layout,
+    dim,
+    num_links,
+    input_dtype,
+    layout,
+    use_program_cache,
+    function_level_defaults,
+    enable_async,
+    replication_factor,
+    num_iters=1,
+):
+    input_shard_spec = ttnn.ShardSpec(
+        shard_grid,
+        input_shard_shape,
+        shard_grid_orientation,
+        False,
+    )
+
+    run_line_all_gather_on_TG_with_mesh_tensor_along_rows(
+        mesh_device,
+        num_devices,
+        input_shape,
+        tensor_mem_layout,
+        dim,
+        num_links,
+        input_dtype,
+        layout,
+        ttnn.BufferType.L1,
+        use_program_cache,
+        function_level_defaults,
+        enable_async=enable_async,
+        num_iters=num_iters,
+        input_shard_spec=input_shard_spec,
+        num_all_gather_instances=replication_factor,
+        cluster_axis=0,
+    )
+
+
+@skip_for_grayskull("Requires eth connected devices to run")
+@pytest.mark.parametrize(
+    "num_devices, num_links, per_chip_output_shape, dim, layout",
+    [
+        (8, 4, [1, 8, 32, 1280], 1, ttnn.TILE_LAYOUT),
+        (8, 4, [8, 1, 32, 1280], 0, ttnn.TILE_LAYOUT),
+        (8, 4, [1, 8, 32, 2048], 1, ttnn.TILE_LAYOUT),
+        (8, 4, [1, 8, 32, 2304], 1, ttnn.TILE_LAYOUT),
+        (8, 4, [1, 8, 32, 4096], 1, ttnn.TILE_LAYOUT),
+    ],
+)
+@pytest.mark.parametrize(
+    "input_dtype",
+    [
+        ttnn.bfloat16,
+        ttnn.bfloat8_b,
+    ],
+)
+@pytest.mark.parametrize(
+    "buffer_type",
+    [
+        ttnn.BufferType.DRAM,
+        ttnn.BufferType.L1,
+    ],
+)
+@pytest.mark.parametrize("enable_async", [True])
+@pytest.mark.parametrize("replication_factor", [4])  # 1, 4])
+@pytest.mark.parametrize("mesh_device", [pytest.param((8, 4), id="8x4_grid")], indirect=True)
+def test_line_all_gather_on_TG_cols_nightly(
+    mesh_device,
+    num_devices,
+    per_chip_output_shape,
+    dim,
+    num_links,
+    input_dtype,
+    layout,
+    buffer_type,
+    use_program_cache,
+    function_level_defaults,
+    enable_async,
+    replication_factor,
+    num_iters=1,
+):
+    run_line_all_gather_on_TG_with_mesh_tensor_along_rows(
+        mesh_device,
+        num_devices,
+        per_chip_output_shape,
+        ttnn.TensorMemoryLayout.INTERLEAVED,
+        dim,
+        num_links,
+        input_dtype,
+        layout,
+        buffer_type,
+        use_program_cache,
+        function_level_defaults,
+        enable_async=enable_async,
+        num_iters=num_iters,
+        num_all_gather_instances=replication_factor,
+        cluster_axis=0,
+    )
diff --git a/tests/ttnn/unit_tests/operations/test_all_gather_TG_post_commit.py b/tests/ttnn/unit_tests/operations/test_all_gather_TG_post_commit.py
index 2f940250eba..be6572f75e6 100644
--- a/tests/ttnn/unit_tests/operations/test_all_gather_TG_post_commit.py
+++ b/tests/ttnn/unit_tests/operations/test_all_gather_TG_post_commit.py
@@ -23,8 +23,8 @@ def report_mismatches(golden, actual, max_printable=None):
                     ]
                     if print_it:
                         printed += 1
-                        logger.error(
-                            f"output mismatch for tensor at [{w}, {z}, {y}, {x}]: expected {int(golden[w, z, y, x])} != actual {int(actual[w, z, y, x])}"
+                        print(
+                            f"output mismatch for tensor at [{w}, {z}, {y}, {x}]: expected {golden[w, z, y, x]} != actual {actual[w, z, y, x]}"
                         )
 
 
@@ -40,92 +40,79 @@ def print_tile_corners_of_tensor(t):
                 for x in range(0, t.shape[3], 32):
                     yy = 0
                     xx = 0
-                    str_vals += f"{int(t[w, z, y + yy, x + xx]):<5} "[:5]
+                    val = int(t[w, z, y + yy, x + xx].item())
+                    str_vals += f"{val:<5} "[:5]
                 print(f"{str_vals}")
 
 
 def run_line_all_gather_on_TG_with_mesh_tensor_along_rows(
     mesh_device,
     num_devices_per_line,
-    input_shape_per_all_gather,
+    per_chip_output_shape,
+    tensor_memory_layout,
     dim,
     num_links,
     input_dtype,
     layout,
-    mem_config,
+    buffer_type: ttnn.BufferType,
     use_program_cache,
     function_level_defaults,
     enable_async,
-    num_all_gather_instances=1,
-    num_iters=1,
-    cluster_axis=0,
+    input_shard_spec: ttnn.ShardSpec = None,
+    num_all_gather_instances: int = 1,
+    num_iters: int = 1,
+    cluster_axis: int = 0,
 ):
     if len(mesh_device.get_devices()) != 32:
         pytest.skip("Not TG!")
     mesh_device.enable_async(enable_async)
 
-    input_shape_per_chip = list(input_shape_per_all_gather)
-    input_shape_per_chip[2 if cluster_axis == 0 else 3] //= num_devices_per_line
-    tensor_height_per_all_gather = input_shape_per_all_gather[-2]
-
-    full_mesh_input_shape = list(input_shape_per_all_gather)
-    full_mesh_input_shape[-2] *= num_all_gather_instances
-    logger.info(f"tensor_height_per_all_gather: {tensor_height_per_all_gather}")
-    logger.info(f"input_shape_per_all_gather: {input_shape_per_all_gather}")
-    logger.info(f"input_shape_per_chip: {input_shape_per_chip}")
-    logger.info(f"full_mesh_input_shape: {full_mesh_input_shape}")
-    logger.info(f"input_shape_per_all_gather: {input_shape_per_all_gather}")
-
-    full_tensor = torch.zeros(full_mesh_input_shape, dtype=torch.bfloat16)
+    input_shape_per_chip = list(per_chip_output_shape)
+    input_shape_per_chip[dim] //= num_devices_per_line
+    tensor_height_per_all_gather = per_chip_output_shape[-2]
 
-    for i in range(num_all_gather_instances):
-        full_tensor[0, 0, i * tensor_height_per_all_gather : (i + 1) * tensor_height_per_all_gather, :] = torch.rand(
-            input_shape_per_all_gather
-        ).bfloat16()
+    full_mesh_input_shape = list(per_chip_output_shape)
+    ## The `all_gather_instances_concat_dim` is the dimension we will split the cluster spanning tensor along in order to split it
+    ## off into per-all-gather tensors
+    all_gather_instances_concat_dim = 1 if dim == 0 else 0
+    full_mesh_input_shape[all_gather_instances_concat_dim] *= num_all_gather_instances
+    logger.info(
+        f"per_chip_output_shape: {full_mesh_input_shape}, dim: {dim}, all_gather_instances_concat_dim: {all_gather_instances_concat_dim}, num_devices_per_line: {num_devices_per_line}"
+    )
 
-    logger.info(f"full_tensor.shape: {full_tensor.shape}")
-    debug = False
-    if debug:
-        tile_id = 0
-        for w in range(full_tensor.shape[0]):
-            for z in range(full_tensor.shape[1]):
-                for y in range(0, full_tensor.shape[2], 32):
-                    for x in range(0, full_tensor.shape[3], 32):
-                        yy_max = 32 if y + 32 < full_tensor.shape[2] else full_tensor.shape[2] - y
-                        xx_max = 32 if x + 32 < full_tensor.shape[3] else full_tensor.shape[3] - x
-                        full_tensor[w, z, y : y + yy_max, x : x + xx_max] = tile_id
-                        tile_id += 1
+    all_gather_instances_goldens = []
+    full_input_tensor_unfractured = torch.rand(full_mesh_input_shape, dtype=torch.bfloat16)
 
-    #
-    # assemble the golden output tensor
-    #
-    inner_dim_concat_axis = 2
-    outer_dim_concat_axis = 3
-    full_tensor_chunks_per_allgather = torch.chunk(full_tensor, num_all_gather_instances, dim=inner_dim_concat_axis)
-    output_chunks_per_allgather = []
-    for i, chunk in enumerate(full_tensor_chunks_per_allgather):
-        width_chunks = torch.chunk(chunk, num_devices_per_line, dim=outer_dim_concat_axis)
-        output_chunk = torch.cat(width_chunks, dim=dim)
-        output_chunks_per_allgather.append(output_chunk)
-    full_mesh_output_golden_per_chip = torch.cat(output_chunks_per_allgather, dim=inner_dim_concat_axis)
-    logger.info(f"full_mesh_output_golden_per_chip.shape: {full_mesh_output_golden_per_chip.shape}")
-    non_replicated_output_golden_tensors = [full_mesh_output_golden_per_chip] * num_devices_per_line
-    full_mesh_output_golden = torch.cat(non_replicated_output_golden_tensors, dim=outer_dim_concat_axis)
-    logger.info(f"full_mesh_output_golden.shape: {full_mesh_output_golden.shape}")
+    input_mem_config = ttnn.MemoryConfig(tensor_memory_layout, buffer_type=buffer_type, shard_spec=input_shard_spec)
+    shard_dims = (dim, all_gather_instances_concat_dim) if cluster_axis == 0 else (all_gather_instances_concat_dim, dim)
+    concat_dims = shard_dims
 
-    shard_dims = (-1, -2) if cluster_axis == 0 else (-2, -1)
     mesh_shape = (
         (num_devices_per_line, num_all_gather_instances)
         if cluster_axis == 0
         else (num_all_gather_instances, num_devices_per_line)
     )
-    logger.info(f"mesh_shape: {mesh_shape}")
+
+    output_shard_spec = None
+    if input_shard_spec is not None:
+        output_shard_shape = list(input_shard_spec.shape)
+        if dim == 3:
+            output_shard_shape[1] *= num_devices_per_line
+        else:
+            output_shard_shape[0] *= num_devices_per_line
+        output_shard_spec = ttnn.ShardSpec(
+            input_shard_spec.grid,
+            output_shard_shape,
+            input_shard_spec.orientation,
+            False,
+        )
+    output_mem_config = ttnn.MemoryConfig(tensor_memory_layout, buffer_type=buffer_type, shard_spec=output_shard_spec)
     ttnn_tensor = ttnn.from_torch(
-        full_tensor,
+        full_input_tensor_unfractured,
         dtype=input_dtype,
         device=mesh_device,
         layout=layout,
-        memory_config=mem_config,
+        memory_config=input_mem_config,
         mesh_mapper=ShardTensor2dMesh(mesh_device, mesh_shape=mesh_shape, dims=shard_dims),
     )
     ttnn_tensor = ttnn.to_device(ttnn_tensor, mesh_device)
@@ -138,61 +125,48 @@ def run_line_all_gather_on_TG_with_mesh_tensor_along_rows(
             cluster_axis=cluster_axis,
             mesh_device=mesh_device,
             num_links=num_links,
+            memory_config=output_mem_config,
             topology=ttnn.Topology.Linear,
         )
 
-    concat_dims = (3, 2) if cluster_axis == 0 else (2, 3)
-    if debug:
-        readback_input_tensor = ttnn.to_torch(
-            ttnn_tensor, mesh_composer=ConcatMesh2dToTensor(mesh_device, mesh_shape=mesh_shape, dims=concat_dims)
-        )
-        print(f"readback_input_tensor")
-        print_tile_corners_of_tensor(readback_input_tensor)
-
-    if debug:
-        for i, t in enumerate(ttnn.get_device_tensors(ttnn_tensor)):
-            print(f"readback_input_tensor {i}")
-            print_tile_corners_of_tensor(t)
-
-    if debug:
-        for i, t in enumerate(ttnn.get_device_tensors(ttnn_tensor_out)):
-            t = t.cpu().to(ttnn.ROW_MAJOR_LAYOUT).to_torch()
-            print(f"OUTPUT TENSOR {i}")
-            print_tile_corners_of_tensor(t)
-
     # ttnn.visualize_mesh_device(mesh_device, tensor=ttnn_tensor_out)
-    logger.info(f"concat_dims: {concat_dims}")
     tt_output_tensor = ttnn.to_torch(
         ttnn_tensor_out, mesh_composer=ConcatMesh2dToTensor(mesh_device, mesh_shape=mesh_shape, dims=concat_dims)
     )
-    logger.info(f"tt_output_tensor.shape: {tt_output_tensor.shape}")
+    output_tensors_list = torch.chunk(tt_output_tensor, num_all_gather_instances, dim=all_gather_instances_concat_dim)
+    output_golden = torch.zeros(tt_output_tensor.shape)
 
-    if debug:
-        print(f"tt_output_tensor")
-        print_tile_corners_of_tensor(tt_output_tensor)
+    # Repeat the input tensor to represent the fact that the full concatenated input tensor lives across every
+    # device in the line
+    repeat_factor = [1] * len(output_golden.shape)
+    repeat_factor[dim] = num_devices_per_line
+    output_golden[:, :, :, :] = full_input_tensor_unfractured.repeat(repeat_factor)
 
-    ## This full_tensor will only be 1/num_devices_per_line of the tt_output_tensor. We should just be able to concatenate it along the
+    eq = True
     if input_dtype == ttnn.bfloat16:
-        eq, output = comp_equal(tt_output_tensor, full_mesh_output_golden)
-        if not eq and debug:
-            report_mismatches(full_mesh_output_golden, tt_output_tensor)
+        eq, output = comp_equal(tt_output_tensor, output_golden)
+        if not eq and debug is True:
+            logger.error(f"found mismatches")
+            report_mismatches(tt_output_tensor, output_golden, 100)
+            print_tile_corners_of_tensor(output_tensor)
     else:
-        eq, output = comp_pcc(tt_output_tensor, full_mesh_output_golden)
+        eq, output = comp_pcc(tt_output_tensor, output_golden)
     if not eq:
-        logger.error(f"output mismatch for tensor")
+        logger.error(f"output mismatch for tensor: {output}")
+
     assert eq, f"FAILED: {output}"
 
 
 # Enumerate the post-commit cases explicitly
 @skip_for_grayskull("Requires eth connected devices to run")
 @pytest.mark.parametrize(
-    "num_devices, num_links, input_shape, dim, layout",
+    "num_devices, num_links, per_chip_output_shape, dim, layout",
     [
-        (4, 3, [1, 1, 32, 1280], 0, ttnn.TILE_LAYOUT),
-        (4, 3, [1, 1, 32, 16384], 3, ttnn.TILE_LAYOUT),
-        (4, 3, [1, 1, 32, 2304], 1, ttnn.TILE_LAYOUT),
-        (4, 3, [1, 1, 32, 4096], 1, ttnn.TILE_LAYOUT),
-        (4, 3, [1, 1, 32, 6656], 1, ttnn.TILE_LAYOUT),
+        (4, 3, [4, 1, 32, 1280], 0, ttnn.TILE_LAYOUT),
+        (4, 3, [1, 1, 32, 16384 * 4], 3, ttnn.TILE_LAYOUT),
+        (4, 3, [1, 4, 32, 2304], 1, ttnn.TILE_LAYOUT),
+        (4, 3, [1, 4, 32, 4096], 1, ttnn.TILE_LAYOUT),
+        (4, 3, [1, 4, 32, 6656], 1, ttnn.TILE_LAYOUT),
     ],
 )
 @pytest.mark.parametrize(
@@ -203,10 +177,10 @@ def run_line_all_gather_on_TG_with_mesh_tensor_along_rows(
     ],
 )
 @pytest.mark.parametrize(
-    "mem_config",
+    "buffer_type",
     [
-        ttnn.MemoryConfig(buffer_type=ttnn.BufferType.DRAM),
-        ttnn.MemoryConfig(buffer_type=ttnn.BufferType.L1),
+        ttnn.BufferType.DRAM,
+        ttnn.BufferType.L1,
     ],
 )
 @pytest.mark.parametrize("replication_factor", [8])  # 1, 8])
@@ -215,12 +189,12 @@ def run_line_all_gather_on_TG_with_mesh_tensor_along_rows(
 def test_line_all_gather_on_TG_rows_post_commit(
     mesh_device,
     num_devices,
-    input_shape,
+    per_chip_output_shape,
     dim,
     num_links,
     input_dtype,
     layout,
-    mem_config,
+    buffer_type,
     use_program_cache,
     function_level_defaults,
     enable_async,
@@ -230,12 +204,13 @@ def test_line_all_gather_on_TG_rows_post_commit(
     run_line_all_gather_on_TG_with_mesh_tensor_along_rows(
         mesh_device,
         num_devices,
-        input_shape,
+        per_chip_output_shape,
+        ttnn.TensorMemoryLayout.INTERLEAVED,
         dim,
         num_links,
         input_dtype,
         layout,
-        mem_config,
+        buffer_type,
         use_program_cache,
         function_level_defaults,
         enable_async=enable_async,
@@ -247,40 +222,39 @@ def test_line_all_gather_on_TG_rows_post_commit(
 
 @skip_for_grayskull("Requires eth connected devices to run")
 @pytest.mark.parametrize(
-    "num_devices, num_links, input_shape, dim, layout",
+    "num_devices, num_links, per_chip_output_shape, dim, layout",
     [
-        # (8, 4, [1, 1, 32, 1280], 1, ttnn.TILE_LAYOUT), # Rightmost column of tiles per input not copied to final output
-        (8, 4, [1, 1, 32, 2048], 1, ttnn.TILE_LAYOUT),  # passes
-        (8, 4, [1, 1, 32, 2304], 1, ttnn.TILE_LAYOUT),  # passes
-        (8, 4, [1, 1, 32, 4096], 1, ttnn.TILE_LAYOUT),  # passes
+        (8, 4, [1, 8, 32, 1280], 1, ttnn.TILE_LAYOUT),
+        (8, 4, [8, 1, 32, 1280], 0, ttnn.TILE_LAYOUT),
+        (8, 4, [1, 8, 32, 2048], 1, ttnn.TILE_LAYOUT),
+        (8, 4, [1, 8, 32, 2304], 1, ttnn.TILE_LAYOUT),
+        (8, 4, [1, 8, 32, 4096], 1, ttnn.TILE_LAYOUT),
     ],
 )
 @pytest.mark.parametrize(
     "input_dtype",
     [
         ttnn.bfloat16,
-        # ttnn.bfloat8_b,
     ],
 )
 @pytest.mark.parametrize(
-    "mem_config",
+    "buffer_type",
     [
-        ttnn.MemoryConfig(buffer_type=ttnn.BufferType.DRAM),
-        # ttnn.MemoryConfig(buffer_type=ttnn.BufferType.L1),
+        ttnn.BufferType.DRAM,
     ],
 )
-@pytest.mark.parametrize("enable_async", [False])
-@pytest.mark.parametrize("replication_factor", [4])  # 1, 4])
+@pytest.mark.parametrize("enable_async", [True])
+@pytest.mark.parametrize("replication_factor", [4])
 @pytest.mark.parametrize("mesh_device", [pytest.param((8, 4), id="8x4_grid")], indirect=True)
 def test_line_all_gather_on_TG_cols_post_commit(
     mesh_device,
     num_devices,
-    input_shape,
+    per_chip_output_shape,
     dim,
     num_links,
     input_dtype,
     layout,
-    mem_config,
+    buffer_type,
     use_program_cache,
     function_level_defaults,
     enable_async,
@@ -290,12 +264,13 @@ def test_line_all_gather_on_TG_cols_post_commit(
     run_line_all_gather_on_TG_with_mesh_tensor_along_rows(
         mesh_device,
         num_devices,
-        input_shape,
+        per_chip_output_shape,
+        ttnn.TensorMemoryLayout.INTERLEAVED,
         dim,
         num_links,
         input_dtype,
         layout,
-        mem_config,
+        buffer_type,
         use_program_cache,
         function_level_defaults,
         enable_async=enable_async,
diff --git a/tests/ttnn/unit_tests/operations/test_maxpool2d.py b/tests/ttnn/unit_tests/operations/test_maxpool2d.py
index 192d47e2a78..b90fd026c96 100644
--- a/tests/ttnn/unit_tests/operations/test_maxpool2d.py
+++ b/tests/ttnn/unit_tests/operations/test_maxpool2d.py
@@ -23,6 +23,7 @@ def run_max_pool(
     device,
     dtype,
     memory_config=None,
+    shard_scheme=None,
 ):
     in_n, in_c, in_h, in_w = act_shape
     kernel_h, kernel_w = kernel_size
@@ -30,22 +31,37 @@ def run_max_pool(
     stride_h, stride_w = stride
     dilation_h, dilation_w = dilation
 
-    if 2 * pad_h > kernel_h or 2 * pad_w > kernel_w:
-        pytest.skip("Invalid case")
-
-    if (kernel_h == 3 and pad_h != 1) or (kernel_h == 2 and pad_h != 0):
-        pytest.skip("kernel size and padding combination not supported")
+    if shard_scheme != ttnn.TensorMemoryLayout.WIDTH_SHARDED:
+        if 2 * pad_h > kernel_h or 2 * pad_w > kernel_w:
+            pytest.skip("Invalid case")
+        if (kernel_h == 3 and pad_h != 1) or (kernel_h == 2 and pad_h != 0):
+            pytest.skip("kernel size and padding combination not supported")
 
     out_h = math.floor((in_h + 2 * pad_h - (dilation_h * kernel_h - 1) - 1) / stride_h) + 1
     out_w = math.floor((in_w + 2 * pad_w - (dilation_w * kernel_w - 1) - 1) / stride_w) + 1
-    if in_c % 16 != 0:
-        pytest.skip("Current maxpool writer needs nchannels to be multiple of 16!")
-
-    if in_c == 16 and dtype == ttnn.bfloat8_b and in_n * in_h * in_w > 600000:
-        pytest.skip("This case runs out of memory on Grayskull")
-
-    if in_n > 16 and in_c > 64 and dtype == ttnn.bfloat8_b and is_wormhole_b0():
-        pytest.skip("This case runs out of memory on Wormhole b0")
+    cores_x = device.core_grid.x
+    cores_y = device.core_grid.y
+    max_cores = cores_x * cores_y
+
+    if shard_scheme == ttnn.TensorMemoryLayout.HEIGHT_SHARDED or shard_scheme is None:
+        if in_c % 16 != 0:
+            pytest.skip("Current maxpool writer needs nchannels to be multiple of 16!")
+        if in_c == 16 and dtype == ttnn.bfloat8_b and in_n * in_h * in_w > 600000:
+            pytest.skip("This case runs out of memory on Grayskull")
+        if in_n > 16 and in_c > 64 and dtype == ttnn.bfloat8_b and is_wormhole_b0():
+            pytest.skip("This case runs out of memory on Wormhole b0")
+
+    if shard_scheme == ttnn.TensorMemoryLayout.WIDTH_SHARDED:
+        if in_c < max_cores:
+            pytest.skip("Width sharding requires channles >= cores")
+        if in_c / max_cores < 16:
+            pytest.skip("Width sharding requires large enough channels to shard (at least 16 per core)")
+
+    if shard_scheme == ttnn.TensorMemoryLayout.BLOCK_SHARDED:
+        if in_c < cores_x:
+            pytest.skip("Block sharding requires channles >= cores")
+        if in_c / cores_x < 16:
+            pytest.skip("Block sharding requires large enough channels to shard (at least 16 per core)")
 
     torch.manual_seed(0)
     torch.set_printoptions(precision=3, sci_mode=False, linewidth=500, threshold=10000, edgeitems=32)
@@ -72,12 +88,15 @@ def run_max_pool(
     if dtype == ttnn.bfloat8_b:
         if (in_h * in_w) % 32 != 0:
             pytest.skip("For BFP8_B datatype, input height * width should be multiple of 32")
+        if shard_scheme == ttnn.TensorMemoryLayout.WIDTH_SHARDED and (in_c / max_cores) % 32 != 0:
+            pytest.skip("For BFP8_B datatype, input channels / max_cores should be multiple of 32")
+        if shard_scheme == ttnn.TensorMemoryLayout.BLOCK_SHARDED and (in_c / cores_x) % 32 != 0:
+            pytest.skip("For BFP8_B datatype, input channels / cores_x should be multiple of 32")
         ttact = ttnn.from_torch(act_reshaped, dtype, layout=ttnn.TILE_LAYOUT)
     else:
         ttact = ttnn.from_torch(act_reshaped, dtype)
 
-    pre_shard = True
-    # pre_shard = False
+    pre_shard = shard_scheme == None
 
     ttact_device = ttnn.to_device(ttact, device)
     if pre_shard:
@@ -109,6 +128,7 @@ def run_max_pool(
         padding=[pad_h, pad_w],
         dilation=[dilation_h, dilation_w],
         memory_config=memory_config,
+        applied_shard_scheme=shard_scheme,
     )
 
     output_host = output.cpu()
@@ -249,6 +269,141 @@ def test_run_max_pool_mem_config(
     run_max_pool(act_shape, (3, 3), (1, 1), (2, 2), (1, 1), device, ttnn.bfloat16, memory_config=memory_config)
 
 
+@pytest.mark.parametrize("device_params", [{"l1_small_size": 24576}], indirect=True)
+@pytest.mark.parametrize(
+    "act_shape",  ## NCHW
+    (
+        (
+            [1, 512, 28, 28],
+            [1, 512, 14, 14],
+            [1, 1024, 6, 6],
+            [1, 2048, 6, 6],
+            [1, 4096, 6, 6],
+            [4, 1024, 40, 40],
+            [2, 2048, 40, 40],
+            [8, 4096, 10, 16],
+        )
+    ),
+)
+@pytest.mark.parametrize(
+    "kernel_size",
+    (
+        (2, 2),
+        (3, 3),
+    ),
+)
+@pytest.mark.parametrize(
+    "padding",
+    (
+        (0, 0),
+        (1, 1),
+    ),
+)
+@pytest.mark.parametrize(
+    "stride",
+    ((2, 2),),
+)
+@pytest.mark.parametrize("dilation", ((1, 1),))  ## default
+@pytest.mark.parametrize("dtype", [ttnn.bfloat16, ttnn.bfloat8_b])
+def test_run_max_pool_width_shard(
+    act_shape,
+    kernel_size,
+    padding,
+    stride,
+    dilation,
+    device,
+    dtype,
+    use_program_cache,
+):
+    run_max_pool(
+        act_shape,
+        kernel_size,
+        padding,
+        stride,
+        dilation,
+        device,
+        dtype,
+        shard_scheme=ttnn.TensorMemoryLayout.WIDTH_SHARDED,
+    )
+
+
+@pytest.mark.parametrize("device_params", [{"l1_small_size": 24576}], indirect=True)
+@pytest.mark.parametrize(
+    "act_shape",  ## NCHW
+    (
+        (
+            [1, 256, 56, 56],
+            [1, 256, 28, 28],
+            [1, 256, 14, 14],
+            [1, 256, 10, 14],
+            [1, 512, 8, 6],
+            [1, 1024, 6, 6],
+            [1, 2048, 4, 6],
+            [4, 512, 40, 40],
+            [2, 1024, 40, 40],
+            [8, 2048, 10, 16],
+            ## resnet shapes
+            [1, 64, 112, 112],
+            [4, 64, 112, 112],
+            [8, 64, 112, 112],
+            [16, 64, 112, 112],
+            ## hpr shapes
+            [8, 32, 132, 20],
+            [16, 32, 132, 20],
+            [32, 32, 132, 20],
+            [64, 32, 132, 20],
+            [128, 32, 132, 20],
+            [8, 32, 264, 40],
+            [16, 32, 264, 40],
+            [32, 32, 264, 40],
+            [4, 16, 1056, 160],
+            [8, 16, 528, 80],
+            [16, 16, 528, 80],
+        )
+    ),
+)
+@pytest.mark.parametrize(
+    "kernel_size",
+    (
+        (2, 2),
+        (3, 3),
+    ),
+)
+@pytest.mark.parametrize(
+    "padding",
+    (
+        (0, 0),
+        (1, 1),
+    ),
+)
+@pytest.mark.parametrize(
+    "stride",
+    ((2, 2),),
+)
+@pytest.mark.parametrize("dilation", ((1, 1),))  ## default
+@pytest.mark.parametrize("dtype", [ttnn.bfloat16, ttnn.bfloat8_b])
+def test_run_max_pool_block_shard(
+    act_shape,
+    kernel_size,
+    padding,
+    stride,
+    dilation,
+    device,
+    dtype,
+    use_program_cache,
+):
+    run_max_pool(
+        act_shape,
+        kernel_size,
+        padding,
+        stride,
+        dilation,
+        device,
+        dtype,
+        shard_scheme=ttnn.TensorMemoryLayout.BLOCK_SHARDED,
+    )
+
+
 @pytest.mark.parametrize("device_params", [{"l1_small_size": 24576}], indirect=True)
 @pytest.mark.parametrize(
     "act_shape",  ## NCHW
diff --git a/tests/ttnn/unit_tests/operations/test_softmax.py b/tests/ttnn/unit_tests/operations/test_softmax.py
index bf8e285cd5d..ff45493669c 100644
--- a/tests/ttnn/unit_tests/operations/test_softmax.py
+++ b/tests/ttnn/unit_tests/operations/test_softmax.py
@@ -15,22 +15,46 @@
 
 @pytest.mark.parametrize(
     "input_vector",
-    [[100.0, 101.0], [100.0, 1000.0], [-100.0, -101.0], [-1000.0, -100.0], [-100, -108, -99, -100, -101, -98]],
+    [
+        [100.0, 101.0],
+        [100.0, 1000.0],
+        [-100.0, -99.0],
+        [-100.0, -101.0],
+        [-1000.0, -100.0],
+        [-100, -108, -99, -100, -101, -98],
+    ],
 )
-def test_softmax_stable_neg_values(device, input_vector):
+@pytest.mark.parametrize("math_approx", [True, False])
+@pytest.mark.parametrize("fp32_acc_en", [True, False])
+def test_softmax_stable_neg_values(device, input_vector, math_approx, fp32_acc_en):
     torch.manual_seed(0)
 
     torch_input_tensor = torch.tensor([[[input_vector]]], dtype=torch.bfloat16)
     torch_output_tensor = F.softmax(torch_input_tensor, dim=-1, dtype=torch.bfloat16)
 
+    if is_grayskull():
+        compute_kernel_config = ttnn.GrayskullComputeKernelConfig(
+            math_fidelity=ttnn.MathFidelity.HiFi4,
+            math_approx_mode=math_approx,
+        )
+    else:
+        compute_kernel_config = ttnn.WormholeComputeKernelConfig(
+            math_fidelity=ttnn.MathFidelity.HiFi4,
+            math_approx_mode=math_approx,
+            fp32_dest_acc_en=fp32_acc_en,
+            packer_l1_acc=False,
+        )
+
     input_tensor = ttnn.from_torch(torch_input_tensor, layout=ttnn.TILE_LAYOUT, device=device)
-    output_tensor = ttnn.softmax(input_tensor, dim=-1, numeric_stable=True)
+    output_tensor = ttnn.softmax(input_tensor, dim=-1, compute_kernel_config=compute_kernel_config, numeric_stable=True)
     output_tensor = ttnn.to_torch(output_tensor)
 
     assert_with_pcc(torch_output_tensor, output_tensor, 0.999)
 
 
-def run_softmax_stable_with_program_cache(device, batch_size, h, w, skip_scale_mask, math_approx):
+def run_softmax_stable_with_program_cache(
+    device, batch_size, h, w, skip_scale_mask, math_approx, fp32_acc_en, in_dtype
+):
     torch.manual_seed(0)
 
     scale = 1.0
@@ -47,7 +71,7 @@ def run_softmax_stable_with_program_cache(device, batch_size, h, w, skip_scale_m
         torch_output_tensor = torch_input_tensor
     torch_output_tensor = F.softmax(torch_output_tensor, dim=-1, dtype=torch.bfloat16)
 
-    input_tensor = ttnn.from_torch(torch_input_tensor, layout=ttnn.TILE_LAYOUT, device=device)
+    input_tensor = ttnn.from_torch(torch_input_tensor, dtype=in_dtype, layout=ttnn.TILE_LAYOUT, device=device)
 
     if is_grayskull():
         compute_kernel_config = ttnn.GrayskullComputeKernelConfig(
@@ -58,7 +82,7 @@ def run_softmax_stable_with_program_cache(device, batch_size, h, w, skip_scale_m
         compute_kernel_config = ttnn.WormholeComputeKernelConfig(
             math_fidelity=ttnn.MathFidelity.HiFi4,
             math_approx_mode=math_approx,
-            fp32_dest_acc_en=False,
+            fp32_dest_acc_en=fp32_acc_en,
             packer_l1_acc=False,
         )
 
@@ -80,9 +104,15 @@ def run_softmax_stable_with_program_cache(device, batch_size, h, w, skip_scale_m
 @pytest.mark.parametrize("w", [1024, 1500])
 @pytest.mark.parametrize("skip_scale_mask", [True, False])
 @pytest.mark.parametrize("math_approx", [True, False])
-def test_softmax_stable_with_program_cache(device, batch_size, h, w, skip_scale_mask, math_approx, use_program_cache):
+@pytest.mark.parametrize("fp32_acc_en", [True, False])
+@pytest.mark.parametrize("in_dtype", [ttnn.bfloat8_b, ttnn.bfloat16])
+def test_softmax_stable_with_program_cache(
+    device, batch_size, h, w, skip_scale_mask, math_approx, fp32_acc_en, in_dtype, use_program_cache
+):
     for _ in range(2):
-        run_softmax_stable_with_program_cache(device, batch_size, h, w, skip_scale_mask, math_approx)
+        run_softmax_stable_with_program_cache(
+            device, batch_size, h, w, skip_scale_mask, math_approx, fp32_acc_en, in_dtype
+        )
         # dummy tensor to change tensor alloc
         dummy_shape = [1, 1, 32, 32]
         py_dummy_tensor = torch.randn(dummy_shape)
@@ -96,7 +126,9 @@ def test_softmax_stable_with_program_cache(device, batch_size, h, w, skip_scale_
     assert device.num_program_cache_entries() == 1
 
 
-def run_softmax_sharded_stable(device, batch_size, num_heads, h, w, skip_scale_mask):
+def run_softmax_sharded_stable(
+    device, batch_size, num_heads, h, w, skip_scale_mask, math_approx, fp32_acc_en, in_dtype
+):
     torch.manual_seed(0)
 
     grid_size = (batch_size, num_heads)
@@ -123,21 +155,41 @@ def run_softmax_sharded_stable(device, batch_size, num_heads, h, w, skip_scale_m
     )
     program_config = ttnn.SoftmaxShardedMultiCoreProgramConfig(
         compute_with_storage_grid_size=grid_size,
-        subblock_w=6,
+        subblock_w=6 if not fp32_acc_en else 3,
         block_h=h // 32,
         block_w=w // 32,
     )
+    if is_grayskull():
+        compute_kernel_config = ttnn.GrayskullComputeKernelConfig(
+            math_fidelity=ttnn.MathFidelity.HiFi4,
+            math_approx_mode=math_approx,
+        )
+    else:
+        compute_kernel_config = ttnn.WormholeComputeKernelConfig(
+            math_fidelity=ttnn.MathFidelity.HiFi4,
+            math_approx_mode=math_approx,
+            fp32_dest_acc_en=fp32_acc_en,
+            packer_l1_acc=False,
+        )
 
     input_tensor = ttnn.from_torch(
-        torch_input_tensor, layout=ttnn.TILE_LAYOUT, device=device, memory_config=memory_config
+        torch_input_tensor, dtype=in_dtype, layout=ttnn.TILE_LAYOUT, device=device, memory_config=memory_config
     )
     if not skip_scale_mask:
         output_tensor = ttnn.scale_mask_softmax_in_place(
-            input_tensor, scale, attention_mask_t, program_config=program_config, numeric_stable=True
+            input_tensor,
+            scale,
+            attention_mask_t,
+            program_config=program_config,
+            compute_kernel_config=compute_kernel_config,
+            numeric_stable=True,
         )
     else:
         output_tensor = ttnn.scale_mask_softmax_in_place(
-            input_tensor, program_config=program_config, numeric_stable=True
+            input_tensor,
+            program_config=program_config,
+            compute_kernel_config=compute_kernel_config,
+            numeric_stable=True,
         )
     output_tensor = ttnn.to_torch(output_tensor)
 
@@ -149,11 +201,16 @@ def run_softmax_sharded_stable(device, batch_size, num_heads, h, w, skip_scale_m
 @pytest.mark.parametrize("h", [384])
 @pytest.mark.parametrize("w", [384])
 @pytest.mark.parametrize("skip_scale_mask", [True, False])
+@pytest.mark.parametrize("math_approx", [True, False])
+@pytest.mark.parametrize("fp32_acc_en", [True, False])
+@pytest.mark.parametrize("in_dtype", [ttnn.bfloat8_b, ttnn.bfloat16])
 def test_softmax_sharded_stable_with_program_cache(
-    device, batch_size, num_heads, h, w, skip_scale_mask, use_program_cache
+    device, batch_size, num_heads, h, w, skip_scale_mask, math_approx, fp32_acc_en, in_dtype, use_program_cache
 ):
     for _ in range(2):
-        run_softmax_sharded_stable(device, batch_size, num_heads, h, w, skip_scale_mask)
+        run_softmax_sharded_stable(
+            device, batch_size, num_heads, h, w, skip_scale_mask, math_approx, fp32_acc_en, in_dtype
+        )
         # dummy tensor to change tensor alloc
         dummy_shape = [1, 1, 32, 32]
         py_dummy_tensor = torch.randn(dummy_shape)
diff --git a/tt_metal/common/CMakeLists.txt b/tt_metal/common/CMakeLists.txt
index 75433fda8c8..294d5700810 100644
--- a/tt_metal/common/CMakeLists.txt
+++ b/tt_metal/common/CMakeLists.txt
@@ -1,5 +1,6 @@
 
 set(COMMON_SRCS
+    ${CMAKE_CURRENT_SOURCE_DIR}/core_coord.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/core_descriptor.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/metal_soc_descriptor.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/tt_backend_api_types.cpp
diff --git a/tt_metal/common/core_coord.cpp b/tt_metal/common/core_coord.cpp
new file mode 100644
index 00000000000..7ea7ea6a5d7
--- /dev/null
+++ b/tt_metal/common/core_coord.cpp
@@ -0,0 +1,525 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "tt_metal/common/core_coord.hpp"
+
+#include <algorithm>
+#include <limits>
+#include <mutex>
+#include <optional>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "third_party/umd/device/tt_xy_pair.h"
+#include "tt_metal/common/assert.hpp"
+#include "tt_metal/third_party/tracy/public/tracy/Tracy.hpp"
+#include "tt_metal/tt_stl/reflection.hpp"
+#include "tt_metal/tt_stl/span.hpp"
+
+auto fmt::formatter<CoreCoord>::format(const CoreCoord &core_coord, format_context &ctx) const
+    -> format_context::iterator {
+    std::stringstream ss;
+    ss << core_coord.str();
+    return fmt::format_to(ctx.out(), "{}", ss.str());
+}
+
+std::string RelativeCoreCoord::str() const { return "(x=" + std::to_string(x) + ",y=" + std::to_string(y) + ")"; }
+
+CoreCoord get_core_coord_from_relative(const RelativeCoreCoord &in, const CoreCoord &grid_size) {
+    CoreCoord coord;
+    coord.x = in.x + ((in.x < 0) ? grid_size.x : 0);
+    coord.y = in.y + ((in.y < 0) ? grid_size.y : 0);
+    return coord;
+}
+
+CoreRange::CoreRange(const CoreCoord &point) : start_coord(point), end_coord(point) {}
+
+CoreRange::CoreRange(const CoreCoord &start_coord, const CoreCoord &end_coord) {
+    TT_FATAL(
+        end_coord.x >= start_coord.x and end_coord.y >= start_coord.y,
+        "Invalid core range for start_coord: {}, end_coord: {}",
+        start_coord.str(),
+        end_coord.str());
+
+    this->start_coord = start_coord;
+    this->end_coord = end_coord;
+}
+
+std::optional<CoreRange> CoreRange::intersects(const CoreRange &other) const {
+    std::size_t x1 = std::max(this->start_coord.x, other.start_coord.x);
+    std::size_t y1 = std::max(this->start_coord.y, other.start_coord.y);
+    std::size_t x2 = std::min(this->end_coord.x, other.end_coord.x);
+    std::size_t y2 = std::min(this->end_coord.y, other.end_coord.y);
+    if (x1 <= x2 and y1 <= y2) {
+        return CoreRange({x1, y1}, {x2, y2});
+    }
+
+    return {};
+}
+
+bool CoreRange::adjacent(const CoreRange &other) const {
+    std::size_t x1 = std::max(this->start_coord.x, other.start_coord.x);
+    std::size_t y1 = std::max(this->start_coord.y, other.start_coord.y);
+    std::size_t x2 = std::min(this->end_coord.x, other.end_coord.x);
+    std::size_t y2 = std::min(this->end_coord.y, other.end_coord.y);
+    return ((x2 + 1 == x1 && y1 <= y2) || (y2 + 1 == y1 && x1 <= x2));
+}
+
+bool CoreRange::contains(const CoreRange &other) const {
+    return (other.start_coord.x >= this->start_coord.x) && (other.end_coord.x <= this->end_coord.x) &&
+           (other.start_coord.y >= this->start_coord.y) && (other.end_coord.y <= this->end_coord.y);
+}
+
+bool CoreRange::contains(const CoreCoord &other) const {
+    return (other.x >= this->start_coord.x) && (other.x <= this->end_coord.x) && (other.y >= this->start_coord.y) &&
+           (other.y <= this->end_coord.y);
+}
+
+// Merge lined-up (in x or y dimension) intersecting/adjacent rectangles
+std::optional<CoreRange> CoreRange::merge(const CoreRange &cr) const {
+    if (this->intersects(cr) || this->adjacent(cr)) {
+        if (this->start_coord.x == cr.start_coord.x && this->end_coord.x == cr.end_coord.x)
+            return CoreRange(
+                {this->start_coord.x, std::min(this->start_coord.y, cr.start_coord.y)},
+                {this->end_coord.x, std::max(this->end_coord.y, cr.end_coord.y)});
+
+        else if (this->start_coord.y == cr.start_coord.y && this->end_coord.y == cr.end_coord.y)
+            return CoreRange(
+                {std::min(this->start_coord.x, cr.start_coord.x), this->start_coord.y},
+                {std::max(this->end_coord.x, cr.end_coord.x), this->end_coord.y});
+    }
+    return std::nullopt;
+}
+
+std::string CoreRange::str() const { return "[" + this->start_coord.str() + " - " + this->end_coord.str() + "]"; }
+
+size_t CoreRange::size() const {
+    return (this->end_coord.x - this->start_coord.x + 1) * (this->end_coord.y - this->start_coord.y + 1);
+}
+
+CoreCoord CoreRange::grid_size() const {
+    return {this->end_coord.x - this->start_coord.x + 1, this->end_coord.y - this->start_coord.y + 1};
+}
+
+CoreRange::CoreIterator::CoreIterator(const CoreCoord &current, const CoreRange &core_range) :
+    current_(current), range_(core_range) {}
+
+CoreCoord &CoreRange::CoreIterator::operator*() { return current_; }
+
+CoreRange::CoreIterator &CoreRange::CoreIterator::operator++() {
+    CoreCoord next;
+
+    const bool is_curr_core_at_end_of_row = current_.x == range_.end_coord.x;
+    if (is_curr_core_at_end_of_row) {
+        // Go to the beginning of the next row
+        next.x = range_.start_coord.x;
+        next.y = current_.y + 1;
+    } else {
+        next.x = current_.x + 1;
+        next.y = current_.y;
+    }
+
+    current_ = next;
+    return *this;
+}
+
+CoreRange::CoreIterator CoreRange::begin() const { return CoreRange::CoreIterator(this->start_coord, *this); }
+
+CoreRange::CoreIterator CoreRange::end() const {
+    const CoreCoord iterator_end(this->start_coord.x, this->end_coord.y + 1);
+    return CoreRange::CoreIterator(iterator_end, *this);
+}
+
+bool CoreRange::CoreIterator::operator==(const CoreIterator &other) const { return current_ == other.current_; }
+
+bool CoreRange::CoreIterator::operator!=(const CoreIterator &other) const { return !(current_ == other.current_); }
+
+auto fmt::formatter<CoreRange>::format(const CoreRange &core_range, format_context &ctx) const
+    -> format_context::iterator {
+    std::stringstream ss;
+    ss << core_range.str();
+    return fmt::format_to(ctx.out(), "{}", ss.str());
+}
+
+CoreRangeSet::CoreRangeSet(const std::vector<CoreRange> &core_ranges) :
+    ranges_(core_ranges.begin(), core_ranges.end()) {
+    ZoneScoped;
+    this->validate_no_overlap();
+}
+
+CoreRangeSet::CoreRangeSet(const std::set<CoreRange> &core_ranges) : ranges_(core_ranges.begin(), core_ranges.end()) {
+    ZoneScoped;
+    this->validate_no_overlap();
+}
+
+CoreRangeSet::CoreRangeSet(const CoreRange &core_range) : ranges_{core_range} {}
+
+void swap(CoreRangeSet &first, CoreRangeSet &second) {
+    std::scoped_lock lock(first.ranges_guard, second.ranges_guard);
+    std::swap(first.ranges_, second.ranges_);
+}
+
+CoreRangeSet::CoreRangeSet(const CoreRangeSet &other) {
+    std::scoped_lock lock(other.ranges_guard);
+    this->ranges_ = other.ranges_;
+}
+
+CoreRangeSet &CoreRangeSet::operator=(const CoreRangeSet &other) {
+    std::scoped_lock lock(other.ranges_guard);
+    this->ranges_ = other.ranges_;
+    return *this;
+}
+
+CoreRangeSet::CoreRangeSet(CoreRangeSet &&other) { swap(*this, other); }
+
+CoreRangeSet &CoreRangeSet::operator=(CoreRangeSet &&other) {
+    swap(*this, other);
+    return *this;
+}
+
+CoreRangeSet::CoreRangeSet(std::vector<CoreRange> &&core_ranges) : ranges_(std::move(core_ranges)) {
+    ZoneScoped;
+    this->validate_no_overlap();
+}
+
+size_t CoreRangeSet::size() const { return ranges_.size(); }
+
+template <typename T>
+CoreRangeSet CoreRangeSet::merge(const T &other) const {
+    size_t min_x = std::numeric_limits<size_t>::max(), max_x = 0, min_y = std::numeric_limits<size_t>::max(), max_y = 0;
+    std::set<CoreRange> crs(this->ranges_.begin(), this->ranges_.end());
+    crs.insert(other.begin(), other.end());
+
+    for (const auto &cr : crs) {
+        min_x = std::min(min_x, cr.start_coord.x);
+        max_x = std::max(max_x, cr.end_coord.x);
+        min_y = std::min(min_y, cr.start_coord.y);
+        max_y = std::max(max_y, cr.end_coord.y);
+    }
+
+    // By overallocating by one x entry, we can avoid needing to check for
+    // boundary conditions when iterating, since there'll always be one
+    // last false entry
+    bool grid[max_y + 1][max_x + 2];
+    memset(grid, 0, sizeof(grid));
+
+    for (const auto &cr : crs)
+        for (unsigned y = cr.start_coord.y; y <= cr.end_coord.y; y++)
+            for (unsigned x = cr.start_coord.x; x <= cr.end_coord.x; x++) grid[y][x] = true;
+
+    crs.clear();
+    for (unsigned y = min_y; y <= max_y; y++) {
+        std::set<CoreRange> filter_set, tmp, new_crs;
+        std::vector<CoreRange> ranges;
+        for (unsigned x = min_x; x <= max_x + 1; x++) {
+            if (grid[y][x]) {
+                unsigned x_start = x;
+                while (grid[y][x]) x++;
+                ranges.push_back(CoreRange({x_start, y}, {x - 1, y}));
+            }
+        }
+
+        for (const auto &cr : ranges) {
+            for (const auto &prev_cr : crs) {
+                if (auto merged = cr.merge(prev_cr)) {
+                    new_crs.insert(merged.value());
+                    filter_set.insert(prev_cr);
+                    filter_set.insert(cr);
+                }
+            }
+            crs.insert(cr);
+        }
+        // Set(A) = Set(A) - Set(B)
+        std::set_difference(
+            std::make_move_iterator(crs.begin()),
+            std::make_move_iterator(crs.end()),
+            filter_set.begin(),
+            filter_set.end(),
+            std::inserter(tmp, tmp.end()));
+        crs.swap(tmp);
+        crs.insert(new_crs.begin(), new_crs.end());
+    }
+    return CoreRangeSet(crs);
+}
+
+template CoreRangeSet CoreRangeSet::merge<std::vector<CoreRange>>(const std::vector<CoreRange> &other) const;
+template CoreRangeSet CoreRangeSet::merge<std::set<CoreRange>>(const std::set<CoreRange> &other) const;
+
+template <>
+CoreRangeSet CoreRangeSet::merge<CoreRangeSet>(const CoreRangeSet &other) const {
+    return this->merge(other.ranges());
+}
+
+bool CoreRangeSet::core_coord_in_core_ranges(const CoreCoord &core_coord) const {
+    ZoneScoped;
+    for (const auto &cr : this->ranges_) {
+        if (cr.contains(core_coord))
+            return true;
+    }
+    return false;
+}
+
+bool CoreRangeSet::intersects(const CoreRange &cr) const {
+    for (const auto &local_cr : this->ranges_) {
+        if (local_cr.intersects(cr))
+            return true;
+    }
+    return false;
+}
+
+const std::vector<CoreRange> &CoreRangeSet::ranges() const { return this->ranges_; }
+
+std::string CoreRangeSet::str() const {
+    if (this->ranges().size() > 0) {
+        std::string core_range_set_str = "{";
+        for (const auto &core_range : this->ranges_) {
+            core_range_set_str += core_range.str() + ", ";
+        }
+        core_range_set_str[core_range_set_str.length() - 2] = '}';
+        core_range_set_str.pop_back();
+        return core_range_set_str;
+    } else {
+        return "{}";
+    }
+}
+
+uint32_t CoreRangeSet::num_cores() const {
+    uint32_t num_cores = 0;
+    for (const auto &core_range : this->ranges()) {
+        num_cores += core_range.size();
+    }
+    return num_cores;
+}
+
+CoreRange CoreRangeSet::bounding_box() const {
+    TT_FATAL(this->ranges().size() > 0, "Cannot get bounding_box of an empty CoreRangeSet!");
+    size_t min_x = UINT32_MAX, min_y = UINT32_MAX, max_x = 0, max_y = 0;
+    for (const auto &cr : this->ranges()) {
+        min_x = std::min(min_x, cr.start_coord.x);
+        max_x = std::max(max_x, cr.end_coord.x);
+        min_y = std::min(min_y, cr.start_coord.y);
+        max_y = std::max(max_y, cr.end_coord.y);
+    }
+    return {{min_x, min_y}, {max_x, max_y}};
+}
+
+void CoreRangeSet::validate_no_overlap() {
+    if (this->ranges_.size() < 2) {
+        return;
+    }
+    for (auto outer_it = this->ranges_.begin(); outer_it != this->ranges_.end() - 1; outer_it++) {
+        for (auto inner_it = outer_it + 1; inner_it != this->ranges_.end(); inner_it++) {
+            CoreRange &first_core_range = *outer_it;
+            CoreRange &second_core_range = *inner_it;
+            bool first_core_left_of_second = first_core_range.end_coord.x < second_core_range.start_coord.x;
+            bool first_core_right_of_second = first_core_range.start_coord.x > second_core_range.end_coord.x;
+            bool first_core_above_second = first_core_range.end_coord.y < second_core_range.start_coord.y;
+            bool first_core_below_second = first_core_range.start_coord.y > second_core_range.end_coord.y;
+            auto no_overlap = first_core_left_of_second or first_core_right_of_second or first_core_above_second or
+                              first_core_below_second;
+            if (not no_overlap) {
+                TT_THROW(
+                    "Cannot create CoreRangeSet with specified core ranges because core ranges {} and {} overlap!",
+                    first_core_range.str(),
+                    second_core_range.str());
+            }
+        }
+    }
+}
+
+bool operator==(const CoreRangeSet &a, const CoreRangeSet &b) {
+    if (a.ranges().size() == b.ranges().size()) {
+        auto range_a = a.ranges();
+        auto range_b = b.ranges();
+        for (auto it_a = range_a.begin(), it_b = range_b.begin(); it_a != range_a.end(); it_a++, it_b++) {
+            if (*it_a != *it_b) {
+                return false;
+            }
+        }
+        return true;
+    }
+    return false;
+}
+
+std::vector<CoreCoord> grid_to_cores(uint32_t num_cores, uint32_t grid_size_x, uint32_t grid_size_y, bool row_wise) {
+    std::vector<CoreCoord> cores;
+    cores.reserve(num_cores);
+    TT_ASSERT(
+        num_cores <= grid_size_x * grid_size_y,
+        "Number of cores {} exceeds grid size {}x{}",
+        num_cores,
+        grid_size_x,
+        grid_size_y);
+    if (row_wise) {
+        for (uint32_t i = 0; i < num_cores; ++i) {
+            cores.push_back({i % grid_size_x, i / grid_size_x});
+        }
+    } else {
+        for (uint32_t i = 0; i < num_cores; ++i) {
+            cores.push_back({i / grid_size_y, i % grid_size_y});
+        }
+    }
+    return cores;
+}
+
+std::vector<CoreCoord> grid_to_cores(CoreCoord start, CoreCoord end, bool row_wise) {
+    std::vector<CoreCoord> cores;
+    auto num_cores_x = (end.x + 1) - start.x;
+    auto num_cores_y = (end.y + 1) - start.y;
+    uint32_t num_cores = num_cores_x * num_cores_y;
+    cores.reserve(num_cores);
+    if (row_wise) {
+        for (uint32_t j = start.y; j < (end.y + 1); j++) {
+            for (uint32_t i = start.x; i < (end.x + 1); i++) {
+                cores.push_back({i, j});
+            }
+        }
+
+    } else {
+        for (uint32_t i = start.x; i < (end.x + 1); i++) {
+            for (uint32_t j = start.y; j < (end.y + 1); j++) {
+                cores.push_back({i, j});
+            }
+        }
+    }
+    return cores;
+}
+
+// Noop cores are appended at the end with no guarantees on ordering
+std::vector<CoreCoord> grid_to_cores_with_noop(
+    const uint32_t bbox_x,
+    const uint32_t bbox_y,
+    const uint32_t grid_size_x,
+    const uint32_t grid_size_y,
+    const bool row_wise) {
+    ZoneScoped;
+    std::vector<CoreCoord> cores;
+    cores.reserve(grid_size_x * grid_size_y);
+    TT_ASSERT(bbox_x < grid_size_x);
+    TT_ASSERT(bbox_y < grid_size_y);
+    const uint32_t box_size_x = bbox_x + 1;
+    const uint32_t box_size_y = bbox_y + 1;
+
+    if (row_wise) {
+        for (uint32_t i = 0; i < box_size_x * box_size_y; ++i) {
+            cores.push_back({i % box_size_x, i / box_size_x});
+        }
+    } else {
+        for (uint32_t i = 0; i < box_size_x * box_size_y; ++i) {
+            cores.push_back({i / box_size_y, i % box_size_y});
+        }
+    }
+
+    // Right rectangle noops
+    for (uint32_t x = box_size_x; x < grid_size_x; ++x) {
+        for (uint32_t y = 0; y < grid_size_y; ++y) {
+            cores.push_back({x, y});
+        }
+    }
+
+    // Bottom rectangle noops
+    for (uint32_t y = box_size_y; y < grid_size_y; ++y) {
+        for (uint32_t x = 0; x < box_size_x; ++x) {
+            cores.push_back({x, y});
+        }
+    }
+
+    return cores;
+}
+
+std::vector<CoreCoord> corerange_to_cores(const CoreRangeSet &crs, std::optional<uint32_t> max_cores, bool row_wise) {
+    uint32_t num_total_cores = 0;
+    std::vector<CoreCoord> all_cores;
+    uint32_t offset = 0;
+
+    for (auto core_range : crs.ranges()) {
+        auto start_coord = core_range.start_coord;
+        auto end_coord = core_range.end_coord;
+        auto cores = grid_to_cores(start_coord, end_coord, row_wise);
+        if (max_cores.has_value()) {
+            if (all_cores.size() + cores.size() > max_cores.value()) {
+                uint32_t num_cores_to_add = max_cores.value() - all_cores.size();
+                all_cores.insert(all_cores.end(), cores.begin(), cores.begin() + num_cores_to_add);
+            } else {
+                all_cores.insert(all_cores.end(), cores.begin(), cores.end());
+            }
+        } else {
+            all_cores.insert(all_cores.end(), cores.begin(), cores.end());
+        }
+    }
+
+    return all_cores;
+}
+
+bool operator!=(const CoreRangeSet &a, const CoreRangeSet &b) { return !(a == b); }
+
+auto fmt::formatter<CoreRangeSet>::format(const CoreRangeSet &core_range_set, format_context &ctx) const
+    -> format_context::iterator {
+    std::stringstream ss;
+    ss << core_range_set.str();
+    return fmt::format_to(ctx.out(), "{}", ss.str());
+}
+
+namespace std {
+
+std::size_t hash<RelativeCoreCoord>::operator()(RelativeCoreCoord const &o) const {
+    std::size_t seed = 0;
+    seed = std::hash<std::size_t>()(o.x) ^ std::hash<std::size_t>()(o.y) << 1;
+    return seed;
+}
+
+std::size_t hash<CoreRange>::operator()(const CoreRange &core_range) const {
+    std::size_t seed = 0;
+    seed = std::hash<CoreCoord>{}(core_range.start_coord) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+    seed = std::hash<CoreCoord>{}(core_range.end_coord) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+    return seed;
+}
+
+std::size_t hash<CoreRangeSet>::operator()(const CoreRangeSet &core_range_set) const {
+    std::size_t seed = 0;
+    for (const auto &core_range : core_range_set.ranges()) {
+        seed = std::hash<CoreRange>{}(core_range) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+    }
+    return seed;
+}
+
+}  // namespace std
+
+namespace tt::stl::json {
+
+nlohmann::json to_json_t<CoreCoord>::operator()(const CoreCoord &core_coord) noexcept {
+    return {{"x", to_json(core_coord.x)}, {"y", to_json(core_coord.y)}};
+}
+
+CoreCoord from_json_t<CoreCoord>::operator()(const nlohmann::json &json) noexcept {
+    return {from_json<uint32_t>(json.at("x")), from_json<uint32_t>(json.at("y"))};
+}
+
+nlohmann::json to_json_t<RelativeCoreCoord>::operator()(const RelativeCoreCoord &relative_core_coord) noexcept {
+    return {{"x", to_json(relative_core_coord.x)}, {"y", to_json(relative_core_coord.y)}};
+}
+
+RelativeCoreCoord from_json_t<RelativeCoreCoord>::operator()(const nlohmann::json &json) noexcept {
+    return {from_json<int32_t>(json.at("x")), from_json<int32_t>(json.at("y"))};
+}
+
+nlohmann::json to_json_t<CoreRange>::operator()(const CoreRange &core_range) noexcept {
+    return {{"start", to_json(core_range.start_coord)}, {"end", to_json(core_range.end_coord)}};
+}
+
+CoreRange from_json_t<CoreRange>::operator()(const nlohmann::json &json) noexcept {
+    return {from_json<CoreCoord>(json.at("start")), from_json<CoreCoord>(json.at("end"))};
+}
+
+nlohmann::json to_json_t<CoreRangeSet>::operator()(const CoreRangeSet &core_range_set) noexcept {
+    nlohmann::json core_range_set_json = nlohmann::json::array();
+    return to_json(core_range_set.ranges());
+}
+
+CoreRangeSet from_json_t<CoreRangeSet>::operator()(const nlohmann::json &json) noexcept {
+    return CoreRangeSet(from_json<std::vector<CoreRange>>(json));
+}
+
+}  // namespace tt::stl::json
diff --git a/tt_metal/common/core_coord.h b/tt_metal/common/core_coord.h
deleted file mode 100644
index 448ef85edb1..00000000000
--- a/tt_metal/common/core_coord.h
+++ /dev/null
@@ -1,636 +0,0 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include <algorithm>
-#include <limits>
-#include <mutex>
-#include <optional>
-#include <set>
-#include <string>
-
-#include "third_party/json/json.hpp"
-#include "third_party/umd/device/tt_xy_pair.h"
-#include "tt_metal/common/assert.hpp"
-#include "tt_metal/third_party/tracy/public/tracy/Tracy.hpp"
-#include "tt_metal/tt_stl/reflection.hpp"
-
-using std::pair;
-
-using CoreCoord = tt_xy_pair;
-
-template <>
-struct fmt::formatter<CoreCoord> {
-    constexpr auto parse(format_parse_context &ctx) -> format_parse_context::iterator { return ctx.end(); }
-
-    auto format(const CoreCoord &core_coord, format_context &ctx) const -> format_context::iterator {
-        std::stringstream ss;
-        ss << core_coord.str();
-        return fmt::format_to(ctx.out(), "{}", ss.str());
-    }
-};
-
-constexpr inline bool operator<=(const CoreCoord &a, const CoreCoord &b) { return (a < b) or (a == b); }
-
-struct RelativeCoreCoord {
-    long x = 0;
-    long y = 0;
-
-    std::string str() const { return "(x=" + std::to_string(x) + ",y=" + std::to_string(y) + ")"; }
-};
-
-constexpr inline bool operator==(const RelativeCoreCoord &a, const RelativeCoreCoord &b) {
-    return a.x == b.x && a.y == b.y;
-}
-
-constexpr inline bool operator!=(const RelativeCoreCoord &a, const RelativeCoreCoord &b) { return !(a == b); }
-
-namespace std {
-template <>
-struct hash<RelativeCoreCoord> {
-    std::size_t operator()(RelativeCoreCoord const &o) const {
-        std::size_t seed = 0;
-        seed = std::hash<std::size_t>()(o.x) ^ std::hash<std::size_t>()(o.y) << 1;
-        return seed;
-    }
-};
-}  // namespace std
-
-inline CoreCoord get_core_coord_from_relative(const RelativeCoreCoord &in, const CoreCoord &grid_size) {
-    CoreCoord coord;
-    coord.x = in.x + ((in.x < 0) ? grid_size.x : 0);
-    coord.y = in.y + ((in.y < 0) ? grid_size.y : 0);
-    return coord;
-}
-
-struct CoreRange {
-    CoreCoord start_coord;
-    CoreCoord end_coord;
-    CoreRange(const CoreCoord &point) {
-        this->start_coord = point;
-        this->end_coord = point;
-    }
-
-    CoreRange(const CoreCoord &start_coord, const CoreCoord &end_coord) {
-        TT_ASSERT(
-            end_coord.x >= start_coord.x and end_coord.y >= start_coord.y,
-            "Invalid core range for start_coord: {}, end_coord: {}", start_coord.str(), end_coord.str());
-
-        this->start_coord = start_coord;
-        this->end_coord = end_coord;
-    }
-
-    CoreRange(const CoreRange &other) = default;
-    CoreRange &operator=(const CoreRange &other) = default;
-    CoreRange(CoreRange &&other) = default;
-    CoreRange &operator=(CoreRange &&other) = default;
-
-    // void validate() {
-    //     TT_FATAL(
-    //         end_coord.x >= start_coord.x and end_coord.y >= start_coord.y,
-    //         "Invalid core range for start_coord: {}, end_coord: {}", start_coord.str(), end_coord.str());
-    // }
-
-    inline std::optional<CoreRange> intersects(const CoreRange &other) const {
-        std::size_t x1 = std::max(this->start_coord.x, other.start_coord.x);
-        std::size_t y1 = std::max(this->start_coord.y, other.start_coord.y);
-        std::size_t x2 = std::min(this->end_coord.x, other.end_coord.x);
-        std::size_t y2 = std::min(this->end_coord.y, other.end_coord.y);
-        if (x1 <= x2 and y1 <= y2)
-            return CoreRange({x1, y1}, {x2, y2});
-
-        return {};
-    }
-
-    inline bool adjacent(const CoreRange &other) const {
-        std::size_t x1 = std::max(this->start_coord.x, other.start_coord.x);
-        std::size_t y1 = std::max(this->start_coord.y, other.start_coord.y);
-        std::size_t x2 = std::min(this->end_coord.x, other.end_coord.x);
-        std::size_t y2 = std::min(this->end_coord.y, other.end_coord.y);
-        return ((x2 + 1 == x1 && y1 <= y2) || (y2 + 1 == y1 && x1 <= x2));
-    }
-
-    inline bool contains(const CoreRange &other) const {
-        return (other.start_coord.x >= this->start_coord.x) && (other.end_coord.x <= this->end_coord.x) && (other.start_coord.y >= this->start_coord.y) &&
-               (other.end_coord.y <= this->end_coord.y);
-    }
-
-    inline bool contains(const CoreCoord &other) const {
-        return (other.x >= this->start_coord.x) && (other.x <= this->end_coord.x) && (other.y >= this->start_coord.y) &&
-               (other.y <= this->end_coord.y);
-    }
-
-    // Merge lined-up (in x or y dimension) intersecting/adjacent rectangles
-    std::optional<CoreRange> merge(const CoreRange &cr) const {
-        if (this->intersects(cr) || this->adjacent(cr)) {
-            if (this->start_coord.x == cr.start_coord.x && this->end_coord.x == cr.end_coord.x)
-                return CoreRange(
-                    {this->start_coord.x, std::min(this->start_coord.y, cr.start_coord.y)},
-                    {this->end_coord.x, std::max(this->end_coord.y, cr.end_coord.y)});
-
-            else if (this->start_coord.y == cr.start_coord.y && this->end_coord.y == cr.end_coord.y)
-                return CoreRange(
-                    {std::min(this->start_coord.x, cr.start_coord.x), this->start_coord.y},
-                    {std::max(this->end_coord.x, cr.end_coord.x), this->end_coord.y});
-        }
-        return std::nullopt;
-    }
-
-    std::string str() const { return "[" + this->start_coord.str() + " - " + this->end_coord.str() + "]"; }
-
-    size_t size() const { return (this->end_coord.x - this->start_coord.x + 1) * (this->end_coord.y - this->start_coord.y + 1); }
-
-    CoreCoord grid_size() const { return {this->end_coord.x - this->start_coord.x + 1, this->end_coord.y - this->start_coord.y + 1}; }
-
-    class CoreIterator
-    {
-    public:
-        CoreIterator(const CoreCoord& current, const CoreRange& core_range) :
-            current_(current),
-            range_(core_range)
-        {}
-
-        CoreCoord& operator*()
-        {
-            return current_;
-        }
-
-        CoreIterator& operator++()
-        {
-            CoreCoord next;
-
-            const bool is_curr_core_at_end_of_row = current_.x == range_.end_coord.x;
-            if (is_curr_core_at_end_of_row)
-            {
-                // Go to the beginning of the next row
-                next.x = range_.start_coord.x;
-                next.y = current_.y + 1;
-            }
-            else
-            {
-                next.x = current_.x + 1;
-                next.y = current_.y;
-            }
-
-            current_ = next;
-            return *this;
-        }
-
-        bool operator==(const CoreIterator& other) const
-        {
-            return current_ == other.current_;
-        }
-
-        bool operator!=(const CoreIterator& other) const
-        {
-            return !(current_ == other.current_);
-        }
-
-    private:
-        CoreCoord current_;
-        const CoreRange& range_;
-    };
-
-    CoreIterator begin() const
-    {
-        return CoreIterator(this->start_coord, *this);
-    }
-
-    CoreIterator end() const
-    {
-        const CoreCoord iterator_end(this->start_coord.x, this->end_coord.y + 1);
-        return CoreIterator(iterator_end, *this);
-    }
-};
-
-constexpr inline bool operator==(const CoreRange &a, const CoreRange &b) {
-    return a.start_coord == b.start_coord && a.end_coord == b.end_coord;
-}
-
-constexpr inline bool operator!=(const CoreRange &a, const CoreRange &b) { return !(a == b); }
-
-constexpr inline bool operator<(const CoreRange &left, const CoreRange &right) {
-    return (left.start_coord < right.start_coord || (left.start_coord == right.start_coord && left.end_coord < right.end_coord));
-}
-
-template <>
-struct fmt::formatter<CoreRange> {
-    constexpr auto parse(format_parse_context &ctx) -> format_parse_context::iterator { return ctx.end(); }
-
-    auto format(const CoreRange &core_range, format_context &ctx) const -> format_context::iterator {
-        std::stringstream ss;
-        ss << core_range.str();
-        return fmt::format_to(ctx.out(), "{}", ss.str());
-    }
-};
-
-namespace std {
-template <>
-struct hash<CoreRange> {
-    std::size_t operator()(const CoreRange &core_range) const {
-        std::size_t seed = 0;
-        seed = std::hash<CoreCoord>{}(core_range.start_coord) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
-        seed = std::hash<CoreCoord>{}(core_range.end_coord) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
-        return seed;
-    }
-};
-}  // namespace std
-
-class CoreRangeSet {
-   public:
-    CoreRangeSet(const std::set<CoreRange> &core_ranges) : ranges_(core_ranges) {
-        ZoneScoped;
-        for (auto outer_it = this->ranges_.begin(); outer_it != this->ranges_.end(); outer_it++) {
-            for (auto inner_it = this->ranges_.begin(); inner_it != this->ranges_.end(); inner_it++) {
-                if (outer_it == inner_it) {
-                    continue;
-                }
-                CoreRange first_core_range = *outer_it;
-                CoreRange second_core_range = *inner_it;
-                bool first_core_left_of_second = first_core_range.end_coord.x < second_core_range.start_coord.x;
-                bool first_core_right_of_second = first_core_range.start_coord.x > second_core_range.end_coord.x;
-                bool first_core_above_second = first_core_range.end_coord.y < second_core_range.start_coord.y;
-                bool first_core_below_second = first_core_range.start_coord.y > second_core_range.end_coord.y;
-                auto no_overlap = first_core_left_of_second or first_core_right_of_second or first_core_above_second or
-                                  first_core_below_second;
-                if (not no_overlap) {
-                    TT_THROW(
-                        "Cannot create CoreRangeSet with specified core ranges because core ranges {} and {} overlap!",
-                        first_core_range.str(),
-                        second_core_range.str());
-                }
-            }
-        }
-    }
-
-    friend void swap(CoreRangeSet& first, CoreRangeSet& second) {
-        std::scoped_lock lock(first.ranges_guard, second.ranges_guard);
-        std::swap(first.ranges_, second.ranges_);
-    }
-
-    CoreRangeSet(const CoreRangeSet &other) {
-        std::scoped_lock lock(other.ranges_guard);
-        this->ranges_ = other.ranges_;
-    }
-    CoreRangeSet &operator=(const CoreRangeSet &other) {
-        std::scoped_lock lock(other.ranges_guard);
-        this->ranges_ = other.ranges_;
-        return *this;
-    }
-
-    CoreRangeSet(CoreRangeSet &&other) {
-        swap(*this, other);
-    }
-
-    CoreRangeSet &operator=(CoreRangeSet &&other) {;
-        swap(*this, other);
-        return *this;
-    }
-
-    auto size() const { return ranges_.size(); }
-
-    CoreRangeSet merge(const std::set<CoreRange> &other) const {
-        size_t min_x = std::numeric_limits<size_t>::max(), max_x = 0, min_y = std::numeric_limits<size_t>::max(),
-               max_y = 0;
-        std::set<CoreRange> crs = this->ranges_;
-        crs.insert(other.begin(), other.end());
-
-        for (const auto &cr : crs) {
-            // std::cout << "merging " << cr.str() << std::endl;
-            min_x = std::min(min_x, cr.start_coord.x);
-            max_x = std::max(max_x, cr.end_coord.x);
-            min_y = std::min(min_y, cr.start_coord.y);
-            max_y = std::max(max_y, cr.end_coord.y);
-        }
-
-        // By overallocating by one x entry, we can avoid needing to check for
-        // boundary conditions when iterating, since there'll always be one
-        // last false entry
-        bool grid[max_y + 1][max_x + 2];
-        memset(grid, 0, sizeof(grid));
-
-        for (const auto &cr : crs)
-            for (unsigned y = cr.start_coord.y; y <= cr.end_coord.y; y++)
-                for (unsigned x = cr.start_coord.x; x <= cr.end_coord.x; x++) grid[y][x] = true;
-
-        crs.clear();
-        for (unsigned y = min_y; y <= max_y; y++) {
-            std::set<CoreRange> filter_set, tmp, new_crs;
-            std::vector<CoreRange> ranges;
-            for (unsigned x = min_x; x <= max_x + 1; x++) {
-                if (grid[y][x]) {
-                    unsigned x_start = x;
-                    while (grid[y][x]) x++;
-                    ranges.push_back(CoreRange({x_start, y}, {x - 1, y}));
-                }
-            }
-
-            for (const auto &cr : ranges) {
-                for (const auto &prev_cr : crs) {
-                    if (auto merged = cr.merge(prev_cr)) {
-                        // std::cout << "merging " << cr.str() << " and " << prev_cr.str() << " with " <<
-                        // merged.value().str() << std::endl;
-                        new_crs.insert(merged.value());
-                        filter_set.insert(prev_cr);
-                        filter_set.insert(cr);
-                    }
-                }
-                crs.insert(cr);
-            }
-            // Set(A) = Set(A) - Set(B)
-            std::set_difference(
-                std::make_move_iterator(crs.begin()),
-                std::make_move_iterator(crs.end()),
-                filter_set.begin(),
-                filter_set.end(),
-                std::inserter(tmp, tmp.end()));
-            crs.swap(tmp);
-            crs.insert(new_crs.begin(), new_crs.end());
-        }
-        // for ( const auto & cr : crs ){
-        //   std::cout << " final merged CR:" << cr.str() << std::endl;
-        // }
-        return CoreRangeSet(crs);
-    }
-
-    CoreRangeSet merge(const CoreRangeSet &s) const { return this->merge(s.ranges()); }
-
-    inline bool core_coord_in_core_ranges(const CoreCoord &core_coord) const {
-        ZoneScoped;
-        for (const auto &cr : this->ranges_) {
-            if (cr.contains(core_coord))
-                return true;
-        }
-        return false;
-    }
-
-    inline bool intersects(const CoreRange &cr) const {
-        for (const auto &local_cr : this->ranges_) {
-            if (local_cr.intersects(cr))
-                return true;
-        }
-        return false;
-    }
-
-    const std::set<CoreRange> &ranges() const { return this->ranges_; }
-
-    std::string str() const {
-        if (this->ranges().size() > 0) {
-            std::string core_range_set_str = "{";
-            for (const auto &core_range : this->ranges_) {
-                core_range_set_str += core_range.str() + ", ";
-            }
-            core_range_set_str[core_range_set_str.length() - 2] = '}';
-            core_range_set_str.pop_back();
-            return core_range_set_str;
-        } else {
-            return "{}";
-        }
-    }
-
-    const uint32_t num_cores() const {
-        uint32_t num_cores = 0;
-        for (const auto &core_range : this->ranges()) {
-            num_cores += core_range.size();
-        }
-        return num_cores;
-    }
-
-    CoreRange bounding_box() const {
-        TT_FATAL(this->ranges().size() > 0, "Cannot get bounding_box of an empty CoreRangeSet!");
-        size_t min_x = UINT32_MAX, min_y = UINT32_MAX, max_x = 0, max_y = 0;
-        for (const auto &cr : this->ranges()) {
-            min_x = std::min(min_x, cr.start_coord.x);
-            max_x = std::max(max_x, cr.end_coord.x);
-            min_y = std::min(min_y, cr.start_coord.y);
-            max_y = std::max(max_y, cr.end_coord.y);
-        }
-        return {{min_x, min_y}, {max_x, max_y}};
-    }
-
-    private:
-     mutable std::mutex ranges_guard;
-     std::set<CoreRange> ranges_;
-};
-
-const inline bool operator==(const CoreRangeSet &a, const CoreRangeSet &b) {
-    if (a.ranges().size() == b.ranges().size()) {
-        auto range_a = a.ranges();
-        auto range_b = b.ranges();
-        for (auto it_a = range_a.begin(), it_b = range_b.begin(); it_a != range_a.end(); it_a++, it_b++) {
-            if (*it_a != *it_b) {
-                return false;
-            }
-        }
-        return true;
-    }
-    return false;
-}
-
-inline std::vector<CoreCoord> grid_to_cores(
-    uint32_t num_cores, uint32_t grid_size_x, uint32_t grid_size_y, bool row_wise = false) {
-    std::vector<CoreCoord> cores;
-    cores.reserve(num_cores);
-    TT_ASSERT(
-        num_cores <= grid_size_x * grid_size_y,
-        "Number of cores {} exceeds grid size {}x{}",
-        num_cores,
-        grid_size_x,
-        grid_size_y);
-    if (row_wise) {
-        for (uint32_t i = 0; i < num_cores; ++i) {
-            cores.push_back({i % grid_size_x, i / grid_size_x});
-        }
-    } else {
-        for (uint32_t i = 0; i < num_cores; ++i) {
-            cores.push_back({i / grid_size_y, i % grid_size_y});
-        }
-    }
-    return cores;
-}
-
-inline std::vector<CoreCoord> grid_to_cores(CoreCoord start, CoreCoord end, bool row_wise = false) {
-    std::vector<CoreCoord> cores;
-    auto num_cores_x = (end.x + 1) - start.x;
-    auto num_cores_y = (end.y + 1) - start.y;
-    uint32_t num_cores = num_cores_x * num_cores_y;
-    cores.reserve(num_cores);
-    if (row_wise) {
-        for (uint32_t j = start.y; j < (end.y + 1); j++) {
-            for (uint32_t i = start.x; i < (end.x + 1); i++) {
-                cores.push_back({i, j});
-            }
-        }
-
-    } else {
-        for (uint32_t i = start.x; i < (end.x + 1); i++) {
-            for (uint32_t j = start.y; j < (end.y + 1); j++) {
-                cores.push_back({i, j});
-            }
-        }
-    }
-    return cores;
-}
-
-// Noop cores are appended at the end with no guarantees on ordering
-inline std::vector<CoreCoord> grid_to_cores_with_noop(
-    const uint32_t bbox_x,
-    const uint32_t bbox_y,
-    const uint32_t grid_size_x,
-    const uint32_t grid_size_y,
-    const bool row_wise = false) {
-    ZoneScoped;
-    std::vector<CoreCoord> cores;
-    cores.reserve(grid_size_x * grid_size_y);
-    TT_ASSERT(bbox_x < grid_size_x);
-    TT_ASSERT(bbox_y < grid_size_y);
-    const uint32_t box_size_x = bbox_x + 1;
-    const uint32_t box_size_y = bbox_y + 1;
-
-    if (row_wise) {
-        for (uint32_t i = 0; i < box_size_x * box_size_y; ++i) {
-            cores.push_back({i % box_size_x, i / box_size_x});
-        }
-    } else {
-        for (uint32_t i = 0; i < box_size_x * box_size_y; ++i) {
-            cores.push_back({i / box_size_y, i % box_size_y});
-        }
-    }
-
-    // Right rectangle noops
-    for (uint32_t x = box_size_x; x < grid_size_x; ++x) {
-        for (uint32_t y = 0; y < grid_size_y; ++y) {
-            cores.push_back({x, y});
-        }
-    }
-
-    // Bottom rectangle noops
-    for (uint32_t y = box_size_y; y < grid_size_y; ++y) {
-        for (uint32_t x = 0; x < box_size_x; ++x) {
-            cores.push_back({x, y});
-        }
-    }
-
-    return cores;
-}
-
-inline std::vector<CoreCoord> corerange_to_cores(
-    const CoreRangeSet &crs, std::optional<uint32_t> max_cores = std::nullopt, bool row_wise = false) {
-    uint32_t num_total_cores = 0;
-    std::vector<CoreCoord> all_cores;
-    uint32_t offset = 0;
-
-    for (auto core_range : crs.ranges()) {
-        auto start_coord = core_range.start_coord;
-        auto end_coord = core_range.end_coord;
-        auto cores = grid_to_cores(start_coord, end_coord, row_wise);
-        if (max_cores.has_value()) {
-            if (all_cores.size() + cores.size() > max_cores.value()) {
-                uint32_t num_cores_to_add = max_cores.value() - all_cores.size();
-                all_cores.insert(all_cores.end(), cores.begin(), cores.begin() + num_cores_to_add);
-            } else {
-                all_cores.insert(all_cores.end(), cores.begin(), cores.end());
-            }
-        } else {
-            all_cores.insert(all_cores.end(), cores.begin(), cores.end());
-        }
-    }
-
-    return all_cores;
-}
-
-const inline bool operator!=(const CoreRangeSet &a, const CoreRangeSet &b) { return !(a == b); }
-
-template <>
-struct fmt::formatter<CoreRangeSet> {
-    constexpr auto parse(format_parse_context &ctx) -> format_parse_context::iterator { return ctx.end(); }
-
-    auto format(const CoreRangeSet &core_range_set, format_context &ctx) const -> format_context::iterator {
-        std::stringstream ss;
-        ss << core_range_set.str();
-        return fmt::format_to(ctx.out(), "{}", ss.str());
-    }
-};
-
-// Adding to tt::tt_metal namespace as we transition to moving this out of global namespace eventually.
-namespace tt::tt_metal {
-   using ::CoreCoord;
-   using ::CoreRange;
-   using ::CoreRangeSet;
-}
-
-namespace std {
-template <>
-struct hash<CoreRangeSet> {
-    std::size_t operator()(const CoreRangeSet &core_range_set) const {
-        std::size_t seed = 0;
-        for (const auto &core_range : core_range_set.ranges()) {
-            seed = std::hash<CoreRange>{}(core_range) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
-        }
-        return seed;
-    }
-};
-}  // namespace std
-
-namespace tt::stl::json {
-
-template <>
-struct to_json_t<CoreCoord> {
-    nlohmann::json operator()(const CoreCoord &core_coord) noexcept {
-        return {{"x", to_json(core_coord.x)}, {"y", to_json(core_coord.y)}};
-    }
-};
-
-template <>
-struct from_json_t<CoreCoord> {
-    CoreCoord operator()(const nlohmann::json &json) noexcept {
-        return {from_json<uint32_t>(json.at("x")), from_json<uint32_t>(json.at("y"))};
-    }
-};
-
-template <>
-struct to_json_t<RelativeCoreCoord> {
-    nlohmann::json operator()(const RelativeCoreCoord &relative_core_coord) noexcept {
-        return {{"x", to_json(relative_core_coord.x)}, {"y", to_json(relative_core_coord.y)}};
-    }
-};
-
-template <>
-struct from_json_t<RelativeCoreCoord> {
-    RelativeCoreCoord operator()(const nlohmann::json &json) noexcept {
-        return {from_json<int32_t>(json.at("x")), from_json<int32_t>(json.at("y"))};
-    }
-};
-
-template <>
-struct to_json_t<CoreRange> {
-    nlohmann::json operator()(const CoreRange &core_range) noexcept {
-        return {{"start", to_json(core_range.start_coord)}, {"end", to_json(core_range.end_coord)}};
-    }
-};
-
-template <>
-struct from_json_t<CoreRange> {
-    CoreRange operator()(const nlohmann::json &json) noexcept {
-        return {from_json<CoreCoord>(json.at("start")), from_json<CoreCoord>(json.at("end"))};
-    }
-};
-
-template <>
-struct to_json_t<CoreRangeSet> {
-    nlohmann::json operator()(const CoreRangeSet &core_range_set) noexcept {
-        nlohmann::json core_range_set_json = nlohmann::json::array();
-        return to_json(core_range_set.ranges());
-    }
-};
-
-template <>
-struct from_json_t<CoreRangeSet> {
-    CoreRangeSet operator()(const nlohmann::json &json) noexcept {
-        return CoreRangeSet(from_json<std::set<CoreRange>>(json));
-    }
-};
-
-}  // namespace tt::stl::json
diff --git a/tt_metal/common/core_coord.hpp b/tt_metal/common/core_coord.hpp
new file mode 100644
index 00000000000..d5623836152
--- /dev/null
+++ b/tt_metal/common/core_coord.hpp
@@ -0,0 +1,255 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <algorithm>
+#include <mutex>
+#include <optional>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "third_party/json/json.hpp"
+#include "third_party/umd/device/tt_xy_pair.h"
+#include "tt_metal/tt_stl/reflection.hpp"
+
+using CoreCoord = tt_xy_pair;
+
+template <>
+struct fmt::formatter<CoreCoord> {
+    constexpr auto parse(format_parse_context &ctx) -> format_parse_context::iterator { return ctx.end(); }
+
+    auto format(const CoreCoord &core_coord, format_context &ctx) const -> format_context::iterator;
+};
+
+constexpr inline bool operator<=(const CoreCoord &a, const CoreCoord &b) { return (a < b) or (a == b); }
+
+struct RelativeCoreCoord {
+    long x = 0;
+    long y = 0;
+
+    std::string str() const;
+};
+
+constexpr inline bool operator==(const RelativeCoreCoord &a, const RelativeCoreCoord &b) {
+    return a.x == b.x && a.y == b.y;
+}
+
+constexpr inline bool operator!=(const RelativeCoreCoord &a, const RelativeCoreCoord &b) { return !(a == b); }
+
+CoreCoord get_core_coord_from_relative(const RelativeCoreCoord &in, const CoreCoord &grid_size);
+
+struct CoreRange {
+    CoreCoord start_coord;
+    CoreCoord end_coord;
+    CoreRange(const CoreCoord &point);
+
+    CoreRange(const CoreCoord &start_coord, const CoreCoord &end_coord);
+
+    CoreRange(const CoreRange &other) = default;
+    CoreRange &operator=(const CoreRange &other) = default;
+    CoreRange(CoreRange &&other) = default;
+    CoreRange &operator=(CoreRange &&other) = default;
+
+    std::optional<CoreRange> intersects(const CoreRange &other) const;
+
+    bool adjacent(const CoreRange &other) const;
+
+    bool contains(const CoreRange &other) const;
+
+    bool contains(const CoreCoord &other) const;
+
+    // Merge lined-up (in x or y dimension) intersecting/adjacent rectangles
+    std::optional<CoreRange> merge(const CoreRange &cr) const;
+
+    std::string str() const;
+
+    size_t size() const;
+
+    CoreCoord grid_size() const;
+
+    class CoreIterator {
+       public:
+        CoreIterator(const CoreCoord &current, const CoreRange &core_range);
+
+        CoreCoord &operator*();
+
+        CoreIterator &operator++();
+
+        bool operator==(const CoreIterator &other) const;
+
+        bool operator!=(const CoreIterator &other) const;
+
+       private:
+        CoreCoord current_;
+        const CoreRange &range_;
+    };
+
+    CoreIterator begin() const;
+
+    CoreIterator end() const;
+};
+
+constexpr bool operator==(const CoreRange &a, const CoreRange &b) {
+    return a.start_coord == b.start_coord && a.end_coord == b.end_coord;
+}
+
+constexpr bool operator!=(const CoreRange &a, const CoreRange &b) { return !(a == b); }
+
+constexpr bool operator<(const CoreRange &left, const CoreRange &right) {
+    return (
+        left.start_coord < right.start_coord ||
+        (left.start_coord == right.start_coord && left.end_coord < right.end_coord));
+}
+
+template <>
+struct fmt::formatter<CoreRange> {
+    constexpr auto parse(format_parse_context &ctx) -> format_parse_context::iterator { return ctx.end(); }
+
+    auto format(const CoreRange &core_range, format_context &ctx) const -> format_context::iterator;
+};
+
+class CoreRangeSet {
+   public:
+    CoreRangeSet(const std::vector<CoreRange> &core_ranges);
+
+    CoreRangeSet(const std::set<CoreRange> &core_ranges);
+
+    CoreRangeSet(const CoreRange &core_range);
+
+    CoreRangeSet() = default;
+
+    friend void swap(CoreRangeSet &first, CoreRangeSet &second);
+
+    CoreRangeSet(const CoreRangeSet &other);
+
+    CoreRangeSet &operator=(const CoreRangeSet &other);
+
+    CoreRangeSet(CoreRangeSet &&other);
+
+    CoreRangeSet &operator=(CoreRangeSet &&other);
+
+    CoreRangeSet(std::vector<CoreRange> &&core_ranges);
+
+    size_t size() const;
+
+    template <typename T>
+    CoreRangeSet merge(const T &other) const;
+
+    bool core_coord_in_core_ranges(const CoreCoord &core_coord) const;
+
+    bool intersects(const CoreRange &cr) const;
+
+    const std::vector<CoreRange> &ranges() const;
+
+    std::string str() const;
+
+    uint32_t num_cores() const;
+
+    CoreRange bounding_box() const;
+
+   private:
+    void validate_no_overlap();
+
+    mutable std::mutex ranges_guard;
+    std::vector<CoreRange> ranges_;
+};
+
+bool operator==(const CoreRangeSet &a, const CoreRangeSet &b);
+
+std::vector<CoreCoord> grid_to_cores(
+    uint32_t num_cores, uint32_t grid_size_x, uint32_t grid_size_y, bool row_wise = false);
+
+std::vector<CoreCoord> grid_to_cores(CoreCoord start, CoreCoord end, bool row_wise = false);
+
+// Noop cores are appended at the end with no guarantees on ordering
+std::vector<CoreCoord> grid_to_cores_with_noop(
+    const uint32_t bbox_x,
+    const uint32_t bbox_y,
+    const uint32_t grid_size_x,
+    const uint32_t grid_size_y,
+    const bool row_wise = false);
+
+std::vector<CoreCoord> corerange_to_cores(
+    const CoreRangeSet &crs, std::optional<uint32_t> max_cores = std::nullopt, bool row_wise = false);
+
+bool operator!=(const CoreRangeSet &a, const CoreRangeSet &b);
+
+template <>
+struct fmt::formatter<CoreRangeSet> {
+    constexpr auto parse(format_parse_context &ctx) -> format_parse_context::iterator { return ctx.end(); }
+
+    auto format(const CoreRangeSet &core_range_set, format_context &ctx) const -> format_context::iterator;
+};
+
+// Adding to tt::tt_metal namespace as we transition to moving this out of global namespace eventually.
+namespace tt::tt_metal {
+using ::CoreCoord;
+using ::CoreRange;
+using ::CoreRangeSet;
+}  // namespace tt::tt_metal
+
+namespace std {
+
+template <>
+struct hash<CoreRange> {
+    std::size_t operator()(const CoreRange &core_range) const;
+};
+
+template <>
+struct hash<RelativeCoreCoord> {
+    std::size_t operator()(RelativeCoreCoord const &o) const;
+};
+
+template <>
+struct hash<CoreRangeSet> {
+    std::size_t operator()(const CoreRangeSet &core_range_set) const;
+};
+
+}  // namespace std
+
+namespace tt::stl::json {
+
+template <>
+struct to_json_t<CoreCoord> {
+    nlohmann::json operator()(const CoreCoord &core_coord) noexcept;
+};
+
+template <>
+struct from_json_t<CoreCoord> {
+    CoreCoord operator()(const nlohmann::json &json) noexcept;
+};
+
+template <>
+struct to_json_t<RelativeCoreCoord> {
+    nlohmann::json operator()(const RelativeCoreCoord &relative_core_coord) noexcept;
+};
+
+template <>
+struct from_json_t<RelativeCoreCoord> {
+    RelativeCoreCoord operator()(const nlohmann::json &json) noexcept;
+};
+
+template <>
+struct to_json_t<CoreRange> {
+    nlohmann::json operator()(const CoreRange &core_range) noexcept;
+};
+
+template <>
+struct from_json_t<CoreRange> {
+    CoreRange operator()(const nlohmann::json &json) noexcept;
+};
+
+template <>
+struct to_json_t<CoreRangeSet> {
+    nlohmann::json operator()(const CoreRangeSet &core_range_set) noexcept;
+};
+
+template <>
+struct from_json_t<CoreRangeSet> {
+    CoreRangeSet operator()(const nlohmann::json &json) noexcept;
+};
+
+}  // namespace tt::stl::json
diff --git a/tt_metal/common/core_descriptor.hpp b/tt_metal/common/core_descriptor.hpp
index 429e6f2ff74..62f3d33dbc7 100644
--- a/tt_metal/common/core_descriptor.hpp
+++ b/tt_metal/common/core_descriptor.hpp
@@ -4,7 +4,7 @@
 
 #pragma once
 
-#include "core_coord.h"
+#include "core_coord.hpp"
 #include "tt_metal/llrt/tt_cluster.hpp"
 #include "llrt/hal.hpp"
 
diff --git a/tt_metal/common/metal_soc_descriptor.h b/tt_metal/common/metal_soc_descriptor.h
index 5364806c706..3ff1079af42 100644
--- a/tt_metal/common/metal_soc_descriptor.h
+++ b/tt_metal/common/metal_soc_descriptor.h
@@ -5,7 +5,7 @@
 #pragma once
 
 #include "common/tt_backend_api_types.hpp"
-#include "core_coord.h"
+#include "core_coord.hpp"
 #include "third_party/umd/device/tt_soc_descriptor.h"
 
 //! tt_SocDescriptor contains information regarding the SOC configuration targetted.
diff --git a/tt_metal/common/work_split.hpp b/tt_metal/common/work_split.hpp
index c4c3153cc59..d0b2087759b 100644
--- a/tt_metal/common/work_split.hpp
+++ b/tt_metal/common/work_split.hpp
@@ -9,7 +9,7 @@
 #pragma once
 
 #include "tt_metal/common/assert.hpp"
-#include "tt_metal/common/core_coord.h"
+#include "tt_metal/common/core_coord.hpp"
 #include "tt_metal/common/math.hpp"
 #include "tt_metal/host_api.hpp"
 
@@ -174,7 +174,7 @@ inline std::tuple<uint32_t, CoreRangeSet, CoreRangeSet, CoreRangeSet, uint32_t,
     uint32_t units_per_core_group_2 = 0;
     // Evenly divided units to all target cores
     if (units_to_divide % target_num_cores == 0) {
-        core_group_1_set = all_cores.ranges();
+        core_group_1_set = std::set<CoreRange>(all_cores.ranges().begin(), all_cores.ranges().end());
         // Uneven division of units across cores
         // This case should only be hit when there are more units of work than a full grid of cores
         // which is implicitly assumed in the following logic
diff --git a/tt_metal/detail/tt_metal.hpp b/tt_metal/detail/tt_metal.hpp
index 5ff82e13f43..d362bee71a2 100644
--- a/tt_metal/detail/tt_metal.hpp
+++ b/tt_metal/detail/tt_metal.hpp
@@ -9,7 +9,7 @@
 #include "tt_metal/third_party/umd/device/tt_cluster_descriptor_types.h"
 #include "tt_metal/third_party/umd/device/tt_soc_descriptor.h"
 #include "tt_metal/hostdevcommon/common_values.hpp"
-#include "tt_metal/common/core_coord.h"
+#include "tt_metal/common/core_coord.hpp"
 #include "tt_metal/impl/dispatch/dispatch_core_manager.hpp"
 
 namespace tt::tt_metal {
diff --git a/tt_metal/graph/graph_tracking.hpp b/tt_metal/graph/graph_tracking.hpp
index dcea7b8dcd9..a39cf3d4a56 100644
--- a/tt_metal/graph/graph_tracking.hpp
+++ b/tt_metal/graph/graph_tracking.hpp
@@ -9,7 +9,7 @@
 #include <span>
 #include <string_view>
 
-#include "tt_metal/common/core_coord.h"
+#include "tt_metal/common/core_coord.hpp"
 #include "tt_metal/impl/buffers/buffer.hpp"
 
 namespace tt::tt_metal {
diff --git a/tt_metal/hostdevcommon/common_runtime_address_map.h b/tt_metal/hostdevcommon/common_runtime_address_map.h
index 308f1b21b99..3b1d25268bc 100644
--- a/tt_metal/hostdevcommon/common_runtime_address_map.h
+++ b/tt_metal/hostdevcommon/common_runtime_address_map.h
@@ -13,11 +13,9 @@
 * This file contains addresses that are visible to both host and device compiled code.
 */
 
-// Kernel config buffer is WIP
-// Size is presently based on the old sizes of the RTAs + CB config + Sems
-// plus some extra space freed up in the mem map
+// TODO: move this to the memory manager, make configurable through the API
 constexpr static std::uint32_t L1_KERNEL_CONFIG_BASE = MEM_MAP_END;
-constexpr static std::uint32_t L1_KERNEL_CONFIG_SIZE = 4 * 1024 + 256 + 128 + 512;
+constexpr static std::uint32_t L1_KERNEL_CONFIG_SIZE = 69 * 1024;
 
 constexpr static std::uint32_t NUM_CIRCULAR_BUFFERS = 32;
 constexpr static std::uint32_t UINT32_WORDS_PER_CIRCULAR_BUFFER_CONFIG = 4;
diff --git a/tt_metal/hw/firmware/src/brisc.cc b/tt_metal/hw/firmware/src/brisc.cc
index a3b22ccfdf1..8b59ec9bc0e 100644
--- a/tt_metal/hw/firmware/src/brisc.cc
+++ b/tt_metal/hw/firmware/src/brisc.cc
@@ -169,13 +169,13 @@ void set_deassert_addresses() {
 #endif
 }
 
-void l1_to_ncrisc_iram_copy(uint16_t size, uint32_t address_offset = 0) {
+void l1_to_ncrisc_iram_copy(uint32_t src_addr, uint16_t size, uint32_t address_offset = 0) {
 #ifdef NCRISC_HAS_IRAM
     // Always copy ncrisc even if its size is 0 (save branch)...
     // Copy NCRISC firmware from L1 to local IRAM using tensix DMA
     tdma_xmov(
         TDMA_MOVER0,
-        (MEM_NCRISC_INIT_IRAM_L1_BASE >> 4) + address_offset,
+        src_addr,
         MEM_MOVER_VIEW_IRAM_BASE_ADDR + address_offset,
         size,
         XMOV_L1_TO_L0);
@@ -267,16 +267,22 @@ void init_sync_registers() {
     }
 }
 
-inline void deassert_ncrisc_trisc() {
-    // Below sets ncrisc to go so we can wait until it is cleared on first iteration
-    mailboxes->slave_sync.all = RUN_SYNC_MSG_ALL_SLAVES_DONE;
-
+inline void init_ncrisc_iram() {
+#ifdef NCRISC_HAS_IRAM
     uint16_t fw_size16 = mailboxes->launch[mailboxes->launch_msg_rd_ptr].kernel_config.ncrisc_kernel_size16;
     ncrisc_kernel_start_offset16 = fw_size16;
 
     // Copies from L1 to IRAM on chips where NCRISC has IRAM
-    l1_to_ncrisc_iram_copy(fw_size16);
+    l1_to_ncrisc_iram_copy(MEM_NCRISC_INIT_IRAM_L1_BASE >> 4, fw_size16);
     l1_to_ncrisc_iram_copy_wait();
+#endif
+}
+
+inline void deassert_ncrisc_trisc() {
+    // Below sets ncrisc to go so we can wait until it is cleared on first iteration
+    mailboxes->slave_sync.all = RUN_SYNC_MSG_ALL_SLAVES_DONE;
+
+    init_ncrisc_iram();
 
     // Bring ncrisc/triscs out of reset
     deassert_all_reset();
@@ -400,8 +406,13 @@ int main() {
             DeviceValidateProfiler(launch_msg_address->kernel_config.enables);
             DeviceZoneSetCounter(launch_msg_address->kernel_config.host_assigned_id);
             // Copies from L1 to IRAM on chips where NCRISC has IRAM
-            l1_to_ncrisc_iram_copy(launch_msg_address->kernel_config.ncrisc_kernel_size16, ncrisc_kernel_start_offset16);
-
+            uint32_t kernel_config_base = firmware_config_init(mailboxes, ProgrammableCoreType::TENSIX, DISPATCH_CLASS_TENSIX_DM0);
+            int ncrisc_index = static_cast<std::underlying_type<TensixProcessorTypes>::type>(TensixProcessorTypes::DM1);
+            uint32_t ncrisc_kernel_src_address =
+                kernel_config_base + launch_msg_address->kernel_config.kernel_text_offset[ncrisc_index];
+            l1_to_ncrisc_iram_copy(ncrisc_kernel_src_address >> 4,
+                launch_msg_address->kernel_config.ncrisc_kernel_size16,
+                ncrisc_kernel_start_offset16);
             // Invalidate the i$ now the kernels have loaded and before running
             volatile tt_reg_ptr uint32_t* cfg_regs = core.cfg_regs_base(0);
             cfg_regs[RISCV_IC_INVALIDATE_InvalidateAll_ADDR32] = RISCV_IC_BRISC_MASK | RISCV_IC_TRISC_ALL_MASK | RISCV_IC_NCRISC_MASK;
@@ -423,7 +434,6 @@ int main() {
             }
             prev_noc_mode = noc_mode;
 
-            uint32_t kernel_config_base = firmware_config_init(mailboxes, ProgrammableCoreType::TENSIX, DISPATCH_CLASS_TENSIX_DM0);
             uint32_t tt_l1_ptr *cb_l1_base = (uint32_t tt_l1_ptr *)(kernel_config_base +
                 launch_msg_address->kernel_config.cb_offset);
             setup_cb_read_write_interfaces(cb_l1_base, 0, num_cbs_to_early_init, true, true, false);
@@ -433,10 +443,13 @@ int main() {
             WAYPOINT("R");
             if (enables & DISPATCH_CLASS_MASK_TENSIX_ENABLE_DM0) {
                 setup_cb_read_write_interfaces(cb_l1_base, num_cbs_to_early_init, launch_msg_address->kernel_config.max_cb_index, true, true, false);
-                kernel_init();
+                int index = static_cast<std::underlying_type<TensixProcessorTypes>::type>(TensixProcessorTypes::DM0);
+                void (*kernel_address)(uint32_t) = (void (*)(uint32_t))
+                    (kernel_config_base + launch_msg_address->kernel_config.kernel_text_offset[index]);
+                (*kernel_address)((uint32_t)kernel_address);
                 RECORD_STACK_USAGE();
             } else {
-                // This was not initialized in kernel_init
+                // This was not initialized in the kernel
                 if (noc_mode == DM_DEDICATED_NOC) {
                     noc_local_state_init(noc_index);
                 }
diff --git a/tt_metal/hw/firmware/src/brisck.cc b/tt_metal/hw/firmware/src/brisck.cc
index 7b01d4ba354..f9f04eec011 100644
--- a/tt_metal/hw/firmware/src/brisck.cc
+++ b/tt_metal/hw/firmware/src/brisck.cc
@@ -19,8 +19,9 @@
 #include <kernel_includes.hpp>
 
 extern uint32_t __kernel_init_local_l1_base[];
+extern uint32_t __fw_export_end_text[];
 
-void kernel_launch() {
+void kernel_launch(uint32_t kernel_base_addr) {
 
 #if defined(DEBUG_NULL_KERNELS) && !defined(DISPATCH_KERNEL)
 #ifdef KERNEL_RUN_TIME
@@ -28,7 +29,7 @@ void kernel_launch() {
     while (c_tensix_core::read_wall_clock() < end_time);
 #endif
 #else
-    firmware_kernel_common_init((void tt_l1_ptr *)(__kernel_init_local_l1_base));
+    firmware_kernel_common_init((void tt_l1_ptr *)(kernel_base_addr + (uint32_t) __kernel_init_local_l1_base - (uint32_t)__fw_export_end_text));
 
     if constexpr (NOC_MODE == DM_DEDICATED_NOC) {
         noc_local_state_init(NOC_INDEX);
diff --git a/tt_metal/hw/firmware/src/erisc.cc b/tt_metal/hw/firmware/src/erisc.cc
index 7660022bc55..5feedb03d3a 100644
--- a/tt_metal/hw/firmware/src/erisc.cc
+++ b/tt_metal/hw/firmware/src/erisc.cc
@@ -72,10 +72,13 @@ void __attribute__((section("erisc_l1_code.1"), noinline)) Application(void) {
             launch_msg_t* launch_msg_address = &(mailboxes->launch[launch_msg_rd_ptr]);
             DeviceValidateProfiler(launch_msg_address->kernel_config.enables);
             DeviceZoneSetCounter(launch_msg_address->kernel_config.host_assigned_id);
+            // Note that a core may get "GO" w/ enable false to keep its launch_msg's in sync
             enum dispatch_core_processor_masks enables = (enum dispatch_core_processor_masks)launch_msg_address->kernel_config.enables;
             if (enables & DISPATCH_CLASS_MASK_ETH_DM0) {
+                WAYPOINT("R");
                 firmware_config_init(mailboxes, ProgrammableCoreType::ACTIVE_ETH, DISPATCH_CLASS_ETH_DM0);
-                kernel_init();
+                kernel_init(0);
+                WAYPOINT("D");
             }
             mailboxes->go_message.signal = RUN_MSG_DONE;
 
@@ -89,7 +92,6 @@ void __attribute__((section("erisc_l1_code.1"), noinline)) Application(void) {
                 // Only executed if watcher is enabled. Ensures that we don't report stale data due to invalid launch messages in the ring buffer
                 CLEAR_PREVIOUS_LAUNCH_MESSAGE_ENTRY_FOR_WATCHER();
             }
-            WAYPOINT("R");
 
         } else if (go_message_signal == RUN_MSG_RESET_READ_PTR) {
             // Reset the launch message buffer read ptr
diff --git a/tt_metal/hw/firmware/src/erisck.cc b/tt_metal/hw/firmware/src/erisck.cc
index d6e916728f5..b977fa9aab9 100644
--- a/tt_metal/hw/firmware/src/erisck.cc
+++ b/tt_metal/hw/firmware/src/erisck.cc
@@ -23,7 +23,7 @@
 
 CBInterface cb_interface[NUM_CIRCULAR_BUFFERS];
 
-void __attribute__((section("erisc_l1_code"))) kernel_launch() {
+void __attribute__((section("erisc_l1_code"))) kernel_launch(uint32_t) {
     DeviceZoneScopedMainChildN("ERISC-KERNEL");
     rtos_context_switch_ptr = (void (*)())RtosTable[0];
 
diff --git a/tt_metal/hw/firmware/src/idle_erisc.cc b/tt_metal/hw/firmware/src/idle_erisc.cc
index 518b33f544c..d0c779bec52 100644
--- a/tt_metal/hw/firmware/src/idle_erisc.cc
+++ b/tt_metal/hw/firmware/src/idle_erisc.cc
@@ -25,7 +25,6 @@
 
 #include "debug/watcher_common.h"
 #include "debug/waypoint.h"
-#include "debug/dprint.h"
 #include "debug/stack_usage.h"
 
 uint8_t noc_index;
@@ -135,7 +134,10 @@ int main() {
 
             // Run the ERISC kernel
             WAYPOINT("R");
-            kernel_init();
+            int index = static_cast<std::underlying_type<EthProcessorTypes>::type>(EthProcessorTypes::DM0);
+            void (*kernel_address)(uint32_t) = (void (*)(uint32_t))
+                (kernel_config_base + mailboxes->launch[mailboxes->launch_msg_rd_ptr].kernel_config.kernel_text_offset[index]);
+            (*kernel_address)((uint32_t)kernel_address);
             RECORD_STACK_USAGE();
             WAYPOINT("D");
             mailboxes->go_message.signal = RUN_MSG_DONE;
diff --git a/tt_metal/hw/firmware/src/idle_erisck.cc b/tt_metal/hw/firmware/src/idle_erisck.cc
index 99f000c3de6..756c71d0448 100644
--- a/tt_metal/hw/firmware/src/idle_erisck.cc
+++ b/tt_metal/hw/firmware/src/idle_erisck.cc
@@ -22,10 +22,12 @@
 #include <kernel_includes.hpp>
 
 extern uint32_t __kernel_init_local_l1_base[];
+extern uint32_t __fw_export_end_text[];
 
-void kernel_launch() {
+void kernel_launch(uint32_t kernel_base_addr) {
     DeviceZoneScopedMainChildN("ERISC-KERNEL");
-    firmware_kernel_common_init((void tt_l1_ptr *)__kernel_init_local_l1_base);
+
+    firmware_kernel_common_init((void tt_l1_ptr *)(kernel_base_addr + (uint32_t) __kernel_init_local_l1_base - (uint32_t)__fw_export_end_text));
 
     noc_local_state_init(NOC_INDEX);
 
diff --git a/tt_metal/hw/firmware/src/ncrisc.cc b/tt_metal/hw/firmware/src/ncrisc.cc
index 48b735afd9a..d5cb2b614f9 100644
--- a/tt_metal/hw/firmware/src/ncrisc.cc
+++ b/tt_metal/hw/firmware/src/ncrisc.cc
@@ -91,13 +91,23 @@ int main(int argc, char *argv[]) {
         notify_brisc_and_wait();
         DeviceZoneScopedMainN("NCRISC-FW");
 
+        uint32_t launch_msg_rd_ptr = mailboxes->launch_msg_rd_ptr;
+        launch_msg_t* launch_msg = &(mailboxes->launch[launch_msg_rd_ptr]);
+
         uint32_t kernel_config_base = firmware_config_init(mailboxes, ProgrammableCoreType::TENSIX, DISPATCH_CLASS_TENSIX_DM1);
         uint32_t tt_l1_ptr *cb_l1_base = (uint32_t tt_l1_ptr *)(kernel_config_base +
-            mailboxes->launch[mailboxes->launch_msg_rd_ptr].kernel_config.cb_offset);
-        setup_cb_read_write_interfaces(cb_l1_base, 0, mailboxes->launch[mailboxes->launch_msg_rd_ptr].kernel_config.max_cb_index, true, true, false);
-
+            launch_msg->kernel_config.cb_offset);
+        setup_cb_read_write_interfaces(cb_l1_base, 0, launch_msg->kernel_config.max_cb_index, true, true, false);
         WAYPOINT("R");
-        kernel_init();
+
+        int index = static_cast<std::underlying_type<TensixProcessorTypes>::type>(TensixProcessorTypes::DM1);
+        void (*kernel_address)(uint32_t) = (void (*)(uint32_t))
+            (kernel_config_base + launch_msg->kernel_config.kernel_text_offset[index]);
+#ifdef ARCH_BLACKHOLE
+        (*kernel_address)((uint32_t)kernel_address);
+#else
+        kernel_init((uint32_t)kernel_address);
+#endif
         RECORD_STACK_USAGE();
         WAYPOINT("D");
 
diff --git a/tt_metal/hw/firmware/src/ncrisck.cc b/tt_metal/hw/firmware/src/ncrisck.cc
index f59e2ce313e..6f24d5b107b 100644
--- a/tt_metal/hw/firmware/src/ncrisck.cc
+++ b/tt_metal/hw/firmware/src/ncrisck.cc
@@ -27,8 +27,9 @@ uint32_t noc_nonposted_atomics_acked[NUM_NOCS];
 uint32_t noc_posted_writes_num_issued[NUM_NOCS];
 
 extern uint32_t __kernel_init_local_l1_base[];
+extern uint32_t __fw_export_end_text[];
 
-void kernel_launch() {
+void kernel_launch(uint32_t kernel_base_addr) {
 
   DeviceZoneScopedMainChildN("NCRISC-KERNEL");
 #if defined(DEBUG_NULL_KERNELS) && !defined(DISPATCH_KERNEL)
@@ -37,11 +38,8 @@ void kernel_launch() {
     while (c_tensix_core::read_wall_clock() < KERNEL_RUN_TIME);
 #endif
 #else
-#ifdef ARCH_BLACKHOLE
-    firmware_kernel_common_init((void tt_l1_ptr *)__kernel_init_local_l1_base);
-#else
-    firmware_kernel_common_init((void tt_l1_ptr *)(MEM_NCRISC_INIT_IRAM_L1_BASE + (uint32_t)__kernel_init_local_l1_base - MEM_NCRISC_IRAM_BASE));
-#endif
+
+    firmware_kernel_common_init((void tt_l1_ptr *)(kernel_base_addr + (uint32_t) __kernel_init_local_l1_base - (uint32_t)__fw_export_end_text));
 
     if constexpr (NOC_MODE == DM_DEDICATED_NOC) {
         noc_local_state_init(NOC_INDEX);
diff --git a/tt_metal/hw/firmware/src/trisc.cc b/tt_metal/hw/firmware/src/trisc.cc
index f71c698167e..505e0bce3bf 100644
--- a/tt_metal/hw/firmware/src/trisc.cc
+++ b/tt_metal/hw/firmware/src/trisc.cc
@@ -94,21 +94,27 @@ int main(int argc, char *argv[]) {
         while (*trisc_run != RUN_SYNC_MSG_GO);
         DeviceZoneScopedMainN("TRISC-FW");
 
-        uint32_t kernel_config_base = mailboxes->launch[mailboxes->launch_msg_rd_ptr].kernel_config.kernel_config_base[ProgrammableCoreType::TENSIX];
+        uint32_t launch_msg_rd_ptr = mailboxes->launch_msg_rd_ptr;
+        launch_msg_t* launch_msg = &(mailboxes->launch[launch_msg_rd_ptr]);
+
+        uint32_t kernel_config_base = launch_msg->kernel_config.kernel_config_base[ProgrammableCoreType::TENSIX];
 
 #if !defined(UCK_CHLKC_MATH)
         uint32_t tt_l1_ptr *cb_l1_base = (uint32_t tt_l1_ptr *)(kernel_config_base +
-            mailboxes->launch[mailboxes->launch_msg_rd_ptr].kernel_config.cb_offset);
-        setup_cb_read_write_interfaces(cb_l1_base, 0, mailboxes->launch[mailboxes->launch_msg_rd_ptr].kernel_config.max_cb_index, cb_init_read, cb_init_write, cb_init_write);
+            launch_msg->kernel_config.cb_offset);
+        setup_cb_read_write_interfaces(cb_l1_base, 0, launch_msg->kernel_config.max_cb_index, cb_init_read, cb_init_write, cb_init_write);
 #endif
 
         rta_l1_base = (uint32_t tt_l1_ptr *)(kernel_config_base +
-            mailboxes->launch[mailboxes->launch_msg_rd_ptr].kernel_config.rta_offset[DISPATCH_CLASS_TENSIX_COMPUTE].rta_offset);
+            launch_msg->kernel_config.rta_offset[DISPATCH_CLASS_TENSIX_COMPUTE].rta_offset);
         crta_l1_base = (uint32_t tt_l1_ptr *)(kernel_config_base +
-            mailboxes->launch[mailboxes->launch_msg_rd_ptr].kernel_config.rta_offset[DISPATCH_CLASS_TENSIX_COMPUTE].crta_offset);
+            launch_msg->kernel_config.rta_offset[DISPATCH_CLASS_TENSIX_COMPUTE].crta_offset);
 
         WAYPOINT("R");
-        kernel_init();
+        int index = static_cast<std::underlying_type<TensixProcessorTypes>::type>(TensixProcessorTypes::MATH0) + thread_id;
+        void (*kernel_address)(uint32_t) = (void (*)(uint32_t))
+            (kernel_config_base + launch_msg->kernel_config.kernel_text_offset[index]);
+        (*kernel_address)((uint32_t)kernel_address);
         RECORD_STACK_USAGE();
         WAYPOINT("D");
 
diff --git a/tt_metal/hw/firmware/src/trisck.cc b/tt_metal/hw/firmware/src/trisck.cc
index f6c1cb57a38..862c2964808 100644
--- a/tt_metal/hw/firmware/src/trisck.cc
+++ b/tt_metal/hw/firmware/src/trisck.cc
@@ -34,8 +34,9 @@ volatile tt_reg_ptr uint * mailbox_base[4] = {
 }
 
 extern uint32_t __kernel_init_local_l1_base[];
+extern uint32_t __fw_export_end_text[];
 
-void kernel_launch()
+void kernel_launch(uint32_t kernel_base_addr)
 {
   DeviceZoneScopedMainChildN("TRISC-KERNEL");
 #if defined(DEBUG_NULL_KERNELS) && !defined(DISPATCH_KERNEL)
@@ -43,7 +44,7 @@ void kernel_launch()
     ckernel::wait(KERNEL_RUN_TIME);
 #endif
 #else
-    firmware_kernel_common_init((void tt_l1_ptr *)(__kernel_init_local_l1_base));
+    firmware_kernel_common_init((void tt_l1_ptr *)(kernel_base_addr + (uint32_t) __kernel_init_local_l1_base - (uint32_t)__fw_export_end_text));
 
 #if defined(UCK_CHLKC_UNPACK)
     // Make sure DBG_FEATURE_DISABLE register is cleared before every kernel is executed
diff --git a/tt_metal/hw/inc/blackhole/core_config.h b/tt_metal/hw/inc/blackhole/core_config.h
index 497ff5512e5..bde8ce444b0 100644
--- a/tt_metal/hw/inc/blackhole/core_config.h
+++ b/tt_metal/hw/inc/blackhole/core_config.h
@@ -24,21 +24,26 @@ enum class AddressableCoreType : uint8_t {
 };
 
 enum class TensixProcessorTypes : uint8_t {
-    BRISC  = 0,
-    NCRISC = 1,
-    TRISC0 = 2,
-    TRISC1 = 3,
-    TRISC2 = 4,
-
     DM0    = 0,
     DM1    = 1,
     MATH0  = 2,
     MATH1  = 3,
     MATH2  = 4,
+    COUNT  = 5
 };
 
 enum class EthProcessorTypes : uint8_t {
     DM0    = 0,
+    DM1    = 1,
+    COUNT  = 2
+};
+
+enum class DramProcessorTypes : uint8_t {
+    DM0     = 0,
+    COUNT   = 1
 };
 
 constexpr uint8_t MaxProcessorsPerCoreType = 5;
+constexpr uint8_t NumTensixDispatchClasses = 3;
+constexpr uint8_t NumEthDispatchClasses = 1;
+constexpr uint8_t NumDramDispatchClasses = 1;
diff --git a/tt_metal/hw/inc/blackhole/dev_mem_map.h b/tt_metal/hw/inc/blackhole/dev_mem_map.h
index 274f787f58a..8afc35b6000 100644
--- a/tt_metal/hw/inc/blackhole/dev_mem_map.h
+++ b/tt_metal/hw/inc/blackhole/dev_mem_map.h
@@ -43,11 +43,19 @@
 
 /////////////
 // Firmware/kernel code holes
-#define MEM_BRISC_FIRMWARE_SIZE (10 * 1024 + MEM_BRISC_LOCAL_SIZE)
-#define MEM_NCRISC_FIRMWARE_SIZE (16 * 1024 + MEM_NCRISC_LOCAL_SIZE)
-#define MEM_TRISC0_FIRMWARE_SIZE (16 * 1024 + MEM_TRISC_LOCAL_SIZE)
-#define MEM_TRISC1_FIRMWARE_SIZE (16 * 1024 + MEM_TRISC_LOCAL_SIZE)
-#define MEM_TRISC2_FIRMWARE_SIZE (16 * 1024 + MEM_TRISC_LOCAL_SIZE)
+#define MEM_BRISC_FIRMWARE_SIZE (5 * 1024)
+// TODO: perhaps put NCRISC FW in the scratch area and free 1.5K after init (GS/WH)
+#define MEM_NCRISC_FIRMWARE_SIZE 1536
+#define MEM_TRISC0_FIRMWARE_SIZE 1536
+#define MEM_TRISC1_FIRMWARE_SIZE 1536
+#define MEM_TRISC2_FIRMWARE_SIZE 1536
+
+#define MEM_BRISC_KERNEL_SIZE  (24 * 1024)
+#define MEM_NCRISC_KERNEL_SIZE (24 * 1024)
+#define MEM_TRISC0_KERNEL_SIZE (24 * 1024)
+#define MEM_TRISC1_KERNEL_SIZE (24 * 1024)
+#define MEM_TRISC2_KERNEL_SIZE (24 * 1024)
+
 #define MEM_ZEROS_SIZE 512
 
 #define MEM_BOOT_CODE_BASE 0
diff --git a/tt_metal/hw/inc/dataflow_api.h b/tt_metal/hw/inc/dataflow_api.h
index 7c771096094..56ba958d3cf 100644
--- a/tt_metal/hw/inc/dataflow_api.h
+++ b/tt_metal/hw/inc/dataflow_api.h
@@ -1621,9 +1621,9 @@ inline void RISC_POST_HEARTBEAT(uint32_t &heartbeat) {
 FORCE_INLINE
 uint32_t min(uint32_t a, uint32_t b) { return (a < b) ? a: b; }
 
-template <uint32_t page_size, bool use_vc>
+template <bool use_vc>
 FORCE_INLINE
-uint32_t noc_async_read_tile_dram_sharded_set_state(uint32_t bank_base_address, uint32_t bank_id = 0, const uint32_t vc = 0, uint8_t noc = noc_index) {
+uint32_t noc_async_read_tile_dram_sharded_set_state(uint32_t bank_base_address, uint32_t page_size, uint32_t bank_id = 0, const uint32_t vc = 0, uint8_t noc = noc_index) {
     uint32_t src_addr_;
     uint32_t src_noc_xy;
 
diff --git a/tt_metal/hw/inc/dev_msgs.h b/tt_metal/hw/inc/dev_msgs.h
index 0b027259c6a..60a0030110b 100644
--- a/tt_metal/hw/inc/dev_msgs.h
+++ b/tt_metal/hw/inc/dev_msgs.h
@@ -90,8 +90,6 @@ struct kernel_config_msg_t {
     volatile uint16_t watcher_kernel_ids[DISPATCH_CLASS_MAX];
     volatile uint16_t ncrisc_kernel_size16;  // size in 16 byte units
 
-    volatile uint16_t host_assigned_id;
-
     // Ring buffer of kernel configuration data
     volatile uint32_t kernel_config_base[static_cast<int>(ProgrammableCoreType::COUNT)];
     volatile uint16_t sem_offset[static_cast<int>(ProgrammableCoreType::COUNT)];
@@ -99,6 +97,8 @@ struct kernel_config_msg_t {
     rta_offset_t rta_offset[DISPATCH_CLASS_MAX];
     volatile uint32_t kernel_text_offset[MaxProcessorsPerCoreType];
 
+    volatile uint16_t host_assigned_id;
+
     volatile uint8_t mode;                   // dispatch mode host/dev
     volatile uint8_t brisc_noc_id;
     volatile uint8_t brisc_noc_mode;
diff --git a/tt_metal/hw/inc/firmware_common.h b/tt_metal/hw/inc/firmware_common.h
index 64d55851522..fd048640f3c 100644
--- a/tt_metal/hw/inc/firmware_common.h
+++ b/tt_metal/hw/inc/firmware_common.h
@@ -21,8 +21,8 @@ extern uint32_t __ldm_data_end[];
 extern void (* __init_array_start[])();
 extern void (* __init_array_end[])();
 
-extern void kernel_init();
-extern void kernel_launch();
+extern void kernel_init(uint32_t kernel_init);
+extern void kernel_launch(uint32_t kernel_base_addr);
 
 inline void l1_to_local_mem_copy(uint32_t *local_mem_addr, uint32_t tt_l1_ptr *l1_addr, int32_t len) {
     // Cover L1 load latency of 6 cycles for the bulk of the copy
diff --git a/tt_metal/hw/inc/grayskull/core_config.h b/tt_metal/hw/inc/grayskull/core_config.h
index f347da707d4..a2e261a1de3 100644
--- a/tt_metal/hw/inc/grayskull/core_config.h
+++ b/tt_metal/hw/inc/grayskull/core_config.h
@@ -22,17 +22,13 @@ enum class AddressableCoreType : uint8_t {
 };
 
 enum class TensixProcessorTypes : uint8_t {
-    BRISC  = 0,
-    NCRISC = 1,
-    TRISC0 = 2,
-    TRISC1 = 3,
-    TRISC2 = 4,
-
     DM0    = 0,
     DM1    = 1,
     MATH0  = 2,
     MATH1  = 3,
     MATH2  = 4,
+    COUNT  = 5
 };
 
 constexpr uint8_t MaxProcessorsPerCoreType = 5;
+constexpr uint8_t NumTensixDispatchClasses = 3;
diff --git a/tt_metal/hw/inc/grayskull/dev_mem_map.h b/tt_metal/hw/inc/grayskull/dev_mem_map.h
index b7e92831929..793bd1ba789 100644
--- a/tt_metal/hw/inc/grayskull/dev_mem_map.h
+++ b/tt_metal/hw/inc/grayskull/dev_mem_map.h
@@ -46,11 +46,18 @@
 
 /////////////
 // Firmware/kernel code holes
-#define MEM_BRISC_FIRMWARE_SIZE (10 * 1024 + MEM_BRISC_LOCAL_SIZE)
-#define MEM_NCRISC_FIRMWARE_SIZE (16 * 1024)
-#define MEM_TRISC0_FIRMWARE_SIZE (16 * 1024 + MEM_TRISC_LOCAL_SIZE)
-#define MEM_TRISC1_FIRMWARE_SIZE (16 * 1024 + MEM_TRISC_LOCAL_SIZE)
-#define MEM_TRISC2_FIRMWARE_SIZE (16 * 1024 + MEM_TRISC_LOCAL_SIZE)
+#define MEM_BRISC_FIRMWARE_SIZE (5 * 1024)
+// TODO: perhaps put NCRISC FW in the scratch area and free 1.5K after init (GS/WH)
+#define MEM_NCRISC_FIRMWARE_SIZE 1536
+#define MEM_TRISC0_FIRMWARE_SIZE 1536
+#define MEM_TRISC1_FIRMWARE_SIZE 1536
+#define MEM_TRISC2_FIRMWARE_SIZE 1536
+
+#define MEM_BRISC_KERNEL_SIZE  (24 * 1024)
+#define MEM_NCRISC_KERNEL_SIZE MEM_NCRISC_IRAM_SIZE
+#define MEM_TRISC0_KERNEL_SIZE (24 * 1024)
+#define MEM_TRISC1_KERNEL_SIZE (24 * 1024)
+#define MEM_TRISC2_KERNEL_SIZE (24 * 1024)
 
 #define MEM_ZEROS_SIZE 512
 
diff --git a/tt_metal/hw/inc/wormhole/core_config.h b/tt_metal/hw/inc/wormhole/core_config.h
index 497ff5512e5..4d30c98584b 100644
--- a/tt_metal/hw/inc/wormhole/core_config.h
+++ b/tt_metal/hw/inc/wormhole/core_config.h
@@ -24,21 +24,19 @@ enum class AddressableCoreType : uint8_t {
 };
 
 enum class TensixProcessorTypes : uint8_t {
-    BRISC  = 0,
-    NCRISC = 1,
-    TRISC0 = 2,
-    TRISC1 = 3,
-    TRISC2 = 4,
-
     DM0    = 0,
     DM1    = 1,
     MATH0  = 2,
     MATH1  = 3,
     MATH2  = 4,
+    COUNT  = 5
 };
 
 enum class EthProcessorTypes : uint8_t {
     DM0    = 0,
+    COUNT  = 1
 };
 
 constexpr uint8_t MaxProcessorsPerCoreType = 5;
+constexpr uint8_t NumTensixDispatchClasses = 3;
+constexpr uint8_t NumEthDispatchClasses = 1;
diff --git a/tt_metal/hw/inc/wormhole/dev_mem_map.h b/tt_metal/hw/inc/wormhole/dev_mem_map.h
index 1f6e55da51e..5f78ec1b810 100644
--- a/tt_metal/hw/inc/wormhole/dev_mem_map.h
+++ b/tt_metal/hw/inc/wormhole/dev_mem_map.h
@@ -47,11 +47,18 @@
 
 /////////////
 // Firmware/kernel code holes
-#define MEM_BRISC_FIRMWARE_SIZE (10 * 1024 + MEM_BRISC_LOCAL_SIZE)
-#define MEM_NCRISC_FIRMWARE_SIZE (16 * 1024)
-#define MEM_TRISC0_FIRMWARE_SIZE (16 * 1024 + MEM_TRISC_LOCAL_SIZE)
-#define MEM_TRISC1_FIRMWARE_SIZE (16 * 1024 + MEM_TRISC_LOCAL_SIZE)
-#define MEM_TRISC2_FIRMWARE_SIZE (16 * 1024 + MEM_TRISC_LOCAL_SIZE)
+#define MEM_BRISC_FIRMWARE_SIZE (5 * 1024)
+// TODO: perhaps put NCRISC FW in the scratch area and free 1.5K after init (GS/WH)
+#define MEM_NCRISC_FIRMWARE_SIZE 1536
+#define MEM_TRISC0_FIRMWARE_SIZE 1536
+#define MEM_TRISC1_FIRMWARE_SIZE 1536
+#define MEM_TRISC2_FIRMWARE_SIZE 1536
+
+#define MEM_BRISC_KERNEL_SIZE  (24 * 1024)
+#define MEM_NCRISC_KERNEL_SIZE MEM_NCRISC_IRAM_SIZE
+#define MEM_TRISC0_KERNEL_SIZE (24 * 1024)
+#define MEM_TRISC1_KERNEL_SIZE (24 * 1024)
+#define MEM_TRISC2_KERNEL_SIZE (24 * 1024)
 
 #define MEM_ZEROS_SIZE 512
 
@@ -122,6 +129,7 @@
 #define MEM_IERISC_MAILBOX_END (MEM_IERISC_MAILBOX_BASE + MEM_IERISC_MAILBOX_SIZE)
 #define MEM_IERISC_FIRMWARE_BASE MEM_IERISC_MAILBOX_END
 #define MEM_IERISC_MAP_END (MEM_IERISC_FIRMWARE_BASE + MEM_IERISC_FIRMWARE_SIZE)
+#define MEM_IERISC_KERNEL_SIZE (24 * 1024)
 #define MEM_IERISC_INIT_LOCAL_L1_BASE_SCRATCH MEM_IERISC_MAP_END
 #define MEM_IERISC_STACK_SIZE 1024
 #define MEM_IERISC_STACK_BASE (MEM_LOCAL_BASE + MEM_IERISC_LOCAL_SIZE - MEM_IERISC_STACK_SIZE)
diff --git a/tt_metal/hw/toolchain/erisc-b0-app.ld b/tt_metal/hw/toolchain/erisc-b0-app.ld
index 05f7949596e..4a82d3f2f17 100644
--- a/tt_metal/hw/toolchain/erisc-b0-app.ld
+++ b/tt_metal/hw/toolchain/erisc-b0-app.ld
@@ -19,4 +19,4 @@ __firmware_global_pointer = ORIGIN(ERISC_DATA) + 0x7f0;
 
 INCLUDE "erisc-b0-app-sections.ld"
 INCLUDE "tensix-address.ld"
-_Z11kernel_initv = ORIGIN(REGION_APP_KERNEL_CODE);
+_Z11kernel_initm = ORIGIN(REGION_APP_KERNEL_CODE);
diff --git a/tt_metal/hw/toolchain/main.ld b/tt_metal/hw/toolchain/main.ld
index 4bdc1d8148e..cf62ec943c3 100644
--- a/tt_metal/hw/toolchain/main.ld
+++ b/tt_metal/hw/toolchain/main.ld
@@ -18,6 +18,8 @@ REGION_ALIAS("REGION_DATA", TARGET_LOCAL_DATA_MEM(LD_TARGET))
 REGION_ALIAS("REGION_STACK", TARGET_STACK_MEM(LD_TARGET))
 
 #define FIRMWARE_STACK_SIZE TARGET_STACK_SIZE(LD_TARGET)
-#define KERNEL_ENTRY_SYMBOL _Z11kernel_initv
+#if defined(TARGET_NCRISC)
+#define KERNEL_ENTRY_SYMBOL _Z11kernel_initm
+#endif
 
 #include "sections.ld"
diff --git a/tt_metal/hw/toolchain/memory.ld b/tt_metal/hw/toolchain/memory.ld
index 29c4ced9588..d0dcf3bc58b 100644
--- a/tt_metal/hw/toolchain/memory.ld
+++ b/tt_metal/hw/toolchain/memory.ld
@@ -2,27 +2,51 @@ MEMORY
 {
     BRISC_LOCAL_DATA_MEM : ORIGIN = MEM_LOCAL_BASE, LENGTH = MEM_BRISC_LOCAL_SIZE - MEM_BRISC_STACK_SIZE
     BRISC_STACK_MEM : ORIGIN = MEM_BRISC_STACK_BASE, LENGTH = MEM_BRISC_STACK_SIZE
+#if defined(TYPE_FIRMWARE)
     BRISC_FIRMWARE_CODE : ORIGIN = MEM_BRISC_FIRMWARE_BASE, LENGTH = MEM_BRISC_FIRMWARE_SIZE
+#else
+    BRISC_FIRMWARE_CODE : ORIGIN = MEM_BRISC_FIRMWARE_BASE, LENGTH = MEM_BRISC_KERNEL_SIZE
+#endif
 
     TRISC0_LOCAL_DATA_MEM : ORIGIN = MEM_LOCAL_BASE, LENGTH = MEM_TRISC_LOCAL_SIZE - MEM_TRISC0_STACK_SIZE
     TRISC0_STACK_MEM : ORIGIN = MEM_TRISC0_STACK_BASE, LENGTH = MEM_TRISC0_STACK_SIZE
+#if defined(TYPE_FIRMWARE)
     TRISC0_FIRMWARE_CODE : ORIGIN = MEM_TRISC0_FIRMWARE_BASE, LENGTH = MEM_TRISC0_FIRMWARE_SIZE
+#else
+    TRISC0_FIRMWARE_CODE : ORIGIN = MEM_TRISC0_FIRMWARE_BASE, LENGTH = MEM_TRISC0_KERNEL_SIZE
+#endif
 
     TRISC1_LOCAL_DATA_MEM : ORIGIN = MEM_LOCAL_BASE, LENGTH = MEM_TRISC_LOCAL_SIZE - MEM_TRISC1_STACK_SIZE
     TRISC1_STACK_MEM : ORIGIN = MEM_TRISC1_STACK_BASE, LENGTH = MEM_TRISC1_STACK_SIZE
+#if defined(TYPE_FIRMWARE)
     TRISC1_FIRMWARE_CODE : ORIGIN = MEM_TRISC1_FIRMWARE_BASE, LENGTH = MEM_TRISC1_FIRMWARE_SIZE
+#else
+    TRISC1_FIRMWARE_CODE : ORIGIN = MEM_TRISC1_FIRMWARE_BASE, LENGTH = MEM_TRISC1_KERNEL_SIZE
+#endif
 
     TRISC2_LOCAL_DATA_MEM : ORIGIN = MEM_LOCAL_BASE, LENGTH = MEM_TRISC_LOCAL_SIZE - MEM_TRISC2_STACK_SIZE
     TRISC2_STACK_MEM : ORIGIN = MEM_TRISC2_STACK_BASE, LENGTH = MEM_TRISC2_STACK_SIZE
+#if defined(TYPE_FIRMWARE)
     TRISC2_FIRMWARE_CODE : ORIGIN = MEM_TRISC2_FIRMWARE_BASE, LENGTH = MEM_TRISC2_FIRMWARE_SIZE
+#else
+    TRISC2_FIRMWARE_CODE : ORIGIN = MEM_TRISC1_FIRMWARE_BASE, LENGTH = MEM_TRISC2_KERNEL_SIZE
+#endif
 
     NCRISC_LOCAL_DATA_MEM : ORIGIN = MEM_LOCAL_BASE, LENGTH = MEM_NCRISC_LOCAL_SIZE - MEM_NCRISC_STACK_SIZE
     NCRISC_STACK_MEM : ORIGIN = MEM_NCRISC_STACK_BASE, LENGTH = MEM_NCRISC_STACK_SIZE
+#if defined(TYPE_FIRMWARE)
     NCRISC_FIRMWARE_CODE : ORIGIN = MEM_NCRISC_FIRMWARE_BASE, LENGTH = MEM_NCRISC_FIRMWARE_SIZE
+#else
+    NCRISC_FIRMWARE_CODE : ORIGIN = MEM_NCRISC_FIRMWARE_BASE, LENGTH = MEM_NCRISC_KERNEL_SIZE
+#endif
 
 #ifdef COMPILE_FOR_IERISC
     IERISC_LOCAL_DATA_MEM : ORIGIN = MEM_LOCAL_BASE, LENGTH = MEM_IERISC_LOCAL_SIZE - MEM_IERISC_STACK_SIZE
     IERISC_STACK_MEM : ORIGIN = MEM_IERISC_STACK_BASE, LENGTH = MEM_IERISC_STACK_SIZE
+#if defined(TYPE_FIRMWARE)
     IERISC_FIRMWARE_CODE : ORIGIN = MEM_IERISC_FIRMWARE_BASE, LENGTH = MEM_IERISC_FIRMWARE_SIZE
+#else
+    IERISC_FIRMWARE_CODE : ORIGIN = MEM_IERISC_FIRMWARE_BASE, LENGTH = MEM_IERISC_KERNEL_SIZE
+#endif
 #endif
 }
diff --git a/tt_metal/hw/toolchain/sections.ld b/tt_metal/hw/toolchain/sections.ld
index def3cf52669..3c8c487ed24 100644
--- a/tt_metal/hw/toolchain/sections.ld
+++ b/tt_metal/hw/toolchain/sections.ld
@@ -73,8 +73,10 @@ SECTIONS
 
 #if defined(TYPE_FIRMWARE)
   __fw_export_end_text = .;
+#if defined(TARGET_NCRISC)
   PROVIDE (KERNEL_ENTRY_SYMBOL = __fw_export_end_text);
 #endif
+#endif
 
 #if defined(TYPE_KERNEL)
   __kernel_init_local_l1_base = .;
diff --git a/tt_metal/hw/toolchain/tmu-crt0k.S b/tt_metal/hw/toolchain/tmu-crt0k.S
index f5d4ec04215..177d79cdb84 100644
--- a/tt_metal/hw/toolchain/tmu-crt0k.S
+++ b/tt_metal/hw/toolchain/tmu-crt0k.S
@@ -3,5 +3,5 @@
 .type   _start, @function
 
 _start:
-	tail    _Z13kernel_launchv
+	tail    _Z13kernel_launchm
 	.size  _start, .-_start
diff --git a/tt_metal/impl/allocator/allocator.hpp b/tt_metal/impl/allocator/allocator.hpp
index 7d27f5a4ee7..ecb31dfb5c8 100644
--- a/tt_metal/impl/allocator/allocator.hpp
+++ b/tt_metal/impl/allocator/allocator.hpp
@@ -11,7 +11,7 @@
 
 #include "allocator_types.hpp"
 #include "tt_metal/common/assert.hpp"
-#include "tt_metal/common/core_coord.h"
+#include "tt_metal/common/core_coord.hpp"
 #include "tt_metal/impl/allocator/algorithms/allocator_algorithm.hpp"
 #include "llrt/hal.hpp"
 
diff --git a/tt_metal/impl/allocator/allocator_types.hpp b/tt_metal/impl/allocator/allocator_types.hpp
index d0c9ce0dcd7..572ab2974e5 100644
--- a/tt_metal/impl/allocator/allocator_types.hpp
+++ b/tt_metal/impl/allocator/allocator_types.hpp
@@ -7,7 +7,7 @@
 #include <vector>
 #include <cstdlib>
 #include <functional>
-#include "common/core_coord.h"
+#include "common/core_coord.hpp"
 #include "hostdevcommon/common_values.hpp"
 #include "hostdevcommon/common_runtime_address_map.h"
 #include "dev_mem_map.h"
diff --git a/tt_metal/impl/buffers/buffer.hpp b/tt_metal/impl/buffers/buffer.hpp
index c77cb98d189..91dd0fd846a 100644
--- a/tt_metal/impl/buffers/buffer.hpp
+++ b/tt_metal/impl/buffers/buffer.hpp
@@ -9,7 +9,7 @@
 #include <optional>
 
 #include "common/bfloat16.hpp"
-#include "common/core_coord.h"
+#include "common/core_coord.hpp"
 #include "common/tt_backend_api_types.hpp"
 #include "hostdevcommon/common_values.hpp"
 #include "tt_metal/common/base.hpp"
diff --git a/tt_metal/impl/buffers/circular_buffer.hpp b/tt_metal/impl/buffers/circular_buffer.hpp
index cac5ad99918..851620f7abd 100644
--- a/tt_metal/impl/buffers/circular_buffer.hpp
+++ b/tt_metal/impl/buffers/circular_buffer.hpp
@@ -4,7 +4,7 @@
 
 #pragma once
 
-#include "common/core_coord.h"
+#include "common/core_coord.hpp"
 #include "common/tt_backend_api_types.hpp"
 #include "tt_metal/impl/buffers/circular_buffer_types.hpp"
 
diff --git a/tt_metal/impl/buffers/semaphore.hpp b/tt_metal/impl/buffers/semaphore.hpp
index 077fe72a769..d4bdf24a75f 100644
--- a/tt_metal/impl/buffers/semaphore.hpp
+++ b/tt_metal/impl/buffers/semaphore.hpp
@@ -5,7 +5,7 @@
 #pragma once
 
 #include "llrt/hal.hpp"
-#include "common/core_coord.h"
+#include "common/core_coord.hpp"
 #include "hostdevcommon/common_runtime_address_map.h"
 #include "tt_metal/third_party/umd/device/tt_soc_descriptor.h"
 
diff --git a/tt_metal/impl/debug/watcher_device_reader.cpp b/tt_metal/impl/debug/watcher_device_reader.cpp
index d9756078879..e5446268717 100644
--- a/tt_metal/impl/debug/watcher_device_reader.cpp
+++ b/tt_metal/impl/debug/watcher_device_reader.cpp
@@ -5,7 +5,7 @@
 #include <vector>
 #include <string>
 
-#include "common/core_coord.h"
+#include "common/core_coord.hpp"
 #include "hw/inc/debug/ring_buffer.h"
 #include "hw/inc/dev_msgs.h"
 #include "impl/device/device.hpp"
diff --git a/tt_metal/impl/device/device.cpp b/tt_metal/impl/device/device.cpp
index 0defda2a3d1..89b50cd1b69 100644
--- a/tt_metal/impl/device/device.cpp
+++ b/tt_metal/impl/device/device.cpp
@@ -223,8 +223,8 @@ void Device::initialize_allocator(size_t l1_small_size, size_t trace_region_size
          .worker_grid_size = this->logical_grid_size(),
          .worker_l1_size = static_cast<size_t>(soc_desc.worker_l1_size),
          .storage_core_bank_size = get_storage_core_bank_size(id_, num_hw_cqs_, dispatch_core_type),
-         .l1_small_size = l1_small_size,
-         .trace_region_size = trace_region_size,
+         .l1_small_size = align(l1_small_size, hal.get_alignment(HalMemType::L1)),
+         .trace_region_size = align(trace_region_size, hal.get_alignment(HalMemType::DRAM)),
          .core_type_from_noc_coord_table = {},  // Populated later
          .worker_log_to_physical_routing_x = soc_desc.worker_log_to_physical_routing_x,
          .worker_log_to_physical_routing_y = soc_desc.worker_log_to_physical_routing_y,
@@ -279,34 +279,61 @@ void Device::initialize_build() {
     uint32_t dispatch_message_addr =
         dispatch_constants::get(dispatch_core_type, this->num_hw_cqs_).get_device_command_queue_addr(CommandQueueDeviceAddrType::DISPATCH_MESSAGE);
 
-    auto init_helper = [this, dispatch_message_addr] (bool is_fw) -> JitBuildStateSet {
+    // TODO now: total number of processor types should be pulled from the HAL
+    uint32_t num_build_states = this->arch() == tt::ARCH::GRAYSKULL ? 5 : 7;
+
+    auto init_helper = [this, dispatch_message_addr, num_build_states] (bool is_fw) -> JitBuildStateSet {
         std::vector<std::shared_ptr<JitBuildState>> build_states;
 
-        build_states.resize(arch() == tt::ARCH::GRAYSKULL ? 5 : 7);
-
-        build_states[build_processor_type_to_index(JitBuildProcessorType::DATA_MOVEMENT).first + 0] =
-            std::make_shared<JitBuildDataMovement>(
-                this->build_env_, JitBuiltStateConfig{.processor_id = 0, .is_fw=is_fw, .dispatch_message_addr=dispatch_message_addr});
-        build_states[build_processor_type_to_index(JitBuildProcessorType::DATA_MOVEMENT).first + 1] =
-            std::make_shared<JitBuildDataMovement>(
-                this->build_env_, JitBuiltStateConfig{.processor_id = 1, .is_fw=is_fw, .dispatch_message_addr=dispatch_message_addr});
-        build_states[build_processor_type_to_index(JitBuildProcessorType::COMPUTE).first + 0] =
-            std::make_shared<JitBuildCompute>(
-                this->build_env_, JitBuiltStateConfig{.processor_id = 0, .is_fw=is_fw, .dispatch_message_addr=dispatch_message_addr});
-        build_states[build_processor_type_to_index(JitBuildProcessorType::COMPUTE).first + 1] =
-            std::make_shared<JitBuildCompute>(
-                this->build_env_, JitBuiltStateConfig{.processor_id = 1, .is_fw=is_fw, .dispatch_message_addr=dispatch_message_addr});
-        build_states[build_processor_type_to_index(JitBuildProcessorType::COMPUTE).first + 2] =
-            std::make_shared<JitBuildCompute>(
-                this->build_env_, JitBuiltStateConfig{.processor_id = 2, .is_fw=is_fw, .dispatch_message_addr=dispatch_message_addr});
-
-        if (arch() != tt::ARCH::GRAYSKULL) {
-            build_states[build_processor_type_to_index(JitBuildProcessorType::ETHERNET).first + 0] =
-                std::make_shared<JitBuildEthernet>(
-                    this->build_env_, JitBuiltStateConfig{.processor_id = 0, .is_fw=is_fw, .dispatch_message_addr=dispatch_message_addr});
-            build_states[build_processor_type_to_index(JitBuildProcessorType::ETHERNET).first + 1] =
-                std::make_shared<JitBuildEthernet>(
-                    this->build_env_, JitBuiltStateConfig{.processor_id = 1, .is_fw=is_fw, .dispatch_message_addr=dispatch_message_addr});
+        build_states.resize(num_build_states);
+        uint32_t programmable_core_type_count = hal.get_programmable_core_type_count();
+        if (is_fw) {
+            this->build_state_indices_.resize(programmable_core_type_count);
+        }
+
+        uint32_t index = 0;
+        for (uint32_t programmable_core = 0; programmable_core < programmable_core_type_count; programmable_core++) {
+            HalProgrammableCoreType core_type = magic_enum::enum_value<HalProgrammableCoreType>(programmable_core);
+            uint32_t processor_class_count = hal.get_processor_classes_count(programmable_core);
+            if (is_fw) {
+                this->build_state_indices_[programmable_core].resize(processor_class_count);
+            }
+            for (uint32_t processor_class = 0; processor_class < processor_class_count; processor_class++) {
+                auto compute_proc_class = magic_enum::enum_cast<HalProcessorClassType>(processor_class);
+                bool is_compute_processor = compute_proc_class.has_value() and compute_proc_class.value() == HalProcessorClassType::COMPUTE;
+                uint32_t processor_types_count = hal.get_processor_types_count(programmable_core, processor_class);
+                if (is_fw) {
+                    this->build_state_indices_[programmable_core][processor_class] = {index, processor_types_count};
+                }
+                for (uint32_t processor_type = 0; processor_type < processor_types_count; processor_type++) {
+                    switch (core_type) {
+                        case HalProgrammableCoreType::TENSIX: {
+                            if (is_compute_processor) {
+                                build_states[index] = std::make_shared<JitBuildCompute>(
+                                    this->build_env_, JitBuiltStateConfig{.processor_id = processor_type, .is_fw=is_fw, .dispatch_message_addr=dispatch_message_addr});
+                            } else {
+                                // TODO: Make .processor_id = processor_type when brisc and ncrisc are considered one processor class
+                                build_states[index] = std::make_shared<JitBuildDataMovement>(
+                                    this->build_env_, JitBuiltStateConfig{.processor_id = processor_class, .is_fw=is_fw, .dispatch_message_addr=dispatch_message_addr});
+                            }
+                            break;
+                        }
+                        case HalProgrammableCoreType::ACTIVE_ETH: {
+                            build_states[index] = std::make_shared<JitBuildActiveEthernet>(
+                                this->build_env_, JitBuiltStateConfig{.processor_id = processor_type, .is_fw=is_fw, .dispatch_message_addr=dispatch_message_addr});
+                            break;
+                        }
+                        case HalProgrammableCoreType::IDLE_ETH: {
+                            build_states[index] = std::make_shared<JitBuildIdleEthernet>(
+                                this->build_env_, JitBuiltStateConfig{.processor_id = processor_type, .is_fw=is_fw, .dispatch_message_addr=dispatch_message_addr});
+                            break;
+                        }
+                        default:
+                            TT_THROW("Unsupported programable core type {} to initialize build states", magic_enum::enum_name(core_type));
+                    }
+                    index++;
+                }
+            }
         }
 
        return build_states;
@@ -324,71 +351,81 @@ void Device::build_firmware() {
     jit_build_set(this->firmware_build_states_, nullptr);
 }
 
-void Device::initialize_firmware(CoreCoord phys_core, launch_msg_t *launch_msg, go_msg_t* go_msg) {
+void Device::initialize_firmware(const HalProgrammableCoreType &core_type, CoreCoord phys_core, launch_msg_t *launch_msg, go_msg_t* go_msg) {
     ZoneScoped;
 
-    if (llrt::is_ethernet_core(phys_core, this->id())) {
-        //ethernet core.
-        //Determine if its a connected or unconnected ethernet core.
-        //Unconnected ethernet cores will get idle_erisc fw.
-        auto active_eth_cores = this->get_active_ethernet_cores();
+    uint32_t core_type_idx = hal.get_programmable_core_type_index(core_type);
+    uint32_t processor_class_count = hal.get_processor_classes_count(core_type);
 
-        if (active_eth_cores.find(logical_core_from_ethernet_core(phys_core)) != active_eth_cores.end()) {
-            if (not llrt::OptionsG.get_skip_loading_fw()) {
-                int eriscv_id = build_processor_type_to_index(JitBuildProcessorType::ETHERNET).first + 0;
-                ll_api::memory binary_mem = llrt::get_risc_binary(firmware_build_states_[eriscv_id]->get_target_out_path(""), eriscv_id);
-                uint32_t fw_size = binary_mem.get_text_size();
-                log_debug(LogDevice, "ERISC fw binary size: {} in bytes", fw_size);
-                llrt::test_load_write_read_risc_binary(binary_mem, this->id(), phys_core, eriscv_id);
+    switch (core_type) {
+        case HalProgrammableCoreType::TENSIX: {
+            llrt::program_risc_startup_addr(this->id(), phys_core);
+            for (uint32_t processor_class = 0; processor_class < processor_class_count; processor_class++) {
+                auto [build_idx, num_build_states] = this->build_processor_type_to_index(core_type_idx, processor_class);
+                for (uint32_t riscv_id = build_idx; riscv_id < (build_idx + num_build_states); riscv_id++) {
+                    ll_api::memory binary_mem = llrt::get_risc_binary(firmware_build_states_[riscv_id]->get_target_out_path(""), riscv_id);
+                    uint32_t fw_size = binary_mem.get_text_size();
+                    if (riscv_id == 1) { // TODO: clean up how brisc/ncrisc are handled
+                        // In this context, ncrisc_kernel_size16 is the size of the fw
+                        launch_msg->kernel_config.ncrisc_kernel_size16 = (fw_size + 15) >> 4;
+                    }
+                    log_debug(LogDevice, "RISC {} fw binary size: {} in bytes", riscv_id, fw_size);
+                    if (not llrt::OptionsG.get_skip_loading_fw()) {
+                        llrt::test_load_write_read_risc_binary(binary_mem, this->id(), phys_core, riscv_id);
+                    }
+                }
             }
-            llrt::launch_erisc_app_fw_on_core(this->id(), phys_core);
-            // Ethernet worker core. Launch messages will be sent by FD infra if it's enabled
-            launch_msg->kernel_config.mode = this->using_slow_dispatch() ? DISPATCH_MODE_HOST :  DISPATCH_MODE_DEV;
-        } else {
-            tt::Cluster::instance().assert_risc_reset_at_core(tt_cxy_pair(this->id(), phys_core));
-            if (not llrt::OptionsG.get_skip_loading_fw()) {
-                int eriscv_id = build_processor_type_to_index(JitBuildProcessorType::ETHERNET).first + 1;
-                ll_api::memory binary_mem = llrt::get_risc_binary(firmware_build_states_[eriscv_id]->get_target_out_path(""), eriscv_id);
-                uint32_t fw_size = binary_mem.get_text_size();
-                log_debug(LogDevice, "ERISC fw binary size: {} in bytes", fw_size);
-                llrt::test_load_write_read_risc_binary(binary_mem, this->id(), phys_core, eriscv_id);
+
+            if (this->using_slow_dispatch()) {
+                // Host always writes launch messages
+                launch_msg->kernel_config.mode = DISPATCH_MODE_HOST;
+            } else {
+                std::vector<CoreCoord> physical_dispatch_cores = {};
+                if (dispatch_core_manager::instance().get_dispatch_core_type(this->id()) == CoreType::WORKER) {
+                    physical_dispatch_cores = this->worker_cores_from_logical_cores(dispatch_core_manager::instance().get_all_logical_dispatch_cores(this->id()));
+                }
+                if (std::find(physical_dispatch_cores.begin(), physical_dispatch_cores.end(), phys_core) != physical_dispatch_cores.end()) {
+                    // Dispatch cores - Host writes launch messages
+                    launch_msg->kernel_config.mode = DISPATCH_MODE_HOST;
+                } else {
+                    // Worker cores - Dispatcher will write launch messages
+                    launch_msg->kernel_config.mode = DISPATCH_MODE_DEV;
+                }
             }
-            llrt::program_risc_startup_addr(this->id(), phys_core);
-            // Idle ethernet core. Used by FD infra. Host will write launch messages during init.
-            launch_msg->kernel_config.mode = DISPATCH_MODE_HOST;
+
+            break;
         }
-    } else {
-        llrt::program_risc_startup_addr(this->id(), phys_core);
-        for (int riscv_id = 0; riscv_id < 5; riscv_id++) {
-            ll_api::memory binary_mem =
-                llrt::get_risc_binary(firmware_build_states_[riscv_id]->get_target_out_path(""), riscv_id);
-            uint32_t fw_size = binary_mem.get_text_size();
-            if (riscv_id == 1) {
-                // In this context, ncrisc_kernel_size16 is the size of the fw
-                launch_msg->kernel_config.ncrisc_kernel_size16 = (fw_size + 15) >> 4;
+        case HalProgrammableCoreType::ACTIVE_ETH:
+        case HalProgrammableCoreType::IDLE_ETH: {
+            bool is_idle_eth = core_type == HalProgrammableCoreType::IDLE_ETH;
+            if (is_idle_eth) {
+                tt::Cluster::instance().assert_risc_reset_at_core(tt_cxy_pair(this->id(), phys_core));
             }
-            log_debug(LogDevice, "RISC {} fw binary size: {} in bytes", riscv_id, fw_size);
             if (not llrt::OptionsG.get_skip_loading_fw()) {
-                llrt::test_load_write_read_risc_binary(binary_mem, this->id(), phys_core, riscv_id);
-            }
-        }
-        if (this->using_slow_dispatch()) {
-            // Host always writes launch messages
-            launch_msg->kernel_config.mode = DISPATCH_MODE_HOST;
-        } else {
-            std::vector<CoreCoord> physical_dispatch_cores = {};
-            if (dispatch_core_manager::instance().get_dispatch_core_type(this->id()) == CoreType::WORKER) {
-                physical_dispatch_cores = this->worker_cores_from_logical_cores(dispatch_core_manager::instance().get_all_logical_dispatch_cores(this->id()));
+                for (uint32_t processor_class = 0; processor_class < processor_class_count; processor_class++) {
+                    auto [build_idx, num_build_states] = this->build_processor_type_to_index(core_type_idx, processor_class);
+                    for (uint32_t eriscv_id = build_idx; eriscv_id < (build_idx + num_build_states); eriscv_id++) {
+                        ll_api::memory binary_mem = llrt::get_risc_binary(firmware_build_states_[eriscv_id]->get_target_out_path(""), eriscv_id);
+                        uint32_t fw_size = binary_mem.get_text_size();
+                        log_debug(LogDevice, "ERISC fw binary size: {} in bytes", fw_size);
+                        llrt::test_load_write_read_risc_binary(binary_mem, this->id(), phys_core, eriscv_id);
+                    }
+                }
             }
-            if (std::find(physical_dispatch_cores.begin(), physical_dispatch_cores.end(), phys_core) != physical_dispatch_cores.end()) {
-                // Dispatch cores - Host writes launch messages
-                launch_msg->kernel_config.mode = DISPATCH_MODE_HOST;
+            if (is_idle_eth) {
+                llrt::program_risc_startup_addr(this->id(), phys_core);
             } else {
-                // Worker cores - Dispatcher will write launch messages
-                launch_msg->kernel_config.mode = DISPATCH_MODE_DEV;
+                llrt::launch_erisc_app_fw_on_core(this->id(), phys_core);
             }
+            // Ethernet worker core. Launch messages will be sent by FD infra if it's enabled
+            // Idle ethernet core. Used by FD infra. Host will write launch messages during init.
+            launch_msg->kernel_config.mode = (this->using_slow_dispatch() or is_idle_eth) ? DISPATCH_MODE_HOST :  DISPATCH_MODE_DEV;
+            break;
         }
+        default:
+            TT_THROW("Unsupported programable core type {} to initialize build states", magic_enum::enum_name(core_type));
     }
+
     // Initialize each entry in the launch_msg ring buffer with the correct dispatch mode - Cores that don't get a valid
     // launch_message during program execution need to at least have the correct dispatch mode.
     // When using Fast Dispatch on Tensix:
@@ -584,7 +621,7 @@ void Device::initialize_and_launch_firmware() {
                 CoreCoord worker_core = this->worker_core_from_logical_core(logical_core);
                 tt::llrt::write_hex_vec_to_core(
                     this->id(), worker_core, core_info_vec, this->get_dev_addr(worker_core, HalL1MemAddrType::CORE_INFO));
-                this->initialize_firmware(worker_core, &launch_msg, &go_msg);
+                this->initialize_firmware(HalProgrammableCoreType::TENSIX, worker_core, &launch_msg, &go_msg);
                 not_done_cores.insert(worker_core);
             }
         }
@@ -604,14 +641,14 @@ void Device::initialize_and_launch_firmware() {
         CoreCoord phys_eth_core = this->ethernet_core_from_logical_core(eth_core);
         tt::llrt::write_hex_vec_to_core(
             this->id(), phys_eth_core, core_info_vec, this->get_dev_addr(phys_eth_core, HalL1MemAddrType::CORE_INFO));
-        this->initialize_firmware(phys_eth_core, &launch_msg, &go_msg);
+        this->initialize_firmware(HalProgrammableCoreType::ACTIVE_ETH, phys_eth_core, &launch_msg, &go_msg);
     }
 
     for (const auto &eth_core : this->get_inactive_ethernet_cores()) {
         CoreCoord phys_eth_core = this->ethernet_core_from_logical_core(eth_core);
         tt::llrt::write_hex_vec_to_core(
             this->id(), phys_eth_core, core_info_vec, this->get_dev_addr(phys_eth_core, HalL1MemAddrType::CORE_INFO));
-        this->initialize_firmware(phys_eth_core, &launch_msg, &go_msg);
+        this->initialize_firmware(HalProgrammableCoreType::IDLE_ETH, phys_eth_core, &launch_msg, &go_msg);
         not_done_cores.insert(phys_eth_core);
     }
 
@@ -2718,12 +2755,14 @@ void Device::configure_command_queue_programs() {
         }
     }
 
+    command_queue_program.finalize(this);
     detail::ConfigureDeviceWithProgram(this, command_queue_program, true);
     tt::Cluster::instance().l1_barrier(this->id());
     if (device_id != mmio_device_id) {
         if (tt::Cluster::instance().get_device_tunnel_depth(device_id) == 1) {
             //first or only remote device on the tunnel, launch fd2 kernels on mmio device for all remote devices.
             Program& mmio_command_queue_program = *this->command_queue_programs[1];
+            mmio_command_queue_program.finalize(mmio_device);
             detail::ConfigureDeviceWithProgram(mmio_device, mmio_command_queue_program, true);
             tt::Cluster::instance().l1_barrier(mmio_device_id);
         }
@@ -2771,7 +2810,6 @@ void Device::init_command_queue_device() {
     }
     this->configure_command_queue_programs();
     Program& command_queue_program = *this->command_queue_programs[0];
-    command_queue_program.finalize(this);
 
     // TODO: should get a const ref
     std::vector<std::vector<CoreCoord>>logical_cores = command_queue_program.logical_cores();
@@ -2791,7 +2829,6 @@ void Device::init_command_queue_device() {
             chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(this->id());
             Device *mmio_device = tt::DevicePool::instance().get_active_device(mmio_device_id);
             Program& mmio_command_queue_program = *this->command_queue_programs[1];
-            mmio_command_queue_program.finalize(mmio_device);
             std::vector<std::vector<CoreCoord>>logical_cores = mmio_command_queue_program.logical_cores();
             for (uint32_t index = 0; index < hal.get_programmable_core_type_count(); index++) {
                 const auto& logical_dispatch_cores = logical_cores[index];
@@ -3181,33 +3218,25 @@ float Device::sfpu_inf() const{
     return std::numeric_limits<float>::infinity();
 }
 
-pair<int, int> Device::build_processor_type_to_index(JitBuildProcessorType t) const {
-    constexpr int DataMovementBuildCount = 2;
-    constexpr int ComputeBuildCount = 3;
-    constexpr int EthernetBuildCount = 2;
-
-    switch (t) {
-    case JitBuildProcessorType::DATA_MOVEMENT: return pair<int, int>(0, DataMovementBuildCount);
-    case JitBuildProcessorType::COMPUTE: return pair<int, int>(DataMovementBuildCount, ComputeBuildCount);
-    case JitBuildProcessorType::ETHERNET: return pair<int, int>(DataMovementBuildCount + ComputeBuildCount, EthernetBuildCount);
-    default: TT_THROW("Bad processor type: {}", static_cast<std::underlying_type<JitBuildProcessorType>::type>(t));
-    }
-
-    // shh the warnings
-    return pair<int, int>(0, 0);
+std::pair<int, int> Device::build_processor_type_to_index(uint32_t programmable_core, uint32_t processor_class) const {
+    TT_ASSERT(programmable_core < this->build_state_indices_.size(),
+        "Programmable core type {} is not included in the FW or Kernel build state", programmable_core);
+    TT_ASSERT(processor_class < this->build_state_indices_[programmable_core].size(),
+        "Processor class type {} is not included in the FW or Kernel build state", processor_class);
+    return this->build_state_indices_[programmable_core][processor_class];
 }
 
 // Ideally the firmware getter would be private to the device, however, tests look for this
-const JitBuildState& Device::build_firmware_state(JitBuildProcessorType t, int i) const {
-    return *(this->firmware_build_states_[build_processor_type_to_index(t).first + i]);
+const JitBuildState& Device::build_firmware_state(uint32_t programmable_core, uint32_t processor_class, int i) const {
+    return *(this->firmware_build_states_[build_processor_type_to_index(programmable_core, processor_class).first + i]);
 }
 
-const JitBuildState& Device::build_kernel_state(JitBuildProcessorType t, int i) const {
-    return *(this->kernel_build_states_[build_processor_type_to_index(t).first + i]);
+const JitBuildState& Device::build_kernel_state(uint32_t programmable_core, uint32_t processor_class, int i) const {
+    return *(this->kernel_build_states_[build_processor_type_to_index(programmable_core, processor_class).first + i]);
 }
 
-const JitBuildStateSubset Device::build_kernel_states(JitBuildProcessorType t) const {
-    pair<int, int> bptti = build_processor_type_to_index(t);
+const JitBuildStateSubset Device::build_kernel_states(uint32_t programmable_core, uint32_t processor_class) const {
+    std::pair<int, int> bptti = build_processor_type_to_index(programmable_core, processor_class);
     JitBuildStateSubset subset = {
         &this->kernel_build_states_[bptti.first],
         bptti.second
@@ -3215,13 +3244,13 @@ const JitBuildStateSubset Device::build_kernel_states(JitBuildProcessorType t) c
     return subset;
 }
 
-const string Device::build_firmware_target_path(JitBuildProcessorType t, int i) const {
-    const JitBuildState& bs = build_firmware_state(t, i);
+const string Device::build_firmware_target_path(uint32_t programmable_core, uint32_t processor_class, int i) const {
+    const JitBuildState& bs = build_firmware_state(programmable_core, processor_class, i);
     return bs.get_target_out_path("");
 }
 
-const string Device::build_kernel_target_path(JitBuildProcessorType t, int i, const string& kernel_name) const {
-    const JitBuildState& bs = build_kernel_state(t, i);
+const string Device::build_kernel_target_path(uint32_t programmable_core, uint32_t processor_class, int i, const string& kernel_name) const {
+    const JitBuildState& bs = build_kernel_state(programmable_core, processor_class, i);
     return bs.get_target_out_path(kernel_name);
 }
 
diff --git a/tt_metal/impl/device/device.hpp b/tt_metal/impl/device/device.hpp
index 6644e1f6e1d..5ee266d37b7 100644
--- a/tt_metal/impl/device/device.hpp
+++ b/tt_metal/impl/device/device.hpp
@@ -6,6 +6,7 @@
 
 #include <memory>
 #include <mutex>
+#include <utility>
 
 #include "hostdevcommon/common_values.hpp"
 #include "impl/dispatch/work_executor.hpp"
@@ -209,11 +210,11 @@ class Device {
 
     void generate_device_headers(const std::string &path) const;
     const JitBuildEnv& build_env() const { return this->build_env_; }
-    const string build_firmware_target_path(JitBuildProcessorType t, int i) const;
-    const string build_kernel_target_path(JitBuildProcessorType t, int i, const string& kernel_name) const;
-    const JitBuildState& build_firmware_state(JitBuildProcessorType t, int i) const;
-    const JitBuildState& build_kernel_state(JitBuildProcessorType t, int i) const;
-    const JitBuildStateSubset build_kernel_states(JitBuildProcessorType t) const;
+    const string build_firmware_target_path(uint32_t programmable_core, uint32_t processor_class, int i) const;
+    const string build_kernel_target_path(uint32_t programmable_core, uint32_t processor_class, int i, const string& kernel_name) const;
+    const JitBuildState& build_firmware_state(uint32_t programmable_core, uint32_t processor_class, int i) const;
+    const JitBuildState& build_kernel_state(uint32_t programmable_core, uint32_t processor_class, int i) const;
+    const JitBuildStateSubset build_kernel_states(uint32_t programmable_core, uint32_t processor_class) const;
     SystemMemoryManager& sysmem_manager() { return *sysmem_manager_; }
     HWCommandQueue& hw_command_queue(size_t cq_id = 0);
     CommandQueue& command_queue(size_t cq_id = 0);
@@ -235,7 +236,7 @@ class Device {
     void initialize_allocator(size_t l1_small_size, size_t trace_region_size, const std::vector<uint32_t> &l1_bank_remap = {});
     void initialize_build();
     void build_firmware();
-    void initialize_firmware(CoreCoord phys_core, launch_msg_t *launch_msg, go_msg_t* go_msg);
+    void initialize_firmware(const HalProgrammableCoreType &core_type, CoreCoord phys_core, launch_msg_t *launch_msg, go_msg_t* go_msg);
     void reset_cores();
     void initialize_and_launch_firmware();
     void init_command_queue_host();
@@ -249,7 +250,7 @@ class Device {
     void get_associated_dispatch_phys_cores(
         std::unordered_map<chip_id_t, std::unordered_set<CoreCoord>> &my_dispatch_cores,
         std::unordered_map<chip_id_t, std::unordered_set<CoreCoord>> &other_dispatch_cores);
-    std::pair<int, int> build_processor_type_to_index(JitBuildProcessorType t) const;
+    std::pair<int, int> build_processor_type_to_index(uint32_t programmable_core, uint32_t processor_class) const;
 
     // Puts device into reset
     bool close();
@@ -280,6 +281,7 @@ class Device {
     JitBuildEnv build_env_;
     JitBuildStateSet firmware_build_states_;
     JitBuildStateSet kernel_build_states_;
+    std::vector<std::vector<std::pair<int, int>>> build_state_indices_;
 
     std::set<CoreCoord> compute_cores_;
     std::set<CoreCoord> storage_only_cores_;
@@ -333,7 +335,7 @@ class Device {
     T get_base_allocator_addr(const HalMemType &mem_type) const;
 
     template <typename CoreRangeContainer>
-    std::vector<pair<transfer_info_cores, uint32_t>> extract_dst_noc_multicast_info(const CoreRangeContainer& ranges, const CoreType core_type);
+    std::vector<std::pair<transfer_info_cores, uint32_t>> extract_dst_noc_multicast_info(const CoreRangeContainer& ranges, const CoreType core_type);
     bool dispatch_s_enabled() const;
     bool distributed_dispatcher() const;
 
@@ -373,9 +375,9 @@ inline T Device::get_base_allocator_addr(const HalMemType &mem_type) const {
 
 // TODO: Find a better home for this function
 template <typename CoreRangeContainer>
-std::vector<pair<transfer_info_cores, uint32_t>> Device::extract_dst_noc_multicast_info(const CoreRangeContainer& ranges, const CoreType core_type) {
+std::vector<std::pair<transfer_info_cores, uint32_t>> Device::extract_dst_noc_multicast_info(const CoreRangeContainer& ranges, const CoreType core_type) {
     // This API extracts all the pairs of noc multicast encodings given a set of core ranges
-    std::vector<pair<transfer_info_cores, uint32_t>> dst_noc_multicast_info;
+    std::vector<std::pair<transfer_info_cores, uint32_t>> dst_noc_multicast_info;
     dst_noc_multicast_info.reserve(ranges.size());
     for (const CoreRange& core_range : ranges) {
         CoreCoord physical_start = this->physical_core_from_logical_core(core_range.start_coord, core_type);
diff --git a/tt_metal/impl/dispatch/command_queue.cpp b/tt_metal/impl/dispatch/command_queue.cpp
index 2f5e239eb9f..010ad9e4359 100644
--- a/tt_metal/impl/dispatch/command_queue.cpp
+++ b/tt_metal/impl/dispatch/command_queue.cpp
@@ -545,13 +545,14 @@ void EnqueueProgramCommand::assemble_runtime_args_commands(ProgramCommandSequenc
     for (uint32_t programmable_core_type_index = 0;
          programmable_core_type_index < hal.get_programmable_core_type_count();
          programmable_core_type_index++) {
+        uint32_t processor_classes = hal.get_processor_classes_count(programmable_core_type_index);
         for (auto& kg : program.get_kernel_groups(programmable_core_type_index)) {
             if (kg.total_rta_size != 0) {
                 // Reserve 2x for unique rtas as we pontentially split the cmds due to not fitting in one prefetch cmd
                 command_count += 2;
             }
         }
-        for (int dispatch_class = 0; dispatch_class < DISPATCH_CLASS_MAX; dispatch_class++) {
+        for (int dispatch_class = 0; dispatch_class < processor_classes; dispatch_class++) {
             uint32_t common_size = program.get_program_config(programmable_core_type_index).crta_sizes[dispatch_class];
             if (common_size != 0) {
                 command_count++;
@@ -568,6 +569,7 @@ void EnqueueProgramCommand::assemble_runtime_args_commands(ProgramCommandSequenc
             continue;
         }
         CoreType core_type = hal.get_core_type(index);
+        uint32_t processor_classes = hal.get_processor_classes_count(index);
 
         for (auto& kg : program.get_kernel_groups(index)) {
             if (kg.total_rta_size != 0) {
@@ -578,7 +580,7 @@ void EnqueueProgramCommand::assemble_runtime_args_commands(ProgramCommandSequenc
 
                             unique_rt_args_data.resize(unique_rt_args_data.size() + 1);
                             unique_rt_data_and_sizes.resize(unique_rt_data_and_sizes.size() + 1);
-                            for (int dispatch_class = 0; dispatch_class < DISPATCH_CLASS_MAX; dispatch_class++) {
+                            for (int dispatch_class = 0; dispatch_class < processor_classes; dispatch_class++) {
                                 auto& optional_id = kg.kernel_ids[dispatch_class];
                                 if (optional_id) {
                                     auto kernel = detail::GetKernel(program, optional_id.value());
@@ -626,7 +628,7 @@ void EnqueueProgramCommand::assemble_runtime_args_commands(ProgramCommandSequenc
             }
         }
 
-        for (int dispatch_class = 0; dispatch_class < DISPATCH_CLASS_MAX; dispatch_class++) {
+        for (int dispatch_class = 0; dispatch_class < processor_classes; dispatch_class++) {
             uint32_t common_size = program.get_program_config(index).crta_sizes[dispatch_class];
             for (size_t kernel_id = 0; kernel_id < program.num_kernels(); kernel_id++) {
                 auto kernel = detail::GetKernel(program, kernel_id);
@@ -661,7 +663,7 @@ void EnqueueProgramCommand::assemble_runtime_args_commands(ProgramCommandSequenc
                                 .noc_xy_addr = this->device->get_noc_unicast_encoding(this->noc_index, physical_core)});
                         }
                     } else {
-                        vector<pair<transfer_info_cores, uint32_t>> dst_noc_multicast_info =
+                        vector<std::pair<transfer_info_cores, uint32_t>> dst_noc_multicast_info =
                             device->extract_dst_noc_multicast_info<std::vector<CoreRange>>(
                                 kernel->logical_coreranges(), core_type);
                         common_sub_cmds.emplace<std::vector<CQDispatchWritePackedMulticastSubCmd>>(
@@ -932,7 +934,9 @@ void EnqueueProgramCommand::assemble_device_commands(ProgramCommandSequence& pro
             } else {
                 uint32_t base_address = this->program.kernels_buffer->address();
                 uint32_t page_offset = kg_transfer_info.page_offsets[kernel_idx];
-                uint32_t dst_addr = kg_transfer_info.dst_base_addrs[kernel_idx];
+
+                // TODO: pack all these writes into 1 linear write
+                uint32_t kernel_config_buffer_offset = kg_transfer_info.dst_base_addrs[kernel_idx];
                 uint32_t aligned_length = align(kg_transfer_info.lengths[kernel_idx], hal.get_alignment(HalMemType::DRAM));
                 uint32_t padding = aligned_length - kg_transfer_info.lengths[kernel_idx];
                 while (aligned_length != 0) {
@@ -953,13 +957,13 @@ void EnqueueProgramCommand::assemble_device_commands(ProgramCommandSequence& pro
                     }
                     kernel_bins_dispatch_subcmds.back().emplace_back(CQDispatchWritePackedLargeSubCmd{
                         .noc_xy_addr = noc_encoding,
-                        .addr = dst_addr,
+                        .addr = kernel_config_buffer_offset,
                         .length = (uint16_t)write_length,
                         .num_mcast_dests = (uint8_t)num_mcast_dests,
                         .flags = CQ_DISPATCH_CMD_PACKED_WRITE_LARGE_FLAG_NONE});
                     RecordDispatchData(
                         program, DISPATCH_DATA_BINARY, write_length, kg_transfer_info.riscvs[kernel_idx]);
-                    dst_addr += write_length;
+                    kernel_config_buffer_offset += write_length;
 
                     kernel_bins_prefetch_subcmds.back().emplace_back(CQPrefetchRelayPagedPackedSubCmd{
                         .start_page = (uint16_t)page_offset,
@@ -1174,7 +1178,11 @@ void EnqueueProgramCommand::assemble_device_commands(ProgramCommandSequence& pro
     uint32_t dram_alignment = hal.get_alignment(HalMemType::DRAM);
     for (uint32_t i = 0; i < kernel_bins_dispatch_subcmds.size(); ++i) {
         device_command_sequence.add_dispatch_write_packed_large(
-            dram_alignment, kernel_bins_dispatch_subcmds[i].size(), kernel_bins_dispatch_subcmds[i]);
+            dram_alignment,
+            kernel_bins_dispatch_subcmds[i].size(),
+            kernel_bins_dispatch_subcmds[i],
+            0,
+            DISPATCH_WRITE_OFFSET_TENSIX_L1_CONFIG_BASE);
         device_command_sequence.add_prefetch_relay_paged_packed(
             kernel_bins_write_packed_large_data_aligned_sizeB[i],
             kernel_bins_prefetch_subcmds[i],
@@ -1455,11 +1463,6 @@ void EnqueueProgramCommand::write_program_command_sequence(const ProgramCommandS
 
 void EnqueueProgramCommand::process() {
 
-    bool is_finalized = program.is_finalized();
-    if (not is_finalized) {
-        program.finalize(device);
-    }
-
     const std::pair<ConfigBufferSync, std::vector<ConfigBufferEntry>&> reservation =
         this->manager.get_config_buffer_mgr().reserve(program.program_config_sizes_);
     bool stall_first = reservation.first.need_sync;
@@ -1484,7 +1487,7 @@ void EnqueueProgramCommand::process() {
     // Currently this is mapped by device, but will be mapped by multiple values in the future
     uint64_t command_hash = this->device->id();
     auto cached_cmd_iter = this->program.cached_program_command_sequences_.find(command_hash);
-    bool is_cached = is_finalized && cached_cmd_iter != this->program.cached_program_command_sequences_.end();
+    bool is_cached = program.is_cached() && cached_cmd_iter != this->program.cached_program_command_sequences_.end();
 
     // Calculate all commands size and determine how many fetch q entries to use
     // Preamble, some waits and stalls
@@ -1505,6 +1508,7 @@ void EnqueueProgramCommand::process() {
         this->write_program_command_sequence(program_command_sequence, stall_first);
         this->assemble_stall_commands(program_command_sequence, false);
         this->program.cached_program_command_sequences_.insert({command_hash, std::move(program_command_sequence)});
+        program.set_cached();
     } else {
         static constexpr uint32_t wait_count_offset = (sizeof(CQPrefetchCmd) + offsetof(CQDispatchCmd, wait.count));
         static constexpr uint32_t tensix_l1_write_offset_offset =
@@ -2227,12 +2231,14 @@ void HWCommandQueue::enqueue_write_buffer(Buffer& buffer, const void* src, bool
 void HWCommandQueue::enqueue_program(Program& program, bool blocking) {
     ZoneScopedN("HWCommandQueue_enqueue_program");
     if (not program.is_finalized()) {
+        program.finalize(device);
         TT_FATAL(!this->manager.get_bypass_mode(), "Tracing should only be used when programs have been cached");
         if (program.kernels_buffer != nullptr) {
             this->enqueue_write_buffer(
                 *program.kernels_buffer, program.program_transfer_info.binary_data.data(), false);
         }
     }
+
 #ifdef DEBUG
     if (tt::llrt::OptionsG.get_validate_kernel_binaries()) {
         TT_FATAL(!this->manager.get_bypass_mode(), "Tracing cannot be used while validating program binaries");
diff --git a/tt_metal/impl/dispatch/data_collection.cpp b/tt_metal/impl/dispatch/data_collection.cpp
index 516f7a27912..c5c400e01c3 100644
--- a/tt_metal/impl/dispatch/data_collection.cpp
+++ b/tt_metal/impl/dispatch/data_collection.cpp
@@ -5,7 +5,7 @@
 #include "data_collection.hpp"
 #include "llrt/rtoptions.hpp"
 #include "tt_metal/impl/kernels/kernel.hpp"
-#include "tt_metal/common/core_coord.h"
+#include "tt_metal/common/core_coord.hpp"
 
 #include <magic_enum.hpp>
 
@@ -123,7 +123,7 @@ class DataCollector {
 
 private:
     map<uint64_t, vector<DispatchData>> program_id_to_dispatch_data;
-    map<uint64_t, map<CoreType, vector<pair<kernel_id_array_t, CoreRangeSet>>>> program_id_to_kernel_groups;
+    map<uint64_t, map<CoreType, vector<std::pair<kernel_id_array_t, CoreRangeSet>>>> program_id_to_kernel_groups;
     map<uint64_t, int> program_id_to_call_count;
 };
 
@@ -202,7 +202,7 @@ void DataCollector::DumpData() {
         // Dump kernel ids for each kernel group in this program
         for (auto &core_type_and_kernel_groups : program_id_to_kernel_groups[program_id]) {
             CoreType core_type = core_type_and_kernel_groups.first;
-            vector<pair<kernel_id_array_t, CoreRangeSet>> &kernel_groups = core_type_and_kernel_groups.second;
+            vector<std::pair<kernel_id_array_t, CoreRangeSet>> &kernel_groups = core_type_and_kernel_groups.second;
             outfile << fmt::format("\t{} Kernel Groups: {}\n", core_type, kernel_groups.size());
             for (auto &ids_and_ranges : kernel_groups) {
                 // Dump kernel ids in this group
diff --git a/tt_metal/impl/dispatch/debug_tools.cpp b/tt_metal/impl/dispatch/debug_tools.cpp
index 14ec9b706d3..f8c54fa3573 100644
--- a/tt_metal/impl/dispatch/debug_tools.cpp
+++ b/tt_metal/impl/dispatch/debug_tools.cpp
@@ -23,7 +23,7 @@ void match_device_program_data_with_host_program_data(const char* host_file, con
     host_dispatch_dump_file.open(host_file);
     device_dispatch_dump_file.open(device_file);
 
-    vector<pair<string, vector<string>>> host_map;
+    vector<std::pair<string, vector<string>>> host_map;
 
 
     string line;
diff --git a/tt_metal/impl/dispatch/dispatch_core_manager.hpp b/tt_metal/impl/dispatch/dispatch_core_manager.hpp
index 19c1f85f75c..bc700ac6219 100644
--- a/tt_metal/impl/dispatch/dispatch_core_manager.hpp
+++ b/tt_metal/impl/dispatch/dispatch_core_manager.hpp
@@ -5,7 +5,7 @@
 #pragma once
 
 #include "common/core_descriptor.hpp"
-#include "tt_metal/common/core_coord.h"
+#include "tt_metal/common/core_coord.hpp"
 #include <list>
 
 namespace tt::tt_metal {
diff --git a/tt_metal/impl/kernels/kernel.cpp b/tt_metal/impl/kernels/kernel.cpp
index a3c91aa0d8a..0d521ce3194 100644
--- a/tt_metal/impl/kernels/kernel.cpp
+++ b/tt_metal/impl/kernels/kernel.cpp
@@ -15,7 +15,7 @@
 #include "tt_metal/impl/debug/watcher_server.hpp"
 #include "tt_metal/third_party/tracy/public/tracy/Tracy.hpp"
 #include "tt_metal/common/utils.hpp"
-#include "tt_metal/common/core_coord.h"
+#include "tt_metal/common/core_coord.hpp"
 #include "tt_metal/jit_build/genfiles.hpp"
 namespace tt {
 
@@ -337,20 +337,27 @@ void ComputeKernel::set_build_options(JitBuildOptions &build_options) const {
 void DataMovementKernel::generate_binaries(Device *device, JitBuildOptions &build_options) const {
     jit_build_genfiles_kernel_include(device->build_env(), *this, this->kernel_src_);
     device->generate_device_headers(build_options.path);
+    uint32_t tensix_core_type = hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX);
+    uint32_t dm_class_idx = magic_enum::enum_integer(HalProcessorClassType::DM);
     int riscv_id = static_cast<std::underlying_type<DataMovementProcessor>::type>(this->config_.processor);
-    jit_build(device->build_kernel_state(JitBuildProcessorType::DATA_MOVEMENT, riscv_id), this);
+    jit_build(device->build_kernel_state(tensix_core_type, dm_class_idx, riscv_id), this);
 }
 
 void EthernetKernel::generate_binaries(Device *device, JitBuildOptions &build_options) const {
     jit_build_genfiles_kernel_include(device->build_env(), *this, this->kernel_src_);
     device->generate_device_headers(build_options.path);
-    int erisc_id = this->config_.eth_mode == Eth::IDLE ? 1 : 0;
-    jit_build(device->build_kernel_state(JitBuildProcessorType::ETHERNET, erisc_id), this);
+    uint32_t erisc_core_type = hal.get_programmable_core_type_index(
+        this->config_.eth_mode == Eth::IDLE ? HalProgrammableCoreType::IDLE_ETH : HalProgrammableCoreType::ACTIVE_ETH
+    );
+    uint32_t dm_class_idx = magic_enum::enum_integer(HalProcessorClassType::DM);
+    jit_build(device->build_kernel_state(erisc_core_type, dm_class_idx, 0), this);
 }
 
 void ComputeKernel::generate_binaries(Device *device, JitBuildOptions &build_options) const {
     jit_build_genfiles_triscs_src(device->build_env(), *this, this->kernel_src_);
-    JitBuildStateSubset build_states = device->build_kernel_states(JitBuildProcessorType::COMPUTE);
+    uint32_t tensix_core_type = hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX);
+    uint32_t compute_class_idx = magic_enum::enum_integer(HalProcessorClassType::COMPUTE);
+    JitBuildStateSubset build_states = device->build_kernel_states(tensix_core_type, compute_class_idx);
     jit_build_subset(build_states, this);
 }
 
@@ -368,9 +375,15 @@ void DataMovementKernel::read_binaries(Device *device) {
 
     // TODO(pgk): move the procssor types into the build system.  or just use integer indicies
     // TODO(pgk): consolidate read_binaries where possible
+    uint32_t tensix_core_type = hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX);
+    uint32_t dm_class_idx = magic_enum::enum_integer(HalProcessorClassType::DM);
     int riscv_id = static_cast<std::underlying_type<DataMovementProcessor>::type>(this->config_.processor);
-    const JitBuildState &build_state = device->build_kernel_state(JitBuildProcessorType::DATA_MOVEMENT, riscv_id);
-    ll_api::memory binary_mem = llrt::get_risc_binary(build_state.get_target_out_path(this->kernel_full_name_), riscv_id, llrt::PackSpans::PACK);
+    const JitBuildState &build_state = device->build_kernel_state(tensix_core_type, dm_class_idx, riscv_id);
+    // TODO: from HAL
+    ll_api::memory::Relocate relo_type =
+        (riscv_id == 1 && (device->arch() == tt::ARCH::GRAYSKULL || device->arch() == tt::ARCH::WORMHOLE_B0)) ?
+        ll_api::memory::Relocate::NONE : ll_api::memory::Relocate::XIP;
+    ll_api::memory binary_mem = llrt::get_risc_binary(build_state.get_target_out_path(this->kernel_full_name_), riscv_id, ll_api::memory::PackSpans::PACK, relo_type);
     binaries.push_back(binary_mem);
     uint32_t binary_size = binary_mem.get_packed_size();
     log_debug(LogLoader, "RISC {} kernel binary size: {} in bytes", riscv_id, binary_size);
@@ -381,10 +394,17 @@ void EthernetKernel::read_binaries(Device *device) {
     // untested
     TT_ASSERT(!binary_path_.empty(), "Path to Kernel binaries not set!");
     std::vector<ll_api::memory> binaries;
+    uint32_t erisc_core_type = hal.get_programmable_core_type_index(
+        this->config_.eth_mode == Eth::IDLE ? HalProgrammableCoreType::IDLE_ETH : HalProgrammableCoreType::ACTIVE_ETH
+    );
+    uint32_t dm_class_idx = magic_enum::enum_integer(HalProcessorClassType::DM);
+    const JitBuildState &build_state = device->build_kernel_state(erisc_core_type, dm_class_idx, 0);
     int erisc_id = this->config_.eth_mode == Eth::IDLE ? 1 : 0;
-    const JitBuildState &build_state = device->build_kernel_state(JitBuildProcessorType::ETHERNET, erisc_id);
-    ll_api::memory binary_mem = llrt::get_risc_binary(build_state.get_target_out_path(this->kernel_full_name_), erisc_id + 5, llrt::PackSpans::PACK);
-    binaries.push_back(binary_mem);
+    // TODO: fix when active eth supports relo
+    ll_api::memory::Relocate relo_type = (this->config_.eth_mode == Eth::IDLE) ?
+        ll_api::memory::Relocate::XIP : ll_api::memory::Relocate::NONE;
+    ll_api::memory binary_mem = llrt::get_risc_binary(build_state.get_target_out_path(this->kernel_full_name_), erisc_id + 5, ll_api::memory::PackSpans::PACK, relo_type);
+   binaries.push_back(binary_mem);
     uint32_t binary_size = binary_mem.get_packed_size();
     log_debug(LogLoader, "ERISC {} kernel binary size: {} in bytes", erisc_id, binary_size);
     this->set_binaries(device->build_key(), std::move(binaries));
@@ -393,9 +413,11 @@ void EthernetKernel::read_binaries(Device *device) {
 void ComputeKernel::read_binaries(Device *device) {
     TT_ASSERT(!binary_path_.empty(), "Path to Kernel binaries not set!");
     std::vector<ll_api::memory> binaries;
+    uint32_t tensix_core_type = hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX);
+    uint32_t compute_class_idx = magic_enum::enum_integer(HalProcessorClassType::COMPUTE);
     for (int trisc_id = 0; trisc_id <= 2; trisc_id++) {
-        const JitBuildState &build_state = device->build_kernel_state(JitBuildProcessorType::COMPUTE, trisc_id);
-        ll_api::memory binary_mem = llrt::get_risc_binary(build_state.get_target_out_path(this->kernel_full_name_), trisc_id + 2, llrt::PackSpans::PACK);
+        const JitBuildState &build_state = device->build_kernel_state(tensix_core_type, compute_class_idx, trisc_id);
+        ll_api::memory binary_mem = llrt::get_risc_binary(build_state.get_target_out_path(this->kernel_full_name_), trisc_id + 2, ll_api::memory::PackSpans::PACK, ll_api::memory::Relocate::XIP);
         binaries.push_back(binary_mem);
         uint32_t binary_size = binary_mem.get_packed_size();
         log_debug(LogLoader, "RISC {} kernel binary size: {} in bytes", trisc_id + 2, binary_size);
@@ -416,41 +438,35 @@ RISCV EthernetKernel::processor() const { return RISCV::ERISC; }
 
 RISCV ComputeKernel::processor() const { return RISCV::COMPUTE; }
 
-bool DataMovementKernel::configure(Device *device, const CoreCoord &logical_core) const {
-    bool pass = true;
+bool DataMovementKernel::configure(Device *device, const CoreCoord &logical_core, uint32_t base_address, const uint32_t offsets[]) const {
     if (not is_on_logical_core(logical_core)) {
         TT_THROW("Cannot configure kernel because it is not on core {}", logical_core.str());
     }
     auto device_id = device->id();
     auto worker_core = device->worker_core_from_logical_core(logical_core);
     ll_api::memory binary_mem = this->binaries(device->build_key()).at(0);
+    int riscv_id = static_cast<std::underlying_type<DataMovementProcessor>::type>(this->config_.processor);
+    llrt::write_binary_to_address(binary_mem, device_id, worker_core, base_address + offsets[riscv_id]);
 
-    int riscv_id;
-    switch (this->config_.processor) {
-        case (DataMovementProcessor::RISCV_0): {
-            riscv_id = 0;
-        } break;
-        case (DataMovementProcessor::RISCV_1): {
-            riscv_id = 1;
-        } break;
-        default: TT_THROW("Unsupported data movement processor!");
-    }
-
-    pass &= tt::llrt::test_load_write_read_risc_binary(binary_mem, device_id, worker_core, riscv_id);
-    return pass;
+    return true;
 }
 
-bool EthernetKernel::configure(Device *device, const CoreCoord &logical_core) const {
-    bool pass = true;
+bool EthernetKernel::configure(Device *device, const CoreCoord &logical_core, uint32_t base_address, const uint32_t offsets[]) const {
     auto device_id = device->id();
     auto ethernet_core = device->ethernet_core_from_logical_core(logical_core);
     ll_api::memory binary_mem = this->binaries(device->build_key()).at(0);
-    int riscv_id = this->config_.eth_mode == Eth::IDLE ? 6 : 5;
-    pass &= tt::llrt::test_load_write_read_risc_binary(binary_mem, device_id, ethernet_core, riscv_id);
-    return pass;
+
+    if (this->config_.eth_mode == Eth::IDLE) {
+        llrt::write_binary_to_address(binary_mem, device_id, ethernet_core, base_address + offsets[0]);
+    } else {
+        int riscv_id = 5;
+        tt::llrt::test_load_write_read_risc_binary(binary_mem, device_id, ethernet_core, riscv_id);
+    }
+
+    return true;
 }
 
-bool ComputeKernel::configure(Device *device, const CoreCoord &logical_core) const {
+bool ComputeKernel::configure(Device *device, const CoreCoord &logical_core, uint32_t base_address, const uint32_t offsets[]) const {
     bool pass = true;
     if (not is_on_logical_core(logical_core)) {
         TT_THROW("Cannot configure kernel because it is not on core {}", logical_core.str());
@@ -460,7 +476,7 @@ bool ComputeKernel::configure(Device *device, const CoreCoord &logical_core) con
     std::vector<ll_api::memory> binaries = this->binaries(device->build_key());
 
     for (int trisc_id = 0; trisc_id <= 2; trisc_id++) {
-        pass &= tt::llrt::test_load_write_read_trisc_binary(binaries.at(trisc_id), device_id, worker_core, trisc_id);
+        llrt::write_binary_to_address(binaries.at(trisc_id), device_id, worker_core, base_address + offsets[2 + trisc_id]);
     }
 
     return pass;
diff --git a/tt_metal/impl/kernels/kernel.hpp b/tt_metal/impl/kernels/kernel.hpp
index ccb908f3816..1c8488ab815 100644
--- a/tt_metal/impl/kernels/kernel.hpp
+++ b/tt_metal/impl/kernels/kernel.hpp
@@ -95,9 +95,9 @@ class Kernel : public JitBuildSettings {
     std::map<std::string, std::string> defines() const { return defines_; }
 
     virtual RISCV processor() const = 0;
-    dispatch_core_processor_classes dispatch_class() { return this->dispatch_class_; }
+    uint32_t dispatch_class() { return this->dispatch_class_; }
 
-    virtual bool configure(Device *device, const CoreCoord &logical_core) const = 0;
+    virtual bool configure(Device *device, const CoreCoord &logical_core, uint32_t base_address, const uint32_t offsets[]) const = 0;
 
     virtual Config config() const = 0;
 
@@ -134,7 +134,7 @@ class Kernel : public JitBuildSettings {
     // Different set of binaries per device because kernel compilation is device dependent
     // TODO: break this dependency by https://github.com/tenstorrent/tt-metal/issues/3381
     std::unordered_map<chip_id_t, std::vector<ll_api::memory>> binaries_;
-    dispatch_core_processor_classes dispatch_class_;
+    uint8_t dispatch_class_;
     std::vector<uint32_t> compile_time_args_;
     std::vector< std::vector< std::vector<uint32_t>> > core_to_runtime_args_;
     std::vector< std::vector< RuntimeArgsData> > core_to_runtime_args_data_;
@@ -159,8 +159,7 @@ class DataMovementKernel : public Kernel {
    public:
     DataMovementKernel(const KernelSource &kernel_src, const CoreRangeSet &cr_set, const DataMovementConfig &config) :
         Kernel(kernel_src, cr_set, config.compile_args, config.defines), config_(config) {
-        this->dispatch_class_ = (config.processor == DataMovementProcessor::RISCV_0) ? DISPATCH_CLASS_TENSIX_DM0
-                                                                                     : DISPATCH_CLASS_TENSIX_DM1;
+        this->dispatch_class_ = magic_enum::enum_integer(HalProcessorClassType::DM) + magic_enum::enum_integer(config.processor);
     }
 
     ~DataMovementKernel() {}
@@ -171,7 +170,7 @@ class DataMovementKernel : public Kernel {
     void generate_binaries(Device *device, JitBuildOptions& build_options) const override;
     void read_binaries(Device *device) override;
 
-    bool configure(Device *device, const CoreCoord &logical_core) const override;
+    bool configure(Device *device, const CoreCoord &logical_core, uint32_t base_address, const uint32_t offsets[]) const override;
 
     Config config() const override { return this->config_; }
 
@@ -189,7 +188,7 @@ class EthernetKernel : public Kernel {
    public:
     EthernetKernel(const KernelSource &kernel_src, const CoreRangeSet &cr_set, const EthernetConfig &config) :
         Kernel(kernel_src, cr_set, config.compile_args, config.defines), config_(config) {
-        this->dispatch_class_ = DISPATCH_CLASS_ETH_DM0;
+        this->dispatch_class_ = magic_enum::enum_integer(HalProcessorClassType::DM);
     }
 
     ~EthernetKernel() {}
@@ -200,7 +199,7 @@ class EthernetKernel : public Kernel {
     void generate_binaries(Device *device, JitBuildOptions &build_options) const override;
     void read_binaries(Device *device) override;
 
-    bool configure(Device *device, const CoreCoord &logical_core) const override;
+    bool configure(Device *device, const CoreCoord &logical_core, uint32_t base_address, const uint32_t offsets[]) const override;
 
     Config config() const override { return this->config_; }
 
@@ -218,7 +217,7 @@ class ComputeKernel : public Kernel {
    public:
     ComputeKernel(const KernelSource &kernel_src, const CoreRangeSet &cr_set, const ComputeConfig &config) :
         Kernel(kernel_src, cr_set, config.compile_args, config.defines), config_(config) {
-        this->dispatch_class_ = DISPATCH_CLASS_TENSIX_COMPUTE;
+        this->dispatch_class_ = magic_enum::enum_integer(HalProcessorClassType::COMPUTE);
     }
 
     ~ComputeKernel() {}
@@ -229,7 +228,7 @@ class ComputeKernel : public Kernel {
     void generate_binaries(Device *device, JitBuildOptions& build_options) const override;
     void read_binaries(Device *device) override;
 
-    bool configure(Device *device, const CoreCoord &logical_core) const override;
+    bool configure(Device *device, const CoreCoord &logical_core, uint32_t base_address, const uint32_t offsets[]) const override;
 
     Config config() const override { return this->config_; }
 
diff --git a/tt_metal/impl/program/program.cpp b/tt_metal/impl/program/program.cpp
index 2b6f61047a3..ffb8fb6a499 100644
--- a/tt_metal/impl/program/program.cpp
+++ b/tt_metal/impl/program/program.cpp
@@ -107,7 +107,12 @@ void DisablePersistentKernelCache() { enable_persistent_kernel_cache = false; }
 std::atomic<uint64_t> Program::program_counter = 0;
 
 Program::Program() :
-    id(program_counter++), runtime_id(0), worker_crs_({}), local_circular_buffer_allocation_needed_(false), finalized_(false) {
+    id(program_counter++),
+    runtime_id(0),
+    worker_crs_(),
+    local_circular_buffer_allocation_needed_(false),
+    finalized_(false),
+    cached_(false) {
 
     uint32_t programmable_core_count = hal.get_programmable_core_type_count();
     for (uint32_t i = 0; i < programmable_core_count; i++) {
@@ -146,7 +151,7 @@ std::shared_ptr<Kernel> Program::get_kernel(KernelHandle kernel_id) const {
     return nullptr;
 }
 
-KernelGroup::KernelGroup() : core_ranges({}) {}
+KernelGroup::KernelGroup() : core_ranges(CoreRangeSet()) {}
 
 KernelGroup::KernelGroup(
     const Program &program,
@@ -155,8 +160,7 @@ KernelGroup::KernelGroup(
     bool erisc_is_idle,
     int last_cb_index,
     const CoreRangeSet &new_ranges) :
-    core_ranges({}) {
-
+    core_ranges(CoreRangeSet()) {
     this->programmable_core_type_index = programmable_core_type_index;
     this->core_ranges = this->core_ranges.merge(new_ranges);
     this->kernel_ids = kernel_ids;
@@ -171,7 +175,8 @@ KernelGroup::KernelGroup(
             hal.get_dev_addr(index, HalL1MemAddrType::KERNEL_CONFIG);
     }
 
-    for (int class_id = 0; class_id < DISPATCH_CLASS_MAX; class_id++) {
+    uint32_t processor_classes = hal.get_processor_classes_count(programmable_core_type_index);
+    for (int class_id = 0; class_id < processor_classes; class_id++) {
         auto& optional_id = kernel_ids[class_id];
         if (optional_id) {
             const auto kernel = program.get_kernel(optional_id.value());
@@ -181,14 +186,14 @@ KernelGroup::KernelGroup(
             if (programmable_core_type_index == hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX)) {
                 // The code below sets the brisc_noc_id for use by the device firmware
                 // Use 0 if neither brisc nor ncrisc specify a noc
-                if (class_id == DISPATCH_CLASS_TENSIX_DM0) {
+                if (class_id == utils::underlying_type<DataMovementProcessor>(DataMovementProcessor::RISCV_0)) { // weird?
                     // Use brisc's noc if brisc specifies a noc
                     this->launch_msg.kernel_config.brisc_noc_id = std::get<DataMovementConfig>(kernel->config()).noc;
                     // if noc mode is already set to DM_DYNAMIC_NOC then we can't change back to DM_DEDICATED_NOC
                     if (std::get<DataMovementConfig>(kernel->config()).noc_mode == NOC_MODE::DM_DYNAMIC_NOC) {
                         this->launch_msg.kernel_config.brisc_noc_mode = NOC_MODE::DM_DYNAMIC_NOC;
                     }
-                } else if (class_id == DISPATCH_CLASS_TENSIX_DM1) {
+                } else if (class_id == utils::underlying_type<DataMovementProcessor>(DataMovementProcessor::RISCV_1)) { // weird?
                     // Use 1-ncrisc's noc (the other noc) if ncrisc specifies a noc
                     // If both brisc and ncrisc set the noc, then this is safe due to prior correctness validation
                     this->launch_msg.kernel_config.brisc_noc_id = 1 - std::get<DataMovementConfig>(kernel->config()).noc;
@@ -203,6 +208,7 @@ KernelGroup::KernelGroup(
 
     for (uint32_t index = 0; index < MaxProcessorsPerCoreType; index ++) {
         this->kernel_bin_sizes[index] = 0;
+        this->kernel_text_offsets[index] = 0;
         this->launch_msg.kernel_config.kernel_text_offset[index] = 0;
     }
     this->launch_msg.kernel_config.ncrisc_kernel_size16 = 0;
@@ -234,6 +240,7 @@ struct KernelGroupInt {
     kernel_id_array_t kernel_ids;
 
     bool operator==(const KernelGroupInt &b) const;
+    // fix this
     void update(dispatch_core_processor_classes proc_class, size_t kernel_idx) {
         this->kernel_ids[proc_class] = static_cast<KernelHandle>(kernel_idx);
     }
@@ -288,7 +295,7 @@ void Program::update_kernel_groups(uint32_t programmable_core_type_index) {
             for (auto core : kernel->logical_cores()) {
                 int core_index = core.y * grid_extent_[programmable_core_type_index].x + core.x;
                 grid[core_index].valid = true;
-                grid[core_index].update(kernel->dispatch_class(), id);
+                grid[core_index].update(magic_enum::enum_cast<dispatch_core_processor_classes>(kernel->dispatch_class()).value(), id);
             }
         }
 
@@ -644,27 +651,10 @@ void Program::set_cb_tile_dims(Device *device, const std::vector<CoreRange> &crs
 }
 
 void Program::populate_dispatch_data(Device *device) {
-    static const uint32_t processor_to_firmware_base[] = {
-        MEM_BRISC_FIRMWARE_BASE,
-        MEM_NCRISC_FIRMWARE_BASE,
-        MEM_TRISC0_FIRMWARE_BASE,
-        MEM_TRISC1_FIRMWARE_BASE,
-        MEM_TRISC2_FIRMWARE_BASE,
-        eth_l1_mem::address_map::FIRMWARE_BASE
-    };
-    static const uint32_t processor_to_firmware_size[] = {
-        MEM_BRISC_FIRMWARE_SIZE,
-        MEM_NCRISC_INIT_IRAM_L1_SIZE,
-        MEM_TRISC0_FIRMWARE_SIZE,
-        MEM_TRISC1_FIRMWARE_SIZE,
-        MEM_TRISC2_FIRMWARE_SIZE,
-        eth_l1_mem::address_map::FIRMWARE_SIZE
-    };
-
     auto extract_dst_noc_unicast_info =
-        [&device](const std::set<CoreRange> &ranges, const CoreType core_type) -> std::vector<pair<transfer_info_cores, uint32_t>> {
+        [&device](const auto &ranges, const CoreType core_type) -> std::vector<std::pair<transfer_info_cores, uint32_t>> {
         // This API extracts all the pairs of noc multicast encodings given a set of core ranges
-        vector<pair<transfer_info_cores, uint32_t>> dst_noc_unicast_info;
+        vector<std::pair<transfer_info_cores, uint32_t>> dst_noc_unicast_info;
         for (const CoreRange &core_range : ranges) {
             for (auto x = core_range.start_coord.x; x <= core_range.end_coord.x; x++) {
                 for (auto y = core_range.start_coord.y; y <= core_range.end_coord.y; y++) {
@@ -684,8 +674,8 @@ void Program::populate_dispatch_data(Device *device) {
         // TODO: use semaphore.core_type from main
         if (semaphore.core_type() == CoreType::WORKER) {
             uint32_t index = hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX);
-            vector<pair<transfer_info_cores, uint32_t>> dst_noc_multicast_info =
-                device->extract_dst_noc_multicast_info<std::set<CoreRange>>(
+            vector<std::pair<transfer_info_cores, uint32_t>> dst_noc_multicast_info =
+                device->extract_dst_noc_multicast_info<std::vector<CoreRange>>(
                     semaphore.core_range_set().ranges(), CoreType::WORKER);
             transfer_info transfer_info = {
                 .dst_base_addr = semaphore.offset(),
@@ -696,7 +686,7 @@ void Program::populate_dispatch_data(Device *device) {
         } else if (semaphore.core_type() == CoreType::ETH) {
             // TODO: we only fast dispatch to active eth...
             uint32_t index = hal.get_programmable_core_type_index(HalProgrammableCoreType::ACTIVE_ETH);
-            vector<pair<transfer_info_cores, uint32_t>> dst_noc_unicast_info =
+            vector<std::pair<transfer_info_cores, uint32_t>> dst_noc_unicast_info =
                 extract_dst_noc_unicast_info(semaphore.core_range_set().ranges(), CoreType::ETH);
             transfer_info transfer_info = {
                 .dst_base_addr = semaphore.offset(),
@@ -746,16 +736,11 @@ void Program::populate_dispatch_data(Device *device) {
 
                 TT_ASSERT(kernel_bin.num_spans() == 1);
 
-                uint32_t max_kernel_bin_size = processor_to_firmware_size[sub_kernels[sub_kernel_index]];
-
+                // TODO: spans are packed into 1 now, just grab it and go
                 kernel_bin.process_spans([&](vector<uint32_t>::const_iterator mem_ptr, uint64_t dst, uint32_t len) {
 
-                    max_kernel_bin_size -= dst - processor_to_firmware_base[sub_kernels[sub_kernel_index]];
-
-                    uint64_t relo_addr =
-                        tt::llrt::relocate_dev_addr(dst);
-
-                    dst_base_addrs[transfer_info_index] = (uint32_t)relo_addr;
+                    // Set dst for eth kernels until they move to ring buffer
+                    dst_base_addrs[transfer_info_index] = dst;
                     page_offsets[transfer_info_index] =
                         binaries_data.size() * sizeof(uint32_t) / HostMemDeviceCommand::PROGRAM_PAGE_SIZE;
                     lengths[transfer_info_index] = len * sizeof(uint32_t);
@@ -766,12 +751,6 @@ void Program::populate_dispatch_data(Device *device) {
                         align(binaries_data.size(), HostMemDeviceCommand::PROGRAM_PAGE_SIZE / sizeof(uint32_t)), 0);
                     transfer_info_index++;
                 });
-
-                uint32_t bin_size = kernel_bin.size() * sizeof(uint32_t);
-                // TODO: remove this check when the ring buffer is in place (checked there)
-                TT_FATAL(bin_size <= max_kernel_bin_size,
-                    "Kernel binary size, {}, overflowed kernel binary storage size, {}",
-                     bin_size, max_kernel_bin_size);
             }
 
             kernel_bins_transfer_info kb_transfer_info = {
@@ -794,14 +773,21 @@ void Program::populate_dispatch_data(Device *device) {
         for (KernelGroup &kernel_group : this->get_kernel_groups(index)) {
             // TODO: add a bit in the hal that says if this core type is unicast/multicast
             if (core_type == CoreType::WORKER) {
-                std::vector<pair<transfer_info_cores, uint32_t>> dst_noc_multicast_info =
-                    device->extract_dst_noc_multicast_info<std::set<CoreRange>>(
+                std::vector<std::pair<transfer_info_cores, uint32_t>> dst_noc_multicast_info =
+                    device->extract_dst_noc_multicast_info<std::vector<CoreRange>>(
                         kernel_group.core_ranges.ranges(), core_type);
 
                 vector<KernelHandle> kernel_ids;
-                for (auto &optional_id : kernel_group.kernel_ids) {
+                for (int dispatch_class = 0; dispatch_class < kernel_group.kernel_ids.size(); dispatch_class++) {
+                    auto &optional_id = kernel_group.kernel_ids[dispatch_class];
                     if (optional_id) {
                         kernel_ids.push_back(optional_id.value());
+                        int proc_sub_class = 0;
+                        for (uint32_t& dst_addr : kernel_transfer_info.at(optional_id.value()).dst_base_addrs) {
+                            // TODO: ditch this w/ linear writes based on program config kernel_text_offset and size
+                            dst_addr = kernel_group.kernel_text_offsets[dispatch_class + proc_sub_class];
+                            proc_sub_class++;
+                        }
                     }
                 }
 
@@ -813,7 +799,7 @@ void Program::populate_dispatch_data(Device *device) {
                 }
             } else {
                 TT_ASSERT(core_type == CoreType::ETH);
-                vector<pair<transfer_info_cores, uint32_t>> dst_noc_unicast_info =
+                vector<std::pair<transfer_info_cores, uint32_t>> dst_noc_unicast_info =
                     extract_dst_noc_unicast_info(kernel_group.core_ranges.ranges(), core_type);
 
                 vector<KernelHandle> kernel_ids;
@@ -842,8 +828,10 @@ uint32_t Program::finalize_rt_args(uint32_t programmable_core_type_index, uint32
     // Iterate over kernels in the program and "level" the number of RTAs based on the max
     // Unique RTAs are packed across dispatch classes
     // Common RTAs come after unique RTAs
-    vector<uint32_t> max_rtas(DISPATCH_CLASS_MAX);
-    vector<uint32_t> max_crtas(DISPATCH_CLASS_MAX);
+    uint32_t processor_classes = hal.get_processor_classes_count(programmable_core_type_index);
+
+    vector<uint32_t> max_rtas(processor_classes);
+    vector<uint32_t> max_crtas(processor_classes);
     uint32_t max_unique_rta_size = 0;
     uint32_t total_crta_size = 0;
 
@@ -854,7 +842,7 @@ uint32_t Program::finalize_rt_args(uint32_t programmable_core_type_index, uint32
 
     uint32_t l1_alignment = hal.get_alignment(HalMemType::L1);
     for (auto& kg : this->get_kernel_groups(programmable_core_type_index)) {
-        for (int dispatch_class = 0; dispatch_class < DISPATCH_CLASS_MAX; dispatch_class++) {
+        for (int dispatch_class = 0; dispatch_class < processor_classes; dispatch_class++) {
             max_rtas[dispatch_class] = 0;
             auto& optional_id = kg.kernel_ids[dispatch_class];
             if (optional_id) {
@@ -872,7 +860,7 @@ uint32_t Program::finalize_rt_args(uint32_t programmable_core_type_index, uint32
         }
 
         uint32_t offset = 0;
-        for (int dispatch_class = 0; dispatch_class < DISPATCH_CLASS_MAX; dispatch_class++) {
+        for (int dispatch_class = 0; dispatch_class < processor_classes; dispatch_class++) {
             auto& optional_id = kg.kernel_ids[dispatch_class];
             kg.rta_sizes[dispatch_class] = max_rtas[dispatch_class] * sizeof(uint32_t);
             if (optional_id) {
@@ -890,7 +878,7 @@ uint32_t Program::finalize_rt_args(uint32_t programmable_core_type_index, uint32
         max_unique_rta_size = std::max(offset, max_unique_rta_size);
     }
 
-    for (int dispatch_class = 0; dispatch_class < DISPATCH_CLASS_MAX; dispatch_class++) {
+    for (int dispatch_class = 0; dispatch_class < processor_classes; dispatch_class++) {
         max_crtas[dispatch_class] = 0;
     }
     // Find the max # common RTAs across all kernels for each dispatch class
@@ -907,7 +895,7 @@ uint32_t Program::finalize_rt_args(uint32_t programmable_core_type_index, uint32
 
     // Calculate the address offset and size for common RTAs for each dispatch class
     uint32_t offset = 0;
-    for (int dispatch_class = 0; dispatch_class < DISPATCH_CLASS_MAX; dispatch_class++) {
+    for (int dispatch_class = 0; dispatch_class < processor_classes; dispatch_class++) {
         uint32_t size = max_crtas[dispatch_class] * sizeof(uint32_t);
         this->get_program_config(programmable_core_type_index).crta_offsets[dispatch_class] = base_offset + max_unique_rta_size + offset;
         this->get_program_config(programmable_core_type_index).crta_sizes[dispatch_class] = size;
@@ -929,7 +917,7 @@ uint32_t Program::finalize_rt_args(uint32_t programmable_core_type_index, uint32
 
     // Set the kernel group common runtime arg offsets use in the launch message
     for (auto& kg : this->get_kernel_groups(programmable_core_type_index)) {
-        for (int dispatch_class = 0; dispatch_class < DISPATCH_CLASS_MAX; dispatch_class++) {
+        for (int dispatch_class = 0; dispatch_class < processor_classes; dispatch_class++) {
             kg.launch_msg.kernel_config.rta_offset[dispatch_class].crta_offset = this->get_program_config(programmable_core_type_index).crta_offsets[dispatch_class];
         }
     }
@@ -1009,29 +997,33 @@ uint32_t Program::finalize_kernel_bins(Device *device, uint32_t programmable_cor
             auto& optional_id = kg.kernel_ids[class_id];
             if (optional_id) {
                 const auto kernel = this->get_kernel(optional_id.value());
+                std::vector<ll_api::memory> const &binaries = kernel->binaries(device->build_key());
                 // TODO: this is really ugly, save me future-HAL!
                 if (programmable_core_type_index == hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX)) {
                     uint32_t binary_packed_size = kernel->get_binary_packed_size(device, 0);
 
                     if (class_id == DISPATCH_CLASS_TENSIX_DM0) {
                         kg.kernel_bin_sizes[0] = binary_packed_size;
+                        kg.kernel_text_offsets[0] = offset;
                         kg.launch_msg.kernel_config.kernel_text_offset[0] = offset;
                         offset += binary_packed_size;
                         offset = align(offset, l1_alignment);
                     } else if (class_id == DISPATCH_CLASS_TENSIX_DM1) {
                         kg.kernel_bin_sizes[1] = binary_packed_size;
+                        kg.kernel_text_offsets[1] = offset;
                         kg.launch_msg.kernel_config.kernel_text_offset[1] = offset;
                         offset += binary_packed_size;
+                        offset = align(offset, l1_alignment);
 
                         uint32_t binary_text_size = kernel->get_binary_text_size(device, 0);
                         TT_ASSERT(binary_text_size >> 4 <= std::numeric_limits<uint16_t>::max());
                         kg.launch_msg.kernel_config.ncrisc_kernel_size16 = (binary_text_size + 15) >> 4;
-                        offset = align(offset, l1_alignment);
                     } else {
                         constexpr uint32_t max_math_processors_count = 3;
                         for (uint32_t proc_type_index = 0; proc_type_index < max_math_processors_count; proc_type_index++) {
                             uint32_t binary_packed_size = kernel->get_binary_packed_size(device, proc_type_index);
                             kg.kernel_bin_sizes[2 + proc_type_index] = binary_packed_size;
+                            kg.kernel_text_offsets[2 + proc_type_index] = offset;
                             kg.launch_msg.kernel_config.kernel_text_offset[2 + proc_type_index] = offset;
                             offset += binary_packed_size;
                             offset = align(offset, l1_alignment);
@@ -1040,9 +1032,18 @@ uint32_t Program::finalize_kernel_bins(Device *device, uint32_t programmable_cor
                 } else {
                     uint32_t binary_packed_size = kernel->get_binary_packed_size(device, 0);
                     kg.kernel_bin_sizes[0] = binary_packed_size;
-                    kg.launch_msg.kernel_config.kernel_text_offset[0] = offset;
-                    offset += binary_packed_size;
-                    offset = align(offset, l1_alignment);
+
+                    // No kernel config buffer on active eth yet
+                    if (hal.get_programmable_core_type(kg.programmable_core_type_index) ==
+                        HalProgrammableCoreType::IDLE_ETH) {
+                        kg.kernel_text_offsets[0] = offset;
+                        kg.launch_msg.kernel_config.kernel_text_offset[0] = offset;
+                        offset += binary_packed_size;
+                        offset = align(offset, l1_alignment);
+                    } else {
+                        kg.kernel_text_offsets[0] = binaries[0].get_text_addr();
+                        kg.launch_msg.kernel_config.kernel_text_offset[0] = binaries[0].get_text_addr();
+                    }
                 }
             }
         }
@@ -1061,6 +1062,9 @@ uint32_t& Program::get_program_config_size(uint32_t programmable_core_type_index
 }
 
 void Program::finalize(Device *device) {
+
+    this->construct_core_range_set_for_worker_cores();
+
     // Store the number of tensix "go signals" for use by CQ
     // CQ iterates over these to update runtime addresses, needs to know when eth begins (after tensix)
     // TODO: should store all the counts
@@ -1075,6 +1079,7 @@ void Program::finalize(Device *device) {
     }
 
     for (uint32_t index = 0; index < hal.get_programmable_core_type_count(); index++) {
+        HalProgrammableCoreType programmable_core_type = static_cast<HalProgrammableCoreType>(index);
         uint32_t offset = 0;
 
         offset = finalize_rt_args(index, offset);
@@ -1086,16 +1091,25 @@ void Program::finalize(Device *device) {
         offset = finalize_cbs(index, offset);
         TT_ASSERT(offset == align(offset, hal.get_alignment(HalMemType::L1)));
 
-        // TODO: update the offset when kernel bins are moved into the kernel config buffer
-        (void)finalize_kernel_bins(device, index, offset);
+        offset = finalize_kernel_bins(device, index, offset);
         TT_ASSERT(offset == align(offset, hal.get_alignment(HalMemType::L1)));
 
         this->get_program_config_size(index) = offset;
+
+        auto max_size = hal.get_dev_size(programmable_core_type, HalL1MemAddrType::KERNEL_CONFIG);
+        TT_FATAL(offset < max_size,
+                 "Program size ({}) too large for kernel config buffer ({}) on {}",
+                 offset, max_size, magic_enum::enum_name(programmable_core_type));
     }
 
     // The sem offsets cross programmable_core_types so must be set after the loop above
     this->set_launch_msg_sem_offsets();
 
+    // TODO: This check is wrong - it populates dispatch data for dispatch kernels
+    if (std::getenv("TT_METAL_SLOW_DISPATCH_MODE") == nullptr) {
+        this->populate_dispatch_data(device);  // TODO: maybe rename
+    }
+
     finalized_ = true;
 }
 
@@ -1196,11 +1210,6 @@ void Program::compile(Device *device, bool fd_bootloader_mode) {
 
     sync_build_step(events);
 
-    this->construct_core_range_set_for_worker_cores();
-    if (std::getenv("TT_METAL_SLOW_DISPATCH_MODE") == nullptr) {
-        this->populate_dispatch_data(device);  // TODO: maybe rename
-    }
-
     if (detail::CompilationReporter::enabled()) {
         detail::CompilationReporter::inst().flush_program_entry(*this, enable_persistent_kernel_cache);
     }
diff --git a/tt_metal/impl/program/program.hpp b/tt_metal/impl/program/program.hpp
index 05aa822d787..8e1dbb587ad 100644
--- a/tt_metal/impl/program/program.hpp
+++ b/tt_metal/impl/program/program.hpp
@@ -50,6 +50,7 @@ struct KernelGroup {
     kernel_id_array_t kernel_ids;
     uint32_t rta_sizes[DISPATCH_CLASS_MAX];
     uint32_t total_rta_size;
+    uint32_t kernel_text_offsets[MaxProcessorsPerCoreType];
     uint32_t kernel_bin_sizes[MaxProcessorsPerCoreType];
     launch_msg_t launch_msg;
     go_msg_t go_msg;
@@ -149,6 +150,8 @@ class Program {
     void allocate_circular_buffers(const Device *device);
 
     bool is_finalized() const { return this->finalized_; }
+    bool is_cached() const { return this->cached_; }
+    void set_cached() { this->cached_ = true; }
     void finalize(Device *device);
     std::shared_ptr<Kernel> get_kernel(KernelHandle kernel_id) const;
 
@@ -171,6 +174,7 @@ class Program {
     ProgramTransferInfo program_transfer_info;
 
     bool finalized_;
+    bool cached_;
 
     struct CircularBufferAllocator {
         CircularBufferAllocator(const CoreRange &core_range_) : core_range(core_range_) {}
diff --git a/tt_metal/impl/program/program_device_map.hpp b/tt_metal/impl/program/program_device_map.hpp
index 964f56e478a..66f679b62e9 100644
--- a/tt_metal/impl/program/program_device_map.hpp
+++ b/tt_metal/impl/program/program_device_map.hpp
@@ -9,7 +9,7 @@
 #include <variant>
 #include <vector>
 
-#include "tt_metal/common/core_coord.h"
+#include "tt_metal/common/core_coord.hpp"
 #include "tt_metal/common/tt_backend_api_types.hpp"
 
 namespace tt::tt_metal {
diff --git a/tt_metal/jit_build/build.cpp b/tt_metal/jit_build/build.cpp
index 09ecc49d7a0..083a30f2377 100644
--- a/tt_metal/jit_build/build.cpp
+++ b/tt_metal/jit_build/build.cpp
@@ -196,6 +196,11 @@ void JitBuildState::finish_init() {
     // Note the preceding slash which defies convention as this gets appended to
     // the kernel name used as a path which doesn't have a slash
     this->target_full_path_ = "/" + this->target_name_ + "/" + this->target_name_ + ".elf";
+
+    if (not this->is_fw_) {
+        // Emit relocations, so we can relocate the resulting binary
+        this->lflags_ += "-Wl,--emit-relocs ";
+    }
 }
 
 JitBuildDataMovement::JitBuildDataMovement(const JitBuildEnv& env, const JitBuiltStateConfig &build_config) :
@@ -351,8 +356,8 @@ JitBuildCompute::JitBuildCompute(const JitBuildEnv& env, const JitBuiltStateConf
     finish_init();
 }
 
-JitBuildEthernet::JitBuildEthernet(const JitBuildEnv& env, const JitBuiltStateConfig &build_config) : JitBuildState(env, build_config) {
-    TT_ASSERT(this->core_id_ >= 0 && this->core_id_ < 2, "Invalid ethernet processor");
+JitBuildActiveEthernet::JitBuildActiveEthernet(const JitBuildEnv& env, const JitBuiltStateConfig &build_config) : JitBuildState(env, build_config) {
+    TT_ASSERT(this->core_id_ >= 0 && this->core_id_ < 1, "Invalid active ethernet processor");
     this->out_path_ = this->is_fw_ ? env_.out_firmware_root_ : env_.out_kernel_root_;
 
     this->includes_ = env_.includes_ + "-I " + env_.root_ + "tt_metal/hw/ckernels/" + env.arch_name_ +
@@ -403,7 +408,31 @@ JitBuildEthernet::JitBuildEthernet(const JitBuildEnv& env, const JitBuiltStateCo
                             env_.root_ + linker_str;
             break;
         }
-        case 1:
+        default:
+            TT_THROW("Invalid processor ID {} for Active Ethernet core.", this->core_id_);
+    }
+    this->process_defines_at_compile = true;
+
+    finish_init();
+}
+
+JitBuildIdleEthernet::JitBuildIdleEthernet(const JitBuildEnv& env, const JitBuiltStateConfig &build_config) : JitBuildState(env, build_config) {
+    TT_ASSERT(this->core_id_ >= 0 && this->core_id_ < 1, "Invalid idle ethernet processor");
+    this->out_path_ = this->is_fw_ ? env_.out_firmware_root_ : env_.out_kernel_root_;
+
+    this->includes_ = env_.includes_ + "-I " + env_.root_ + "tt_metal/hw/ckernels/" + env.arch_name_ +
+                      "/metal/common " + "-I " + env_.root_ + "tt_metal/hw/ckernels/" + env.arch_name_ +
+                      "/metal/llk_io ";
+
+    this->defines_ = env_.defines_;
+    uint32_t l1_cache_disable_mask =
+        tt::llrt::OptionsG.get_feature_riscv_mask(tt::llrt::RunTimeDebugFeatureDisableL1DataCache);
+    if ((l1_cache_disable_mask & tt::llrt::DebugHartFlags::RISCV_ER) == tt::llrt::DebugHartFlags::RISCV_ER) {
+        this->defines_ += "-DDISABLE_L1_DATA_CACHE ";
+    }
+
+    switch (this->core_id_) {
+        case 0: {
             this->target_name_ = "idle_erisc";
             this->cflags_ =
                 env_.cflags_ + "-Os " + "-fno-tree-loop-distribute-patterns ";  // don't use memcpy for cpy loops
@@ -429,6 +458,9 @@ JitBuildEthernet::JitBuildEthernet(const JitBuildEnv& env, const JitBuiltStateCo
             }
 
             break;
+        }
+        default:
+            TT_THROW("Invalid processor ID {} for Idle Ethernet core.", this->core_id_);
     }
     this->process_defines_at_compile = true;
 
diff --git a/tt_metal/jit_build/build.hpp b/tt_metal/jit_build/build.hpp
index d1e96a23b6c..2962cccc8af 100644
--- a/tt_metal/jit_build/build.hpp
+++ b/tt_metal/jit_build/build.hpp
@@ -10,7 +10,7 @@
 #include "common/tt_backend_api_types.hpp"
 #include "common/executor.hpp"
 #include "common/utils.hpp"
-#include "common/core_coord.h"
+#include "common/core_coord.hpp"
 #include "jit_build/data_format.hpp"
 #include "jit_build/settings.hpp"
 #include "hostdevcommon/common_values.hpp"
@@ -28,12 +28,6 @@ using vector_cache_aligned = std::vector<T, tt::stl::aligned_allocator<T, CACHE_
 
 class JitBuildSettings;
 
-enum class JitBuildProcessorType {
-    DATA_MOVEMENT,
-    COMPUTE,
-    ETHERNET
-};
-
 struct JitBuiltStateConfig {
     int processor_id = 0;
     bool is_fw = false;
@@ -47,7 +41,8 @@ class JitBuildEnv {
     friend class JitBuildState;
     friend class JitBuildDataMovement;
     friend class JitBuildCompute;
-    friend class JitBuildEthernet;
+    friend class JitBuildActiveEthernet;
+    friend class JitBuildIdleEthernet;
 
   public:
     JitBuildEnv();
@@ -150,10 +145,16 @@ class JitBuildCompute : public JitBuildState {
     JitBuildCompute(const JitBuildEnv& env, const JitBuiltStateConfig &build_config);
 };
 
-class JitBuildEthernet : public JitBuildState {
+class JitBuildActiveEthernet : public JitBuildState {
+  private:
+  public:
+    JitBuildActiveEthernet(const JitBuildEnv& env, const JitBuiltStateConfig &build_config);
+};
+
+class JitBuildIdleEthernet : public JitBuildState {
   private:
   public:
-    JitBuildEthernet(const JitBuildEnv& env, const JitBuiltStateConfig &build_config);
+    JitBuildIdleEthernet(const JitBuildEnv& env, const JitBuiltStateConfig &build_config);
 };
 
 // Abstract base class for kernel specialization
diff --git a/tt_metal/jit_build/genfiles.hpp b/tt_metal/jit_build/genfiles.hpp
index 13966db5ee7..4dee07a44ab 100644
--- a/tt_metal/jit_build/genfiles.hpp
+++ b/tt_metal/jit_build/genfiles.hpp
@@ -7,7 +7,7 @@
 #include <string>
 #include <vector>
 
-#include "common/core_coord.h"
+#include "common/core_coord.hpp"
 #include "impl/kernels/kernel.hpp"
 
 namespace tt::tt_metal {
diff --git a/tt_metal/llrt/blackhole/bh_hal_active_eth.cpp b/tt_metal/llrt/blackhole/bh_hal_active_eth.cpp
index 15d2aa56148..566eff03691 100644
--- a/tt_metal/llrt/blackhole/bh_hal_active_eth.cpp
+++ b/tt_metal/llrt/blackhole/bh_hal_active_eth.cpp
@@ -8,12 +8,15 @@
 
 #include "llrt/hal.hpp"
 #include "llrt/blackhole/bh_hal.hpp"
+#include "hw/inc/blackhole/core_config.h"
 #include "hw/inc/blackhole/dev_mem_map.h"
 #include "hw/inc/blackhole/eth_l1_address_map.h"
 #include "hostdevcommon/common_runtime_address_map.h"
 #include "tt_metal/third_party/umd/device/tt_soc_descriptor.h"
 #include "hw/inc/dev_msgs.h"
 
+#include <magic_enum.hpp>
+
 #define GET_ETH_MAILBOX_ADDRESS_HOST(x) \
     ((uint64_t) & (((mailboxes_t *)eth_l1_mem::address_map::ERISC_MEM_MAILBOX_BASE)->x))
 
@@ -23,8 +26,6 @@ namespace tt_metal {
 
 HalCoreInfoType create_active_eth_mem_map() {
 
-    constexpr uint32_t num_proc_per_idle_eth_core = 1;
-
     std::vector<DeviceAddr> mem_map_bases;
 
     mem_map_bases.resize(utils::underlying_type<HalL1MemAddrType>(HalL1MemAddrType::COUNT));
@@ -51,7 +52,13 @@ HalCoreInfoType create_active_eth_mem_map() {
     mem_map_sizes[utils::underlying_type<HalL1MemAddrType>(HalL1MemAddrType::GO_MSG)] = sizeof(go_msg_t);
     mem_map_sizes[utils::underlying_type<HalL1MemAddrType>(HalL1MemAddrType::LAUNCH_MSG_BUFFER_RD_PTR)] = sizeof(uint32_t);
 
-    return {HalProgrammableCoreType::IDLE_ETH, CoreType::ETH, num_proc_per_idle_eth_core, mem_map_bases, mem_map_sizes, false};
+    std::vector<std::vector<uint8_t>> processor_classes(NumEthDispatchClasses);
+    std::vector<uint8_t> processor_types{0};
+    for (uint8_t processor_class_idx = 0; processor_class_idx < NumEthDispatchClasses; processor_class_idx++) {
+        processor_classes[processor_class_idx] = processor_types;
+    }
+
+    return {HalProgrammableCoreType::IDLE_ETH, CoreType::ETH, processor_classes, mem_map_bases, mem_map_sizes, false};
 }
 
 }  // namespace tt_metal
diff --git a/tt_metal/llrt/blackhole/bh_hal_idle_eth.cpp b/tt_metal/llrt/blackhole/bh_hal_idle_eth.cpp
index 8ee36ee8752..09b5c77af7f 100644
--- a/tt_metal/llrt/blackhole/bh_hal_idle_eth.cpp
+++ b/tt_metal/llrt/blackhole/bh_hal_idle_eth.cpp
@@ -8,12 +8,15 @@
 
 #include "llrt/hal.hpp"
 #include "llrt/blackhole/bh_hal.hpp"
+#include "hw/inc/blackhole/core_config.h"
 #include "hw/inc/blackhole/dev_mem_map.h"
 #include "hw/inc/blackhole/eth_l1_address_map.h"
 #include "hostdevcommon/common_runtime_address_map.h"
 #include "tt_metal/third_party/umd/device/tt_soc_descriptor.h"
 #include "hw/inc/dev_msgs.h"
 
+#include <magic_enum.hpp>
+
 #define GET_IERISC_MAILBOX_ADDRESS_HOST(x) ((uint64_t) & (((mailboxes_t *)MEM_IERISC_MAILBOX_BASE)->x))
 
 namespace tt {
@@ -22,7 +25,6 @@ namespace tt_metal {
 
 HalCoreInfoType create_idle_eth_mem_map() {
 
-    constexpr uint32_t num_proc_per_idle_eth_core = 1;
     uint32_t max_alignment = std::max(DRAM_ALIGNMENT, L1_ALIGNMENT);
 
     static_assert(MEM_IERISC_MAP_END % L1_ALIGNMENT == 0);
@@ -53,7 +55,13 @@ HalCoreInfoType create_idle_eth_mem_map() {
     mem_map_sizes[utils::underlying_type<HalL1MemAddrType>(HalL1MemAddrType::GO_MSG)] = sizeof(go_msg_t);
     mem_map_sizes[utils::underlying_type<HalL1MemAddrType>(HalL1MemAddrType::LAUNCH_MSG_BUFFER_RD_PTR)] = sizeof(uint32_t);
 
-    return {HalProgrammableCoreType::IDLE_ETH, CoreType::ETH, num_proc_per_idle_eth_core, mem_map_bases, mem_map_sizes, false};
+    std::vector<std::vector<uint8_t>> processor_classes(NumEthDispatchClasses);
+    std::vector<uint8_t> processor_types{0};
+    for (uint8_t processor_class_idx = 0; processor_class_idx < NumEthDispatchClasses; processor_class_idx++) {
+        processor_classes[processor_class_idx] = processor_types;
+    }
+
+    return {HalProgrammableCoreType::IDLE_ETH, CoreType::ETH, processor_classes, mem_map_bases, mem_map_sizes, false};
 }
 
 }  // namespace tt_metal
diff --git a/tt_metal/llrt/blackhole/bh_hal_tensix.cpp b/tt_metal/llrt/blackhole/bh_hal_tensix.cpp
index 6d87c007b1e..29fac3daf27 100644
--- a/tt_metal/llrt/blackhole/bh_hal_tensix.cpp
+++ b/tt_metal/llrt/blackhole/bh_hal_tensix.cpp
@@ -6,12 +6,16 @@
 
 #include "llrt/hal.hpp"
 #include "llrt/blackhole/bh_hal.hpp"
+#include "hw/inc/blackhole/core_config.h"
 #include "hw/inc/blackhole/dev_mem_map.h"
 #include "hw/inc/blackhole/eth_l1_address_map.h"  // XXXX FIXME
 #include "hostdevcommon/common_runtime_address_map.h"
 #include "tt_metal/third_party/umd/device/tt_soc_descriptor.h"
 #include "hw/inc/dev_msgs.h"
 
+#include <magic_enum.hpp>
+#include <numeric>
+
 #define GET_MAILBOX_ADDRESS_HOST(x) ((uint64_t) & (((mailboxes_t *)MEM_MAILBOX_BASE)->x))
 
 namespace tt {
@@ -20,7 +24,6 @@ namespace tt_metal {
 
 HalCoreInfoType create_tensix_mem_map() {
 
-    constexpr uint32_t num_proc_per_tensix_core = 5;
     uint32_t max_alignment = std::max(DRAM_ALIGNMENT, L1_ALIGNMENT);
 
     std::vector<DeviceAddr> mem_map_bases;
@@ -50,7 +53,16 @@ HalCoreInfoType create_tensix_mem_map() {
     mem_map_sizes[utils::underlying_type<HalL1MemAddrType>(HalL1MemAddrType::GO_MSG)] = sizeof(go_msg_t);
     mem_map_sizes[utils::underlying_type<HalL1MemAddrType>(HalL1MemAddrType::LAUNCH_MSG_BUFFER_RD_PTR)] = sizeof(uint32_t);
 
-    return {HalProgrammableCoreType::TENSIX, CoreType::WORKER, num_proc_per_tensix_core, mem_map_bases, mem_map_sizes, true};
+    std::vector<std::vector<uint8_t>> processor_classes(NumTensixDispatchClasses);
+    std::vector<uint8_t> processor_types;
+    for (uint8_t processor_class_idx = 0; processor_class_idx < NumTensixDispatchClasses; processor_class_idx++) {
+        uint32_t num_processors = processor_class_idx == (NumTensixDispatchClasses - 1) ? 3 : 1;
+        processor_types.resize(num_processors);
+        std::iota(processor_types.begin(), processor_types.end(), 0);
+        processor_classes[processor_class_idx] = processor_types;
+    }
+
+    return {HalProgrammableCoreType::TENSIX, CoreType::WORKER, processor_classes, mem_map_bases, mem_map_sizes, true};
 }
 
 }  // namespace tt_metal
diff --git a/tt_metal/llrt/grayskull/gs_hal.cpp b/tt_metal/llrt/grayskull/gs_hal.cpp
index e1f27aa34f7..061843e96e0 100644
--- a/tt_metal/llrt/grayskull/gs_hal.cpp
+++ b/tt_metal/llrt/grayskull/gs_hal.cpp
@@ -9,11 +9,15 @@
 
 #if defined (ARCH_GRAYSKULL)
 
+#include "hw/inc/grayskull/core_config.h"
 #include "hw/inc/grayskull/dev_mem_map.h"
 #include "hw/inc/grayskull/eth_l1_address_map.h" // TODO remove when commonruntimeaddressmap is gone
 #include "hostdevcommon/common_runtime_address_map.h"
 #include "hw/inc/dev_msgs.h"
 
+#include <magic_enum.hpp>
+#include <numeric>
+
 #endif
 
 #define GET_MAILBOX_ADDRESS_HOST(x) ((uint64_t) & (((mailboxes_t *)MEM_MAILBOX_BASE)->x))
@@ -32,7 +36,6 @@ void Hal::initialize_gs() {
 
     static_assert(static_cast<int>(HalProgrammableCoreType::TENSIX) == static_cast<int>(ProgrammableCoreType::TENSIX));
 
-    constexpr uint32_t num_proc_per_tensix_core = 5;
     uint32_t max_alignment = std::max(DRAM_ALIGNMENT, L1_ALIGNMENT);
     std::vector<DeviceAddr> mem_map_bases;
 
@@ -60,7 +63,16 @@ void Hal::initialize_gs() {
     mem_map_sizes[utils::underlying_type<HalL1MemAddrType>(HalL1MemAddrType::GO_MSG)] = sizeof(go_msg_t);
     mem_map_sizes[utils::underlying_type<HalL1MemAddrType>(HalL1MemAddrType::LAUNCH_MSG_BUFFER_RD_PTR)] = sizeof(uint32_t);
 
-    this->core_info_.push_back({HalProgrammableCoreType::TENSIX, CoreType::WORKER, num_proc_per_tensix_core, mem_map_bases, mem_map_sizes, true});
+    std::vector<std::vector<uint8_t>> processor_classes(NumTensixDispatchClasses);
+    std::vector<uint8_t> processor_types;
+    for (uint8_t processor_class_idx = 0; processor_class_idx < NumTensixDispatchClasses; processor_class_idx++) {
+        uint32_t num_processors = processor_class_idx == (NumTensixDispatchClasses - 1) ? 3 : 1;
+        processor_types.resize(num_processors);
+        std::iota(processor_types.begin(), processor_types.end(), 0);
+        processor_classes[processor_class_idx] = processor_types;
+    }
+
+    this->core_info_.push_back({HalProgrammableCoreType::TENSIX, CoreType::WORKER, processor_classes, mem_map_bases, mem_map_sizes, true});
 
     this->dram_bases_.resize(utils::underlying_type<HalDramMemAddrType>(HalDramMemAddrType::COUNT));
     this->dram_sizes_.resize(utils::underlying_type<HalDramMemAddrType>(HalDramMemAddrType::COUNT));
diff --git a/tt_metal/llrt/hal.cpp b/tt_metal/llrt/hal.cpp
index f3c3b9e0a33..b75b3806084 100644
--- a/tt_metal/llrt/hal.cpp
+++ b/tt_metal/llrt/hal.cpp
@@ -56,13 +56,13 @@ uint32_t Hal::get_programmable_core_type_index(HalProgrammableCoreType programma
 
 HalCoreInfoType::HalCoreInfoType(HalProgrammableCoreType programmable_core_type,
                                  CoreType core_type,
-                                 uint32_t core_proc_count,
+                                 const std::vector<std::vector<uint8_t>> &processor_classes,
                                  const std::vector<DeviceAddr>& mem_map_bases,
                                  const std::vector<uint32_t>& mem_map_sizes,
                                  bool supports_cbs) :
     programmable_core_type_(programmable_core_type),
     core_type_(core_type),
-    proc_count_(core_proc_count),
+    processor_classes_(processor_classes),
     mem_map_bases_(mem_map_bases),
     mem_map_sizes_(mem_map_sizes),
     supports_cbs_(supports_cbs) {
diff --git a/tt_metal/llrt/hal.hpp b/tt_metal/llrt/hal.hpp
index 080e259d4a7..4b6682333c2 100644
--- a/tt_metal/llrt/hal.hpp
+++ b/tt_metal/llrt/hal.hpp
@@ -10,6 +10,7 @@
 //
 
 #include <cstdint>
+#include <variant>
 #include <vector>
 #include <memory>
 #include "tt_metal/common/assert.hpp"
@@ -30,6 +31,13 @@ enum class HalProgrammableCoreType {
     COUNT      = 3
 };
 
+enum class HalProcessorClassType : uint8_t {
+    DM      = 0,
+    // Setting this to 2 because we currently treat brisc and ncrisc as two unique processor classes on Tensix
+    // TODO: Uplift view of Tensix processor classes to be 1 DM class with 2 processor types
+    COMPUTE = 2
+};
+
 enum class HalL1MemAddrType : uint8_t {
     BARRIER = 0,
     LAUNCH = 1,
@@ -67,18 +75,21 @@ class HalCoreInfoType {
   private:
     HalProgrammableCoreType programmable_core_type_;
     CoreType core_type_;
-    std::uint32_t proc_count_; // eventually a vector of attributes?
+    // index represents processor class position, value is the specific processor class
+    std::vector<std::vector<uint8_t>> processor_classes_;
     std::vector<DeviceAddr> mem_map_bases_;
     std::vector<uint32_t> mem_map_sizes_;
     bool supports_cbs_;
 
   public:
-    HalCoreInfoType(HalProgrammableCoreType programmable_core_type, CoreType core_type, uint32_t core_proc_count,
+    HalCoreInfoType(HalProgrammableCoreType programmable_core_type, CoreType core_type, const std::vector<std::vector<uint8_t>> &processor_classes,
         const std::vector<DeviceAddr>& mem_map_bases, const std::vector<uint32_t>& mem_map_sizes, bool supports_cbs);
 
     template <typename T = DeviceAddr>
     T get_dev_addr(HalL1MemAddrType addr_type) const;
     uint32_t get_dev_size(HalL1MemAddrType addr_type) const;
+    uint32_t get_processor_classes_count() const;
+    uint32_t get_processor_types_count(uint32_t processor_class_idx) const;
 };
 
 template <typename T>
@@ -94,6 +105,15 @@ inline uint32_t HalCoreInfoType::get_dev_size(HalL1MemAddrType addr_type) const
     return this->mem_map_sizes_[index];
 }
 
+inline uint32_t HalCoreInfoType::get_processor_classes_count() const {
+    return this->processor_classes_.size();
+}
+
+inline uint32_t HalCoreInfoType::get_processor_types_count(uint32_t processor_class_idx) const {
+    TT_ASSERT(processor_class_idx < this->processor_classes_.size());
+    return this->processor_classes_[processor_class_idx].size();
+}
+
 class Hal {
   private:
     std::mutex lock;
@@ -116,8 +136,11 @@ class Hal {
     HalProgrammableCoreType get_programmable_core_type(uint32_t core_type_index) const;
     uint32_t get_programmable_core_type_index(HalProgrammableCoreType programmable_core_type_index) const;
     CoreType get_core_type(uint32_t programmable_core_type_index) const;
-
-    uint32_t get_processor_count(uint32_t core_type_index) const;
+    uint32_t get_processor_classes_count(std::variant<HalProgrammableCoreType, uint32_t> programmable_core_type) const;
+    uint32_t get_processor_class_type_index(HalProcessorClassType processor_class);
+    uint32_t get_processor_types_count(
+        std::variant<HalProgrammableCoreType, uint32_t> programmable_core_type,
+        uint32_t processor_class_idx) const;
 
     template <typename T = DeviceAddr>
     T get_dev_addr(HalProgrammableCoreType programmable_core_type, HalL1MemAddrType addr_type) const;
@@ -139,6 +162,39 @@ inline uint32_t Hal::get_programmable_core_type_count() const {
     return core_info_.size();
 }
 
+inline uint32_t Hal::get_processor_classes_count(std::variant<HalProgrammableCoreType, uint32_t> programmable_core_type) const {
+    return std::visit(
+        [&](auto &&core_type_specifier) -> uint32_t {
+            using T = std::decay_t<decltype(core_type_specifier)>;
+            uint32_t index = this->core_info_.size();
+            if constexpr (std::is_same_v<T, HalProgrammableCoreType>) {
+                index = utils::underlying_type<HalProgrammableCoreType>(core_type_specifier);
+            } else if constexpr (std::is_same_v<T, uint32_t>) {
+                index = core_type_specifier;
+            }
+            TT_ASSERT(index < this->core_info_.size());
+            return this->core_info_[index].get_processor_classes_count();
+        },
+    programmable_core_type);
+}
+
+inline uint32_t Hal::get_processor_types_count(
+    std::variant<HalProgrammableCoreType, uint32_t> programmable_core_type, uint32_t processor_class_idx) const {
+    return std::visit(
+        [&](auto &&core_type_specifier) -> uint32_t {
+            using T = std::decay_t<decltype(core_type_specifier)>;
+            uint32_t index = this->core_info_.size();
+            if constexpr (std::is_same_v<T, HalProgrammableCoreType>) {
+                index = utils::underlying_type<HalProgrammableCoreType>(core_type_specifier);
+            } else if constexpr (std::is_same_v<T, uint32_t>) {
+                index = core_type_specifier;
+            }
+            TT_ASSERT(index < this->core_info_.size());
+            return this->core_info_[index].get_processor_types_count(processor_class_idx);
+        },
+    programmable_core_type);
+}
+
 inline HalProgrammableCoreType Hal::get_programmable_core_type(uint32_t core_type_index) const {
     return core_info_[core_type_index].programmable_core_type_;
 }
diff --git a/tt_metal/llrt/llrt.cpp b/tt_metal/llrt/llrt.cpp
index 335b75e7481..6edf9d38853 100644
--- a/tt_metal/llrt/llrt.cpp
+++ b/tt_metal/llrt/llrt.cpp
@@ -26,7 +26,8 @@ using std::uint16_t;
 using std::uint32_t;
 using std::uint64_t;
 
-ll_api::memory get_risc_binary(string const &path, uint32_t riscv_id, PackSpans pack_spans) {
+ll_api::memory get_risc_binary(string const &path, uint32_t riscv_id,
+    ll_api::memory::PackSpans span_type, ll_api::memory::Relocate relo_type) {
 
     static const uint32_t processor_to_fw_base_addr[] = {
         MEM_BRISC_FIRMWARE_BASE,
@@ -49,13 +50,15 @@ ll_api::memory get_risc_binary(string const &path, uint32_t riscv_id, PackSpans
     if (inserted) {
       // We're the first with PATH. Create and insert.
       lock.unlock();
-      auto *ptr = new ll_api::memory(path);
+      auto *ptr = new ll_api::memory(path, relo_type);
 
       // TODO: pass pack_spans into reader, generate text/data sizes
       // from segment sizes and pack there
-      if (pack_spans == PackSpans::PACK) {
+      if (span_type == ll_api::memory::PackSpans::PACK) {
           uint64_t data_start = MEM_LOCAL_BASE;
-          uint64_t text_start = processor_to_fw_base_addr[riscv_id];
+          uint64_t text_start = (relo_type == ll_api::memory::Relocate::XIP) ?
+              0 :
+              processor_to_fw_base_addr[riscv_id];
           ptr->pack_data_into_text(text_start, data_start);
       }
 
@@ -203,6 +206,14 @@ bool test_load_write_read_trisc_binary(ll_api::memory &mem, chip_id_t chip_id, c
     return test_load_write_read_risc_binary(mem, chip_id, core, triscv_id + 2);
 }
 
+void write_binary_to_address(ll_api::memory &mem, chip_id_t chip_id, const CoreCoord &core, uint32_t address) {
+
+    log_debug(tt::LogLLRuntime, "vec size = {}, size_in_bytes = {}", mem.size(), mem.size() * sizeof(uint32_t));
+    mem.process_spans([&](std::vector<uint32_t>::const_iterator mem_ptr, uint64_t addr, uint32_t len_words) {
+        tt::Cluster::instance().write_core(&*mem_ptr, len_words * sizeof(uint32_t), tt_cxy_pair(chip_id, core), address);
+    });
+}
+
 CoreCoord get_core_for_dram_channel(int dram_channel_id, chip_id_t chip_id) {
     return tt::Cluster::instance().get_soc_desc(chip_id).get_preferred_worker_core_for_dram_channel(dram_channel_id);
 }
diff --git a/tt_metal/llrt/llrt.hpp b/tt_metal/llrt/llrt.hpp
index a788668790e..0fc3004f551 100644
--- a/tt_metal/llrt/llrt.hpp
+++ b/tt_metal/llrt/llrt.hpp
@@ -53,8 +53,10 @@ using NUM_REPETITIONS = std::uint32_t;
 using WorkerCore = tt_cxy_pair;
 using WorkerCores = std::vector<WorkerCore>;
 
-enum class PackSpans { PACK, NO_PACK };
-ll_api::memory get_risc_binary(string const &path, uint32_t riscv_id = 0, PackSpans pack_spans = PackSpans::NO_PACK);
+ll_api::memory get_risc_binary(string const &path, uint32_t riscv_id = 0,
+    ll_api::memory::PackSpans span_type = ll_api::memory::PackSpans::NO_PACK,
+    ll_api::memory::Relocate relo_type = ll_api::memory::Relocate::NONE);
+
 
 // TODO: try using "stop" method from device instead, it's the proper way of asserting reset
 
@@ -94,8 +96,8 @@ uint32_t generate_risc_startup_addr(bool is_eth_core);
 void program_risc_startup_addr(chip_id_t chip_id, const CoreCoord &core);
 
 bool test_load_write_read_risc_binary(ll_api::memory &mem, chip_id_t chip_id, const CoreCoord &core, int riscv_id);
-
 bool test_load_write_read_trisc_binary(ll_api::memory &mem, chip_id_t chip_id, const CoreCoord &core, int triscv_id);
+void write_binary_to_address(ll_api::memory &mem, chip_id_t chip_id, const CoreCoord &core, uint32_t address);
 
 // subchannel hard-coded to 0 for now
 CoreCoord get_core_for_dram_channel(int dram_channel_id, chip_id_t chip_id = 0);
diff --git a/tt_metal/llrt/rtoptions.hpp b/tt_metal/llrt/rtoptions.hpp
index 8bcfb7acadb..8ebbc3d2380 100644
--- a/tt_metal/llrt/rtoptions.hpp
+++ b/tt_metal/llrt/rtoptions.hpp
@@ -15,7 +15,7 @@
 #include <unordered_set>
 #include <vector>
 
-#include "tt_metal/common/core_coord.h"
+#include "tt_metal/common/core_coord.hpp"
 #include "tt_metal/impl/dispatch/dispatch_core_manager.hpp"
 #include "tt_metal/third_party/umd/device/tt_soc_descriptor.h"  // For CoreType
 
diff --git a/tt_metal/llrt/tlb_config.cpp b/tt_metal/llrt/tlb_config.cpp
index 335ed83e63c..b3deadbc88b 100644
--- a/tt_metal/llrt/tlb_config.cpp
+++ b/tt_metal/llrt/tlb_config.cpp
@@ -7,6 +7,7 @@
 #include "third_party/umd/device/blackhole/blackhole_implementation.h"
 #include "third_party/umd/device/grayskull/grayskull_implementation.h"
 #include "third_party/umd/device/wormhole/wormhole_implementation.h"
+#include "tt_metal/common/assert.hpp"
 
 namespace ll_api {
 
@@ -208,21 +209,4 @@ void configure_static_tlbs(tt::ARCH arch, chip_id_t mmio_device_id, const metal_
     device_driver.setup_core_to_tlb_map([get_static_tlb_index](CoreCoord core) { return get_static_tlb_index(core); });
 }
 
-std::unordered_map<std::string, std::int32_t> get_dynamic_tlb_config(tt::ARCH arch) {
-    std::unordered_map<std::string, std::int32_t> dynamic_tlb_config;
-    switch (arch) {
-        case tt::ARCH::GRAYSKULL:
-            dynamic_tlb_config["REG_TLB"] = tt::umd::grayskull::REG_TLB;
-            break;
-        case tt::ARCH::WORMHOLE_B0:
-            dynamic_tlb_config["REG_TLB"] = tt::umd::wormhole::REG_TLB;
-            break;
-        case tt::ARCH::BLACKHOLE:
-            dynamic_tlb_config["REG_TLB"] = tt::umd::blackhole::REG_TLB;
-            break;
-        default: TT_THROW("Configuring dynamic TLBs is not supported for {}", tt::get_string(arch));
-    }
-    return dynamic_tlb_config;
-}
-
 }  // namespace ll_api
diff --git a/tt_metal/llrt/tlb_config.hpp b/tt_metal/llrt/tlb_config.hpp
index 099b530a04b..4dca5560b89 100644
--- a/tt_metal/llrt/tlb_config.hpp
+++ b/tt_metal/llrt/tlb_config.hpp
@@ -14,6 +14,4 @@ namespace ll_api {
 
 void configure_static_tlbs(tt::ARCH arch, chip_id_t mmio_device_id, const metal_SocDescriptor &sdesc, tt_device &device_driver);
 
-std::unordered_map<std::string, std::int32_t> get_dynamic_tlb_config(tt::ARCH arch);
-
 }  // namespace ll_api
diff --git a/tt_metal/llrt/tt_cluster.cpp b/tt_metal/llrt/tt_cluster.cpp
index e45e438941e..70cf9f4774d 100644
--- a/tt_metal/llrt/tt_cluster.cpp
+++ b/tt_metal/llrt/tt_cluster.cpp
@@ -19,7 +19,7 @@
 #include "tt_metal/impl/debug/sanitize_noc_host.hpp"
 #include "tt_metal/llrt/rtoptions.hpp"
 #include "tt_metal/llrt/tlb_config.hpp"
-#include "tt_metal/common/core_coord.h"
+#include "tt_metal/common/core_coord.hpp"
 
 static constexpr uint32_t HOST_MEM_CHANNELS = 4;
 static constexpr uint32_t HOST_MEM_CHANNELS_MASK = HOST_MEM_CHANNELS - 1;
@@ -260,7 +260,6 @@ void Cluster::open_driver(
         if (is_tg_cluster_) {
             num_host_mem_ch_per_mmio_device = HOST_MEM_CHANNELS;
         }
-        std::unordered_map<std::string, std::int32_t> dynamic_tlb_config = ll_api::get_dynamic_tlb_config(this->arch_);
         // This will remove harvested rows from the soc descriptor
         const bool perform_harvesting = true;
         const bool clean_system_resources = true;
@@ -269,7 +268,6 @@ void Cluster::open_driver(
             this->cluster_desc_path_,
             controlled_device_ids,
             num_host_mem_ch_per_mmio_device,
-            dynamic_tlb_config,
             skip_driver_allocs,
             clean_system_resources,
             perform_harvesting);
diff --git a/tt_metal/llrt/tt_elffile.cpp b/tt_metal/llrt/tt_elffile.cpp
index 70be05904fe..89c2cf2d943 100644
--- a/tt_metal/llrt/tt_elffile.cpp
+++ b/tt_metal/llrt/tt_elffile.cpp
@@ -8,6 +8,8 @@
 #include <array>
 
 #include "common/assert.hpp"
+// C++
+#include <map>
 // C
 #include <errno.h>
 // OS
@@ -44,6 +46,19 @@ enum {
 #define EM_RISCV_WORMHOLE 0x5151
 #define EM_RISCV_BLACKHOLE 0x6151
 
+// We have to translate these two instructions
+static constexpr uint32_t insn_opc_auipc = 0x00000017;
+static constexpr uint32_t insn_opc_lui = 0x00000037;
+static constexpr uint32_t insn_mask_u = 0x0000007f;
+static constexpr uint32_t mask_hi20 = 0x00000fff;
+static constexpr unsigned mask_hi20_shift = 12;
+static constexpr uint32_t mask_lo12_i = 0x000fffff;
+static constexpr unsigned mask_lo12_i_shift = 20;
+static constexpr uint32_t mask_lo12_s = 0x01fff07f;
+static constexpr unsigned mask_lo12_s_split = 5;
+static constexpr unsigned mask_lo12_s_shift_1 = 7;
+static constexpr unsigned mask_lo12_s_shift_2 = 25;
+
 using namespace ll_api;
 
 class ElfFile::Impl {
@@ -63,6 +78,7 @@ class ElfFile::Impl {
    public:
     void LoadImage();
     void WeakenDataSymbols(std::span<std::string_view const> strong_names);
+    void XIPify();
 
    private:
     [[nodiscard]] auto GetHeader() const -> Elf32_Ehdr const & { return *ByteOffset<Elf32_Ehdr>(GetContents().data()); }
@@ -127,6 +143,13 @@ class ElfFile::Impl {
     [[nodiscard]] static T const *ByteOffset(std::byte const *base, size_t offset = 0) {
         return reinterpret_cast<T const *>(base + offset);
     }
+
+    uint32_t Read32(Elf32_Shdr const &shdr, address_t addr) {
+        return *ByteOffset<uint32_t>(GetContents(shdr).data(), addr - shdr.sh_addr);
+    }
+    void Write32(Elf32_Shdr const &shdr, address_t addr, uint32_t value) {
+        *ByteOffset<uint32_t>(GetContents(shdr).data(), addr - shdr.sh_addr) = value;
+    }
 };
 
 ElfFile::~ElfFile() {
@@ -175,6 +198,8 @@ void ElfFile::WriteImage(std::string const &path) {
 
 void ElfFile::WeakenDataSymbols(std::span<std::string_view const> strong) { pimpl_->WeakenDataSymbols(strong); }
 
+void ElfFile::MakeExecuteInPlace() { pimpl_->XIPify(); }
+
 void ElfFile::Impl::LoadImage() {
     auto &hdr = GetHeader();
 
@@ -341,3 +366,309 @@ void ElfFile::Impl::WeakenDataSymbols(std::span<std::string_view const> strong)
         weakener.RewriteSymbols();
     }
 }
+
+void ElfFile::Impl::XIPify() {
+    // In general there can be several lo12 relocs for a hi20
+    // reloc. This is particularly true for lui/{addi,lw,sw,etc}
+    // pairs -- a load and a store might share a single lui, as
+    // the compiler now emits those insns separately. Thus we have
+    // to build a work list and then process it. Furthermore,
+    // although auipc/lo12 pairings are clear because the lo12
+    // part directly points at the auipc, that is not true of
+    // lui/lo12 pairings. We have to use heuristics to locate the
+    // matching relocs and that could get arbitrarily hard. We
+    // presume (a) the compiler doesn't duplicate lui insns, and
+    // (b) the lui preceeds the lo12 in program counter
+    // order. Thus we look for a hi20 reloc matching the symbol at
+    // a lower offset than the lo12 in question. Fortunately we
+    // only need to do this for relocs that need translating, and
+    // those happen to be rare when all data-like sections are in
+    // the data segment (so putting .rodata in text is
+    // problematic). If that proves insufficient here are some
+    // ideas:
+
+    // * Insert fn boundaries from symbols of FNtype -- you'll
+    //   need to tweak the fn address to not cause collisions in
+    //   the reloc map. this might fail with hot/cold block
+    //   splitting.
+
+    // * Construct the CFG by examining R_RISCV_BRANCH
+    //   relocs. Then walk it (backwards) from each lo12 to find
+    //   the reachable hi20. This would be able to deal with
+    //   hot/cold splitting, if one constructed the complete
+    //   section CFG, not as a per-fn entity. One might get away
+    //   with not disasembling to discover ret instructions that
+    //   terminate the CFG.
+
+    struct ComposedReloc {
+        std::vector<Elf32_Rela *> lo_relocs;
+        Elf32_Rela *hi_reloc = nullptr;  // the high part
+
+        ComposedReloc(Elf32_Rela *hi) : hi_reloc(hi) {}
+    };
+
+    enum { ABS, PCREL, HWM };
+    static char const *const r_names[][2] = {
+        {"R_RISCV_HI20", "R_RISCV_LO12"}, {"R_RISCV_PCREL_HI20", "R_RISCV_PCREL_LO12"}};
+
+    auto check_relaxed = [&](Elf32_Rela const &reloc) {
+        // If RELOC is the final reloc, this will
+        // be out of bounds (and probably fail),
+        // but we kind of want that anyway
+        if (ELF32_R_TYPE((&reloc)[1].r_info) != R_RISCV_RELAX)
+            log_debug(tt::LogLLRuntime, "{}: Relocation at {x} is not relaxed", path_, reloc.r_offset);
+    };
+
+    unsigned num_reloc_sections = 0;
+    for (auto const &relocHdr : GetShdrs()) {
+        if (relocHdr.sh_type != SHT_RELA)
+            continue;
+
+        // Is this relocating a section of interest?
+        unsigned section_ix = relocHdr.sh_info;
+        auto &section = GetShdr(section_ix);
+        if (!(section.sh_flags & SHF_ALLOC && section.sh_type != SHT_NOBITS))
+            continue;
+
+        int segment_ix = GetSegmentIx(section);
+        if (segment_ix < 0)
+            continue;
+
+        num_reloc_sections++;
+        std::map<offset_t, ComposedReloc> composed[HWM];
+        std::vector<Elf32_Rela *> lo[HWM];
+
+        auto symbols = GetSymbols(GetShdr(relocHdr.sh_link));
+        auto relocs = GetRelocations(relocHdr);
+        bool is_from_text = !segment_ix;
+
+        // ADD32/SUB32 pairs are used for switch tables. Make sure
+        // they're consistent.
+        Elf32_Rela const *sub_reloc = nullptr;  // Active sub reloc.
+        for (auto ix = relocs.size(); ix--;) {
+            auto &reloc = relocs[ix];
+            if (reloc.r_offset & 3 || reloc.r_offset - section.sh_addr >= section.sh_size)
+                TT_THROW(
+                    "{}: relocation @ {x} is {} section {}",
+                    path_,
+                    reloc.r_offset,
+                    reloc.r_offset & 3 ? "misaligned in" : "outside of",
+                    GetName(section));
+
+            auto type = ELF32_R_TYPE(reloc.r_info);
+            auto sym_ix = ELF32_R_SYM(reloc.r_info);
+            auto const *symbol = &symbols[sym_ix];
+            bool is_to_text = IsTextSymbol(*symbol);
+
+            // Check add/sub relocs are paired and do not cross text/non-text boundary.
+            if (bool(sub_reloc) != (type == R_RISCV_ADD32) || (sub_reloc && sub_reloc->r_offset != reloc.r_offset))
+            unpaired_sub:
+                TT_THROW(
+                    "{}: unpaired {} reloc at {x}",
+                    path_,
+                    sub_reloc ? "sub32" : "add32",
+                    (sub_reloc ? sub_reloc : &reloc)->r_offset);
+            if (type == R_RISCV_ADD32) {
+                auto const *sub_symbol = &symbols[ELF32_R_SYM(sub_reloc->r_info)];
+                bool sub_is_to_text = IsTextSymbol(*sub_symbol);
+                if (is_to_text != sub_is_to_text)
+                    TT_THROW(
+                        "{}: mismatched add32/sub32 relocs at {x} & {x}", path_, reloc.r_offset, sub_reloc->r_offset);
+            }
+            sub_reloc = nullptr;
+            if (type == R_RISCV_SUB32) {
+                sub_reloc = &reloc;
+                if (!ix)
+                    goto unpaired_sub;
+            }
+
+            unsigned kind = PCREL;
+            switch (type) {
+                // Abs relocs to text will need fixing up
+                case R_RISCV_LO12_I:
+                case R_RISCV_LO12_S:
+                    if (!is_to_text)
+                        break;
+                    kind = ABS;
+                    [[fallthrough]];
+
+                // PCrel relocs not to text will need fixing up. At
+                // this point we don't know the symbol from the LO12
+                // relocs, as that points at the hi20 reloc.
+                case R_RISCV_PCREL_LO12_I:
+                case R_RISCV_PCREL_LO12_S: lo[kind].push_back(&reloc); break;
+
+                case R_RISCV_HI20: kind = ABS; [[fallthrough]];
+
+                case R_RISCV_PCREL_HI20:
+                    if (is_to_text && !is_from_text)
+                        TT_THROW(
+                            "{}: segment-crossing {} relocation found at {x}", path_, r_names[kind][0], reloc.r_offset);
+
+                    if (!is_to_text && kind == ABS)
+                        break;
+                    composed[kind].emplace(reloc.r_offset, ComposedReloc(&reloc));
+                    break;
+
+                case R_RISCV_32: {
+                    if (!is_to_text)
+                        break;
+                    // Emit dynamic reloc
+                    log_debug(
+                        tt::LogLLRuntime, "{}: emitting dynamic R_RISCV_32 relocation at {x}", path_, reloc.r_offset);
+                    address_t value =
+                        (symbol->st_value + reloc.r_addend - GetSegments().front().address);
+                    Write32(section, reloc.r_offset, value);
+                    auto &seg = GetSegments()[segment_ix];
+                    seg.relocs.push_back(reloc.r_offset - seg.address);
+                } break;
+
+                case R_RISCV_JAL:
+                    if (is_from_text != is_to_text)
+                        TT_THROW("{}: segment-crossing R_RISCV_JAL relocation found at {x}", path_, reloc.r_offset);
+                    break;
+
+                case R_RISCV_CALL:
+                case R_RISCV_CALL_PLT:
+                    TT_THROW("{}: R_RISCV_CALL{,_PLT} relocation found at {x}", path_, reloc.r_offset);
+                    break;
+
+                case R_RISCV_32_PCREL:
+                    TT_THROW("{}: R_RISCV_32_PCREL relocation found at {x}", path_, reloc.r_offset);
+                    break;
+            }
+        }
+
+        // Combine hi/lo relocs
+
+        // We can't do abs ones in general with complete accuracy,
+        // because there could be multiple possible matching hi
+        // relocs. If we construct the CFG then it becomes more
+        // accurate, but it's always going to be somewhat
+        // heuristic. Let's hope CFG construction is unnecessary. A
+        // first step in that direction might be to insert function
+        // boundaries, to stop the search.
+        for (unsigned kind = HWM; kind--;) {
+            for (auto *lo_reloc : lo[kind]) {
+                // Find the matching hi-reloc by searching backwards. This
+                // presumes block reordering hasn't done something to
+                // break that.
+                unsigned sym_ix = ELF32_R_SYM(lo_reloc->r_info);
+                auto hi_reloc = composed[kind].begin();
+
+                if (kind == ABS) {
+                    hi_reloc = composed[kind].lower_bound(lo_reloc->r_offset);
+                    while (hi_reloc != composed[kind].begin()) {
+                        --hi_reloc;
+                        if (ELF32_R_SYM(hi_reloc->second.hi_reloc->r_info) == sym_ix)
+                            goto found;
+                    }
+                } else {
+                    uint32_t hi_offset = symbols[sym_ix].st_value + lo_reloc->r_addend;
+                    hi_reloc = composed[kind].find(hi_offset);
+                    if (hi_reloc != composed[kind].end())
+                        goto found;
+                }
+                TT_THROW(
+                    "{}: {} relocation at {x} has no matching {}",
+                    path_,
+                    r_names[kind][true],
+                    lo_reloc->r_offset,
+                    r_names[kind][false]);
+            found:
+                hi_reloc->second.lo_relocs.push_back(lo_reloc);
+            }
+        }
+
+        // Process composed relocations
+        for (unsigned kind = HWM; kind--;) {
+            for (auto &slot : composed[kind]) {
+                if (slot.second.lo_relocs.empty())
+                    TT_THROW(
+                        "{}: R_RISCV_{}HI20 relocation at {x} has no matching R_RISCV_{}LO12",
+                        path_,
+                        r_names[kind][false],
+                        r_names[kind][true],
+                        slot.first);
+
+                auto hi_reloc = slot.second.hi_reloc;
+                unsigned sym_ix = ELF32_R_SYM(hi_reloc->r_info);
+                auto const &symbol = symbols[sym_ix];
+                bool is_to_text = IsTextSymbol(symbol);
+                if (is_to_text == is_from_text)
+                    continue;
+
+                address_t value = symbol.st_value + hi_reloc->r_addend;
+                if (kind == ABS) {
+                    value -= slot.first;
+                    sym_ix = 0;
+                }
+
+                // translate hi
+                check_relaxed(*hi_reloc);
+                uint32_t insn = Read32(section, hi_reloc->r_offset);
+                log_debug(
+                    tt::LogLLRuntime,
+                    "{}: translating {} at {x} to {}",
+                    path_,
+                    r_names[kind][false],
+                    hi_reloc->r_offset,
+                    r_names[HWM - 1 - kind][false]);
+                if ((insn & insn_mask_u) != (kind == ABS ? insn_opc_lui : insn_opc_auipc))
+                    TT_THROW(
+                        "{}: translating instruction at {x} is not `{}'",
+                        path_,
+                        hi_reloc->r_offset,
+                        kind == ABS ? "lui" : "auipc");
+                insn &= mask_hi20;                      // Remove old immediate
+                insn ^= insn_opc_auipc ^ insn_opc_lui;  // Convert opcode
+                // Insert new immediate
+                insn |= ((value + (1 << 11)) >> 12) << mask_hi20_shift;
+                Write32(section, hi_reloc->r_offset, insn);
+                hi_reloc->r_info ^= ELF32_R_INFO(0, R_RISCV_HI20 ^ R_RISCV_PCREL_HI20);
+
+                // translate lo
+                for (auto *lo_reloc : slot.second.lo_relocs) {
+                    unsigned type = ELF32_R_TYPE(lo_reloc->r_info);
+                    bool is_form_i = type == (kind == PCREL ? R_RISCV_PCREL_LO12_I : R_RISCV_LO12_I);
+                    check_relaxed(*lo_reloc);
+                    uint32_t insn = Read32(section, lo_reloc->r_offset);
+                    log_debug(
+                        tt::LogLLRuntime,
+                        "{}: translating R_RISCV{}_LO12 at {x} to R_RISCV{}_LO12",
+                        path_,
+                        r_names[kind][true],
+                        lo_reloc->r_offset,
+                        r_names[HWM - 1 - kind][true]);
+                    if (is_form_i) {
+                        insn &= mask_lo12_i;
+                        insn |= (value & 0x0fff) << mask_lo12_i_shift;
+                    } else {
+                        // S form splits the immediate
+                        insn &= mask_lo12_s;
+                        insn |= (value & ((1 << mask_lo12_s_split) - 1)) << mask_lo12_s_shift_1;
+                        insn |= ((value & 0x0fff) >> mask_lo12_s_split) << mask_lo12_s_shift_2;
+                    }
+                    Write32(section, lo_reloc->r_offset, insn);
+
+                    // We can't convert to PCREL with fidelity, as
+                    // that involves adding a symbol. Instead, let's
+                    // use a null symbol and an addend.
+                    lo_reloc->r_info = ELF32_R_INFO(
+                        sym_ix,
+                        type ^ (is_form_i ? (R_RISCV_LO12_I ^ R_RISCV_PCREL_LO12_I)
+                                          : (R_RISCV_LO12_S ^ R_RISCV_PCREL_LO12_S)));
+                    lo_reloc->r_addend = kind == PCREL ? slot.second.hi_reloc->r_addend
+                                                       : slot.second.hi_reloc->r_offset - lo_reloc->r_offset;
+                }
+            }
+        }
+    }
+
+    if (!num_reloc_sections)
+        // Hm, that's suspicious
+        TT_THROW("{}: there are no relocation sections", path_);
+
+    // The text segment is now XIP
+    GetSegments().front().address = 0;
+}
diff --git a/tt_metal/llrt/tt_elffile.hpp b/tt_metal/llrt/tt_elffile.hpp
index b93bfd8d0f5..7c1b09ff034 100644
--- a/tt_metal/llrt/tt_elffile.hpp
+++ b/tt_metal/llrt/tt_elffile.hpp
@@ -25,6 +25,7 @@ class ElfFile {
     using word_t = std::uint32_t;     // Contents
 
     struct Segment {
+        std::vector<offset_t> relocs;      // 32-bit relocs to apply
         std::span<word_t const> contents;  // Non-owning span
         address_t address = 0;             // byte address or 0 for XIP
         offset_t bss = 0;                  // words of BSS
@@ -75,6 +76,9 @@ class ElfFile {
     // globs ending in '*'.
     void WeakenDataSymbols(std::span<std::string_view const> strong_names);
 
+    // XIPify
+    void MakeExecuteInPlace();
+
    private:
     class Impl;
     // We can't use unique_ptr here, because the above move semantics
diff --git a/tt_metal/llrt/tt_memory.cpp b/tt_metal/llrt/tt_memory.cpp
index 588d80c342e..0c8a30549e0 100644
--- a/tt_metal/llrt/tt_memory.cpp
+++ b/tt_metal/llrt/tt_memory.cpp
@@ -24,16 +24,20 @@ memory::memory() {
     packed_size_ = 0;
 }
 
-memory::memory(std::string const &path) : memory() {
+memory::memory(std::string const &path, Relocate relo_type) : memory() {
     ElfFile elf;
 
     elf.ReadImage(path);
+    if (relo_type == Relocate::XIP) {
+        elf.MakeExecuteInPlace();
+    }
 
     // The ELF file puts the text segment first, but memory wants
     // ordered spans.
     // FIXME: Perhaps we can relax that?
     uint32_t total_size = 0;
     auto emit_segment = [&](ElfFile::Segment const& segment) {
+        TT_ASSERT(segment.relocs.empty(), "Unexpected dynamic relocations");
         link_spans_.emplace_back(
             segment.address, segment.contents.size());
         data_.insert(data_.end(), segment.contents.begin(), segment.contents.end());
@@ -50,7 +54,7 @@ memory::memory(std::string const &path) : memory() {
     if (text)
         emit_segment(*text);
 
-    set_text_size(elf.GetSegments()[0].contents.size() * sizeof(uint32_t));
+    set_text_size(elf.GetSegments()[0].contents.size() * sizeof(word_t));
     set_packed_size(total_size * sizeof(uint32_t));
 }
 
@@ -145,6 +149,7 @@ void memory::pack_data_into_text(std::uint64_t text_start, std::uint64_t data_st
     this->link_spans_.resize(1);
     this->link_spans_[0] = new_span;
     this->data_ = new_data;
+    this->text_addr_ = new_span.addr;
 }
 
 }  // namespace ll_api
diff --git a/tt_metal/llrt/tt_memory.h b/tt_metal/llrt/tt_memory.h
index b39e899e0a3..98eda2331c8 100644
--- a/tt_metal/llrt/tt_memory.h
+++ b/tt_metal/llrt/tt_memory.h
@@ -20,6 +20,8 @@ class memory {
  public:
   typedef std::uint64_t address_t;
   typedef std::uint32_t word_t;
+  enum class PackSpans { PACK, NO_PACK };
+  enum class Relocate { XIP, NONE };
 
  private:
   static constexpr uint32_t initial_data_space_ = 0x400;
@@ -37,10 +39,11 @@ class memory {
   std::vector<struct span> link_spans_;
   uint32_t text_size_;
   uint32_t packed_size_;
+  uint32_t text_addr_;
 
  public:
   memory();
-  memory(std::string const &path);
+  memory(std::string const &path, Relocate relo_type);
 
  public:
   const std::vector<word_t>& data() const { return this->data_; }
@@ -52,6 +55,7 @@ class memory {
   void set_packed_size(uint32_t size) { this->packed_size_ = size; }
   uint32_t get_text_size() const { return this->text_size_; }
   uint32_t get_packed_size() const { return this->packed_size_; }
+  uint32_t get_text_addr() const { return this->text_addr_; }
 
   size_t size() const { return data_.size(); }
 
diff --git a/tt_metal/llrt/wormhole/wh_hal_active_eth.cpp b/tt_metal/llrt/wormhole/wh_hal_active_eth.cpp
index dc1d16aedf7..400bfe2dc1f 100644
--- a/tt_metal/llrt/wormhole/wh_hal_active_eth.cpp
+++ b/tt_metal/llrt/wormhole/wh_hal_active_eth.cpp
@@ -8,12 +8,15 @@
 
 #include "llrt/hal.hpp"
 #include "llrt/wormhole/wh_hal.hpp"
+#include "hw/inc/wormhole/core_config.h"
 #include "hw/inc/wormhole/dev_mem_map.h"
 #include "hw/inc/wormhole/eth_l1_address_map.h"
 #include "hostdevcommon/common_runtime_address_map.h"
 #include "tt_metal/third_party/umd/device/tt_soc_descriptor.h"
 #include "hw/inc/dev_msgs.h"
 
+#include <magic_enum.hpp>
+
 #define GET_ETH_MAILBOX_ADDRESS_HOST(x) \
     ((uint64_t) & (((mailboxes_t *)eth_l1_mem::address_map::ERISC_MEM_MAILBOX_BASE)->x))
 
@@ -23,8 +26,6 @@ namespace tt_metal {
 
 HalCoreInfoType create_active_eth_mem_map() {
 
-    constexpr uint32_t num_proc_per_active_eth_core = 1;
-
     std::vector<DeviceAddr> mem_map_bases;
 
     mem_map_bases.resize(utils::underlying_type<HalL1MemAddrType>(HalL1MemAddrType::COUNT));
@@ -51,7 +52,13 @@ HalCoreInfoType create_active_eth_mem_map() {
     mem_map_sizes[utils::underlying_type<HalL1MemAddrType>(HalL1MemAddrType::GO_MSG)] = sizeof(go_msg_t);
     mem_map_sizes[utils::underlying_type<HalL1MemAddrType>(HalL1MemAddrType::LAUNCH_MSG_BUFFER_RD_PTR)] = sizeof(uint32_t);
 
-    return {HalProgrammableCoreType::ACTIVE_ETH, CoreType::ETH, num_proc_per_active_eth_core, mem_map_bases, mem_map_sizes, false};
+    std::vector<std::vector<uint8_t>> processor_classes(NumEthDispatchClasses);
+    std::vector<uint8_t> processor_types{0};
+    for (uint8_t processor_class_idx = 0; processor_class_idx < NumEthDispatchClasses; processor_class_idx++) {
+        processor_classes[processor_class_idx] = processor_types;
+    }
+
+    return {HalProgrammableCoreType::ACTIVE_ETH, CoreType::ETH, processor_classes, mem_map_bases, mem_map_sizes, false};
 }
 
 }  // namespace tt_metal
diff --git a/tt_metal/llrt/wormhole/wh_hal_idle_eth.cpp b/tt_metal/llrt/wormhole/wh_hal_idle_eth.cpp
index 3236dcfdf87..c87a90bc46e 100644
--- a/tt_metal/llrt/wormhole/wh_hal_idle_eth.cpp
+++ b/tt_metal/llrt/wormhole/wh_hal_idle_eth.cpp
@@ -8,12 +8,15 @@
 
 #include "llrt/hal.hpp"
 #include "llrt/wormhole/wh_hal.hpp"
+#include "hw/inc/wormhole/core_config.h"
 #include "hw/inc/wormhole/dev_mem_map.h"
 #include "hw/inc/wormhole/eth_l1_address_map.h"
 #include "hostdevcommon/common_runtime_address_map.h"
 #include "tt_metal/third_party/umd/device/tt_soc_descriptor.h"
 #include "hw/inc/dev_msgs.h"
 
+#include <magic_enum.hpp>
+
 #define GET_IERISC_MAILBOX_ADDRESS_HOST(x) ((uint64_t) & (((mailboxes_t *)MEM_IERISC_MAILBOX_BASE)->x))
 
 namespace tt {
@@ -22,7 +25,6 @@ namespace tt_metal {
 
 HalCoreInfoType create_idle_eth_mem_map() {
 
-    constexpr uint32_t num_proc_per_idle_eth_core = 1;
     uint32_t max_alignment = std::max(DRAM_ALIGNMENT, L1_ALIGNMENT);
 
     static_assert(MEM_IERISC_MAP_END % L1_ALIGNMENT == 0);
@@ -53,7 +55,13 @@ HalCoreInfoType create_idle_eth_mem_map() {
     mem_map_sizes[utils::underlying_type<HalL1MemAddrType>(HalL1MemAddrType::GO_MSG)] = sizeof(go_msg_t);
     mem_map_sizes[utils::underlying_type<HalL1MemAddrType>(HalL1MemAddrType::LAUNCH_MSG_BUFFER_RD_PTR)] = sizeof(uint32_t);
 
-    return {HalProgrammableCoreType::IDLE_ETH, CoreType::ETH, num_proc_per_idle_eth_core, mem_map_bases, mem_map_sizes, false};
+    std::vector<std::vector<uint8_t>> processor_classes(NumEthDispatchClasses);
+    std::vector<uint8_t> processor_types{0};
+    for (uint8_t processor_class_idx = 0; processor_class_idx < NumEthDispatchClasses; processor_class_idx++) {
+        processor_classes[processor_class_idx] = processor_types;
+    }
+
+    return {HalProgrammableCoreType::IDLE_ETH, CoreType::ETH, processor_classes, mem_map_bases, mem_map_sizes, false};
 }
 
 }  // namespace tt_metal
diff --git a/tt_metal/llrt/wormhole/wh_hal_tensix.cpp b/tt_metal/llrt/wormhole/wh_hal_tensix.cpp
index 2dec8efbda3..2fff4835305 100644
--- a/tt_metal/llrt/wormhole/wh_hal_tensix.cpp
+++ b/tt_metal/llrt/wormhole/wh_hal_tensix.cpp
@@ -6,12 +6,16 @@
 
 #include "llrt/hal.hpp"
 #include "llrt/wormhole/wh_hal.hpp"
+#include "hw/inc/wormhole/core_config.h"
 #include "hw/inc/wormhole/dev_mem_map.h"
 #include "hw/inc/wormhole/eth_l1_address_map.h"  // XXXX FIXME
 #include "hostdevcommon/common_runtime_address_map.h"
 #include "tt_metal/third_party/umd/device/tt_soc_descriptor.h"
 #include "hw/inc/dev_msgs.h"
 
+#include <magic_enum.hpp>
+#include <numeric>
+
 #define GET_MAILBOX_ADDRESS_HOST(x) ((uint64_t) & (((mailboxes_t *)MEM_MAILBOX_BASE)->x))
 
 namespace tt {
@@ -20,7 +24,6 @@ namespace tt_metal {
 
 HalCoreInfoType create_tensix_mem_map() {
 
-    constexpr uint32_t num_proc_per_tensix_core = 5;
     uint32_t max_alignment = std::max(DRAM_ALIGNMENT, L1_ALIGNMENT);
 
     std::vector<DeviceAddr> mem_map_bases;
@@ -49,7 +52,16 @@ HalCoreInfoType create_tensix_mem_map() {
     mem_map_sizes[utils::underlying_type<HalL1MemAddrType>(HalL1MemAddrType::GO_MSG)] = sizeof(go_msg_t);
     mem_map_sizes[utils::underlying_type<HalL1MemAddrType>(HalL1MemAddrType::LAUNCH_MSG_BUFFER_RD_PTR)] = sizeof(uint32_t);
 
-    return {HalProgrammableCoreType::TENSIX, CoreType::WORKER, num_proc_per_tensix_core, mem_map_bases, mem_map_sizes, true};
+    std::vector<std::vector<uint8_t>> processor_classes(NumTensixDispatchClasses);
+    std::vector<uint8_t> processor_types;
+    for (uint8_t processor_class_idx = 0; processor_class_idx < NumTensixDispatchClasses; processor_class_idx++) {
+        uint32_t num_processors = processor_class_idx == (NumTensixDispatchClasses - 1) ? 3 : 1;
+        processor_types.resize(num_processors);
+        std::iota(processor_types.begin(), processor_types.end(), 0);
+        processor_classes[processor_class_idx] = processor_types;
+    }
+
+    return {HalProgrammableCoreType::TENSIX, CoreType::WORKER, processor_classes, mem_map_bases, mem_map_sizes, true};
 }
 
 }  // namespace tt_metal
diff --git a/tt_metal/programming_examples/contributed/vecadd/vecadd.cpp b/tt_metal/programming_examples/contributed/vecadd/vecadd.cpp
index a37f49fd725..9e3fc07776a 100644
--- a/tt_metal/programming_examples/contributed/vecadd/vecadd.cpp
+++ b/tt_metal/programming_examples/contributed/vecadd/vecadd.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "common/core_coord.h"
+#include "common/core_coord.hpp"
 #include "tt_metal/host_api.hpp"
 #include "tt_metal/impl/device/device.hpp"
 #include "common/bfloat16.hpp"
diff --git a/tt_metal/python_env/requirements-dev.txt b/tt_metal/python_env/requirements-dev.txt
index 5c57c68b0c0..f1761202d68 100644
--- a/tt_metal/python_env/requirements-dev.txt
+++ b/tt_metal/python_env/requirements-dev.txt
@@ -58,3 +58,4 @@ docopt==0.6.2
 tabulate==0.9.0
 blobfile==2.1.1 # Required for llama3
 numpy>=1.24.4,<2
+huggingface-hub==0.25.2
diff --git a/tt_metal/third_party/umd b/tt_metal/third_party/umd
index f7b1ce0f6ed..6deb8d7d2c6 160000
--- a/tt_metal/third_party/umd
+++ b/tt_metal/third_party/umd
@@ -1 +1 @@
-Subproject commit f7b1ce0f6ed3101bdd4f1367145378a59996f07b
+Subproject commit 6deb8d7d2c6513af090d91c58e3ace53b4564b4e
diff --git a/tt_metal/tt_metal.cpp b/tt_metal/tt_metal.cpp
index b78a6a09f2c..2a863b4e499 100644
--- a/tt_metal/tt_metal.cpp
+++ b/tt_metal/tt_metal.cpp
@@ -40,10 +40,10 @@ CoreRangeSet GetCoreRangeSet(const std::variant<CoreCoord, CoreRange, CoreRangeS
         {
             using T = std::decay_t<decltype(core_spec)>;
             if constexpr (std::is_same_v<T, CoreCoord>) {
-                return CoreRangeSet({CoreRange(core_spec, core_spec)});
+                return CoreRangeSet(CoreRange(core_spec, core_spec));
             }
             else if constexpr (std::is_same_v<T, CoreRange>) {
-                return CoreRangeSet({core_spec});
+                return CoreRangeSet(core_spec);
             }
             else if constexpr (std::is_same_v<T, CoreRangeSet>) {
                 return core_spec;
@@ -113,11 +113,18 @@ DataMovementConfigStatus CheckDataMovementConfig(Program &program, const CoreRan
 }
 
 void ConfigureKernelGroup(
-    const Program &program, const KernelGroup *kernel_group, Device *device, const CoreCoord &logical_core) {
+    Program &program,
+    uint32_t programmable_core_type_index,
+    const KernelGroup *kernel_group,
+    Device *device,
+    const CoreCoord &logical_core) {
 
+    uint32_t kernel_config_base = hal.get_dev_addr(programmable_core_type_index, HalL1MemAddrType::KERNEL_CONFIG);
     for (auto& optional_id : kernel_group->kernel_ids) {
         if (optional_id) {
-            detail::GetKernel(program, optional_id.value())->configure(device, logical_core);
+            // Need the individual offsets of each bin
+            detail::GetKernel(program, optional_id.value())->configure(device, logical_core,
+                kernel_config_base, kernel_group->kernel_text_offsets);
         }
     }
 }
@@ -756,7 +763,7 @@ bool ConfigureDeviceWithProgram(Device *device, Program &program, bool fd_bootlo
             KernelGroup *kernel_group = program.kernels_on_core(logical_core, index);
             CoreCoord physical_core = device->physical_core_from_logical_core(logical_core, core_type);
 
-            ConfigureKernelGroup(program, kernel_group, device, logical_core);
+            ConfigureKernelGroup(program, index, kernel_group, device, logical_core);
             // TODO: add support for CB for ethernet cores
             if (core_type == CoreType::WORKER) {
                 // CircularBufferConfigVec -- common across all kernels, so written once to the core
@@ -798,6 +805,7 @@ void WriteRuntimeArgsToDevice(Device *device, Program &program) {
 
     for (uint32_t index = 0; index < hal.get_programmable_core_type_count(); index++) {
         CoreType core_type = hal.get_core_type(index);
+        uint32_t processor_classes = hal.get_processor_classes_count(index);
         for (auto& kg : program.get_kernel_groups(index)) {
             uint32_t kernel_config_base = kg.launch_msg.kernel_config.kernel_config_base[index];
             for (const CoreRange &core_range : kg.core_ranges.ranges()) {
@@ -805,7 +813,7 @@ void WriteRuntimeArgsToDevice(Device *device, Program &program) {
                     for (auto y = core_range.start_coord.y; y <= core_range.end_coord.y; y++) {
                         CoreCoord logical_core(x, y);
                         auto physical_core = device->physical_core_from_logical_core(logical_core, core_type);
-                        for (int dispatch_class = 0; dispatch_class < DISPATCH_CLASS_MAX; dispatch_class++) {
+                        for (int dispatch_class = 0; dispatch_class < processor_classes; dispatch_class++) {
                             auto& optional_id = kg.kernel_ids[dispatch_class];
                             if (optional_id) {
                                 const auto &kernel = detail::GetKernel(program, optional_id.value());
@@ -1059,9 +1067,9 @@ uint32_t CreateSemaphore(
     return std::visit(
         [&](auto &&c) -> uint32_t {
             using T = std::decay_t<decltype(c)>;
-            CoreRangeSet crs({});
+            CoreRangeSet crs;
             if constexpr (std::is_same_v<T, CoreRange>) {
-                crs = CoreRangeSet({c});
+                crs = CoreRangeSet(c);
             } else {
                 crs = c;
             }
diff --git a/ttnn/cpp/ttnn/operations/ccl/all_gather/device/all_gather_op.hpp b/ttnn/cpp/ttnn/operations/ccl/all_gather/device/all_gather_op.hpp
index 607bb80af49..29b13c0655e 100644
--- a/ttnn/cpp/ttnn/operations/ccl/all_gather/device/all_gather_op.hpp
+++ b/ttnn/cpp/ttnn/operations/ccl/all_gather/device/all_gather_op.hpp
@@ -5,7 +5,7 @@
 #pragma once
 
 #include <cstdint>
-#include "common/core_coord.h"
+#include "common/core_coord.hpp"
 #include "impl/buffers/buffer.hpp"
 #include "ttnn/tensor/tensor.hpp"
 #include "ttnn/operations/ccl/shared_with_host/hetergeneous_data_structs.hpp"
diff --git a/ttnn/cpp/ttnn/operations/ccl/all_gather/device/multi_core/all_gather_op_multi_core.cpp b/ttnn/cpp/ttnn/operations/ccl/all_gather/device/multi_core/all_gather_op_multi_core.cpp
index 78f2d45a023..ddbca87932e 100644
--- a/ttnn/cpp/ttnn/operations/ccl/all_gather/device/multi_core/all_gather_op_multi_core.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/all_gather/device/multi_core/all_gather_op_multi_core.cpp
@@ -4,7 +4,7 @@
 ///
 #include <algorithm>
 
-#include "tt_metal/common/core_coord.h"
+#include "tt_metal/common/core_coord.hpp"
 #include "eth_l1_address_map.h"
 #include "impl/buffers/buffer.hpp"
 #include "ttnn/tensor/tensor_impl.hpp"
diff --git a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/host/reduce_scatter_full_worker_grid.cpp b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/host/reduce_scatter_full_worker_grid.cpp
index 7c8cbda4539..f0f80e7e3f2 100644
--- a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/host/reduce_scatter_full_worker_grid.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/host/reduce_scatter_full_worker_grid.cpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 ///
 
-#include "common/core_coord.h"
+#include "common/core_coord.hpp"
 #include "impl/buffers/buffer.hpp"
 #include "ttnn/operation.hpp"
 #include "ttnn/operations/ccl/ccl_host_types.hpp"
@@ -336,8 +336,10 @@ static std::pair<CoreRangeSet, std::optional<CoreRangeSet>> select_worker_cores_
 
     TT_ASSERT(num_edm_channels % 2 == 0, "For line topologies, we expect a multiple of 2 number of channels for the algorithm and worker kernels to work.");
     const std::size_t workers_per_direction = num_edm_channels / num_directions_per_line;
-    auto const& lower_half_of_cores = CoreRangeSet({CoreRange(CoreCoord(0, 0), CoreCoord(workers_per_direction - 1, num_links - 1))});
-    auto const& upper_half_of_cores = CoreRangeSet({CoreRange(CoreCoord(workers_per_direction, 0), CoreCoord(num_edm_channels - 1, num_links - 1))});
+    auto const& lower_half_of_cores =
+        CoreRangeSet(CoreRange(CoreCoord(0, 0), CoreCoord(workers_per_direction - 1, num_links - 1)));
+    auto const& upper_half_of_cores = CoreRangeSet(
+        CoreRange(CoreCoord(workers_per_direction, 0), CoreCoord(num_edm_channels - 1, num_links - 1)));
     if (topology_config.ring_index == 0) {
         log_trace(tt::LogOp, "Start of line, putting CCL send cores in lower half");
         return {upper_half_of_cores, lower_half_of_cores};
@@ -348,7 +350,9 @@ static std::pair<CoreRangeSet, std::optional<CoreRangeSet>> select_worker_cores_
         return {lower_half_of_cores, upper_half_of_cores};
     } else {
         log_trace(tt::LogOp, "Middle of line - no CCL kernel");
-        return {CoreRangeSet({CoreRange(CoreCoord(0, 0), CoreCoord(num_edm_channels - 1, num_links - 1))}), std::nullopt};
+        return {
+            CoreRangeSet(CoreRange(CoreCoord(0, 0), CoreCoord(num_edm_channels - 1, num_links - 1))),
+            std::nullopt};
     }
 }
 
@@ -376,9 +380,11 @@ static std::pair<CoreRangeSet, std::optional<CoreRangeSet>> select_worker_cores(
         }
 
         case ttnn::ccl::Topology::Ring:
-            return {CoreRangeSet({CoreRange(CoreCoord(0, 0), CoreCoord(num_edm_channels - 1, num_links - 1))}), std::nullopt};
+            return {
+                CoreRangeSet(CoreRange(CoreCoord(0, 0), CoreCoord(num_edm_channels - 1, num_links - 1))),
+                std::nullopt};
 
-        default: TT_ASSERT(false, "Unsupported topology"); return {CoreRangeSet({}), std::nullopt};
+        default: TT_ASSERT(false, "Unsupported topology"); return {CoreRangeSet(), std::nullopt};
     };
 }
 
diff --git a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/host/reduce_scatter_common.hpp b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/host/reduce_scatter_common.hpp
index f2976fef86e..867699338ee 100644
--- a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/host/reduce_scatter_common.hpp
+++ b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/host/reduce_scatter_common.hpp
@@ -4,7 +4,7 @@
 
 #pragma once
 
-#include "common/core_coord.h"
+#include "common/core_coord.hpp"
 
 #include <cstdint>
 #include <vector>
diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.cpp
index af0fd9a46c4..b32bea449e2 100644
--- a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.cpp
+++ b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.cpp
@@ -94,7 +94,7 @@ ParallelConfig determine_parallel_config(
     auto grid_size = device->compute_with_storage_grid_size();
     uint32_t max_num_cores = grid_size.x * grid_size.y;
     uint32_t num_cores_nhw = 0;
-    CoreRangeSet grid = {{}};
+    CoreRangeSet grid;
     if (shard_layout == TensorMemoryLayout::HEIGHT_SHARDED) {
         num_cores_nhw = find_closest_largest_divisor(out_nhw_ntiles, max_num_cores);
         if (num_cores_nhw < grid_size.x && out_nhw_ntiles > grid_size.x) {
@@ -102,7 +102,9 @@ ParallelConfig determine_parallel_config(
         }
         grid = num_cores_to_corerange_set(num_cores_nhw, grid_size, true);
     } else if (shard_layout == TensorMemoryLayout::BLOCK_SHARDED) {
-        num_cores_nhw = find_closest_largest_divisor_with_num_padding(out_nhw_ntiles, grid_size.x);
+        uint32_t start_divisor =
+                block_shard_orientation == ShardOrientation::COL_MAJOR ? grid_size.x : grid_size.y;
+        num_cores_nhw = find_closest_largest_divisor_with_num_padding(out_nhw_ntiles, start_divisor);
         uint32_t num_cores_c = find_closest_common_largest_divisor(out_c_ntiles, std::ceil((float)input_channels / effective_tile_width), block_shard_orientation == ShardOrientation::COL_MAJOR ? grid_size.y : grid_size.x);
         uint32_t cores_x = block_shard_orientation == ShardOrientation::COL_MAJOR ? num_cores_nhw : num_cores_c;
         uint32_t cores_y = block_shard_orientation == ShardOrientation::COL_MAJOR ? num_cores_c : num_cores_nhw;
diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.cpp
index 692e015c546..06b493cded3 100644
--- a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.cpp
+++ b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.cpp
@@ -24,7 +24,7 @@ namespace optimized_conv_op_utils {
 using namespace tt;
 using namespace tt::tt_metal;
 
-pair<vector<uint32_t>, vector<uint32_t>> compute_opt_conv_activation_as_mm_shape(const tt::tt_metal::LegacyShape& conv_activation_shape, ttnn::operations::sliding_window::SlidingWindowConfig sliding_window_config, uint32_t act_block_h_ntiles) {
+std::pair<vector<uint32_t>, vector<uint32_t>> compute_opt_conv_activation_as_mm_shape(const tt::tt_metal::LegacyShape& conv_activation_shape, ttnn::operations::sliding_window::SlidingWindowConfig sliding_window_config, uint32_t act_block_h_ntiles) {
 
     uint32_t filter_h = (uint32_t)sliding_window_config.window_hw.first;  // filter_h
     uint32_t filter_w = (uint32_t)sliding_window_config.window_hw.second;  // filter_W
@@ -71,8 +71,7 @@ Tensor optimized_conv_new(const Tensor& a, const Tensor &b, std::optional<const
                 auto& a = input_tensors.at(0);
                 auto& b = input_tensors.at(1);
                 auto& bias = optional_input_tensors.at(0);
-                //TT_ASSERT(!untilize_out, "Optimized conv only supports tiled out");
-                TT_ASSERT(b.get_layout() == Layout::TILE); // Weights should already be formatted
+                TT_FATAL(b.get_layout() == Layout::TILE, "Weights should be in TILE layout."); // Weights should already be formatted
                 const auto& ashape = tt::tt_metal::LegacyShape(input_tensor_shape);
                 auto padded_a_shape = ttnn::Shape(std::array<uint32_t,4>{ashape[0], ashape[1], ashape[2], tt::round_up(ashape[3], 16)});
                 FormatParams input_a_format_params = {.pad_shape=padded_a_shape.value, .pad_value=0.0, .target_layout=Layout::ROW_MAJOR};
@@ -98,8 +97,8 @@ Tensor optimized_conv_new(const Tensor& a, const Tensor &b, std::optional<const
 void OptimizedConvNew::validate(const std::vector<Tensor>& input_tensors, const std::vector<std::optional<const Tensor>>& optional_input_tensors) const {
     const auto& input_tensor_a = input_tensors.at(0);
     const auto& input_tensor_b = input_tensors.at(1);
-    // TODO: ...
-    TT_FATAL(!input_tensor_b.memory_config().is_sharded(), "Error");
+    TT_FATAL(input_tensor_a.memory_config().is_sharded(), "Activation tensor should be sharded.");
+    TT_FATAL(!input_tensor_b.memory_config().is_sharded(), "Weights tensor should not be sharded.");
     if (this->untilize_out) {
         TT_FATAL((this->dtype == DataType::BFLOAT16) || (this->dtype == DataType::FLOAT32), "Error");
     }
@@ -190,7 +189,8 @@ std::vector<Tensor> OptimizedConvNew::create_output_tensors(const std::vector<Te
             log_debug(tt::LogOp, "Parallelization config grid size: {}", this->parallelization_config.grid_size.str());
             uint32_t num_cores_x = this->parallelization_config.grid_size.x;
             uint32_t num_cores_y = this->parallelization_config.grid_size.y;
-            CoreRangeSet shard_grid = CoreRangeSet({{{0, 0}, {num_cores_x - 1, num_cores_y - 1}}});
+            CoreRangeSet shard_grid =
+                CoreRangeSet(CoreRange({0, 0}, {num_cores_x - 1, num_cores_y - 1}));
             log_debug(tt::LogOp, "Calculated shard_grid: {}", shard_grid.str());
             std::array<uint32_t, 2> shard_shape = {this->parallelization_config.per_core_out_matrix_height_ntiles * TILE_HEIGHT, this->parallelization_config.per_core_out_matrix_width_ntiles * TILE_WIDTH};
             auto shard_spec = ShardSpec{shard_grid, shard_shape, this->memory_config.shard_spec.value().orientation};
@@ -212,7 +212,6 @@ operation::ProgramWithCallbacks OptimizedConvNew::create_program(const std::vect
     const auto& input_tensor_b = input_tensors.at(1);
     const auto& input_tensor_bias = optional_input_tensors.at(0);
     auto& output_tensor = output_tensors.at(0);
-    TT_ASSERT(input_tensor_a.memory_config().is_sharded()); // TODO: move this check to validate_input_tensors
     return multi_core_optimized_conv_sharded_v2_new(
         input_tensor_a, input_tensor_b, input_tensor_bias,
         sliding_window_config,
diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.hpp b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.hpp
index cf938e1da13..a22885832a8 100644
--- a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.hpp
+++ b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.hpp
@@ -172,6 +172,6 @@ using namespace tt;
 using namespace tt::tt_metal;
 
 
-pair<vector<uint32_t>, vector<uint32_t>> compute_opt_conv_activation_as_mm_shape(const tt::tt_metal::LegacyShape& conv_activation_shape, ttnn::operations::sliding_window::SlidingWindowConfig sliding_window_config, uint32_t act_block_h_ntiles);
+std::pair<vector<uint32_t>, vector<uint32_t>> compute_opt_conv_activation_as_mm_shape(const tt::tt_metal::LegacyShape& conv_activation_shape, ttnn::operations::sliding_window::SlidingWindowConfig sliding_window_config, uint32_t act_block_h_ntiles);
 
 } // optimized_conv_op_utils
diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp
index a8a7b9e4714..d60aaf02115 100644
--- a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp
@@ -862,7 +862,7 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl(
     auto bottom_right_core_physical = device->worker_core_from_logical_core(bottom_right_core);
 
     CoreRange mcast_sender_cores(top_left_core, top_left_core);  // If single core, this kernel doesn't do mcasting
-    CoreRangeSet mcast_receiver_cores{{}};
+    CoreRangeSet mcast_receiver_cores;
     uint32_t weights_mcast_sender_semaphore_id{};
     uint32_t weights_mcast_receiver_semaphore_id{};
     uint32_t act_mcast_sender_semaphore_id = 0;
@@ -872,10 +872,10 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl(
         // 2D mcast
         if (transpose_mcast) {
             mcast_sender_cores = CoreRange(top_left_core, CoreCoord(0, num_cores_y - 1));
-            mcast_receiver_cores = {{CoreRange(CoreCoord(1, 0), bottom_right_core)}};
+            mcast_receiver_cores = CoreRange(CoreCoord(1, 0), bottom_right_core);
         } else {
             mcast_sender_cores = CoreRange(top_left_core, CoreCoord(num_cores_x - 1, 0));
-            mcast_receiver_cores = {{CoreRange(CoreCoord(0, 1), bottom_right_core)}};
+            mcast_receiver_cores = CoreRange(CoreCoord(0, 1), bottom_right_core);
         }
         weights_mcast_sender_semaphore_id = tt_metal::CreateSemaphore(program, all_cores, INVALID);
         weights_mcast_receiver_semaphore_id = tt_metal::CreateSemaphore(program, all_cores, INVALID);
diff --git a/ttnn/cpp/ttnn/operations/core/work_split/work_split_tilize.hpp b/ttnn/cpp/ttnn/operations/core/work_split/work_split_tilize.hpp
index ec4a3654628..851ea144a61 100644
--- a/ttnn/cpp/ttnn/operations/core/work_split/work_split_tilize.hpp
+++ b/ttnn/cpp/ttnn/operations/core/work_split/work_split_tilize.hpp
@@ -9,7 +9,7 @@
 #pragma once
 
 #include "ttnn/tensor/types.hpp"
-#include "tt_metal/common/core_coord.h"
+#include "tt_metal/common/core_coord.hpp"
 
 namespace ttnn {
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/bcast/device/bcast_device_operation.cpp b/ttnn/cpp/ttnn/operations/data_movement/bcast/device/bcast_device_operation.cpp
index 2f890e9bb71..3c38f88ed64 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/bcast/device/bcast_device_operation.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/bcast/device/bcast_device_operation.cpp
@@ -124,7 +124,7 @@ std::vector<Tensor> EltwiseBinaryBroadcast::create_output_tensors(const std::vec
     }
     const auto& input_tensor = input_tensors.at(0);
     if (this->output_mem_config.is_sharded()) {
-        ShardSpec shard_spec{CoreRangeSet({}), {0, 0}};
+        ShardSpec shard_spec{CoreRangeSet(), {0, 0}};
         if (input_tensor.memory_config().is_sharded()) {
             // Derive output shard_spec based on input
             shard_spec = input_tensor.shard_spec().value();
diff --git a/ttnn/cpp/ttnn/operations/data_movement/bcast/device/multi_core_hw/bcast_op_multi_core_hw.cpp b/ttnn/cpp/ttnn/operations/data_movement/bcast/device/multi_core_hw/bcast_op_multi_core_hw.cpp
index aac9e4d8e4c..7b684507703 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/bcast/device/multi_core_hw/bcast_op_multi_core_hw.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/bcast/device/multi_core_hw/bcast_op_multi_core_hw.cpp
@@ -81,7 +81,7 @@ operation::ProgramWithCallbacks bcast_multi_core_hw(const Tensor &a, const Tenso
         num_tiles_per_core_group_2 = 0;
         all_cores = shard_spec.value().grid;
         core_group_1 = all_cores;
-        core_group_2 = CoreRangeSet({});
+        core_group_2 = CoreRangeSet();
     }
 
     uint32_t num_input_tiles_cb0 = src0_sharded ? num_tiles_per_shard : num_input_tiles;
@@ -270,7 +270,7 @@ operation::ProgramWithCallbacks bcast_multi_core_hw(const Tensor &a, const Tenso
             num_tiles_per_core_group_2 = 0;
             all_cores = shard_spec.value().grid;
             core_group_1 = all_cores;
-            core_group_2 = CoreRangeSet({});
+            core_group_2 = CoreRangeSet();
         }
 
         auto& cached_reader_args = GetRuntimeArgs(program, binary_reader_kernel_id);
diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/interleaved_to_sharded.cpp b/ttnn/cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/interleaved_to_sharded.cpp
index 8d2a175fd6e..346e55d3820 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/interleaved_to_sharded.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/interleaved_to_sharded.cpp
@@ -41,7 +41,7 @@ ttnn::Tensor InterleavedToShardedOperation::invoke(
 
     bool row_wise = shard_orientation == ShardOrientation::ROW_MAJOR;
     CoreCoord grid_size;
-    CoreRangeSet grid_set({});
+    CoreRangeSet grid_set;
     std::visit(
         [&](const auto &grid) {
             using GridType = std::decay_t<decltype(grid)>;
diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/interleaved_to_sharded.hpp b/ttnn/cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/interleaved_to_sharded.hpp
index e37eb7b87bb..97faf6d8fad 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/interleaved_to_sharded.hpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/interleaved_to_sharded.hpp
@@ -5,7 +5,7 @@
 #pragma once
 
 #include "ttnn/decorators.hpp"
-#include "tt_metal/common/core_coord.h"
+#include "tt_metal/common/core_coord.hpp"
 
 
 namespace ttnn {
diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/interleaved_to_sharded_pybind.cpp b/ttnn/cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/interleaved_to_sharded_pybind.cpp
index 1ceccb41701..86d003e1802 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/interleaved_to_sharded_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/interleaved_to_sharded_pybind.cpp
@@ -9,7 +9,7 @@
 #include "ttnn/cpp/pybind11/decorators.hpp"
 #include "interleaved_to_sharded.hpp"
 #include "ttnn/types.hpp"
-#include "tt_metal/common/core_coord.h"
+#include "tt_metal/common/core_coord.hpp"
 
 namespace ttnn::operations::data_movement {
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded/reshard/reshard.hpp b/ttnn/cpp/ttnn/operations/data_movement/sharded/reshard/reshard.hpp
index 89e7015af2e..dcc26f7e440 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/sharded/reshard/reshard.hpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/sharded/reshard/reshard.hpp
@@ -5,7 +5,7 @@
 #pragma once
 
 #include "ttnn/decorators.hpp"
-#include "tt_metal/common/core_coord.h"
+#include "tt_metal/common/core_coord.hpp"
 
 
 namespace ttnn {
diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded/reshard/reshard_pybind.cpp b/ttnn/cpp/ttnn/operations/data_movement/sharded/reshard/reshard_pybind.cpp
index 20cd064dd0a..df454ff329f 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/sharded/reshard/reshard_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/sharded/reshard/reshard_pybind.cpp
@@ -9,7 +9,7 @@
 #include "ttnn/cpp/pybind11/decorators.hpp"
 #include "reshard.hpp"
 #include "ttnn/types.hpp"
-#include "tt_metal/common/core_coord.h"
+#include "tt_metal/common/core_coord.hpp"
 
 namespace ttnn::operations::data_movement {
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/interleaved_to_sharded_partial/device/interleaved_to_sharded_partial_op.hpp b/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/interleaved_to_sharded_partial/device/interleaved_to_sharded_partial_op.hpp
index 66f56ef8522..d9231a3c361 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/interleaved_to_sharded_partial/device/interleaved_to_sharded_partial_op.hpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/interleaved_to_sharded_partial/device/interleaved_to_sharded_partial_op.hpp
@@ -7,7 +7,7 @@
 #include "ttnn/tensor/tensor.hpp"
 #include "ttnn/run_operation.hpp"
 
-#include "tt_metal/common/core_coord.h"
+#include "tt_metal/common/core_coord.hpp"
 #include "tt_metal/impl/buffers/buffer.hpp"
 namespace ttnn::operations::data_movement {
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/interleaved_to_sharded_partial/interleaved_to_sharded_partial.cpp b/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/interleaved_to_sharded_partial/interleaved_to_sharded_partial.cpp
index 5ad85a81e03..297f7872d48 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/interleaved_to_sharded_partial/interleaved_to_sharded_partial.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/interleaved_to_sharded_partial/interleaved_to_sharded_partial.cpp
@@ -29,7 +29,7 @@ ttnn::Tensor InterleavedToShardedPartialOperation::invoke(
 
     bool row_wise = shard_orientation == ShardOrientation::ROW_MAJOR;
     CoreCoord grid_size;
-    CoreRangeSet grid_set({});
+    CoreRangeSet grid_set;
     std::visit(
         [&](const auto &grid) {
             using GridType = std::decay_t<decltype(grid)>;
diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/interleaved_to_sharded_partial/interleaved_to_sharded_partial.hpp b/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/interleaved_to_sharded_partial/interleaved_to_sharded_partial.hpp
index 2f1b4923a40..1d5f2acc967 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/interleaved_to_sharded_partial/interleaved_to_sharded_partial.hpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/interleaved_to_sharded_partial/interleaved_to_sharded_partial.hpp
@@ -5,7 +5,7 @@
 #pragma once
 
 #include "ttnn/decorators.hpp"
-#include "tt_metal/common/core_coord.h"
+#include "tt_metal/common/core_coord.hpp"
 
 
 namespace ttnn {
diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/interleaved_to_sharded_partial/interleaved_to_sharded_partial_pybind.cpp b/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/interleaved_to_sharded_partial/interleaved_to_sharded_partial_pybind.cpp
index f0603ea7b8f..6c5150ebecf 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/interleaved_to_sharded_partial/interleaved_to_sharded_partial_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/interleaved_to_sharded_partial/interleaved_to_sharded_partial_pybind.cpp
@@ -9,7 +9,7 @@
 #include "ttnn/cpp/pybind11/decorators.hpp"
 #include "interleaved_to_sharded_partial.hpp"
 #include "ttnn/types.hpp"
-#include "tt_metal/common/core_coord.h"
+#include "tt_metal/common/core_coord.hpp"
 
 namespace ttnn::operations::data_movement {
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/untilize/device/untilize_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/untilize/device/untilize_program_factory.cpp
index 1570010fae0..2d9c90c2eb8 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/untilize/device/untilize_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/untilize/device/untilize_program_factory.cpp
@@ -328,7 +328,7 @@ operation::ProgramWithCallbacks untilize_multi_core(
         uint32_t num_cores = all_cores.num_cores();
         ncores = num_cores;
         core_range = all_cores;
-        core_range_cliff = CoreRangeSet({});
+        core_range_cliff = CoreRangeSet();
         ntiles_per_block = shard_spec.shape[1] / TILE_WIDTH;
         nblocks_per_core = shard_spec.shape[0] / TILE_HEIGHT;
         nblocks_per_core_cliff = 0;
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_device_operation.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_device_operation.cpp
index 5ff657eaf8c..e42f7e72d70 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_device_operation.cpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_device_operation.cpp
@@ -204,7 +204,7 @@ BinaryDeviceOperation::tensor_return_value_t BinaryDeviceOperation::create_outpu
     auto program_factory = select_program_factory(operation_attributes, tensor_args);
     if (std::holds_alternative<ElementWiseMultiCore>(program_factory)) {
         if (operation_attributes.memory_config.is_sharded()) {
-            ShardSpec shard_spec{CoreRangeSet({}), {0, 0}};
+            ShardSpec shard_spec{CoreRangeSet(), {0, 0}};
             if (input_tensor_a.memory_config().is_sharded()) {
                 shard_spec = input_tensor_a.shard_spec().value();
             } else if (input_tensor_b.memory_config().is_sharded()) {
@@ -219,7 +219,7 @@ BinaryDeviceOperation::tensor_return_value_t BinaryDeviceOperation::create_outpu
         }
     } else {
         if (operation_attributes.memory_config.is_sharded()) {
-            ShardSpec shard_spec{CoreRangeSet({}), {0, 0}};
+            ShardSpec shard_spec{CoreRangeSet(), {0, 0}};
             if (input_tensor_a.memory_config().is_sharded()) {
                 // Derive output shard_spec based on input
                 shard_spec = input_tensor_a.shard_spec().value();
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_and_width_multi_core_program_factory.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_and_width_multi_core_program_factory.cpp
index b94004b4dd7..a964c226bd6 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_and_width_multi_core_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_and_width_multi_core_program_factory.cpp
@@ -111,7 +111,7 @@ BinaryDeviceOperation::BroadcastHeightAndWidthMultiCore::create(
         num_tiles_per_core_group_2 = 0;
         all_cores = shard_spec.value().grid;
         core_group_1 = all_cores;
-        core_group_2 = CoreRangeSet({});
+        core_group_2 = CoreRangeSet();
     }
 
     uint32_t num_input_tiles_cb0 = src0_sharded ? num_tiles_per_shard : num_input_tiles;
@@ -319,7 +319,7 @@ void BinaryDeviceOperation::BroadcastHeightAndWidthMultiCore::override_runtime_a
         num_tiles_per_core_group_2 = 0;
         all_cores = shard_spec.value().grid;
         core_group_1 = all_cores;
-        core_group_2 = CoreRangeSet({});
+        core_group_2 = CoreRangeSet();
     }
 
     auto& cached_reader_args = GetRuntimeArgs(program, binary_reader_kernel_id);
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/device/element_wise_multi_core_program_factory.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/device/element_wise_multi_core_program_factory.cpp
index 40ad1a58097..eb57eb345b9 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/binary/device/element_wise_multi_core_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary/device/element_wise_multi_core_program_factory.cpp
@@ -39,7 +39,7 @@ inline __attribute__((always_inline)) void set_eltwise_binary_runtime_args(
     auto src_buffer_b = b.buffer();
     auto dst_buffer = output.buffer();
 
-    CoreRangeSet all_cores({}), core_group_1({}), core_group_2({});
+    CoreRangeSet all_cores, core_group_1, core_group_2;
 
     std::optional<ShardSpec> shard_spec = std::nullopt;
     std::optional<TensorMemoryLayout> sharded_layout = std::nullopt;
@@ -83,7 +83,7 @@ inline __attribute__((always_inline)) void set_eltwise_binary_runtime_args(
         all_cores = shard_spec.value().grid;
         num_cores = all_cores.num_cores();
         core_group_1 = all_cores;
-        core_group_2 = CoreRangeSet({});
+        core_group_2 = CoreRangeSet();
         num_tiles_per_core_group_1 = shard_spec.value().shape[0] * shard_spec.value().shape[1] / TILE_HW;
         num_tiles_per_core_group_2 = 0;
         block_size_per_core_group_1 = find_max_block_size(num_tiles_per_core_group_1);
diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/all_gather_matmul.hpp b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/all_gather_matmul.hpp
index 2e5aef2f420..d28e474e81d 100644
--- a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/all_gather_matmul.hpp
+++ b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/all_gather_matmul.hpp
@@ -5,7 +5,7 @@
 #pragma once
 
 #include "ttnn/decorators.hpp"
-#include "common/core_coord.h"
+#include "common/core_coord.hpp"
 #include "ttnn/operations/experimental/ccl/all_gather_matmul/device/all_gather_matmul_op.hpp"
 #include "ttnn/cpp/ttnn/distributed/api.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/device/all_gather_matmul_op.cpp b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/device/all_gather_matmul_op.cpp
index 29e8456ce91..586faeb2952 100644
--- a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/device/all_gather_matmul_op.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/device/all_gather_matmul_op.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "common/core_coord.h"
+#include "common/core_coord.hpp"
 #include "ttnn/operations/ccl/all_gather/device/all_gather_op.hpp"
 #include "ttnn/operations/math.hpp"
 #include "tt_metal/host_api.hpp"
diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/device/all_gather_matmul_op.hpp b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/device/all_gather_matmul_op.hpp
index 6dc88b1086d..249f92ff20e 100644
--- a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/device/all_gather_matmul_op.hpp
+++ b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/device/all_gather_matmul_op.hpp
@@ -5,7 +5,7 @@
 #pragma once
 
 #include <cstdint>
-#include "common/core_coord.h"
+#include "common/core_coord.hpp"
 #include "impl/buffers/buffer.hpp"
 #include "ttnn/tensor/tensor.hpp"
 #include "ttnn/operations/ccl/shared_with_host/hetergeneous_data_structs.hpp"
diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/device/multi_core/all_gather_matmul_op_multi_core.cpp b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/device/multi_core/all_gather_matmul_op_multi_core.cpp
index b81acd79fa7..792d60a2d6e 100644
--- a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/device/multi_core/all_gather_matmul_op_multi_core.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/device/multi_core/all_gather_matmul_op_multi_core.cpp
@@ -4,7 +4,7 @@
 ///
 #include <algorithm>
 
-#include "tt_metal/common/core_coord.h"
+#include "tt_metal/common/core_coord.hpp"
 #include "eth_l1_address_map.h"
 #include "impl/buffers/buffer.hpp"
 #include "ttnn/tensor/tensor_impl.hpp"
diff --git a/ttnn/cpp/ttnn/operations/experimental/paged_cache/device/paged_fill_cache_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/paged_cache/device/paged_fill_cache_program_factory.cpp
index 7a39357cfb5..bf65a8965b3 100644
--- a/ttnn/cpp/ttnn/operations/experimental/paged_cache/device/paged_fill_cache_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/paged_cache/device/paged_fill_cache_program_factory.cpp
@@ -57,7 +57,7 @@ operation::ProgramWithCallbacks paged_fill_cache_multi_core(const Tensor& cache_
     bool row_major;
     uint32_t num_cores, num_blocks_per_core_group_1, num_blocks_per_core_group_2;
 
-    CoreRangeSet all_cores({}), core_group_1({}), core_group_2({});
+    CoreRangeSet all_cores, core_group_1, core_group_2;
 
     row_major = true;
     std::tie(num_cores, all_cores, core_group_1, core_group_2, num_blocks_per_core_group_1, num_blocks_per_core_group_2) = tt::tt_metal::split_work_to_cores(compute_with_storage_grid_size,  num_blocks_of_work, row_major);
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/nlp_concat_heads_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/nlp_concat_heads_program_factory.cpp
index 8e63c55b980..4f83def6d87 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/nlp_concat_heads_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/nlp_concat_heads_program_factory.cpp
@@ -45,7 +45,7 @@ operation::ProgramWithCallbacks multi_core_nlp_concat_heads(const Tensor &a, Ten
     // Block is a unit of work; ie. num of per_tensor_tiles per core
     uint32_t num_blocks = ashape[0] * ashape[2] / TILE_HEIGHT;
     uint32_t num_cores = 0, num_blocks_per_core_group_1 = 0, num_blocks_per_core_group_2 = 0;
-    CoreRangeSet all_cores = CoreRangeSet({}), core_group_1 = CoreRangeSet({}), core_group_2 = CoreRangeSet({});
+    CoreRangeSet all_cores = CoreRangeSet(), core_group_1 = CoreRangeSet(), core_group_2 = CoreRangeSet();
     bool row_major = false;
     if (in_sharded) {
         all_cores = a.shard_spec().value().grid;
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding/device/rotary_embedding_device_operation.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding/device/rotary_embedding_device_operation.cpp
index 1fa6578efcc..c174d458fdf 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding/device/rotary_embedding_device_operation.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding/device/rotary_embedding_device_operation.cpp
@@ -72,7 +72,7 @@ std::vector<Tensor> RotaryEmbedding::create_output_tensors(const std::vector<Ten
     const auto& input_tensor = input_tensors.at(0);
     auto output_shape = this->compute_output_shapes(input_tensors)[0];
     if (this->output_mem_config.is_sharded()) {
-        ShardSpec shard_spec{CoreRangeSet({}), {0, 0}};
+        ShardSpec shard_spec{CoreRangeSet(), {0, 0}};
         if (input_tensor.is_sharded()) {
             shard_spec = input_tensor.shard_spec().value();
         } else {
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding/device/rotary_embedding_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding/device/rotary_embedding_program_factory.cpp
index faeb4eefe39..f9ed0492f56 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding/device/rotary_embedding_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding/device/rotary_embedding_program_factory.cpp
@@ -64,7 +64,7 @@ operation::ProgramWithCallbacks rotary_embedding_multi_core(
     bool row_major;
     uint32_t num_cores, num_rows_per_core_group_1, num_rows_per_core_group_2;
 
-    CoreRangeSet all_cores({}), core_group_1({}), core_group_2({});
+    CoreRangeSet all_cores, core_group_1, core_group_2;
 
     bool in_sharded = input.shard_spec().has_value();
     bool out_sharded = output.shard_spec().has_value();
@@ -77,7 +77,7 @@ operation::ProgramWithCallbacks rotary_embedding_multi_core(
         all_cores = shard_spec.value().grid;
         num_cores = all_cores.num_cores();
         core_group_1 = all_cores;
-        core_group_2 = CoreRangeSet({});
+        core_group_2 = CoreRangeSet();
         num_rows_per_core_group_1 = shard_spec.value().shape[0] / TILE_HEIGHT;
         num_rows_per_core_group_2 = 0;
         num_input_tiles =
diff --git a/ttnn/cpp/ttnn/operations/kv_cache/device/update_cache_op_multi_core.cpp b/ttnn/cpp/ttnn/operations/kv_cache/device/update_cache_op_multi_core.cpp
index 95aaf14358a..4e7719e53ef 100644
--- a/ttnn/cpp/ttnn/operations/kv_cache/device/update_cache_op_multi_core.cpp
+++ b/ttnn/cpp/ttnn/operations/kv_cache/device/update_cache_op_multi_core.cpp
@@ -63,7 +63,7 @@ operation::ProgramWithCallbacks update_cache_multi_core(const Tensor& cache_tens
     bool row_major;
     uint32_t num_cores, num_batched_heads_per_core_group_1, num_batched_heads_per_core_group_2;
 
-    CoreRangeSet all_cores({}), core_group_1({}), core_group_2({});
+    CoreRangeSet all_cores, core_group_1, core_group_2;
 
     std::optional<ShardSpec> shard_spec = input_tensor.shard_spec();
 
@@ -73,7 +73,7 @@ operation::ProgramWithCallbacks update_cache_multi_core(const Tensor& cache_tens
         all_cores = shard_spec.value().grid;
         num_cores = all_cores.num_cores();
         core_group_1 = all_cores;
-        core_group_2 = CoreRangeSet({});
+        core_group_2 = CoreRangeSet();
         num_batched_heads_per_core_group_1 = shard_spec.value().shape[0] / TILE_HEIGHT;
         num_batched_heads_per_core_group_2 = 0;
         num_input_tiles = shard_spec.value().shape[0] * shard_spec.value().shape[1] / TILE_HW;
@@ -324,7 +324,7 @@ operation::ProgramWithCallbacks fill_cache_multi_core(const Tensor& cache_tensor
     bool row_major;
     uint32_t num_cores, num_blocks_per_core_group_1, num_blocks_per_core_group_2;
 
-    CoreRangeSet all_cores({}), core_group_1({}), core_group_2({});
+    CoreRangeSet all_cores, core_group_1, core_group_2;
 
     std::optional<ShardSpec> shard_spec = input_tensor.shard_spec();
 
@@ -334,7 +334,7 @@ operation::ProgramWithCallbacks fill_cache_multi_core(const Tensor& cache_tensor
         all_cores = shard_spec.value().grid;
         num_cores = all_cores.num_cores();
         core_group_1 = all_cores;
-        core_group_2 = CoreRangeSet({});
+        core_group_2 = CoreRangeSet();
         num_blocks_per_core_group_1 = shard_spec.value().shape[0] / TILE_HEIGHT;
         num_blocks_per_core_group_2 = 0;
         num_input_tiles = shard_spec.value().shape[0] * shard_spec.value().shape[1] / TILE_HW;
diff --git a/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_dram_sharded.cpp b/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_dram_sharded.cpp
index 0a7f90d5b9d..01ebc270bbc 100644
--- a/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_dram_sharded.cpp
+++ b/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_dram_sharded.cpp
@@ -61,7 +61,7 @@ void kernel_main() {
     constexpr DataFormat in1_data_format = get_dataformat(cb_id_in1);
 
     uint32_t in1_base_addr =
-        noc_async_read_tile_dram_sharded_set_state<in1_page_size, true>(in1_tensor_addr, dram_bank_id, vc);
+        noc_async_read_tile_dram_sharded_set_state<true>(in1_tensor_addr, in1_page_size, dram_bank_id, vc);
 
 #ifdef ARCH_GRAYSKULL
     for (uint32_t block = 0; block < num_blocks; ++block) {
@@ -131,7 +131,7 @@ void kernel_main() {
     uint32_t l1_read_addr_in3 = 0;
 
     uint32_t in3_base_addr =
-        noc_async_read_tile_dram_sharded_set_state<in3_page_size, true>(in3_tensor_addr, dram_bank_id, vc);
+        noc_async_read_tile_dram_sharded_set_state<true>(in3_tensor_addr, in3_page_size, dram_bank_id, vc);
 
     for (uint32_t h = 0; h < in3_num_pages; ++h) {
         noc_async_read_tile_dram_sharded_with_state(in3_base_addr, l1_read_addr_in3, l1_write_addr_in3);
diff --git a/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp b/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp
index 94fced2de44..73664b05953 100644
--- a/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp
+++ b/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp
@@ -207,8 +207,8 @@ void kernel_main() {
             uint32_t next_bank_id_and_dram_stride_index = 0;
 
             for (uint32_t i = 0; i < num_dram_shards_to_read; ++i) {
-                uint32_t in1_base_addr = noc_async_read_tile_dram_sharded_set_state<in1_single_tile_size_bytes, true>(
-                    in1_tensor_addr, current_dram_bank_id[next_bank_id_and_dram_stride_index], vc);
+                uint32_t in1_base_addr = noc_async_read_tile_dram_sharded_set_state<true>(
+                    in1_tensor_addr, in1_single_tile_size_bytes, current_dram_bank_id[next_bank_id_and_dram_stride_index], vc);
 
                 if (i == 0) {
                     in1_base_addr += dram_tensor_start_offset;
@@ -313,8 +313,8 @@ void kernel_main() {
             uint32_t next_bank_id_and_dram_stride_index = 0;
 
             for (uint32_t i = 0; i < num_dram_shards_to_read; ++i) {
-                uint32_t in3_base_addr = noc_async_read_tile_dram_sharded_set_state<bias_single_tile_size_bytes, true>(
-                    in3_tensor_addr, current_dram_bank_id[next_bank_id_and_dram_stride_index], vc);
+                uint32_t in3_base_addr = noc_async_read_tile_dram_sharded_set_state<true>(
+                    in3_tensor_addr, bias_single_tile_size_bytes, current_dram_bank_id[next_bank_id_and_dram_stride_index], vc);
 
                 if (i == 0) {
                     in3_base_addr += dram_tensor_start_offset;
diff --git a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.cpp b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.cpp
index 8414733611b..764ab25f3be 100644
--- a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.cpp
+++ b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.cpp
@@ -1421,7 +1421,7 @@ std::vector<Tensor> Matmul::create_output_tensors(const std::vector<Tensor>& inp
                     uint32_t num_blocks_x = (N - 1) / per_core_N + 1;
                     uint32_t num_blocks_total = num_blocks_y * num_blocks_x;
                     uint32_t num_cores = num_blocks_x * num_blocks_y;
-                    CoreRangeSet all_cores({});
+                    CoreRangeSet all_cores;
                     ShardOrientation shard_orientation;
                     if (program_config.transpose_mcast) {
                         all_cores = CoreRangeSet({CoreRange({0, 0}, {num_blocks_y - 1, num_blocks_x - 1})});
diff --git a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_1d_program_factory.cpp b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_1d_program_factory.cpp
index 23db6670f04..a292a31be18 100644
--- a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_1d_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_1d_program_factory.cpp
@@ -146,10 +146,10 @@ operation::ProgramWithCallbacks create_program_mcast_in0(
         in0_mcast_receiver_num_cores,
         num_cores);  // should always be number of cores in receiver grid up to number of active cores
 
-    CoreRangeSet in0_mcast_cores_with_work_and_in_receiver_grid({});
-    CoreRangeSet in0_mcast_cores_without_work_and_in_receiver_grid({});
-    CoreRangeSet in0_mcast_cores_without_work_and_not_in_receiver_grid({});
-    CoreRangeSet in0_mcast_receivers({});
+    CoreRangeSet in0_mcast_cores_with_work_and_in_receiver_grid;
+    CoreRangeSet in0_mcast_cores_without_work_and_in_receiver_grid;
+    CoreRangeSet in0_mcast_cores_without_work_and_not_in_receiver_grid;
+    CoreRangeSet in0_mcast_receivers;
     std::vector<uint32_t> in0_mcast_noc_x;
     std::vector<uint32_t> in0_mcast_noc_y;
     if (in0_is_sharded) {
@@ -987,7 +987,7 @@ operation::ProgramWithCallbacks create_program_mcast_in1(
     uint32_t in1_mcast_receiver_num_cores = in1_mcast_receiver_cores_bounding_box.size();  // always mcast to full grid
 
     CoreRange in1_mcast_sender(start_core, start_core);
-    CoreRangeSet in1_mcast_receivers({});
+    CoreRangeSet in1_mcast_receivers;
     if (in1_mcast_receiver_num_cores > 1) {
         auto receiver_start_core = start_core.x != (compute_with_storage_grid_size.x - 1)
                                        ? CoreCoord{start_core.x + 1, start_core.y}
diff --git a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_dram_sharded_program_factory.cpp b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_dram_sharded_program_factory.cpp
index 240f9aff9a0..96d914fbff2 100644
--- a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_dram_sharded_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_dram_sharded_program_factory.cpp
@@ -459,7 +459,7 @@ operation::ProgramWithCallbacks create_program_dram_sharded(
     tt_metal::Program program{};
 
     // get the dram readers
-    CoreRangeSet all_worker_cores = CoreRangeSet{{}};
+    CoreRangeSet all_worker_cores;
     std::vector<CoreCoord> all_worker_cores_ordered;
 
     if (device->arch() == tt::ARCH::WORMHOLE_B0) {
@@ -1031,7 +1031,8 @@ operation::ProgramWithCallbacks create_program_dram_sharded(
     for (uint32_t i = 0; i < all_cores_in_rect_grid_vec.size(); ++i) {
         auto core = all_cores_in_rect_grid_vec[i];
 
-        if (all_worker_cores.ranges().find(core) == all_worker_cores.ranges().end()) {  // not worker
+        if (std::find(all_worker_cores.ranges().begin(), all_worker_cores.ranges().end(), core) ==
+            all_worker_cores.ranges().end()) {  // not worker
             // in1 reader rt args
             bool is_worker_core = false;
             std::vector<uint32_t> mm_in1_sender_writer_args;
diff --git a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_optimized_program_factory.cpp b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_optimized_program_factory.cpp
index 09e649bb368..4a6fd50be09 100644
--- a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_optimized_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_optimized_program_factory.cpp
@@ -123,7 +123,7 @@ operation::ProgramWithCallbacks create_program(
     }
 
     uint32_t num_cores = 0, num_blocks_per_core_group_1 = 0, num_blocks_per_core_group_2 = 0;
-    CoreRangeSet all_cores({}), core_group_1({}), core_group_2({});
+    CoreRangeSet all_cores, core_group_1, core_group_2;
 
     if (shard_spec.has_value()) {
         all_cores = shard_spec.value().grid;
diff --git a/ttnn/cpp/ttnn/operations/matmul/matmul.hpp b/ttnn/cpp/ttnn/operations/matmul/matmul.hpp
index 54b120fc658..8559de2d18d 100644
--- a/ttnn/cpp/ttnn/operations/matmul/matmul.hpp
+++ b/ttnn/cpp/ttnn/operations/matmul/matmul.hpp
@@ -4,7 +4,7 @@
 
 #pragma once
 
-#include "tt_metal/common/core_coord.h"
+#include "tt_metal/common/core_coord.hpp"
 #include "tt_metal/impl/dispatch/command_queue.hpp"
 #include "ttnn/operations/data_movement/bcast/bcast.hpp"
 #include "ttnn/operations/eltwise/unary/common/unary_op_types.hpp"
diff --git a/ttnn/cpp/ttnn/operations/matmul/matmul_pybind.cpp b/ttnn/cpp/ttnn/operations/matmul/matmul_pybind.cpp
index 165c2e2a4fe..9cadd7d53dd 100644
--- a/ttnn/cpp/ttnn/operations/matmul/matmul_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/matmul/matmul_pybind.cpp
@@ -8,7 +8,7 @@
 #include <pybind11/stl.h>
 
 #include "pybind11/decorators.hpp"
-#include "tt_metal/common/core_coord.h"
+#include "tt_metal/common/core_coord.hpp"
 #include "ttnn/cpp/pybind11/json_class.hpp"
 #include "ttnn/operations/matmul/matmul.hpp"
 #include "ttnn/types.hpp"
diff --git a/ttnn/cpp/ttnn/operations/normalization/layernorm/device/layernorm_types.hpp b/ttnn/cpp/ttnn/operations/normalization/layernorm/device/layernorm_types.hpp
index b512f8deba9..69ee8a9f611 100644
--- a/ttnn/cpp/ttnn/operations/normalization/layernorm/device/layernorm_types.hpp
+++ b/ttnn/cpp/ttnn/operations/normalization/layernorm/device/layernorm_types.hpp
@@ -6,7 +6,7 @@
 
 #include <variant>
 
-#include "tt_metal/common/core_coord.h"
+#include "tt_metal/common/core_coord.hpp"
 
 namespace ttnn::operations::normalization {
 
diff --git a/ttnn/cpp/ttnn/operations/normalization/layernorm/device/multi_core/layernorm_op_multi_core.cpp b/ttnn/cpp/ttnn/operations/normalization/layernorm/device/multi_core/layernorm_op_multi_core.cpp
index cf5db0c6d72..392542fdaad 100644
--- a/ttnn/cpp/ttnn/operations/normalization/layernorm/device/multi_core/layernorm_op_multi_core.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/layernorm/device/multi_core/layernorm_op_multi_core.cpp
@@ -604,9 +604,9 @@ operation::ProgramWithCallbacks layernorm_multi_core_sharded(
     CoreCoord start_core = {0, 0};
     CoreRangeSet all_cores = shard_spec.grid;
     CoreRange sender_cores(start_core, start_core);
-    CoreRangeSet all_to_all_cores({});
-    CoreRangeSet all_to_all_workers_except_sender({});
-    CoreRangeSet not_all_to_all_workers({});
+    CoreRangeSet all_to_all_cores;
+    CoreRangeSet all_to_all_workers_except_sender;
+    CoreRangeSet not_all_to_all_workers;
     uint32_t num_cores_x_mcast, num_cores_y_mcast;
     if (mcast_1d) {
         sender_cores = {start_core, start_core};
@@ -707,18 +707,19 @@ operation::ProgramWithCallbacks layernorm_multi_core_sharded(
             sender_cores = {
                 {(std::size_t) start_core.x, (std::size_t) start_core.y},
                 {(std::size_t) start_core.x, (std::size_t) start_core.y + num_cores_y - 1}};
-            all_to_all_cores = CoreRangeSet({CoreRange(
-                {(std::size_t) start_core.x, (std::size_t) start_core.y},
-                {(std::size_t) start_core.x + num_cores_all_to_all - 1, (std::size_t) start_core.y + num_cores_y - 1})});
+            all_to_all_cores = CoreRangeSet(CoreRange(
+                {(std::size_t)start_core.x, (std::size_t)start_core.y},
+                {(std::size_t)start_core.x + num_cores_all_to_all - 1, (std::size_t)start_core.y + num_cores_y - 1}));
             if (use_mcast && num_cores_all_to_all > 1) {
-                all_to_all_workers_except_sender = CoreRangeSet({CoreRange(
-                    {(std::size_t) start_core.x + 1, (std::size_t) start_core.y},
-                    {(std::size_t) start_core.x + num_cores_all_to_all - 1, (std::size_t) start_core.y + num_cores_y - 1})});
+                all_to_all_workers_except_sender = CoreRangeSet(CoreRange(
+                    {(std::size_t)start_core.x + 1, (std::size_t)start_core.y},
+                    {(std::size_t)start_core.x + num_cores_all_to_all - 1,
+                     (std::size_t)start_core.y + num_cores_y - 1}));
             }
             if (num_none_all_to_all_workers > 0) {
-                not_all_to_all_workers = CoreRangeSet({CoreRange(
-                    {(std::size_t) start_core.x + num_cores_all_to_all, (std::size_t) start_core.y},
-                    {(std::size_t) start_core.x + num_cores_x - 1, (std::size_t) start_core.y + num_cores_y - 1})});
+                not_all_to_all_workers = CoreRangeSet(CoreRange(
+                    {(std::size_t)start_core.x + num_cores_all_to_all, (std::size_t)start_core.y},
+                    {(std::size_t)start_core.x + num_cores_x - 1, (std::size_t)start_core.y + num_cores_y - 1}));
             }
             num_cores_x_mcast = num_cores_x;
             num_cores_y_mcast = 1;
@@ -726,18 +727,19 @@ operation::ProgramWithCallbacks layernorm_multi_core_sharded(
             sender_cores = {
                 {(std::size_t) start_core.x, (std::size_t) start_core.y},
                 {(std::size_t) start_core.x + num_cores_x - 1, (std::size_t) start_core.y}};
-            all_to_all_cores = CoreRangeSet({CoreRange(
-                {(std::size_t) start_core.x, (std::size_t) start_core.y},
-                {(std::size_t) start_core.x + num_cores_x - 1, (std::size_t) start_core.y + num_cores_all_to_all - 1})});
+            all_to_all_cores = CoreRangeSet(CoreRange(
+                {(std::size_t)start_core.x, (std::size_t)start_core.y},
+                {(std::size_t)start_core.x + num_cores_x - 1, (std::size_t)start_core.y + num_cores_all_to_all - 1}));
             if (use_mcast && num_cores_all_to_all > 1) {
-                all_to_all_workers_except_sender = CoreRangeSet({CoreRange(
-                    {(std::size_t) start_core.x, (std::size_t) start_core.y + 1},
-                    {(std::size_t) start_core.x + num_cores_x - 1, (std::size_t) start_core.y + num_cores_all_to_all - 1})});
+                all_to_all_workers_except_sender = CoreRangeSet(CoreRange(
+                    {(std::size_t)start_core.x, (std::size_t)start_core.y + 1},
+                    {(std::size_t)start_core.x + num_cores_x - 1,
+                     (std::size_t)start_core.y + num_cores_all_to_all - 1}));
             }
             if (num_none_all_to_all_workers > 0) {
-                not_all_to_all_workers = CoreRangeSet({CoreRange(
-                    {(std::size_t) start_core.x, (std::size_t) start_core.y + num_cores_all_to_all},
-                    {(std::size_t) start_core.x + num_cores_x - 1, (std::size_t) start_core.y + num_cores_y - 1})});
+                not_all_to_all_workers = CoreRangeSet(CoreRange(
+                    {(std::size_t)start_core.x, (std::size_t)start_core.y + num_cores_all_to_all},
+                    {(std::size_t)start_core.x + num_cores_x - 1, (std::size_t)start_core.y + num_cores_y - 1}));
             }
             num_cores_x_mcast = 1;
             num_cores_y_mcast = num_cores_y;
diff --git a/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/layernorm_distributed_types.hpp b/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/layernorm_distributed_types.hpp
index 4e645e4bde4..d7d7262bf64 100644
--- a/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/layernorm_distributed_types.hpp
+++ b/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/layernorm_distributed_types.hpp
@@ -6,7 +6,7 @@
 
 #include <variant>
 
-#include "tt_metal/common/core_coord.h"
+#include "tt_metal/common/core_coord.hpp"
 
 namespace ttnn::operations::normalization {
 
diff --git a/ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/compute/softmax_sharded.cpp b/ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/compute/softmax_sharded.cpp
index 22cb52941a0..3bbee84224c 100644
--- a/ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/compute/softmax_sharded.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/compute/softmax_sharded.cpp
@@ -23,6 +23,7 @@ ALWI void calc_numeric_stable(uint32_t cb_in, uint32_t cb_bcast_scaler, uint32_t
     // calculate max val per row
     ACQ();
     reconfig_data_format(cb_in, cb_bcast_scaler);
+    pack_reconfig_data_format(cb_max);
     cb_reserve_back(cb_max, 1);
     reduce_init_delta<false, PoolType::MAX, ReduceDim::REDUCE_ROW>();
     cb_wait_front(cb_bcast_scaler, 1);
@@ -197,8 +198,8 @@ void MAIN {
                     index_subblock_w_offset += subblock_w;
                 }
                 cb_pop_front(cb_in0, block_w);
-                reconfig_data_format(cb_exps, cb_bcast_scaler);
             #endif
+            reconfig_data_format(cb_exps, cb_bcast_scaler);
         #endif // FUSED_SCALE_MASK
 
         // sum(exp(x))
diff --git a/ttnn/cpp/ttnn/operations/normalization/softmax/device/multi_core/softmax_op_multi_core.cpp b/ttnn/cpp/ttnn/operations/normalization/softmax/device/multi_core/softmax_op_multi_core.cpp
index 66c06d089b0..a0b49812a56 100644
--- a/ttnn/cpp/ttnn/operations/normalization/softmax/device/multi_core/softmax_op_multi_core.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/softmax/device/multi_core/softmax_op_multi_core.cpp
@@ -107,6 +107,7 @@ operation::ProgramWithCallbacks scale_mask_softmax_multi_core(
     uint32_t in5_t = 1;
     // numeric_stable cb max
     uint32_t im2_t = 1;
+    uint32_t im4_t = tt::div_up(Wt, block_size)*block_size;
 
     // cb_exps - keeps exps in tt::CB in L1 to avoid recomputing
     uint32_t im0_t  = block_size*tt::div_up(Wt, block_size);
@@ -215,10 +216,10 @@ operation::ProgramWithCallbacks scale_mask_softmax_multi_core(
     std::optional<CBHandle> cb_intermed4_id;
     if (numeric_stable) {
         // cb_max
-        auto c_intermed2_config = CircularBufferConfig(im2_t * in0_tile_size, {{tt::CB::c_intermed2, in0_cb_data_format}}).set_page_size(tt::CB::c_intermed2, in0_tile_size);
+        auto c_intermed2_config = CircularBufferConfig(im2_t * im_tile_size, {{tt::CB::c_intermed2, im_cb_data_format}}).set_page_size(tt::CB::c_intermed2, im_tile_size);
         cb_intermed2_id = CreateCircularBuffer( program, all_device_cores, c_intermed2_config );
         // cb_x
-        auto c_x_config = CircularBufferConfig(in0_t * in0_tile_size, {{tt::CB::c_intermed4, in0_cb_data_format}}).set_page_size(tt::CB::c_intermed4, in0_tile_size);
+        auto c_x_config = CircularBufferConfig(im4_t * im_tile_size, {{tt::CB::c_intermed4, im_cb_data_format}}).set_page_size(tt::CB::c_intermed4, im_tile_size);
         cb_intermed4_id = CreateCircularBuffer( program, all_device_cores, c_x_config);
     }
 
@@ -288,6 +289,7 @@ operation::ProgramWithCallbacks scale_mask_softmax_multi_core(
             cb_in4_id,
             causal_mask,
             numeric_stable,
+            fp32_dest_acc_en,
             cb_intermed2_id,
             cb_intermed4_id
         ]
@@ -322,7 +324,7 @@ operation::ProgramWithCallbacks scale_mask_softmax_multi_core(
         }
 
         int32_t num_tiles = input_tensors.at(0).volume()/TILE_HW;
-        uint32_t block_size = find_max_divisor(Wt, 8);
+        uint32_t block_size = fp32_dest_acc_en ? find_max_divisor(Wt, 4) : find_max_divisor(Wt, 8);
 
         // These tile capacity counts for CBs need to match the number of tiles expected by the kernel (softmax.cpp)
         uint32_t in0_t  = numeric_stable ? tt::div_up(Wt, block_size)*block_size : block_size*2;
@@ -332,6 +334,7 @@ operation::ProgramWithCallbacks scale_mask_softmax_multi_core(
         uint32_t in3_t  = 1; // 1/sqrt() scaler tile cb for fused scale/mask/softmax variant
         uint32_t in4_t  = tt::div_up(Wt, block_size)*block_size; // attention mask (N,C,32,W) - Wt is reused for each Ht, NC is cycled
         uint32_t im2_t = 1;
+        uint32_t im4_t = tt::div_up(Wt, block_size)*block_size;
 
         // cb_exps - keeps exps in tt::CB in L1 to avoid recomputing
         uint32_t im0_t  = block_size*tt::div_up(Wt, block_size);
@@ -366,8 +369,8 @@ operation::ProgramWithCallbacks scale_mask_softmax_multi_core(
             UpdateCircularBufferTotalSize(program, cb_in4_id.value(), in4_t * mask_tile_size);
         }
         if (numeric_stable) {
-            UpdateCircularBufferTotalSize(program, cb_intermed2_id.value(), im2_t * in0_tile_size);
-            UpdateCircularBufferTotalSize(program, cb_intermed4_id.value(), in0_t * in0_tile_size);
+            UpdateCircularBufferTotalSize(program, cb_intermed2_id.value(), im2_t * im_tile_size);
+            UpdateCircularBufferTotalSize(program, cb_intermed4_id.value(), im4_t * im_tile_size);
         }
 
         uint32_t curr_row = 0;
@@ -554,8 +557,8 @@ operation::ProgramWithCallbacks scale_mask_softmax_sharded_multi_core(
     // output buffer size
     uint32_t out_CB_size = block_wt * block_ht * out0_tile_size;
     // numeric_stable cb max
-    uint32_t max_CB_size = 1 * in0_tile_size;
-    uint32_t x_CB_size = block_wt * in0_tile_size;
+    uint32_t max_CB_size = 1 * im_tile_size;
+    uint32_t x_CB_size = block_wt * im_tile_size;
 
     ////////////////////////////////////////////////////////////////////////////
     //                      Application Setup
@@ -697,12 +700,12 @@ operation::ProgramWithCallbacks scale_mask_softmax_sharded_multi_core(
     auto cb_intermed1_id = CreateCircularBuffer( program, all_device_cores, c_intermed1_config );
     if (numeric_stable) {
         // cb_max
-        auto c_intermed3_config = CircularBufferConfig(max_CB_size, {{tt::CB::c_intermed3, in0_cb_data_format}})
-            .set_page_size(tt::CB::c_intermed3, in0_tile_size);
+        auto c_intermed3_config = CircularBufferConfig(max_CB_size, {{tt::CB::c_intermed3, im_cb_data_format}})
+            .set_page_size(tt::CB::c_intermed3, im_tile_size);
         auto cb_intermed3_id = CreateCircularBuffer( program, all_device_cores, c_intermed3_config );
         // cb_x
-        auto c_intermed4_config = CircularBufferConfig(x_CB_size, {{tt::CB::c_intermed4, in0_cb_data_format}})
-            .set_page_size(tt::CB::c_intermed4, in0_tile_size);
+        auto c_intermed4_config = CircularBufferConfig(x_CB_size, {{tt::CB::c_intermed4, im_cb_data_format}})
+            .set_page_size(tt::CB::c_intermed4, im_tile_size);
         auto cb_intermed4_id = CreateCircularBuffer( program, all_device_cores, c_intermed4_config );
     }
 
diff --git a/ttnn/cpp/ttnn/operations/normalization/softmax/device/softmax_op.hpp b/ttnn/cpp/ttnn/operations/normalization/softmax/device/softmax_op.hpp
index b546ac0016b..4aacae40e76 100644
--- a/ttnn/cpp/ttnn/operations/normalization/softmax/device/softmax_op.hpp
+++ b/ttnn/cpp/ttnn/operations/normalization/softmax/device/softmax_op.hpp
@@ -6,7 +6,7 @@
 
 #include <cmath>
 #include "common/base_types.hpp"
-#include "common/core_coord.h"
+#include "common/core_coord.hpp"
 #include "ttnn/tensor/types.hpp"
 #include "ttnn/tensor/tensor.hpp"
 #include "ttnn/operation.hpp"
diff --git a/ttnn/cpp/ttnn/operations/normalization/softmax/device/softmax_types.hpp b/ttnn/cpp/ttnn/operations/normalization/softmax/device/softmax_types.hpp
index 7e1bf9f5781..1dcbdb86f0c 100644
--- a/ttnn/cpp/ttnn/operations/normalization/softmax/device/softmax_types.hpp
+++ b/ttnn/cpp/ttnn/operations/normalization/softmax/device/softmax_types.hpp
@@ -4,7 +4,7 @@
 
 #pragma once
 
-#include "common/core_coord.h"
+#include "common/core_coord.hpp"
 
 namespace ttnn::operations::normalization {
 
diff --git a/ttnn/cpp/ttnn/operations/pool/avgpool/avg_pool.cpp b/ttnn/cpp/ttnn/operations/pool/avgpool/avg_pool.cpp
index 77956b3a2c5..d6b67a146ff 100644
--- a/ttnn/cpp/ttnn/operations/pool/avgpool/avg_pool.cpp
+++ b/ttnn/cpp/ttnn/operations/pool/avgpool/avg_pool.cpp
@@ -10,7 +10,7 @@ namespace tt_metal {
 
 template<PoolType pool>
 Tensor pool_2d(const Tensor& input, const MemoryConfig& memory_config, const std::optional<DataType>& output_dtype) {
-    TT_ASSERT(input.storage_type() == StorageType::DEVICE, "Input tensor needs to be on device");
+    TT_FATAL(input.storage_type() == StorageType::DEVICE, "Input tensor needs to be on device");
     auto input_shape = input.get_legacy_shape();
     switch (pool) {
         case PoolType::AVG: {
@@ -18,17 +18,17 @@ Tensor pool_2d(const Tensor& input, const MemoryConfig& memory_config, const std
             return ttnn::sum(input, int(input_shape.rank() - 2), true, memory_config, std::nullopt, 1 / float(height_without_padding));
         }
         default:
-            TT_ASSERT(false && "Undefined pool type");
+            TT_THROW("Undefined pool type");
     }
 }
 
 Tensor avg_pool2d(const Tensor& input, const MemoryConfig& memory_config, const std::optional<DataType>& output_dtype) {
-    TT_ASSERT(input.storage_type() == StorageType::DEVICE, "Input tensor needs to be on device");
+    TT_FATAL(input.storage_type() == StorageType::DEVICE, "Input tensor needs to be on device");
     auto output = input;
 
     tt::tt_metal::LegacyShape in_shape = input.get_legacy_shape();
     auto input_padding = in_shape.padding();
-    TT_ASSERT(input_padding[1].front == 0 and input_padding[1].back == 0);
+    TT_FATAL(input_padding[1].front == 0 and input_padding[1].back == 0, "Padding along second dim is not supported");
     auto output_padding = Padding({input_padding[0], {0, 0}, {0, input_padding[2].back * in_shape[1]}, input_padding[3]}, input_padding.pad_value());
     auto output_shape = tt::tt_metal::LegacyShape({in_shape[0], 1, in_shape[1] * in_shape[2], in_shape[3]}, output_padding);
     output = output.reshape(output_shape);
diff --git a/ttnn/cpp/ttnn/operations/pool/maxpool/device/max_pool2d_device_op.cpp b/ttnn/cpp/ttnn/operations/pool/maxpool/device/max_pool2d_device_op.cpp
index 050dab5a122..fe49e797a50 100644
--- a/ttnn/cpp/ttnn/operations/pool/maxpool/device/max_pool2d_device_op.cpp
+++ b/ttnn/cpp/ttnn/operations/pool/maxpool/device/max_pool2d_device_op.cpp
@@ -26,10 +26,15 @@ void validate_maxpool(const Tensor& input, const sliding_window::SlidingWindowCo
     TT_FATAL(is_pow2, "Row size (nchannels * bytes = {}) should be power of 2 ({}).", in_nbytes_c, is_pow2);
 
     TT_FATAL(input.memory_config().is_sharded(), "Input needs to be sharded");
-    TT_FATAL(input.memory_config().memory_layout == TensorMemoryLayout::HEIGHT_SHARDED, "Only height sharded tensors are supported.");
-
     TT_FATAL(out_mem_config.is_sharded(), "Output memory config needs to be sharded");
-    TT_FATAL(out_mem_config.memory_layout == TensorMemoryLayout::HEIGHT_SHARDED, "Only height sharded tensors are supported.");
+
+    // check that C dimnenion is a multiple of num_shards_c for all but height sharding
+    TensorMemoryLayout in_memory_layout = input.memory_config().memory_layout;
+    if (in_memory_layout != TensorMemoryLayout::HEIGHT_SHARDED) {
+        uint32_t num_shards_c = sliding_window_config.num_cores_c;
+        const tt::tt_metal::LegacyShape input_shape = input.get_legacy_shape();
+        TT_FATAL(input_shape[3] % num_shards_c == 0, "For width and block sharding, input channels should be divisible by num_shards");
+    }
 }
 
 void MaxPool2D::validate_on_program_cache_miss(const operation_attributes_t& op_attr, const tensor_args_t& tensors) {
@@ -83,7 +88,7 @@ MaxPool2D::tensor_return_value_t MaxPool2D::create_output_tensors(const operatio
     Shape output_shape = compute_output_shapes(op_attr, tensors);
     auto mem_config = out_mem_config;
     if (mem_config.shard_spec.has_value()) {
-        mem_config.shard_spec->shape[1] = output_shape[3];
+        mem_config.shard_spec->shape[1] = input.shard_spec()->shape[1];
     } else {
         uint32_t ncores = input.shard_spec().value().num_cores();
         TT_FATAL(ncores == sliding_window_config.num_cores_nhw, "Number of cores should match");
diff --git a/ttnn/cpp/ttnn/operations/pool/maxpool/device/max_pool2d_multi_core_program_factory.cpp b/ttnn/cpp/ttnn/operations/pool/maxpool/device/max_pool2d_multi_core_program_factory.cpp
index 34c3ee69010..ca9ca1b20e1 100644
--- a/ttnn/cpp/ttnn/operations/pool/maxpool/device/max_pool2d_multi_core_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/pool/maxpool/device/max_pool2d_multi_core_program_factory.cpp
@@ -5,6 +5,7 @@
 
 
 
+#include "impl/buffers/buffer_constants.hpp"
 #include "max_pool2d_device_op.hpp"
 // #include "max_pool2d_multi_core_program_factory.hpp"
 #include "ttnn/operations/reduction/generic/device/reduce_op.hpp"  // for reduce_op_utils
@@ -33,6 +34,7 @@ MaxPool2D::MultiCore::cached_program_t max_pool_2d_multi_core_sharded_with_halo_
     uint32_t pad_w,
     uint32_t dilation_h,
     uint32_t dilation_w,
+    uint32_t num_shards_c,
     const MemoryConfig& out_mem_config,
     uint32_t nblocks) {
 
@@ -49,8 +51,9 @@ MaxPool2D::MultiCore::cached_program_t max_pool_2d_multi_core_sharded_with_halo_
     tt::DataFormat out_df = datatype_to_dataformat_converter(output.get_dtype());
     uint32_t in_nbytes = datum_size(in_df);
     uint32_t out_nbytes = datum_size(out_df);
-    uint32_t in_nbytes_c = input_shape[3] * in_nbytes;                                      // row of input (channels)
-    uint32_t out_nbytes_c = output_shape[3] * out_nbytes;                                   // row of output (channels)
+
+    uint32_t in_nbytes_c = input_shape[3] / num_shards_c * in_nbytes;                                      // row of input (channels)
+    uint32_t out_nbytes_c = output_shape[3] / num_shards_c * out_nbytes;                                // row of output (channels)
     TT_ASSERT((in_nbytes_c & (in_nbytes_c - 1)) == 0, "in_nbytes_c should be power of 2");  // in_nbytes_c is power of 2
     TT_ASSERT(
         (out_nbytes_c & (out_nbytes_c - 1)) == 0, "out_nbytes_c should be power of 2");  // out_nbytes_c is power of 2
@@ -60,9 +63,9 @@ MaxPool2D::MultiCore::cached_program_t max_pool_2d_multi_core_sharded_with_halo_
 
     uint32_t kernel_size_hw = kernel_size_w * kernel_size_h;  // number of valid rows, to read
     uint32_t kernel_size_hw_padded = ceil_multiple_of(kernel_size_hw, tt::constants::TILE_HEIGHT);
-    uint32_t in_ntiles_hw = (uint32_t)std::ceil((float)kernel_size_hw_padded / tt::constants::TILE_HEIGHT);
-    uint32_t in_ntiles_c = (uint32_t)std::ceil((float)input_shape[3] / tt::constants::TILE_WIDTH);
-    uint32_t out_ntiles_c = (uint32_t)std::ceil((float)output_shape[3] / tt::constants::TILE_WIDTH);
+    uint32_t in_ntiles_hw = (uint32_t) std::ceil((float) kernel_size_hw_padded / tt::constants::TILE_HEIGHT);
+    uint32_t in_ntiles_c = (uint32_t) std::ceil((float) input_shape[3] / num_shards_c / tt::constants::TILE_WIDTH);
+    uint32_t out_ntiles_c = (uint32_t) std::ceil((float) output_shape[3] / num_shards_c / tt::constants::TILE_WIDTH);
 
     // Hardware can do reduction of 8 tiles at a time.
     // CB sizes can be restricted to this in case input channels are more than 256 to perform reduction iteratively.
@@ -71,7 +74,7 @@ MaxPool2D::MultiCore::cached_program_t max_pool_2d_multi_core_sharded_with_halo_
     const bool is_large_kernel = kernel_size_hw > MAX_SMALL_KERNEL_SIZE_HW;
     const bool is_wide_reduction = in_ntiles_c > MAX_TILES_PER_REDUCTION;
 
-    TT_ASSERT(nblocks == 1, "Multiple blocks not yet supported");
+    TT_FATAL(nblocks == 1, "Multiple blocks not yet supported");
 
     uint32_t tile_w = tt::constants::TILE_WIDTH;
     if (input_shape[3] < tt::constants::TILE_WIDTH) {
@@ -85,7 +88,7 @@ MaxPool2D::MultiCore::cached_program_t max_pool_2d_multi_core_sharded_with_halo_
     auto all_cores = input.shard_spec().value().grid;
     uint32_t ncores = all_cores.num_cores();
     auto core_range = all_cores;
-    auto core_range_cliff = CoreRangeSet({});
+    auto core_range_cliff = CoreRangeSet();
     uint32_t in_nhw_per_core = input.shard_spec()->shape[0];
     uint32_t in_nhw_per_core_cliff = 0;
     uint32_t out_nhw_per_core = output.shard_spec()->shape[0];
@@ -93,7 +96,7 @@ MaxPool2D::MultiCore::cached_program_t max_pool_2d_multi_core_sharded_with_halo_
     uint32_t ncores_w = grid_size.x;
 
     // TODO: support generic nblocks
-    TT_ASSERT(
+    TT_FATAL(
         out_nhw_per_core % nblocks == 0,
         "number of sticks per core ({}) should be divisible by nblocks ({})",
         out_nhw_per_core,
@@ -144,19 +147,19 @@ MaxPool2D::MultiCore::cached_program_t max_pool_2d_multi_core_sharded_with_halo_
             .set_globally_allocated_address(*reader_indices_buffer);
     auto in_reader_indices_cb = tt::tt_metal::CreateCircularBuffer(program, all_cores, in_reader_indices_cb_config);
 
-    uint32_t in_cb_sz;
+    uint32_t in_cb_sz = 0;
     uint32_t in_nblocks_c = 1;
     if (is_large_kernel) {
-        in_cb_sz = (input_shape[3] * kernel_size_hw_padded) > (tt::constants::TILE_HW * MAX_TILES_PER_REDUCTION)
+        in_cb_sz = (input_shape[3] / num_shards_c * kernel_size_hw_padded) > (tt::constants::TILE_HW * MAX_TILES_PER_REDUCTION)
             ? (tt::constants::TILE_HW * MAX_TILES_PER_REDUCTION)
-            : input_shape[3] * kernel_size_hw_padded;
+            : input_shape[3] / num_shards_c * kernel_size_hw_padded;
     } else {
         if (is_wide_reduction) {
             in_cb_sz = MAX_TILES_PER_REDUCTION * tt::constants::TILE_WIDTH * kernel_size_hw_padded;
             TT_FATAL(in_ntiles_c % MAX_TILES_PER_REDUCTION == 0, "input channels should be multiple of {} tiles. General case TODO.", MAX_TILES_PER_REDUCTION);
             in_nblocks_c = in_ntiles_c / MAX_TILES_PER_REDUCTION;
         } else {
-            in_cb_sz = input_shape[3] * kernel_size_hw_padded;
+            in_cb_sz = input_shape[3] / num_shards_c * kernel_size_hw_padded;
         }
     }
     // reader output == input to tilize
@@ -284,7 +287,7 @@ MaxPool2D::MultiCore::cached_program_t max_pool_2d_multi_core_sharded_with_halo_
         in_nbytes_c_log2,
         in_w,
         in_cb_page_padded * in_cb_npages / tile_w,
-        input_shape[3],
+        input_shape[3] / num_shards_c,
         nblocks,
         split_reader,  // enable split reader
         0,             // split reader id
@@ -300,7 +303,7 @@ MaxPool2D::MultiCore::cached_program_t max_pool_2d_multi_core_sharded_with_halo_
         in_nbytes_c_log2,
         in_w,
         in_cb_page_padded * in_cb_npages / tile_w,
-        input_shape[3],
+        input_shape[3] / num_shards_c,
         nblocks,
         split_reader,  // enable split reader
         1,             // split reader id
@@ -339,14 +342,14 @@ MaxPool2D::MultiCore::cached_program_t max_pool_2d_multi_core_sharded_with_halo_
         out_h,
         out_w,
         tt::div_up(output_shape[2], tt::constants::TILE_HEIGHT),
-        tt::div_up(output_shape[3], tt::constants::TILE_WIDTH),
+        tt::div_up(output_shape[3], num_shards_c * tt::constants::TILE_WIDTH),
         nblocks,
         out_w_loop_count,
         1,
         out_nhw_per_core,
         split_reader,                // enable split reader
         out_nhw_per_core / nblocks,  // loop count with blocks
-        input_shape[3],
+        input_shape[3] / num_shards_c,
         in_nblocks_c};
 
     auto reduce_op = tt::tt_metal::ReduceOpMath::MAX;
@@ -394,13 +397,14 @@ MaxPool2D::MultiCore::cached_program_t MaxPool2D::MultiCore::create(const operat
     uint32_t out_h = output_shape[1];
     uint32_t out_w = output_shape[2];
 
+    bool is_width_sharded = input.memory_config().memory_layout == TensorMemoryLayout::WIDTH_SHARDED;
     bool is_block_sharded = input.memory_config().memory_layout == TensorMemoryLayout::BLOCK_SHARDED;
 
     auto pad_metadata = sliding_window::generate_pad_metadata(sliding_window_config);
     auto op_trace_metadata = sliding_window::generate_op_trace_metadata(sliding_window_config);
     auto shard_boundaries = sliding_window::generate_shard_boundaries(sliding_window_config, op_trace_metadata);
     auto top_left_indices =
-        sliding_window::generate_sliding_window_op_config(op_trace_metadata, shard_boundaries, false, false);
+        sliding_window::generate_sliding_window_op_config(op_trace_metadata, shard_boundaries, false, is_block_sharded); // only pad the indices for block sharding
     auto reader_indices =
         sliding_window::construct_on_host_config_tensor(top_left_indices, sliding_window_config, parallel_config);
     log_debug(tt::LogOp, "reader_indices shape: {}", reader_indices.shape());
@@ -420,6 +424,7 @@ MaxPool2D::MultiCore::cached_program_t MaxPool2D::MultiCore::create(const operat
     auto pad_w = sliding_window_config.pad_hw.second;
     auto dilation_h = sliding_window_config.dilation_hw.first;
     auto dilation_w = sliding_window_config.dilation_hw.second;
+    auto num_shards_c = sliding_window_config.num_cores_c;
 
     return max_pool_2d_multi_core_sharded_with_halo_v2_impl_new(
         program,
@@ -439,6 +444,7 @@ MaxPool2D::MultiCore::cached_program_t MaxPool2D::MultiCore::create(const operat
         pad_w,
         dilation_h,
         dilation_w,
+        num_shards_c,
         out_mem_config,
         1);
 }
diff --git a/ttnn/cpp/ttnn/operations/pool/maxpool/max_pool2d.cpp b/ttnn/cpp/ttnn/operations/pool/maxpool/max_pool2d.cpp
index 5074f262893..3fe422e7786 100644
--- a/ttnn/cpp/ttnn/operations/pool/maxpool/max_pool2d.cpp
+++ b/ttnn/cpp/ttnn/operations/pool/maxpool/max_pool2d.cpp
@@ -4,6 +4,7 @@
 
 #include "max_pool2d.hpp"
 
+#include "impl/buffers/buffer_constants.hpp"
 #include "ttnn/operations/conv/conv2d/conv2d.hpp"
 #include "ttnn/operations/sliding_window/sliding_window.hpp"
 #include "tt_metal/common/math.hpp"
@@ -12,8 +13,17 @@
 namespace ttnn {
 namespace operations::pool {
 
-Tensor MaxPool2DOp::invoke(uint8_t queue_id, const Tensor& input_tensor, uint32_t batch_size, uint32_t input_h, uint32_t input_w, uint32_t channels, std::array<uint32_t, 2> kernel_size, std::array<uint32_t, 2> stride, std::array<uint32_t, 2> padding, std::array<uint32_t, 2> dilation, const std::optional<const MemoryConfig> memory_config) {
-
+Tensor MaxPool2DOp::invoke(uint8_t queue_id,
+                           const Tensor& input_tensor,
+                           uint32_t batch_size,
+                           uint32_t input_h, uint32_t input_w,
+                           uint32_t channels,
+                           std::array<uint32_t, 2> kernel_size,
+                           std::array<uint32_t, 2> stride,
+                           std::array<uint32_t, 2> padding,
+                           std::array<uint32_t, 2> dilation,
+                           const std::optional<const MemoryConfig> memory_config,
+                           const std::optional<const TensorMemoryLayout> applied_shard_scheme) {
     sliding_window::SlidingWindowConfig sliding_window_config{
             .batch_size = batch_size,
             .input_hw = {input_h, input_w},
@@ -22,7 +32,7 @@ Tensor MaxPool2DOp::invoke(uint8_t queue_id, const Tensor& input_tensor, uint32_
             .pad_hw = {padding.at(0), padding.at(1)},
             .dilation_hw = {dilation.at(0), dilation.at(1)}
     };
-    auto output_shape = sliding_window_config.get_output_shape();
+    auto output_shape = sliding_window_config.get_output_shape();   // last dim/width is 0
     auto input_tensor_sharded = input_tensor;
 
     // maxpool output is row major
@@ -32,38 +42,50 @@ Tensor MaxPool2DOp::invoke(uint8_t queue_id, const Tensor& input_tensor, uint32_
     sliding_window::ParallelConfig parallel_config;
     MemoryConfig out_memory_config = input_tensor_sharded.memory_config();
     uint32_t num_cores_nhw = 0;
+    uint32_t num_cores_c = 0;
 
+    TensorMemoryLayout shard_layout = TensorMemoryLayout::HEIGHT_SHARDED; // default to height sharding
     if (!out_memory_config.shard_spec.has_value()) {
         // Input is not sharded. Perform sharding.
+        if (applied_shard_scheme.has_value()) {
+            TT_FATAL((applied_shard_scheme.value() == TensorMemoryLayout::HEIGHT_SHARDED) ||
+                     (applied_shard_scheme.value() == TensorMemoryLayout::WIDTH_SHARDED) ||
+                     (applied_shard_scheme.value() == TensorMemoryLayout::BLOCK_SHARDED),
+                     "Only height, width, or block sharding strategies are supported.");
+            shard_layout = applied_shard_scheme.value();
+        }
         parallel_config = conv::conv2d::determine_parallel_config(
-                                            TensorMemoryLayout::HEIGHT_SHARDED,
+                                            shard_layout,
                                             batch_size,
-                                            0,          // in_channels -- not used
+                                            channels,
                                             output_shape[1],
                                             output_shape[2],
-                                            0,          // out_channels -- not used
+                                            channels,
                                             input_tensor.device(),
                                             ShardOrientation::ROW_MAJOR,
                                             false);
         num_cores_nhw = conv::conv2d::get_num_cores_nhw_from_parallel_config(parallel_config);
+        num_cores_c = conv::conv2d::get_num_cores_channels_from_parallel_config(parallel_config);
         auto sharded_mem_config = conv::conv2d::create_sharded_memory_config_from_parallel_config(input_tensor_sharded.shape(), parallel_config, is_in_tiled ? tt::constants::TILE_HEIGHT : 1);
-        input_tensor_sharded = ttnn::to_memory_config(input_tensor_sharded, sharded_mem_config, std::nullopt);
+        input_tensor_sharded = ttnn::to_memory_config(input_tensor_sharded, sharded_mem_config, std::nullopt); // this converts interleaved to sharded
         out_memory_config = input_tensor_sharded.memory_config();
     } else {
         // input is already sharded, use it as is
         const auto shard_grid = out_memory_config.shard_spec.value().grid;
         const auto shard_scheme = out_memory_config.memory_layout;
         const auto shard_orientation = out_memory_config.shard_spec.value().orientation;
-        TT_FATAL(shard_scheme == TensorMemoryLayout::HEIGHT_SHARDED, "Only height sharded tensors are supported.");
+        TT_FATAL(!applied_shard_scheme.has_value(), "A sharding scheme should not be specified for a sharded input tensor.");
         TT_FATAL(shard_orientation == ShardOrientation::ROW_MAJOR, "Only row major orientation is supported.");
         parallel_config.grid = shard_grid;
         parallel_config.shard_scheme = shard_scheme;
         parallel_config.shard_orientation = shard_orientation;
         num_cores_nhw = conv::conv2d::get_num_cores_nhw_from_parallel_config(parallel_config);
+        num_cores_c = conv::conv2d::get_num_cores_channels_from_parallel_config(parallel_config);
     }
+
     // update the shard spec to match the output shape
     auto shard_spec = out_memory_config.shard_spec.value();
-    uint32_t output_shard_width_padded = input_tensor.dtype() == DataType::BFLOAT8_B ? tt::round_up(output_shape[3], tt::constants::TILE_WIDTH) : tt::round_up(output_shape[3] * tt::datum_size(tt::tt_metal::datatype_to_dataformat_converter(input_tensor.dtype())), tt::constants::TILE_WIDTH);
+    uint32_t output_shard_width_padded = input_tensor.dtype() == DataType::BFLOAT8_B ? tt::round_up(channels / num_cores_c, tt::constants::TILE_WIDTH) : tt::round_up(channels / num_cores_c * tt::datum_size(tt::tt_metal::datatype_to_dataformat_converter(input_tensor.dtype())), tt::constants::TILE_WIDTH);
     uint32_t output_nhw = output_shape[0] * output_shape[1] * output_shape[2];
     uint32_t output_nhw_padded = tt::round_up(output_nhw, num_cores_nhw * (is_out_tiled ? tt::constants::TILE_HEIGHT : 1));
     uint32_t output_shard_height_padded = output_nhw_padded / num_cores_nhw;
@@ -78,6 +100,7 @@ Tensor MaxPool2DOp::invoke(uint8_t queue_id, const Tensor& input_tensor, uint32_
             .pad_hw = {padding.at(0), padding.at(1)},
             .dilation_hw = {dilation.at(0), dilation.at(1)},
             .num_cores_nhw = num_cores_nhw,
+            .num_cores_c = num_cores_c,
             .core_range_set = parallel_config.grid,
             .snap_to_tile = false
     };
diff --git a/ttnn/cpp/ttnn/operations/pool/maxpool/max_pool2d.hpp b/ttnn/cpp/ttnn/operations/pool/maxpool/max_pool2d.hpp
index 4c3ffc9c553..ee04df952c5 100644
--- a/ttnn/cpp/ttnn/operations/pool/maxpool/max_pool2d.hpp
+++ b/ttnn/cpp/ttnn/operations/pool/maxpool/max_pool2d.hpp
@@ -16,7 +16,17 @@ namespace ttnn {
 namespace operations::pool {
 
 struct MaxPool2DOp {
-    static Tensor invoke(uint8_t queue_id, const Tensor& input_tensor, uint32_t batch_size, uint32_t input_h, uint32_t input_w, uint32_t channels, std::array<uint32_t, 2> kernel_size, std::array<uint32_t, 2> stride, std::array<uint32_t, 2> padding, std::array<uint32_t, 2> dilation, const std::optional<const MemoryConfig> memory_config = std::nullopt);
+    static Tensor invoke(uint8_t queue_id,
+                         const Tensor& input_tensor,
+                         uint32_t batch_size,
+                         uint32_t input_h, uint32_t input_w,
+                         uint32_t channels,
+                         std::array<uint32_t, 2> kernel_size,
+                         std::array<uint32_t, 2> stride,
+                         std::array<uint32_t, 2> padding,
+                         std::array<uint32_t, 2> dilation,
+                         const std::optional<const MemoryConfig> memory_config,
+                         const std::optional<const TensorMemoryLayout> applied_shard_scheme);
 
 };
 
diff --git a/ttnn/cpp/ttnn/operations/pool/maxpool/max_pool2d_pybind.cpp b/ttnn/cpp/ttnn/operations/pool/maxpool/max_pool2d_pybind.cpp
index 4e9782e48c1..6a7905ebfea 100644
--- a/ttnn/cpp/ttnn/operations/pool/maxpool/max_pool2d_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/pool/maxpool/max_pool2d_pybind.cpp
@@ -19,24 +19,59 @@ void bind_max_pool2d_operation(py::module& module) {
         module,
         ttnn::max_pool2d,
         R"doc(
-        Max Pool 2D
-        +-------------------+-------------------------------+---------------+-------------+----------+
-        | Argument          | Description                   | Data type     | Valid range | Required |
-        +===================+===============================+===============+=============+==========+
-        | input             | Input activations tensor      | Tensor        |             | Yes      |
-        | in_n              | Input nbatch                  | Tensor        |             | Yes      |
-        | in_h              | Input height                  | Tensor        |             | Yes      |
-        | in_w              | Input width                   | Tensor        |             | Yes      |
-        | kernel_h          | kernel window height          | uint32_t      |             | Yes      |
-        | kernel_w          | kernel window width           | uint32_t      |             | Yes      |
-        | stride_h          | stride in height dim          | uint32_t      |             | No       |
-        | stride_w          | stride in width dim           | uint32_t      |             | No       |
-        | pad_h             | padding in height dim         | uint32_t      |             | No       |
-        | pad_w             | padding in width dim          | uint32_t      |             | No       |
-        | dilation_h        | kernel dilation in height dim | uint32_t      |             | No       |
-        | dilation_w        | kernel dilation in width dim  | uint32_t      |             | No       |
-        | memory_config     | Output memory config          | MemoryConfig  |             | No       |
-        +-------------------+-------------------------------+---------------+-------------+----------+
+        Applies a max pool convolution to the input tensor. The resulting output Tensor will contain the maximum
+        value for each channel within a kernel window. The input tensor is expected to be in [NHW, C] format and
+        should be on the device. Height, width and block sharding schemes are supported.
+
+        Args:
+            input_tensor_a (ttnn.Tensor): the tensor to be convolved.
+            batch_size (int): the number of batches (N in a [N, C, H, W] shaped tensor).
+            input_h (int): the height of the input tensor (H in a [N, C, H, W] shaped tensor).
+            input_w (int): the width of the input tensor (W in a [N, C, H, W] shaped tensor).
+            channels (int): the number of channels (C in a [N, C, H, W] shaped tensor).
+            kernel_size (List of [int]): the (h, w) size of the kernel window.
+            stride (List of [int]): the (h, w) stride of the kernel window.
+            padding (List of [int]): the (h, w) padding of the input tensor.
+            dilation (List of [int]): the (h, w) dilation of the kernel window.
+
+        Keyword Args:
+            memory_config (ttnn.MemoryConfig, optional): the memory configuration for the output tensor. Defaults to `None`.
+            applied_shard_scheme (ttnn.TensorMemoryLayout, optional): the sharding scheme to apply to a non-pre-sharded input tensor. Defaults to `None`, which should be used with pre-sharded input tensors.
+            queue_id (int, optional): the queue id to use for the operation. Defaults to `0`.
+
+        Returns:
+            ttnn.Tensor: the max pool convolved output tensor.
+
+        Example:
+            >>> import ttnn
+            >>> import torch
+            >>> device = ttnn.CreateDevice(0, l1_small_size=8192)
+            >>> kernel_h, kernel_w = 2, 2
+            >>> stride_h, stride_w = 1, 1
+            >>> pad_h, pad_w = 0, 0
+            >>> dilation_h, dilation_w = 1, 1
+            >>> nchw_shape = (4, 256, 40, 40)
+            >>> in_N, in_C, in_H, in_W = nchw_shape
+            >>> input_shape = (1, 1, in_N * in_H * in_W, in_C)
+            >>> input = torch.randn(nchw_shape, dtype=torch.bfloat16)
+            >>> input_perm = torch.permute(input, (0, 2, 3, 1)) # this op expects a [N, H, W, C] format
+            >>> input_reshape = input_perm.reshape(input_shape)
+            >>> tt_input= ttnn.from_torch(input_reshape, ttnn.bfloat16)
+            >>> tt_input_dev = ttnn.to_device(tt_input, device)
+            >>> tt_output = ttnn.max_pool2d(
+                                input_tensor=tt_input_dev,
+                                batch_size=in_N,
+                                input_h=in_H,
+                                input_w=in_W,
+                                channels=in_C,
+                                kernel_size=[kernel_h, kernel_w],
+                                stride=[stride_h, stride_w],
+                                padding=[pad_h, pad_w],
+                                dilation=[dilation_h, dilation_w],
+                                memory_config=None,
+                                applied_shard_scheme=ttnn.TensorMemoryLayout.BLOCK_SHARDED,
+                            )
+
         )doc",
         ttnn::pybind_overload_t{
             [](const decltype(ttnn::max_pool2d)& self, const ttnn::Tensor& input_tensor,
@@ -48,19 +83,21 @@ void bind_max_pool2d_operation(py::module& module) {
                 std::array<uint32_t, 2> stride,
                 std::array<uint32_t, 2> padding,
                 std::array<uint32_t, 2> dilation,
-                const std::optional<const MemoryConfig>& memory_config,
+                const std::optional<const MemoryConfig> memory_config,
+                const std::optional<const ttnn::TensorMemoryLayout> applied_shard_scheme,
                 const uint8_t& queue_id)
                 -> ttnn::Tensor { return self(queue_id,
-                                            input_tensor,
-                                            batch_size,
-                                            input_h,
-                                            input_w,
-                                            channels,
-                                            kernel_size,
-                                            stride,
-                                            padding,
-                                            dilation,
-                                            memory_config); },
+                                              input_tensor,
+                                              batch_size,
+                                              input_h,
+                                              input_w,
+                                              channels,
+                                              kernel_size,
+                                              stride,
+                                              padding,
+                                              dilation,
+                                              memory_config,
+                                              applied_shard_scheme); },
                 py::arg("input_tensor"),
                 py::arg("batch_size"),
                 py::arg("input_h"),
@@ -72,6 +109,7 @@ void bind_max_pool2d_operation(py::module& module) {
                 py::arg("dilation"),
                 py::kw_only(),
                 py::arg("memory_config") = std::nullopt,
+                py::arg("applied_shard_scheme") = std::nullopt,
                 py::arg("queue_id") = 0});
 }
 
diff --git a/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_bilinear_program_factory_multicore.cpp b/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_bilinear_program_factory_multicore.cpp
index 4864d6a34f1..c29bb743c73 100644
--- a/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_bilinear_program_factory_multicore.cpp
+++ b/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_bilinear_program_factory_multicore.cpp
@@ -33,6 +33,7 @@ Tensor HaloTensorCreation(const Tensor &input){
     int input_height = input.get_legacy_shape()[1];
     int input_width = input.get_legacy_shape()[2];
     int num_cores_nhw = input.shard_spec().value().num_cores();
+    int num_cores_c = 1;
 
     ttnn::Tensor input_tensor = input;  // tensor to return
     SlidingWindowConfig sliding_window_config = SlidingWindowConfig(
@@ -43,6 +44,7 @@ Tensor HaloTensorCreation(const Tensor &input){
             {1, 0}, //padding
             {1, 1}, //dilation
             num_cores_nhw,
+            num_cores_c,
             input_tensor.memory_config().shard_spec.value().grid,
             false, true);
 
diff --git a/ttnn/cpp/ttnn/operations/reduction/generic/device/multi_core_h/reduce_op_multi_core_h.cpp b/ttnn/cpp/ttnn/operations/reduction/generic/device/multi_core_h/reduce_op_multi_core_h.cpp
index 3ad22e3eef1..1a5fd46bce4 100644
--- a/ttnn/cpp/ttnn/operations/reduction/generic/device/multi_core_h/reduce_op_multi_core_h.cpp
+++ b/ttnn/cpp/ttnn/operations/reduction/generic/device/multi_core_h/reduce_op_multi_core_h.cpp
@@ -57,7 +57,7 @@ operation::ProgramWithCallbacks reduce_multi_core_h(
         all_cores = a.shard_spec().value().grid;
         num_cores = all_cores.num_cores();
         core_group_1 = all_cores;
-        core_group_2 = CoreRangeSet({});
+        core_group_2 = CoreRangeSet();
         num_cols_per_core_group_1 = NC * (a.shard_spec().value().shape[1] / TILE_WIDTH);
         num_cols_per_core_group_2 = 0;
     }
diff --git a/ttnn/cpp/ttnn/operations/sharding_utilities.hpp b/ttnn/cpp/ttnn/operations/sharding_utilities.hpp
index 6b58fc9cbdb..1b66f00cb09 100644
--- a/ttnn/cpp/ttnn/operations/sharding_utilities.hpp
+++ b/ttnn/cpp/ttnn/operations/sharding_utilities.hpp
@@ -9,7 +9,7 @@
 #pragma once
 
 #include "tt_metal/common/math.hpp"
-#include "tt_metal/common/core_coord.h"
+#include "tt_metal/common/core_coord.hpp"
 
 #include "tt_metal/host_api.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/sliding_window/halo/device/halo_device_operation.cpp b/ttnn/cpp/ttnn/operations/sliding_window/halo/device/halo_device_operation.cpp
index f16b0976dcc..e1b38036ca1 100644
--- a/ttnn/cpp/ttnn/operations/sliding_window/halo/device/halo_device_operation.cpp
+++ b/ttnn/cpp/ttnn/operations/sliding_window/halo/device/halo_device_operation.cpp
@@ -124,8 +124,8 @@ Tensor halo_op(const Tensor& input_tensor,
                 uint32_t reshard_num_cores_nhw,
                 MemoryConfig output_memory_config,
                 bool is_out_tiled) {
-    TT_ASSERT(input_tensor.memory_config().is_sharded());
-    TT_ASSERT(input_tensor.memory_config().memory_layout == TensorMemoryLayout::HEIGHT_SHARDED || input_tensor.memory_config().memory_layout == TensorMemoryLayout::BLOCK_SHARDED || input_tensor.memory_config().memory_layout == TensorMemoryLayout::WIDTH_SHARDED);
+    TT_FATAL(input_tensor.memory_config().is_sharded(), "Halo expects sharded input tensor");
+    TT_FATAL(input_tensor.memory_config().memory_layout == TensorMemoryLayout::HEIGHT_SHARDED || input_tensor.memory_config().memory_layout == TensorMemoryLayout::BLOCK_SHARDED || input_tensor.memory_config().memory_layout == TensorMemoryLayout::WIDTH_SHARDED, "Only height, width or block sharded tensors are supported.");
     // NOTE: for HEIGHT_SHARDED, ncores_nhw == ncores
     //       for BLOCK_SHARDED, ncores_nhw is just the ncores along height dim (last tensor dim is split along width)
     bool is_block_sharded = input_tensor.memory_config().memory_layout == TensorMemoryLayout::BLOCK_SHARDED;
diff --git a/ttnn/cpp/ttnn/operations/sliding_window/sliding_window.cpp b/ttnn/cpp/ttnn/operations/sliding_window/sliding_window.cpp
index 127aeadf718..8286a278ddf 100644
--- a/ttnn/cpp/ttnn/operations/sliding_window/sliding_window.cpp
+++ b/ttnn/cpp/ttnn/operations/sliding_window/sliding_window.cpp
@@ -18,7 +18,7 @@ Shape SlidingWindowConfig::get_input_shape() const {
 }
 
 bool SlidingWindowConfig::has_parallel_config() const {
-    return num_cores_nhw > 0 && !core_range_set.ranges().empty();
+    return num_cores_nhw > 0 && num_cores_c > 0 && !core_range_set.ranges().empty();
 }
 /**
     * Calculate the window op output shape, excludes the channel dimension since this config is independent of the depth.
@@ -371,18 +371,18 @@ std::tuple<std::vector<std::vector<uint16_t>>, std::vector<std::vector<uint16_t>
                             flattened_remote_config);
 }
 
-std::vector<std::vector<uint16_t>> generate_sliding_window_op_config(const std::vector<uint32_t>& op_trace_metadata, const std::vector<std::pair<uint32_pair_t, uint32_pair_t>>& shard_boundaries, bool pad_tile, bool pad_last_core) {
+std::vector<std::vector<uint16_t>> generate_sliding_window_op_config(const std::vector<uint32_t>& op_trace_metadata, const std::vector<std::pair<uint32_pair_t, uint32_pair_t>>& shard_boundaries, bool pad_tile, bool pad_cores) {
     std::vector<std::vector<uint16_t>> sharded_input_top_left_indices;
     for(const auto& item : shard_boundaries) {
         const auto& [output_shard_start, output_shard_end] = item.first;
         const auto& [input_shard_start, input_shard_end] = item.second;
+        std::vector<uint16_t> local_top_left_indices;
         // sanity check
         if (output_shard_start >= op_trace_metadata.size()) {
             // this core has no output
             continue;
         }
         TT_ASSERT(input_shard_start == op_trace_metadata[output_shard_start]);
-        std::vector<uint16_t> local_top_left_indices;
         for(size_t i = output_shard_start; i < output_shard_end + 1; i++) {
             local_top_left_indices.push_back(op_trace_metadata[i] - op_trace_metadata[output_shard_start]);
         }
@@ -398,16 +398,20 @@ std::vector<std::vector<uint16_t>> generate_sliding_window_op_config(const std::
             }
         }
     }
-    if (pad_last_core) {
-        // Pad indices for last core if not equal to other cores
+    if (pad_cores) {
         uint32_t indices_length_per_core = sharded_input_top_left_indices[0].size();
-        uint32_t indices_length_last_core = sharded_input_top_left_indices.back().size();
-        TT_ASSERT(indices_length_last_core <= indices_length_per_core, "indices length for last core {} larger than indices length per core {}", indices_length_last_core, indices_length_per_core);
-        if (indices_length_per_core - indices_length_last_core > 0) {
-            std::vector<uint16_t> extend_v(indices_length_per_core - indices_length_last_core, 0);
-            sharded_input_top_left_indices.back().insert(sharded_input_top_left_indices.back().end(), extend_v.begin(), extend_v.end());
+        for (uint32_t core_idx = 0; core_idx < shard_boundaries.size(); core_idx++) {
+            // Pad indices for this core if not equal to other cores
+            if (sharded_input_top_left_indices.size() == core_idx) {
+                sharded_input_top_left_indices.push_back(std::vector<uint16_t>());
+            }
+            TT_FATAL(core_idx < sharded_input_top_left_indices.size(), "Invalid core_idx {} for sharded_input_top_left_indices", core_idx);
+            uint32_t indices_length_this_core = sharded_input_top_left_indices[core_idx].size();
+            if (indices_length_per_core - indices_length_this_core > 0) {
+                std::vector<uint16_t> extend_v(indices_length_per_core - indices_length_this_core, 0);
+                sharded_input_top_left_indices[core_idx].insert(sharded_input_top_left_indices[core_idx].end(), extend_v.begin(), extend_v.end());
+            }
         }
-
     }
     return sharded_input_top_left_indices;
 }
@@ -488,7 +492,7 @@ std::string SlidingWindowConfig::to_string() const {
             + "_" + std::to_string(std::get<0>(stride_hw)) + "_" + std::to_string(std::get<1>(stride_hw))
             + "_" + std::to_string(std::get<0>(pad_hw)) + "_" + std::to_string(std::get<1>(pad_hw))
             + "_" + std::to_string(std::get<0>(dilation_hw)) + "_" + std::to_string(std::get<1>(dilation_hw))
-            + "_" + std::to_string(num_cores_nhw) + "_" + core_range_set.str();
+            + "_" + std::to_string(num_cores_nhw) + "_" + std::to_string(num_cores_c) + "_" + core_range_set.str();
 }
 
 } // namespace ttnn::operations::sliding_window
@@ -518,7 +522,7 @@ auto fmt::formatter<ttnn::operations::sliding_window::ParallelConfig>::format(co
 }
 
 auto fmt::formatter<ttnn::operations::sliding_window::SlidingWindowConfig>::format(const ttnn::operations::sliding_window::SlidingWindowConfig& t, format_context& ctx) const -> format_context::iterator {
-        std::string str = fmt::format("SlidingWindowConfig(batch_size={}, input_hw=({},{}), window_hw=({},{}), stride_hw=({},{}), pad_hw=({},{}), dilation_hw=({},{}), num_cores_nhw={}, core_range_set_={})",
+        std::string str = fmt::format("SlidingWindowConfig(batch_size={}, input_hw=({},{}), window_hw=({},{}), stride_hw=({},{}), pad_hw=({},{}), dilation_hw=({},{}), num_cores_nhw={}, num_cores_c={}, core_range_set_={})",
             t.batch_size,
             t.input_hw.first,
             t.input_hw.second,
@@ -531,6 +535,7 @@ auto fmt::formatter<ttnn::operations::sliding_window::SlidingWindowConfig>::form
             t.dilation_hw.first,
             t.dilation_hw.second,
             t.num_cores_nhw,
+            t.num_cores_c,
             t.core_range_set.str());
         return fmt::format_to(ctx.out(), "{}", str);
 }
diff --git a/ttnn/cpp/ttnn/operations/sliding_window/sliding_window.hpp b/ttnn/cpp/ttnn/operations/sliding_window/sliding_window.hpp
index 1a22e2cae66..5a55ebbd0c5 100644
--- a/ttnn/cpp/ttnn/operations/sliding_window/sliding_window.hpp
+++ b/ttnn/cpp/ttnn/operations/sliding_window/sliding_window.hpp
@@ -13,7 +13,7 @@
 namespace ttnn::operations::sliding_window {
 
 struct ParallelConfig {
-    CoreRangeSet grid = {{}};
+    CoreRangeSet grid = {};
     TensorMemoryLayout shard_scheme;
     ShardOrientation shard_orientation;
 
@@ -45,7 +45,8 @@ struct SlidingWindowConfig {
 
     // parallel configuration
     uint32_t num_cores_nhw = 1;        // num cores along collapsed height nhw
-    CoreRangeSet core_range_set = std::set{CoreRange({0, 0}, {0, 0})};   // active cores
+    uint32_t num_cores_c = 1;          // num cores along width c
+    CoreRangeSet core_range_set = CoreRangeSet(CoreRange({0, 0}, {0, 0}));   // active cores
 
     bool snap_to_tile = false;
     bool is_bilinear = false;
@@ -80,7 +81,7 @@ std::vector<std::pair<uint32_pair_t, uint32_pair_t>> generate_shard_boundaries(c
 std::vector<std::pair<bool, uint32_pair_t>> generate_tensor_metadata(const std::vector<bool>& pad_metadata, const SlidingWindowConfig& config, uint32_t reshard_num_cores_nhw = 0, bool is_in_tiled = true);
 uint32_t generate_max_out_nsticks_per_core(const std::vector<std::pair<uint32_pair_t, uint32_pair_t>>& shard_boundaries);
 std::tuple<std::vector<std::vector<uint16_t>>, std::vector<std::vector<uint16_t>>, std::vector<std::vector<uint16_t>>> generate_halo_kernel_config_tensors(const std::vector<std::pair<bool, uint32_pair_t>>& tensor_metadata, const std::vector<std::pair<uint32_pair_t, uint32_pair_t>>& shard_boundaries, bool is_block_sharded, bool transpose_mcast, bool remote_read, Device* device);
-std::vector<std::vector<uint16_t>> generate_sliding_window_op_config(const std::vector<uint32_t>& op_trace_metadata, const std::vector<std::pair<uint32_pair_t, uint32_pair_t>>& shard_boundaries, bool pad_tile = false, bool pad_last_core = false);
+std::vector<std::vector<uint16_t>> generate_sliding_window_op_config(const std::vector<uint32_t>& op_trace_metadata, const std::vector<std::pair<uint32_pair_t, uint32_pair_t>>& shard_boundaries, bool pad_tile = false, bool pad_cores = false);
 std::vector<uint16_t> flatten(const std::vector<std::vector<uint16_t>>& input);
 Tensor construct_on_host_config_tensor(const std::vector<std::vector<uint16_t>>& config, const SlidingWindowConfig& sw_config, const ParallelConfig& p_config);
 Tensor move_config_tensor_to_device(const Tensor& config_tensor, const ParallelConfig& p_config, bool is_block_sharded, Device* device);
diff --git a/ttnn/cpp/ttnn/operations/sliding_window/utils.hpp b/ttnn/cpp/ttnn/operations/sliding_window/utils.hpp
index 38c740242c6..9a683f3e0d0 100644
--- a/ttnn/cpp/ttnn/operations/sliding_window/utils.hpp
+++ b/ttnn/cpp/ttnn/operations/sliding_window/utils.hpp
@@ -13,7 +13,7 @@ namespace tt::tt_metal {
 namespace utils {
 
 inline void init_neighbor_core_xy_mapping(CoreCoord grid_size, std::map<CoreCoord, CoreCoord>& left_neighbor_core, std::map<CoreCoord, CoreCoord>& right_neighbor_core, bool is_twod = false) {
-    TT_ASSERT((grid_size.x == 12 && grid_size.y == 9) || (grid_size.x == 8 && grid_size.y == 8) || (grid_size.x == 8 && grid_size.y == 7));
+    TT_FATAL((grid_size.x == 12 && grid_size.y == 9) || (grid_size.x == 8 && grid_size.y == 8) || (grid_size.x == 8 && grid_size.y == 7));
     if (is_twod) {
         // 2d decomposition case (block sharded)
         // left-right neighbors are calculated along the x dim
diff --git a/ttnn/cpp/ttnn/operations/transformer/sdpa_config.hpp b/ttnn/cpp/ttnn/operations/transformer/sdpa_config.hpp
index 6df094a9c98..5269bba72e7 100644
--- a/ttnn/cpp/ttnn/operations/transformer/sdpa_config.hpp
+++ b/ttnn/cpp/ttnn/operations/transformer/sdpa_config.hpp
@@ -5,7 +5,7 @@
 #pragma once
 
 #include <cstddef>
-#include "common/core_coord.h"
+#include "common/core_coord.hpp"
 
 namespace ttnn::operations::transformer {
 
diff --git a/ttnn/cpp/ttnn/tensor/types.hpp b/ttnn/cpp/ttnn/tensor/types.hpp
index 7c62798c2dd..77b01638c11 100644
--- a/ttnn/cpp/ttnn/tensor/types.hpp
+++ b/ttnn/cpp/ttnn/tensor/types.hpp
@@ -12,7 +12,7 @@
 #include <algorithm>
 
 #include "common/bfloat16.hpp"
-#include "tt_metal/common/core_coord.h"
+#include "tt_metal/common/core_coord.hpp"
 #include "tt_metal/impl/buffers/buffer.hpp"
 #include "tt_metal/impl/device/device.hpp"
 #include "tt_metal/tt_stl/concepts.hpp"
diff --git a/ttnn/ttnn/distributed/distributed.py b/ttnn/ttnn/distributed/distributed.py
index fa4abe3c86d..9d7f519daec 100644
--- a/ttnn/ttnn/distributed/distributed.py
+++ b/ttnn/ttnn/distributed/distributed.py
@@ -344,7 +344,7 @@ def map(self, tensor: "torch.Tensor") -> List["torch.Tensor"]:
 
         if len(tensor_shards) != rows * cols:
             raise ValueError(
-                "ShardTensor2dMesh: Sharding failed. Number of shards should match the product of the mesh dimensions."
+                f"ShardTensor2dMesh: Sharding failed. Number of shards should match the product of the mesh dimensions. Got {len(tensor_shards)} shards but expected {rows * cols} ({rows} rows * {cols} cols)."
             )
 
         return tensor_shards