From 3b565c8eea7ece0656bb55469576c3ed07c26d36 Mon Sep 17 00:00:00 2001
From: Nathan Sidwell <nsidwell@tenstorrent.com>
Date: Fri, 25 Oct 2024 09:12:04 -0400
Subject: [PATCH 01/30] #12979: Remove unused data_noinit section (#14242)

---
 tt_metal/hw/toolchain/erisc-b0-app-sections.ld | 5 -----
 tt_metal/hw/toolchain/erisc-b0-kernel.ld       | 5 -----
 2 files changed, 10 deletions(-)

diff --git a/tt_metal/hw/toolchain/erisc-b0-app-sections.ld b/tt_metal/hw/toolchain/erisc-b0-app-sections.ld
index 7e393bf869b..3ec62a2055a 100644
--- a/tt_metal/hw/toolchain/erisc-b0-app-sections.ld
+++ b/tt_metal/hw/toolchain/erisc-b0-app-sections.ld
@@ -89,11 +89,6 @@ SECTIONS
    . += 4;
   } > REGION_LDM
 
-  data_noinit (NOLOAD):
-  {
-    *(data_noinit)
-  } > REGION_APP_DATA
-
   l1_memory :
   {
     *(l1_memory)
diff --git a/tt_metal/hw/toolchain/erisc-b0-kernel.ld b/tt_metal/hw/toolchain/erisc-b0-kernel.ld
index bb2183b3e97..feefb637320 100644
--- a/tt_metal/hw/toolchain/erisc-b0-kernel.ld
+++ b/tt_metal/hw/toolchain/erisc-b0-kernel.ld
@@ -110,11 +110,6 @@ SECTIONS
    . += 4;
   } > REGION_LDM
 
-  data_noinit (NOLOAD):
-  {
-    *(data_noinit)
-  } > REGION_APP_KERNEL_DATA
-
   l1_memory :
   {
     *(l1_memory)

From 6624bd34aaf7f07c0ee070a72d9331c4f14fe4be Mon Sep 17 00:00:00 2001
From: Andrija Malbasa <amalbasa@tenstorrent.com>
Date: Fri, 25 Oct 2024 15:29:07 +0200
Subject: [PATCH 02/30] Add embedding sweep (#14278)

* #11512: Add embedding sweep

* #11512: Add embedding to ttnn-run-sweeps.yaml

* #11512: Remove device_mesh_fixture from embedding sweep
---
 .github/workflows/ttnn-run-sweeps.yaml        |   1 +
 .../sweeps/embedding/embedding.py             | 109 ++++++++++++++++++
 2 files changed, 110 insertions(+)
 create mode 100644 tests/sweep_framework/sweeps/embedding/embedding.py

diff --git a/.github/workflows/ttnn-run-sweeps.yaml b/.github/workflows/ttnn-run-sweeps.yaml
index 40371321e11..0c965eec5cf 100644
--- a/.github/workflows/ttnn-run-sweeps.yaml
+++ b/.github/workflows/ttnn-run-sweeps.yaml
@@ -228,6 +228,7 @@ on:
           - eltwise.ternary.where.where_pytorch2
           - reduction.topk.topk
           - reduction.argmax.argmax
+          - embedding.embedding
           - matmul.full.matmul_default_block_sharded
           - matmul.full.matmul_default_height_sharded
           - matmul.full.matmul_default_interleaved
diff --git a/tests/sweep_framework/sweeps/embedding/embedding.py b/tests/sweep_framework/sweeps/embedding/embedding.py
new file mode 100644
index 00000000000..739f74e854a
--- /dev/null
+++ b/tests/sweep_framework/sweeps/embedding/embedding.py
@@ -0,0 +1,109 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import random
+import ttnn
+from tests.sweep_framework.sweep_utils.utils import gen_shapes
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 30
+
+random.seed(0)
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+    "nightly": {
+        "embedding_args": gen_shapes([1, 32, 32, 128], [4, 2080, 4128, 550], [1, 32, 32, 32], 32),
+        "input_dtype": [ttnn.uint32],
+        "weight_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
+        "output_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
+        "input_layout": [ttnn.ROW_MAJOR_LAYOUT, ttnn.TILE_LAYOUT],
+        "weight_layout": [ttnn.ROW_MAJOR_LAYOUT, ttnn.TILE_LAYOUT],
+        "input_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+        "weight_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+        "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+    },
+}
+
+
+def invalidate_vector(test_vector) -> Tuple[bool, Optional[str]]:
+    if test_vector["input_layout"] == ttnn.TILE_LAYOUT:
+        return True, "Input must be in row major layout"
+    if test_vector["weight_layout"] == ttnn.TILE_LAYOUT:
+        return True, "Weights must in row major layout"
+    if test_vector["output_dtype"] == ttnn.bfloat8_b:
+        return True, "bloat8_b is not supported for output tensor"
+    if test_vector["weight_layout"] == ttnn.ROW_MAJOR_LAYOUT and test_vector["weight_dtype"] == ttnn.bfloat8_b:
+        return True, "bfloat8_b is only supported on tiled layout"
+    return False, None
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+    embedding_args,
+    input_dtype,
+    weight_dtype,
+    output_dtype,
+    input_layout,
+    weight_layout,
+    input_memory_config,
+    weight_memory_config,
+    output_memory_config,
+    *,
+    device,
+) -> list:
+    data_seed = random.randint(0, 20000000)
+    torch.manual_seed(data_seed)
+
+    batch_size, seq_length, embeddings_dim, num_embeddings = embedding_args
+
+    input_shape = (batch_size, seq_length)
+    weight_shape = (num_embeddings, embeddings_dim)
+
+    torch_input_tensor = torch_random(input_shape, 0, num_embeddings, torch.int64)
+    torch_weight_tensor = gen_func_with_cast_tt(
+        partial(torch_random, low=-100, high=100, dtype=torch.float32), weight_dtype
+    )(weight_shape)
+
+    golden_function = ttnn.get_golden_function(ttnn.embedding)
+    torch_output_tensor = golden_function(torch_input_tensor, torch_weight_tensor).squeeze(dim=0)
+    # torch_output_tensor = torch.nn.functional.embedding(torch_input_tensor, torch_weight_tensor)
+
+    input_tensor = ttnn.from_torch(
+        torch_input_tensor,
+        dtype=input_dtype,
+        layout=input_layout,
+        device=device,
+        memory_config=input_memory_config,
+    )
+    weight_tensor = ttnn.from_torch(
+        torch_weight_tensor,
+        dtype=weight_dtype,
+        layout=weight_layout,
+        device=device,
+        memory_config=weight_memory_config,
+    )
+
+    start_time = start_measuring_time()
+    output_tensor = ttnn.embedding(input_tensor, weight_tensor, dtype=output_dtype, memory_config=output_memory_config)
+    e2e_perf = stop_measuring_time(start_time)
+
+    output_tensor = ttnn.to_torch(output_tensor).squeeze()
+
+    return [check_with_pcc(torch_output_tensor, output_tensor, 0.999), e2e_perf]

From 2c84ace80a82302607e809ebce5c9514d5edc22a Mon Sep 17 00:00:00 2001
From: Andrija Malbasa <amalbasa@tenstorrent.com>
Date: Fri, 25 Oct 2024 15:43:11 +0200
Subject: [PATCH 03/30] Add nonzero sweep (#14279)

* #11512: Add nonzero sweep and add it to ttnn-run-sweeps.yaml

* #11512: Add licence header to nonzero sweep

* #11512: Update gen_with_zeroes function
---
 .github/workflows/ttnn-run-sweeps.yaml        |  1 +
 tests/sweep_framework/sweep_utils/utils.py    |  2 +
 .../sweeps/data_movement/nonzero/nonzero.py   | 97 +++++++++++++++++++
 3 files changed, 100 insertions(+)
 create mode 100644 tests/sweep_framework/sweeps/data_movement/nonzero/nonzero.py

diff --git a/.github/workflows/ttnn-run-sweeps.yaml b/.github/workflows/ttnn-run-sweeps.yaml
index 0c965eec5cf..8905b948f2a 100644
--- a/.github/workflows/ttnn-run-sweeps.yaml
+++ b/.github/workflows/ttnn-run-sweeps.yaml
@@ -258,6 +258,7 @@ on:
           - data_movement.index_select.index_select_pytorch2
           - data_movement.split.split_with_sizes_pytorch2
           - data_movement.repeat.repeat
+          - data_movement.nonzero.nonzero
           - conv2d.full.conv2d_misc
           - conv2d.full.conv2d_sharding
           - conv2d.full.conv2d_sliding_window
diff --git a/tests/sweep_framework/sweep_utils/utils.py b/tests/sweep_framework/sweep_utils/utils.py
index 89ff4c15aed..9f574d47f88 100644
--- a/tests/sweep_framework/sweep_utils/utils.py
+++ b/tests/sweep_framework/sweep_utils/utils.py
@@ -129,6 +129,8 @@ def gen_rand_bitwise_left_shift(size, shift_bits=None, low=-2147483647, high=214
 
 def gen_with_zeroes(size, probabilityzeroes=0.5, low=-100, high=100, dtype=torch.bfloat16):
     element_count = 1
+    if probabilityzeroes == "random":
+        probabilityzeroes = random.uniform(0.0, 0.9)
     for i in size:
         element_count = element_count * i
     raw = torch.zeros(element_count).to(dtype)
diff --git a/tests/sweep_framework/sweeps/data_movement/nonzero/nonzero.py b/tests/sweep_framework/sweeps/data_movement/nonzero/nonzero.py
new file mode 100644
index 00000000000..ba72a97d98a
--- /dev/null
+++ b/tests/sweep_framework/sweeps/data_movement/nonzero/nonzero.py
@@ -0,0 +1,97 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import random
+import ttnn
+from tests.sweep_framework.sweep_utils.utils import gen_shapes, sanitize_shape_rm, gen_with_zeroes
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 30
+
+random.seed(0)
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+    "nightly": {
+        "input_shape": gen_shapes([1, 1, 1, 1], [1, 1, 1, 256], [1, 1, 1, 1], 16),
+        "input_a_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
+        "input_layout": [ttnn.ROW_MAJOR_LAYOUT, ttnn.TILE_LAYOUT],
+        "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+        "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG],
+    },
+    "xfail": {
+        "input_shape": gen_shapes([1, 1, 1, 1], [1, 1, 1, 256], [1, 1, 1, 1], 16),
+        "input_a_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
+        "input_layout": [ttnn.ROW_MAJOR_LAYOUT, ttnn.TILE_LAYOUT],
+        "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+        "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+    },
+}
+
+
+def invalidate_vector(test_vector) -> Tuple[bool, Optional[str]]:
+    if test_vector["input_layout"] == ttnn.TILE_LAYOUT:
+        return True, "Input tensor must be in row major layout"
+    if test_vector["input_layout"] == ttnn.ROW_MAJOR_LAYOUT and test_vector["input_a_dtype"] == ttnn.bfloat8_b:
+        return True, "bfloat8_b is only supported on tiled layout"
+    return False, None
+
+
+def run(
+    input_shape,
+    input_a_dtype,
+    input_layout,
+    input_a_memory_config,
+    output_memory_config,
+    *,
+    device,
+) -> list:
+    data_seed = random.randint(0, 20000000)
+    torch.manual_seed(data_seed)
+
+    if input_layout == ttnn.ROW_MAJOR_LAYOUT:
+        input_shape = sanitize_shape_rm(input_shape)
+
+    torch_input_tensor_a = gen_func_with_cast_tt(
+        partial(gen_with_zeroes, probabilityzeroes="random", low=-100, high=100, dtype=torch.float32), input_a_dtype
+    )(input_shape)
+
+    torch_output_tensor = torch.nonzero(torch_input_tensor_a, as_tuple=False)
+    torch_num_nonzero = torch_output_tensor.shape[0]
+    torch_output_tensor = torch_output_tensor[:, 3].reshape(-1, 1)
+
+    input_tensor_a = ttnn.from_torch(
+        torch_input_tensor_a,
+        dtype=input_a_dtype,
+        layout=input_layout,
+        device=device,
+        memory_config=input_a_memory_config,
+    )
+
+    start_time = start_measuring_time()
+    output_indices, output_tensor = ttnn.nonzero(input_tensor_a, memory_config=output_memory_config)
+    e2e_perf = stop_measuring_time(start_time)
+
+    num_nonzero = ttnn.to_torch(output_indices)[0, 0, 0, 0].item()
+    output_tensor = ttnn.to_torch(output_tensor)[0, 0, 0, :num_nonzero].reshape(-1, 1)
+
+    if num_nonzero != torch_num_nonzero:
+        return [
+            (False, f"Expected num of non-zero: {torch_num_nonzero}, actual num of non_zero: {num_nonzero}"),
+            e2e_perf,
+        ]
+
+    return [check_with_pcc(torch_output_tensor, output_tensor, 0.999), e2e_perf]

From d6aeb5979dfdc5b59f154a18cdfcaee83bbc28ce Mon Sep 17 00:00:00 2001
From: Pavle Josipovic <pjosipovic@tenstorrent.com>
Date: Fri, 25 Oct 2024 13:51:58 +0000
Subject: [PATCH 04/30] #13541: Fix regression from #01940051

Previous change for conv2d auto sharded didn't properly adjust
python biding for get_conv_padded_input_shape_and_mem_config
in template version of the function for MeshDevice.
get_conv_padded_input_shape_and_mem_config that is template on Device, was properly adjusted and now with this change they are the same.
---
 .../operations/conv/conv2d/conv2d_pybind.cpp   | 18 ++----------------
 1 file changed, 2 insertions(+), 16 deletions(-)

diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_pybind.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_pybind.cpp
index 0163c3d43a0..e081d6bf44d 100644
--- a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_pybind.cpp
@@ -159,14 +159,7 @@ void py_bind_conv2d(py::module& module) {
            uint32_t height,
            uint32_t width,
            uint32_t in_channels,
-           uint32_t out_channels,
-           std::array<uint32_t, 2> kernel_size,
-           std::array<uint32_t, 2> stride,
-           std::array<uint32_t, 2> padding,
-           std::array<uint32_t, 2> dilation,
-           uint32_t weights_width,
-           uint32_t input_width,
-           uint32_t groups) -> std::tuple<ttnn::Shape, ttnn::MemoryConfig, bool, bool> {
+           uint32_t out_channels) -> std::tuple<ttnn::Shape, ttnn::MemoryConfig, bool, bool> {
             return ttnn::operations::conv::conv2d::get_conv_padded_input_shape_and_mem_config<MeshDevice>(
                 device,
                 input_tensor,
@@ -185,14 +178,7 @@ void py_bind_conv2d(py::module& module) {
         py::arg("height"),
         py::arg("width"),
         py::arg("in_channels"),
-        py::arg("out_channels"),
-        py::arg("kernel_size"),
-        py::arg("stride"),
-        py::arg("padding"),
-        py::arg("dilation"),
-        py::arg("weights_width"),
-        py::arg("input_width"),
-        py::arg("groups"));
+        py::arg("out_channels"));
 
     module.def(
         "convert_conv_weight_tensor_to_tiled_layout",

From 189afd39cb7334e1fe7d01f7725e5126b854e396 Mon Sep 17 00:00:00 2001
From: Nenad Petrovic
 <109360062+npetrovic-tenstorrent@users.noreply.github.com>
Date: Fri, 25 Oct 2024 16:51:34 +0200
Subject: [PATCH 05/30] New backward binary ops sweeps (#14211)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* #11512: Add rsub_bw sweep

* #11512: Add squared difference bw sweep

---------

Co-authored-by: “Nenad <“npetrovic@tenstorrent.com”>
---
 .github/workflows/ttnn-run-sweeps.yaml        |   2 +
 .../binary_backward/rsub_bw/rsub_bw.py        | 134 +++++++++++++++++
 .../squared_difference_bw.py                  | 136 ++++++++++++++++++
 3 files changed, 272 insertions(+)
 create mode 100644 tests/sweep_framework/sweeps/eltwise/binary_backward/rsub_bw/rsub_bw.py
 create mode 100644 tests/sweep_framework/sweeps/eltwise/binary_backward/squared_difference_bw/squared_difference_bw.py

diff --git a/.github/workflows/ttnn-run-sweeps.yaml b/.github/workflows/ttnn-run-sweeps.yaml
index 8905b948f2a..e8b28d15de1 100644
--- a/.github/workflows/ttnn-run-sweeps.yaml
+++ b/.github/workflows/ttnn-run-sweeps.yaml
@@ -211,6 +211,8 @@ on:
           - eltwise.binary_backward.subalpha_bw.subalpha_bw
           - eltwise.binary_backward.xlogy_bw.xlogy_bw
           - eltwise.binary_backward.hypot_bw.hypot_bw
+          - eltwise.binary_backward.rsub_bw.rsub_bw
+          - eltwise.binary_backward.squared_difference_bw.squared_difference_bw
           - eltwise.composite.binary.addalpha.addalpha
           - eltwise.composite.binary.subalpha.subalpha
           - eltwise.composite.binary.minimum.minimum
diff --git a/tests/sweep_framework/sweeps/eltwise/binary_backward/rsub_bw/rsub_bw.py b/tests/sweep_framework/sweeps/eltwise/binary_backward/rsub_bw/rsub_bw.py
new file mode 100644
index 00000000000..c16b5fe8722
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/binary_backward/rsub_bw/rsub_bw.py
@@ -0,0 +1,134 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import random
+import ttnn
+from tests.sweep_framework.sweep_utils.utils import gen_shapes
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 30
+
+random.seed(0)
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+    "nightly": {
+        "input_shape": gen_shapes([1, 1, 1, 1], [6, 12, 256, 256], [1, 1, 1, 1], 16)
+        + gen_shapes([1, 1, 1], [12, 256, 256], [1, 1, 1], 16)
+        + gen_shapes([1, 1], [256, 256], [1, 1], 16),
+        "grad_dtype": [ttnn.bfloat16],
+        "input_a_dtype": [ttnn.bfloat16],
+        "input_b_dtype": [ttnn.bfloat16],
+        "grad_layout": [ttnn.TILE_LAYOUT],
+        "input_a_layout": [ttnn.TILE_LAYOUT],
+        "input_b_layout": [ttnn.TILE_LAYOUT],
+        "grad_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+        "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+        "input_b_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+        "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+    },
+}
+
+
+def str_to_float(x):
+    try:
+        return float(x)
+    except:
+        return 0.0
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+    input_shape,
+    grad_dtype,
+    input_a_dtype,
+    input_b_dtype,
+    grad_layout,
+    input_a_layout,
+    input_b_layout,
+    grad_memory_config,
+    input_a_memory_config,
+    input_b_memory_config,
+    output_memory_config,
+    *,
+    device,
+) -> list:
+    data_seed = random.randint(0, 20000000)
+    torch.manual_seed(data_seed)
+
+    torch_grad_tensor = gen_func_with_cast_tt(partial(torch_random, low=-10, high=10, dtype=torch.float32), grad_dtype)(
+        input_shape
+    )
+
+    torch_input_tensor_a = gen_func_with_cast_tt(
+        partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+    )(input_shape)
+    torch_input_tensor_a.requires_grad = True
+    torch_input_tensor_a.retain_grad()
+
+    torch_input_tensor_b = gen_func_with_cast_tt(
+        partial(torch_random, low=-100, high=100, dtype=torch.float32), input_b_dtype
+    )(input_shape)
+    torch_input_tensor_b.requires_grad = True
+    torch_input_tensor_b.retain_grad()
+
+    golden_function = ttnn.get_golden_function(ttnn.rsub_bw)
+    torch_output_tensor = golden_function(torch_grad_tensor, torch_input_tensor_a, torch_input_tensor_b)
+
+    grad_tensor = ttnn.from_torch(
+        torch_grad_tensor,
+        dtype=grad_dtype,
+        layout=grad_layout,
+        device=device,
+        memory_config=grad_memory_config,
+    )
+
+    input_tensor_a = ttnn.from_torch(
+        torch_input_tensor_a.detach().clone(),
+        dtype=input_a_dtype,
+        layout=input_a_layout,
+        device=device,
+        memory_config=input_a_memory_config,
+    )
+
+    input_tensor_b = ttnn.from_torch(
+        torch_input_tensor_b.detach().clone(),
+        dtype=input_b_dtype,
+        layout=input_b_layout,
+        device=device,
+        memory_config=input_b_memory_config,
+    )
+
+    start_time = start_measuring_time()
+    output_tensor = ttnn.rsub_bw(grad_tensor, input_tensor_a, input_tensor_b, memory_config=output_memory_config)
+
+    for i in range(len(output_tensor)):
+        output_tensor[i] = ttnn.to_torch(output_tensor[i])
+    e2e_perf = stop_measuring_time(start_time)
+
+    pcc = [True, 1.0]
+
+    for i in range(len(output_tensor)):
+        pcc_tmp = check_with_pcc(torch_output_tensor[i], output_tensor[i], 0.99)
+        pcc[0] = pcc[0] and pcc_tmp[0]
+        pcc[1] = min(pcc[1], str_to_float(pcc_tmp[1]))
+
+    pcc[1] = str(pcc[1])
+    # print(f"pcc {pcc}")
+    return [pcc, e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/binary_backward/squared_difference_bw/squared_difference_bw.py b/tests/sweep_framework/sweeps/eltwise/binary_backward/squared_difference_bw/squared_difference_bw.py
new file mode 100644
index 00000000000..61bc0e090ba
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/binary_backward/squared_difference_bw/squared_difference_bw.py
@@ -0,0 +1,136 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import random
+import ttnn
+from tests.sweep_framework.sweep_utils.utils import gen_shapes
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 30
+
+random.seed(0)
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+    "nightly": {
+        "input_shape": gen_shapes([1, 1, 1, 1], [6, 12, 256, 256], [1, 1, 1, 1], 16)
+        + gen_shapes([1, 1, 1], [12, 256, 256], [1, 1, 1], 16)
+        + gen_shapes([1, 1], [256, 256], [1, 1], 16),
+        "grad_dtype": [ttnn.bfloat16],
+        "input_a_dtype": [ttnn.bfloat16],
+        "input_b_dtype": [ttnn.bfloat16],
+        "grad_layout": [ttnn.TILE_LAYOUT],
+        "input_a_layout": [ttnn.TILE_LAYOUT],
+        "input_b_layout": [ttnn.TILE_LAYOUT],
+        "grad_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+        "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+        "input_b_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+        "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+    },
+}
+
+
+def str_to_float(x):
+    try:
+        return float(x)
+    except:
+        return 0.0
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+    input_shape,
+    grad_dtype,
+    input_a_dtype,
+    input_b_dtype,
+    grad_layout,
+    input_a_layout,
+    input_b_layout,
+    grad_memory_config,
+    input_a_memory_config,
+    input_b_memory_config,
+    output_memory_config,
+    *,
+    device,
+) -> list:
+    data_seed = random.randint(0, 20000000)
+    torch.manual_seed(data_seed)
+
+    torch_grad_tensor = gen_func_with_cast_tt(partial(torch_random, low=-10, high=10, dtype=torch.float32), grad_dtype)(
+        input_shape
+    )
+
+    torch_input_tensor_a = gen_func_with_cast_tt(
+        partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+    )(input_shape)
+    torch_input_tensor_a.requires_grad = True
+    torch_input_tensor_a.retain_grad()
+
+    torch_input_tensor_b = gen_func_with_cast_tt(
+        partial(torch_random, low=-100, high=100, dtype=torch.float32), input_b_dtype
+    )(input_shape)
+    torch_input_tensor_b.requires_grad = True
+    torch_input_tensor_b.retain_grad()
+
+    golden_function = ttnn.get_golden_function(ttnn.squared_difference_bw)
+    torch_output_tensor = golden_function(torch_grad_tensor, torch_input_tensor_a, torch_input_tensor_b)
+
+    grad_tensor = ttnn.from_torch(
+        torch_grad_tensor,
+        dtype=grad_dtype,
+        layout=grad_layout,
+        device=device,
+        memory_config=grad_memory_config,
+    )
+
+    input_tensor_a = ttnn.from_torch(
+        torch_input_tensor_a.detach().clone(),
+        dtype=input_a_dtype,
+        layout=input_a_layout,
+        device=device,
+        memory_config=input_a_memory_config,
+    )
+
+    input_tensor_b = ttnn.from_torch(
+        torch_input_tensor_b.detach().clone(),
+        dtype=input_b_dtype,
+        layout=input_b_layout,
+        device=device,
+        memory_config=input_b_memory_config,
+    )
+
+    start_time = start_measuring_time()
+    output_tensor = ttnn.squared_difference_bw(
+        grad_tensor, input_tensor_a, input_tensor_b, memory_config=output_memory_config
+    )
+
+    for i in range(len(output_tensor)):
+        output_tensor[i] = ttnn.to_torch(output_tensor[i])
+    e2e_perf = stop_measuring_time(start_time)
+
+    pcc = [True, 1.0]
+
+    for i in range(len(output_tensor)):
+        pcc_tmp = check_with_pcc(torch_output_tensor[i], output_tensor[i], 0.99)
+        pcc[0] = pcc[0] and pcc_tmp[0]
+        pcc[1] = min(pcc[1], str_to_float(pcc_tmp[1]))
+
+    pcc[1] = str(pcc[1])
+    # print(f"pcc {pcc}")
+    return [pcc, e2e_perf]

From e7f5c97b7190581cd5e6335fb733739fa754b33f Mon Sep 17 00:00:00 2001
From: Uma Devi Selvaraj <umadevi@multicorewareinc.com>
Date: Fri, 25 Oct 2024 21:07:11 +0530
Subject: [PATCH 06/30] Second dimension bcast support for binop (#13673)

* #13646: Add second dimension support in binary op

* #13646: Add unit test for channel bcast using repeat

* #13646: Add assert statement

* #13646: Fix CI failure issue
---
 .../pytests/tt_dnn/test_broadcast.py          |  6 +--
 .../operations/eltwise/test_mul_bcast.py      | 42 +++++++++++++++++++
 .../ttnn/operations/eltwise/binary/binary.cpp |  9 +++-
 3 files changed, 52 insertions(+), 5 deletions(-)
 create mode 100644 tests/ttnn/unit_tests/operations/eltwise/test_mul_bcast.py

diff --git a/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_broadcast.py b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_broadcast.py
index 117b1383491..296734f0f4e 100644
--- a/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_broadcast.py
+++ b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_broadcast.py
@@ -27,7 +27,7 @@
 @pytest.mark.parametrize("dtype", (ttnn.bfloat16, ttnn.bfloat8_b))
 def test_run_bcast_h_test(input_shapes, bcast_op_type, dtype, device, function_level_defaults):
     datagen_func = [
-        generation_funcs.gen_func_with_cast(partial(generation_funcs.gen_rand, low=-100, high=100), torch.float32)
+        generation_funcs.gen_func_with_cast(partial(generation_funcs.gen_rand, low=-100, high=100), torch.bfloat16)
     ] * 2
     comparison_func = partial(comparison_funcs.comp_pcc)
     run_single_pytorch_test(
@@ -60,7 +60,7 @@ def test_run_bcast_h_test(input_shapes, bcast_op_type, dtype, device, function_l
 @pytest.mark.parametrize("dtype", (ttnn.bfloat16, ttnn.bfloat8_b))
 def test_run_bcast_w_test(input_shapes, bcast_op_type, dtype, device, function_level_defaults):
     datagen_func = [
-        generation_funcs.gen_func_with_cast(partial(generation_funcs.gen_rand, low=-100, high=100), torch.float32)
+        generation_funcs.gen_func_with_cast(partial(generation_funcs.gen_rand, low=-100, high=100), torch.bfloat16)
     ] * 2
     comparison_func = partial(comparison_funcs.comp_pcc)
     run_single_pytorch_test(
@@ -93,7 +93,7 @@ def test_run_bcast_w_test(input_shapes, bcast_op_type, dtype, device, function_l
 @pytest.mark.parametrize("dtype", (ttnn.bfloat16, ttnn.bfloat8_b))
 def test_run_bcast_hw_test(input_shapes, bcast_op_type, dtype, device, function_level_defaults):
     datagen_func = [
-        generation_funcs.gen_func_with_cast(partial(generation_funcs.gen_rand, low=-100, high=100), torch.float32)
+        generation_funcs.gen_func_with_cast(partial(generation_funcs.gen_rand, low=-100, high=100), torch.bfloat16)
     ] * 2
     comparison_func = partial(comparison_funcs.comp_pcc)
     run_single_pytorch_test(
diff --git a/tests/ttnn/unit_tests/operations/eltwise/test_mul_bcast.py b/tests/ttnn/unit_tests/operations/eltwise/test_mul_bcast.py
new file mode 100644
index 00000000000..de4122f29ad
--- /dev/null
+++ b/tests/ttnn/unit_tests/operations/eltwise/test_mul_bcast.py
@@ -0,0 +1,42 @@
+# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+
+import torch
+
+import ttnn
+
+from tests.ttnn.utils_for_testing import assert_with_pcc
+from torch.nn import functional as F
+
+
+@pytest.mark.parametrize("h", [32])
+@pytest.mark.parametrize("w", [64])
+def test_mul_channel_bcast_repeat(device, h, w):
+    torch_input_tensor_a = torch.rand((16, 16, h, w), dtype=torch.bfloat16)
+    torch_input_tensor_b = torch.rand((16, 1, h, w), dtype=torch.bfloat16)
+
+    input_tensor_a = ttnn.from_torch(torch_input_tensor_a, layout=ttnn.TILE_LAYOUT, device=device)
+    input_tensor_b = ttnn.from_torch(torch_input_tensor_b, layout=ttnn.TILE_LAYOUT, device=device)
+    output = ttnn.mul(input_tensor_a, input_tensor_b)
+    output = ttnn.to_torch(output)
+
+    torch_output_tensor = torch.mul(torch_input_tensor_a, torch_input_tensor_b)
+    assert_with_pcc(torch_output_tensor, output, 0.9999)
+
+
+@pytest.mark.parametrize("h", [32])
+@pytest.mark.parametrize("w", [64])
+def test_mul_batch_bcast_repeat(device, h, w):
+    torch_input_tensor_a = torch.rand((1, 16, h, w), dtype=torch.bfloat16)
+    torch_input_tensor_b = torch.rand((16, 16, h, w), dtype=torch.bfloat16)
+
+    input_tensor_a = ttnn.from_torch(torch_input_tensor_a, layout=ttnn.TILE_LAYOUT, device=device)
+    input_tensor_b = ttnn.from_torch(torch_input_tensor_b, layout=ttnn.TILE_LAYOUT, device=device)
+    output = ttnn.mul(input_tensor_a, input_tensor_b)
+    output = ttnn.to_torch(output)
+
+    torch_output_tensor = torch.mul(torch_input_tensor_a, torch_input_tensor_b)
+    assert_with_pcc(torch_output_tensor, output, 0.9999)
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/binary.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/binary.cpp
index 49980f2dc42..af6ee7605c6 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/binary/binary.cpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary/binary.cpp
@@ -110,13 +110,18 @@ auto preprocess_inputs(
     auto repeat_smaller = [](const auto &first, auto &second) {
         const auto first_shape = first.get_shape();
         const auto second_shape = second.get_shape();
-
         // repeats second if it is smaller
         if (first_shape.rank() == 4 and second_shape.rank() == 4 and first_shape[0] > second_shape[0]) {
-            tt::log_warning(tt::LogOp, "Using repeat op to broadcast batch dim");
+            TT_FATAL(second_shape[0] == 1, "Dimension trying to broadcast is not equal to 1");
             Shape repeats(std::array<uint32_t, 4>{first_shape[0], 1, 1, 1});
             second = ttnn::repeat(second, repeats);
         }
+        // repeats second if it is smaller
+        if (first_shape.rank() == 4 and second_shape.rank() == 4 and first_shape[1] > second_shape[1]) {
+            TT_FATAL(second_shape[1] == 1, "Dimension trying to broadcast is not equal to 1");
+            Shape repeats(std::array<uint32_t, 4>{1, first_shape[1], 1, 1});
+            second = ttnn::repeat(second, repeats);
+        }
     };
     repeat_smaller(input_tensor_a, input_tensor_b);
     repeat_smaller(input_tensor_b, input_tensor_a);

From 39f805f59bdb13f77d1581c997763e6408e46fd7 Mon Sep 17 00:00:00 2001
From: Nenad Petrovic
 <109360062+npetrovic-tenstorrent@users.noreply.github.com>
Date: Fri, 25 Oct 2024 19:02:18 +0200
Subject: [PATCH 07/30] Leaky relu sweeps (#13763)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* #11512: Add leaky relu sweeps

* #11512: Add elu and celu sweeps

* #11512: Refactor sweeps

* #11512: Add silu_bw and selu_bw

* #11512: Add silu_bw and selu_bw

* #11512: Add floor_bw

* #11512: Add hardswish_bw and tanshrink_bw sweeps

* #11512: Update run yaml

* #11512: Add rpow

---------

Co-authored-by: “Nenad <“npetrovic@tenstorrent.com”>
---
 .github/workflows/ttnn-run-sweeps.yaml        |   9 ++
 .../eltwise/unary_backward/celu_bw/celu_bw.py |  97 +++++++++++++++++
 .../eltwise/unary_backward/elu_bw/elu_bw.py   |  97 +++++++++++++++++
 .../unary_backward/floor_bw/floor_bw.py       |  95 +++++++++++++++++
 .../hardswish_bw/hardswish_bw.py              |  95 +++++++++++++++++
 .../leaky_relu_bw/leaky_relu_bw.py            | 100 ++++++++++++++++++
 .../eltwise/unary_backward/rpow_bw/rpow_bw.py |  98 +++++++++++++++++
 .../eltwise/unary_backward/selu_bw/selu_bw.py |  95 +++++++++++++++++
 .../eltwise/unary_backward/silu_bw/silu_bw.py |  95 +++++++++++++++++
 .../tanhshrink_bw/tanhshrink_bw.py            |  95 +++++++++++++++++
 10 files changed, 876 insertions(+)
 create mode 100644 tests/sweep_framework/sweeps/eltwise/unary_backward/celu_bw/celu_bw.py
 create mode 100644 tests/sweep_framework/sweeps/eltwise/unary_backward/elu_bw/elu_bw.py
 create mode 100644 tests/sweep_framework/sweeps/eltwise/unary_backward/floor_bw/floor_bw.py
 create mode 100644 tests/sweep_framework/sweeps/eltwise/unary_backward/hardswish_bw/hardswish_bw.py
 create mode 100644 tests/sweep_framework/sweeps/eltwise/unary_backward/leaky_relu_bw/leaky_relu_bw.py
 create mode 100644 tests/sweep_framework/sweeps/eltwise/unary_backward/rpow_bw/rpow_bw.py
 create mode 100644 tests/sweep_framework/sweeps/eltwise/unary_backward/selu_bw/selu_bw.py
 create mode 100644 tests/sweep_framework/sweeps/eltwise/unary_backward/silu_bw/silu_bw.py
 create mode 100644 tests/sweep_framework/sweeps/eltwise/unary_backward/tanhshrink_bw/tanhshrink_bw.py

diff --git a/.github/workflows/ttnn-run-sweeps.yaml b/.github/workflows/ttnn-run-sweeps.yaml
index e8b28d15de1..f216b98debc 100644
--- a/.github/workflows/ttnn-run-sweeps.yaml
+++ b/.github/workflows/ttnn-run-sweeps.yaml
@@ -145,6 +145,15 @@ on:
           - eltwise.unary_backward.hardsigmoid_bw.hardsigmoid_bw
           - eltwise.unary_backward.lgamma_bw.lgamma_bw
           - eltwise.unary_backward.multigammaln_bw.multigammaln_bw
+          - eltwise.unary_backward.leaky_relu_bw.leaky_relu_bw
+          - eltwise.unary_backward.elu_bw.elu_bw
+          - eltwise.unary_backward.celu_bw.celu_bw
+          - eltwise.unary_backward.selu_bw.selu_bw
+          - eltwise.unary_backward.silu_bw.silu_bw
+          - eltwise.unary_backward.floor_bw.floor_bw
+          - eltwise.unary_backward.tanhshrink_bw.tanhshrink_bw
+          - eltwise.unary_backward.hardswish_bw.hardswish_bw
+          - eltwise.unary_backward.rpow_bw.rpow_bw
           - eltwise.unary.lgamma
           - eltwise.unary.logit
           - eltwise.unary.mish
diff --git a/tests/sweep_framework/sweeps/eltwise/unary_backward/celu_bw/celu_bw.py b/tests/sweep_framework/sweeps/eltwise/unary_backward/celu_bw/celu_bw.py
new file mode 100644
index 00000000000..90b2c6ebe1c
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary_backward/celu_bw/celu_bw.py
@@ -0,0 +1,97 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import random
+import ttnn
+from tests.sweep_framework.sweep_utils.utils import gen_shapes, sanitize_shape_rm
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 30
+
+random.seed(0)
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+    "nightly": {
+        "input_shape": gen_shapes([1, 1, 32, 32], [6, 12, 256, 256], [1, 1, 32, 32], 16)
+        + gen_shapes([1, 32, 32], [12, 256, 256], [1, 32, 32], 16)
+        + gen_shapes([32, 32], [256, 256], [32, 32], 16),
+        "grad_dtype": [ttnn.bfloat16],
+        "input_a_dtype": [ttnn.bfloat16],
+        "grad_layout": [ttnn.TILE_LAYOUT],
+        "input_a_layout": [ttnn.TILE_LAYOUT],
+        "grad_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+        "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+        "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+    },
+}
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+    input_shape,
+    grad_dtype,
+    input_a_dtype,
+    grad_layout,
+    input_a_layout,
+    grad_memory_config,
+    input_a_memory_config,
+    output_memory_config,
+    *,
+    device,
+) -> list:
+    data_seed = random.randint(0, 20000000)
+    torch.manual_seed(data_seed)
+
+    torch_grad_tensor = gen_func_with_cast_tt(
+        partial(torch_random, low=-100, high=100, dtype=torch.float32), grad_dtype
+    )(input_shape)
+    torch_input_tensor_a = gen_func_with_cast_tt(
+        partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+    )(input_shape)
+    torch_input_tensor_a.requires_grad = True
+    torch_input_tensor_a.retain_grad()
+
+    alpha = torch.tensor(1, dtype=torch.bfloat16).uniform_(0.01, 10).item()
+
+    intermediate_result = torch.nn.functional.elu(torch_input_tensor_a, alpha=alpha)
+    intermediate_result.backward(gradient=torch_grad_tensor)
+    torch_output_tensor = torch_input_tensor_a.grad
+
+    grad_tensor = ttnn.from_torch(
+        torch_grad_tensor,
+        dtype=grad_dtype,
+        layout=grad_layout,
+        device=device,
+        memory_config=grad_memory_config,
+    )
+
+    input_tensor_a = ttnn.from_torch(
+        torch_input_tensor_a.detach().clone(),
+        dtype=input_a_dtype,
+        layout=input_a_layout,
+        device=device,
+        memory_config=input_a_memory_config,
+    )
+
+    start_time = start_measuring_time()
+    output_tensor = ttnn.elu_bw(grad_tensor, input_tensor_a, alpha=alpha, memory_config=output_memory_config)[0]
+    output_tensor = ttnn.to_torch(output_tensor)
+    e2e_perf = stop_measuring_time(start_time)
+
+    return [check_with_pcc(torch_output_tensor, output_tensor, 0.999), e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary_backward/elu_bw/elu_bw.py b/tests/sweep_framework/sweeps/eltwise/unary_backward/elu_bw/elu_bw.py
new file mode 100644
index 00000000000..cb4ab1809d6
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary_backward/elu_bw/elu_bw.py
@@ -0,0 +1,97 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import random
+import ttnn
+from tests.sweep_framework.sweep_utils.utils import gen_shapes, sanitize_shape_rm
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 30
+
+random.seed(0)
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+    "nightly": {
+        "input_shape": gen_shapes([1, 1, 32, 32], [6, 12, 256, 256], [1, 1, 32, 32], 16)
+        + gen_shapes([1, 32, 32], [12, 256, 256], [1, 32, 32], 16)
+        + gen_shapes([32, 32], [256, 256], [32, 32], 16),
+        "grad_dtype": [ttnn.bfloat16],
+        "input_a_dtype": [ttnn.bfloat16],
+        "grad_layout": [ttnn.TILE_LAYOUT],
+        "input_a_layout": [ttnn.TILE_LAYOUT],
+        "grad_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+        "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+        "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+    },
+}
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+    input_shape,
+    grad_dtype,
+    input_a_dtype,
+    grad_layout,
+    input_a_layout,
+    grad_memory_config,
+    input_a_memory_config,
+    output_memory_config,
+    *,
+    device,
+) -> list:
+    data_seed = random.randint(0, 20000000)
+    torch.manual_seed(data_seed)
+
+    torch_grad_tensor = gen_func_with_cast_tt(
+        partial(torch_random, low=-100, high=100, dtype=torch.float32), grad_dtype
+    )(input_shape)
+    torch_input_tensor_a = gen_func_with_cast_tt(
+        partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+    )(input_shape)
+    torch_input_tensor_a.requires_grad = True
+    torch_input_tensor_a.retain_grad()
+
+    alpha = torch.tensor(1, dtype=torch.bfloat16).uniform_(-10, 10).item()
+
+    intermediate_result = torch.nn.functional.elu(torch_input_tensor_a, alpha=alpha)
+    intermediate_result.backward(gradient=torch_grad_tensor)
+    torch_output_tensor = torch_input_tensor_a.grad
+
+    grad_tensor = ttnn.from_torch(
+        torch_grad_tensor,
+        dtype=grad_dtype,
+        layout=grad_layout,
+        device=device,
+        memory_config=grad_memory_config,
+    )
+
+    input_tensor_a = ttnn.from_torch(
+        torch_input_tensor_a.detach().clone(),
+        dtype=input_a_dtype,
+        layout=input_a_layout,
+        device=device,
+        memory_config=input_a_memory_config,
+    )
+
+    start_time = start_measuring_time()
+    output_tensor = ttnn.elu_bw(grad_tensor, input_tensor_a, alpha=alpha, memory_config=output_memory_config)[0]
+    output_tensor = ttnn.to_torch(output_tensor)
+    e2e_perf = stop_measuring_time(start_time)
+
+    return [check_with_pcc(torch_output_tensor, output_tensor, 0.999), e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary_backward/floor_bw/floor_bw.py b/tests/sweep_framework/sweeps/eltwise/unary_backward/floor_bw/floor_bw.py
new file mode 100644
index 00000000000..2d3aa811334
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary_backward/floor_bw/floor_bw.py
@@ -0,0 +1,95 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import random
+import ttnn
+from tests.sweep_framework.sweep_utils.utils import gen_shapes, sanitize_shape_rm
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 30
+
+random.seed(0)
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+    "nightly": {
+        "input_shape": gen_shapes([1, 1, 32, 32], [6, 12, 256, 256], [1, 1, 32, 32], 16)
+        + gen_shapes([1, 32, 32], [12, 256, 256], [1, 32, 32], 16)
+        + gen_shapes([32, 32], [256, 256], [32, 32], 16),
+        "grad_dtype": [ttnn.bfloat16],
+        "input_a_dtype": [ttnn.bfloat16],
+        "grad_layout": [ttnn.TILE_LAYOUT],
+        "input_a_layout": [ttnn.TILE_LAYOUT],
+        "grad_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+        "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+        "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+    },
+}
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+    input_shape,
+    grad_dtype,
+    input_a_dtype,
+    grad_layout,
+    input_a_layout,
+    grad_memory_config,
+    input_a_memory_config,
+    output_memory_config,
+    *,
+    device,
+) -> list:
+    data_seed = random.randint(0, 20000000)
+    torch.manual_seed(data_seed)
+
+    torch_grad_tensor = gen_func_with_cast_tt(
+        partial(torch_random, low=-100, high=100, dtype=torch.float32), grad_dtype
+    )(input_shape)
+    torch_input_tensor_a = gen_func_with_cast_tt(
+        partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+    )(input_shape)
+    torch_input_tensor_a.requires_grad = True
+    torch_input_tensor_a.retain_grad()
+
+    intermediate_result = torch.floor(torch_input_tensor_a)
+    intermediate_result.backward(gradient=torch_grad_tensor)
+    torch_output_tensor = torch_input_tensor_a.grad
+
+    grad_tensor = ttnn.from_torch(
+        torch_grad_tensor,
+        dtype=grad_dtype,
+        layout=grad_layout,
+        device=device,
+        memory_config=grad_memory_config,
+    )
+
+    input_tensor_a = ttnn.from_torch(
+        torch_input_tensor_a.detach().clone(),
+        dtype=input_a_dtype,
+        layout=input_a_layout,
+        device=device,
+        memory_config=input_a_memory_config,
+    )
+
+    start_time = start_measuring_time()
+    output_tensor = ttnn.floor_bw(grad_tensor, input_tensor_a, memory_config=output_memory_config)[0]
+    output_tensor = ttnn.to_torch(output_tensor)
+    e2e_perf = stop_measuring_time(start_time)
+
+    return [check_with_pcc(torch_output_tensor, output_tensor, 0.999), e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary_backward/hardswish_bw/hardswish_bw.py b/tests/sweep_framework/sweeps/eltwise/unary_backward/hardswish_bw/hardswish_bw.py
new file mode 100644
index 00000000000..1c9cbf0e84a
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary_backward/hardswish_bw/hardswish_bw.py
@@ -0,0 +1,95 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import random
+import ttnn
+from tests.sweep_framework.sweep_utils.utils import gen_shapes, sanitize_shape_rm
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 30
+
+random.seed(0)
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+    "nightly": {
+        "input_shape": gen_shapes([1, 1, 32, 32], [6, 12, 256, 256], [1, 1, 32, 32], 16)
+        + gen_shapes([1, 32, 32], [12, 256, 256], [1, 32, 32], 16)
+        + gen_shapes([32, 32], [256, 256], [32, 32], 16),
+        "grad_dtype": [ttnn.bfloat16],
+        "input_a_dtype": [ttnn.bfloat16],
+        "grad_layout": [ttnn.TILE_LAYOUT],
+        "input_a_layout": [ttnn.TILE_LAYOUT],
+        "grad_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+        "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+        "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+    },
+}
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+    input_shape,
+    grad_dtype,
+    input_a_dtype,
+    grad_layout,
+    input_a_layout,
+    grad_memory_config,
+    input_a_memory_config,
+    output_memory_config,
+    *,
+    device,
+) -> list:
+    data_seed = random.randint(0, 20000000)
+    torch.manual_seed(data_seed)
+
+    torch_grad_tensor = gen_func_with_cast_tt(
+        partial(torch_random, low=-100, high=100, dtype=torch.float32), grad_dtype
+    )(input_shape)
+    torch_input_tensor_a = gen_func_with_cast_tt(
+        partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+    )(input_shape)
+    torch_input_tensor_a.requires_grad = True
+    torch_input_tensor_a.retain_grad()
+
+    intermediate_result = torch.nn.functional.hardswish(torch_input_tensor_a)
+    intermediate_result.backward(gradient=torch_grad_tensor)
+    torch_output_tensor = torch_input_tensor_a.grad
+
+    grad_tensor = ttnn.from_torch(
+        torch_grad_tensor,
+        dtype=grad_dtype,
+        layout=grad_layout,
+        device=device,
+        memory_config=grad_memory_config,
+    )
+
+    input_tensor_a = ttnn.from_torch(
+        torch_input_tensor_a.detach().clone(),
+        dtype=input_a_dtype,
+        layout=input_a_layout,
+        device=device,
+        memory_config=input_a_memory_config,
+    )
+
+    start_time = start_measuring_time()
+    output_tensor = ttnn.hardswish_bw(grad_tensor, input_tensor_a, memory_config=output_memory_config)[0]
+    output_tensor = ttnn.to_torch(output_tensor)
+    e2e_perf = stop_measuring_time(start_time)
+
+    return [check_with_pcc(torch_output_tensor, output_tensor, 0.999), e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary_backward/leaky_relu_bw/leaky_relu_bw.py b/tests/sweep_framework/sweeps/eltwise/unary_backward/leaky_relu_bw/leaky_relu_bw.py
new file mode 100644
index 00000000000..31c1262b411
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary_backward/leaky_relu_bw/leaky_relu_bw.py
@@ -0,0 +1,100 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import random
+import ttnn
+from tests.sweep_framework.sweep_utils.utils import gen_shapes, sanitize_shape_rm
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 30
+
+random.seed(0)
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+    "nightly": {
+        "input_shape": gen_shapes([1, 1, 32, 32], [6, 12, 256, 256], [1, 1, 32, 32], 16)
+        + gen_shapes([1, 32, 32], [12, 256, 256], [1, 32, 32], 16)
+        + gen_shapes([32, 32], [256, 256], [32, 32], 16),
+        "grad_dtype": [ttnn.bfloat16],
+        "input_a_dtype": [ttnn.bfloat16],
+        "grad_layout": [ttnn.TILE_LAYOUT],
+        "input_a_layout": [ttnn.TILE_LAYOUT],
+        "grad_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+        "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+        "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+    },
+}
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+    input_shape,
+    grad_dtype,
+    input_a_dtype,
+    grad_layout,
+    input_a_layout,
+    grad_memory_config,
+    input_a_memory_config,
+    output_memory_config,
+    *,
+    device,
+) -> list:
+    data_seed = random.randint(0, 20000000)
+    torch.manual_seed(data_seed)
+
+    torch_grad_tensor = gen_func_with_cast_tt(
+        partial(torch_random, low=-100, high=100, dtype=torch.float32), grad_dtype
+    )(input_shape)
+    torch_input_tensor_a = gen_func_with_cast_tt(
+        partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+    )(input_shape)
+    torch_input_tensor_a.requires_grad = True
+    torch_input_tensor_a.retain_grad()
+
+    negative_slope = torch.tensor(1, dtype=torch.bfloat16).uniform_(0, 100).item()
+
+    intermediate_result = torch.nn.functional.leaky_relu(torch_input_tensor_a, negative_slope=negative_slope)
+    intermediate_result.backward(gradient=torch_grad_tensor)
+    torch_output_tensor = torch_input_tensor_a.grad
+
+    grad_tensor = ttnn.from_torch(
+        torch_grad_tensor,
+        dtype=grad_dtype,
+        layout=grad_layout,
+        device=device,
+        memory_config=grad_memory_config,
+    )
+
+    input_tensor_a = ttnn.from_torch(
+        torch_input_tensor_a.detach().clone(),
+        dtype=input_a_dtype,
+        layout=input_a_layout,
+        device=device,
+        memory_config=input_a_memory_config,
+    )
+
+    start_time = start_measuring_time()
+    output_tensor = ttnn.leaky_relu_bw(
+        grad_tensor, input_tensor_a, negative_slope=negative_slope, memory_config=output_memory_config
+    )[0]
+    output_tensor = ttnn.to_torch(output_tensor)
+    e2e_perf = stop_measuring_time(start_time)
+
+    return [check_with_pcc(torch_output_tensor, output_tensor, 0.999), e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary_backward/rpow_bw/rpow_bw.py b/tests/sweep_framework/sweeps/eltwise/unary_backward/rpow_bw/rpow_bw.py
new file mode 100644
index 00000000000..dd5d6c19ea4
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary_backward/rpow_bw/rpow_bw.py
@@ -0,0 +1,98 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import random
+import ttnn
+from tests.sweep_framework.sweep_utils.utils import gen_shapes, sanitize_shape_rm
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 30
+
+random.seed(0)
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+    "xfail": {
+        "input_shape": gen_shapes([1, 1, 32, 32], [6, 12, 256, 256], [1, 1, 32, 32], 16)
+        + gen_shapes([1, 32, 32], [12, 256, 256], [1, 32, 32], 16)
+        + gen_shapes([32, 32], [256, 256], [32, 32], 16),
+        "grad_dtype": [ttnn.bfloat16],
+        "input_a_dtype": [ttnn.bfloat16],
+        "grad_layout": [ttnn.TILE_LAYOUT],
+        "input_a_layout": [ttnn.TILE_LAYOUT],
+        "grad_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+        "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+        "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+    },
+}
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+    input_shape,
+    grad_dtype,
+    input_a_dtype,
+    grad_layout,
+    input_a_layout,
+    grad_memory_config,
+    input_a_memory_config,
+    output_memory_config,
+    *,
+    device,
+) -> list:
+    data_seed = random.randint(0, 20000000)
+    torch.manual_seed(data_seed)
+
+    torch_grad_tensor = gen_func_with_cast_tt(
+        partial(torch_random, low=-100, high=100, dtype=torch.float32), grad_dtype
+    )(input_shape)
+    torch_input_tensor_a = gen_func_with_cast_tt(
+        partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+    )(input_shape)
+    torch_input_tensor_a.requires_grad = True
+    torch_input_tensor_a.retain_grad()
+
+    factor = random.uniform(0.1, 10.0)
+
+    intermediate_result = torch.pow(torch_input_tensor_a, factor)
+    intermediate_result.backward(gradient=torch_grad_tensor)
+    torch_output_tensor = torch_input_tensor_a.grad
+
+    grad_tensor = ttnn.from_torch(
+        torch_grad_tensor,
+        dtype=grad_dtype,
+        layout=grad_layout,
+        device=device,
+        memory_config=grad_memory_config,
+    )
+
+    input_tensor_a = ttnn.from_torch(
+        torch_input_tensor_a.detach().clone(),
+        dtype=input_a_dtype,
+        layout=input_a_layout,
+        device=device,
+        memory_config=input_a_memory_config,
+    )
+
+    start_time = start_measuring_time()
+    output_tensor = ttnn.pow_bw(grad_tensor, input_tensor_a, factor, memory_config=output_memory_config)[0]
+    output_tensor = ttnn.to_torch(output_tensor)
+    e2e_perf = stop_measuring_time(start_time)
+
+    return [check_with_pcc(torch_output_tensor, output_tensor, 0.999), e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary_backward/selu_bw/selu_bw.py b/tests/sweep_framework/sweeps/eltwise/unary_backward/selu_bw/selu_bw.py
new file mode 100644
index 00000000000..d4eeac59b7e
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary_backward/selu_bw/selu_bw.py
@@ -0,0 +1,95 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import random
+import ttnn
+from tests.sweep_framework.sweep_utils.utils import gen_shapes, sanitize_shape_rm
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 30
+
+random.seed(0)
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+    "nightly": {
+        "input_shape": gen_shapes([1, 1, 32, 32], [6, 12, 256, 256], [1, 1, 32, 32], 16)
+        + gen_shapes([1, 32, 32], [12, 256, 256], [1, 32, 32], 16)
+        + gen_shapes([32, 32], [256, 256], [32, 32], 16),
+        "grad_dtype": [ttnn.bfloat16],
+        "input_a_dtype": [ttnn.bfloat16],
+        "grad_layout": [ttnn.TILE_LAYOUT],
+        "input_a_layout": [ttnn.TILE_LAYOUT],
+        "grad_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+        "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+        "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+    },
+}
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+    input_shape,
+    grad_dtype,
+    input_a_dtype,
+    grad_layout,
+    input_a_layout,
+    grad_memory_config,
+    input_a_memory_config,
+    output_memory_config,
+    *,
+    device,
+) -> list:
+    data_seed = random.randint(0, 20000000)
+    torch.manual_seed(data_seed)
+
+    torch_grad_tensor = gen_func_with_cast_tt(
+        partial(torch_random, low=-100, high=100, dtype=torch.float32), grad_dtype
+    )(input_shape)
+    torch_input_tensor_a = gen_func_with_cast_tt(
+        partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+    )(input_shape)
+    torch_input_tensor_a.requires_grad = True
+    torch_input_tensor_a.retain_grad()
+
+    intermediate_result = torch.nn.functional.selu(torch_input_tensor_a)
+    intermediate_result.backward(gradient=torch_grad_tensor)
+    torch_output_tensor = torch_input_tensor_a.grad
+
+    grad_tensor = ttnn.from_torch(
+        torch_grad_tensor,
+        dtype=grad_dtype,
+        layout=grad_layout,
+        device=device,
+        memory_config=grad_memory_config,
+    )
+
+    input_tensor_a = ttnn.from_torch(
+        torch_input_tensor_a.detach().clone(),
+        dtype=input_a_dtype,
+        layout=input_a_layout,
+        device=device,
+        memory_config=input_a_memory_config,
+    )
+
+    start_time = start_measuring_time()
+    output_tensor = ttnn.selu_bw(grad_tensor, input_tensor_a, memory_config=output_memory_config)[0]
+    output_tensor = ttnn.to_torch(output_tensor)
+    e2e_perf = stop_measuring_time(start_time)
+
+    return [check_with_pcc(torch_output_tensor, output_tensor, 0.999), e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary_backward/silu_bw/silu_bw.py b/tests/sweep_framework/sweeps/eltwise/unary_backward/silu_bw/silu_bw.py
new file mode 100644
index 00000000000..fed2d0d1d08
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary_backward/silu_bw/silu_bw.py
@@ -0,0 +1,95 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import random
+import ttnn
+from tests.sweep_framework.sweep_utils.utils import gen_shapes, sanitize_shape_rm
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 30
+
+random.seed(0)
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+    "nightly": {
+        "input_shape": gen_shapes([1, 1, 32, 32], [6, 12, 256, 256], [1, 1, 32, 32], 16)
+        + gen_shapes([1, 32, 32], [12, 256, 256], [1, 32, 32], 16)
+        + gen_shapes([32, 32], [256, 256], [32, 32], 16),
+        "grad_dtype": [ttnn.bfloat16],
+        "input_a_dtype": [ttnn.bfloat16],
+        "grad_layout": [ttnn.TILE_LAYOUT],
+        "input_a_layout": [ttnn.TILE_LAYOUT],
+        "grad_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+        "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+        "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+    },
+}
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+    input_shape,
+    grad_dtype,
+    input_a_dtype,
+    grad_layout,
+    input_a_layout,
+    grad_memory_config,
+    input_a_memory_config,
+    output_memory_config,
+    *,
+    device,
+) -> list:
+    data_seed = random.randint(0, 20000000)
+    torch.manual_seed(data_seed)
+
+    torch_grad_tensor = gen_func_with_cast_tt(
+        partial(torch_random, low=-100, high=100, dtype=torch.float32), grad_dtype
+    )(input_shape)
+    torch_input_tensor_a = gen_func_with_cast_tt(
+        partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+    )(input_shape)
+    torch_input_tensor_a.requires_grad = True
+    torch_input_tensor_a.retain_grad()
+
+    intermediate_result = torch.nn.functional.silu(torch_input_tensor_a)
+    intermediate_result.backward(gradient=torch_grad_tensor)
+    torch_output_tensor = torch_input_tensor_a.grad
+
+    grad_tensor = ttnn.from_torch(
+        torch_grad_tensor,
+        dtype=grad_dtype,
+        layout=grad_layout,
+        device=device,
+        memory_config=grad_memory_config,
+    )
+
+    input_tensor_a = ttnn.from_torch(
+        torch_input_tensor_a.detach().clone(),
+        dtype=input_a_dtype,
+        layout=input_a_layout,
+        device=device,
+        memory_config=input_a_memory_config,
+    )
+
+    start_time = start_measuring_time()
+    output_tensor = ttnn.silu_bw(grad_tensor, input_tensor_a, memory_config=output_memory_config)[0]
+    output_tensor = ttnn.to_torch(output_tensor)
+    e2e_perf = stop_measuring_time(start_time)
+
+    return [check_with_pcc(torch_output_tensor, output_tensor, 0.999), e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary_backward/tanhshrink_bw/tanhshrink_bw.py b/tests/sweep_framework/sweeps/eltwise/unary_backward/tanhshrink_bw/tanhshrink_bw.py
new file mode 100644
index 00000000000..a2afd5c6e4a
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary_backward/tanhshrink_bw/tanhshrink_bw.py
@@ -0,0 +1,95 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import random
+import ttnn
+from tests.sweep_framework.sweep_utils.utils import gen_shapes, sanitize_shape_rm
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 30
+
+random.seed(0)
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+    "nightly": {
+        "input_shape": gen_shapes([1, 1, 32, 32], [6, 12, 256, 256], [1, 1, 32, 32], 16)
+        + gen_shapes([1, 32, 32], [12, 256, 256], [1, 32, 32], 16)
+        + gen_shapes([32, 32], [256, 256], [32, 32], 16),
+        "grad_dtype": [ttnn.bfloat16],
+        "input_a_dtype": [ttnn.bfloat16],
+        "grad_layout": [ttnn.TILE_LAYOUT],
+        "input_a_layout": [ttnn.TILE_LAYOUT],
+        "grad_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+        "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+        "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+    },
+}
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+    input_shape,
+    grad_dtype,
+    input_a_dtype,
+    grad_layout,
+    input_a_layout,
+    grad_memory_config,
+    input_a_memory_config,
+    output_memory_config,
+    *,
+    device,
+) -> list:
+    data_seed = random.randint(0, 20000000)
+    torch.manual_seed(data_seed)
+
+    torch_grad_tensor = gen_func_with_cast_tt(
+        partial(torch_random, low=-100, high=100, dtype=torch.float32), grad_dtype
+    )(input_shape)
+    torch_input_tensor_a = gen_func_with_cast_tt(
+        partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+    )(input_shape)
+    torch_input_tensor_a.requires_grad = True
+    torch_input_tensor_a.retain_grad()
+
+    intermediate_result = torch.nn.functional.tanhshrink(torch_input_tensor_a)
+    intermediate_result.backward(gradient=torch_grad_tensor)
+    torch_output_tensor = torch_input_tensor_a.grad
+
+    grad_tensor = ttnn.from_torch(
+        torch_grad_tensor,
+        dtype=grad_dtype,
+        layout=grad_layout,
+        device=device,
+        memory_config=grad_memory_config,
+    )
+
+    input_tensor_a = ttnn.from_torch(
+        torch_input_tensor_a.detach().clone(),
+        dtype=input_a_dtype,
+        layout=input_a_layout,
+        device=device,
+        memory_config=input_a_memory_config,
+    )
+
+    start_time = start_measuring_time()
+    output_tensor = ttnn.tanhshrink_bw(grad_tensor, input_tensor_a, memory_config=output_memory_config)[0]
+    output_tensor = ttnn.to_torch(output_tensor)
+    e2e_perf = stop_measuring_time(start_time)
+
+    return [check_with_pcc(torch_output_tensor, output_tensor, 0.999), e2e_perf]

From 424f7ed7c2ea1f453d76e14d0a1088602a417d0b Mon Sep 17 00:00:00 2001
From: Sean Nijjar <sean.nijjar@gmail.com>
Date: Fri, 25 Oct 2024 13:32:31 -0400
Subject: [PATCH 08/30] #14250: add cluster-axis API to reduce scatter (#14252)

* #14250: add cluster-axis API to reduce scatter

this commit also improves test coverage to handle scatter dimensions that are used for TG llama.  Correspondingly, a bug for CCL send command generator was resolved that incorrectly handled non-inner-dim command sequence generation.

Co-authored-by: Sean Nijjar <snijjar@tenstorrent.com>
---
 .../nightly/tg/ccl/test_all_gather_nightly.py |   1 +
 .../tg/ccl/test_reduce_scatter_nightly.py     |   1 +
 tests/scripts/tg/run_tg_nightly_tests.sh      |   3 +-
 .../ccl/test_reduce_scatter_TG_nightly.py     | 327 ++++++++++++++++++
 ttnn/cpp/ttnn/operations/ccl/ccl_common.cpp   |  10 +-
 .../host/reduce_scatter_full_worker_grid.cpp  |   2 +-
 .../device/reduce_scatter_op.cpp              |  74 +++-
 .../device/reduce_scatter_op.hpp              |  16 +-
 .../ccl/reduce_scatter/reduce_scatter.cpp     |  15 +
 .../ccl/reduce_scatter/reduce_scatter.hpp     |  12 +
 .../reduce_scatter/reduce_scatter_pybind.cpp  |  34 +-
 11 files changed, 485 insertions(+), 10 deletions(-)
 create mode 120000 tests/nightly/tg/ccl/test_all_gather_nightly.py
 create mode 120000 tests/nightly/tg/ccl/test_reduce_scatter_nightly.py
 create mode 100644 tests/ttnn/unit_tests/operations/ccl/test_reduce_scatter_TG_nightly.py

diff --git a/tests/nightly/tg/ccl/test_all_gather_nightly.py b/tests/nightly/tg/ccl/test_all_gather_nightly.py
new file mode 120000
index 00000000000..92d5007ada5
--- /dev/null
+++ b/tests/nightly/tg/ccl/test_all_gather_nightly.py
@@ -0,0 +1 @@
+../../../../tests/ttnn/unit_tests/operations/ccl/test_all_gather_TG_nightly.py
\ No newline at end of file
diff --git a/tests/nightly/tg/ccl/test_reduce_scatter_nightly.py b/tests/nightly/tg/ccl/test_reduce_scatter_nightly.py
new file mode 120000
index 00000000000..ac93b90f333
--- /dev/null
+++ b/tests/nightly/tg/ccl/test_reduce_scatter_nightly.py
@@ -0,0 +1 @@
+../../../../tests/ttnn/unit_tests/operations/ccl/test_reduce_scatter_TG_nightly.py
\ No newline at end of file
diff --git a/tests/scripts/tg/run_tg_nightly_tests.sh b/tests/scripts/tg/run_tg_nightly_tests.sh
index 1bcf876a66e..89e5c253c7c 100755
--- a/tests/scripts/tg/run_tg_nightly_tests.sh
+++ b/tests/scripts/tg/run_tg_nightly_tests.sh
@@ -7,7 +7,8 @@ run_tg_llama3_70b_tests() {
 
   echo "LOG_METAL: Running run_tg_llama3_70b_tests"
 
-  pytest tests/ttnn/unit_tests/operations/ccl/test_all_gather_TG_nightly.py ; fail+=$?
+  pytest tests/nightly/tg/ccl/test_all_gather_nightly.py ; fail+=$?
+  pytest tests/nightly/tg/ccl/test_reduce_scatter_nightly.py ; fail+=$?
 
   # Falcon40B prefill 60 layer end to end with 10 loops; we need 8x8 grid size
   pytest tests/nightly/tg/models/demos/tg/llama3_70b ; fail+=$?
diff --git a/tests/ttnn/unit_tests/operations/ccl/test_reduce_scatter_TG_nightly.py b/tests/ttnn/unit_tests/operations/ccl/test_reduce_scatter_TG_nightly.py
new file mode 100644
index 00000000000..2cbe8f5aa29
--- /dev/null
+++ b/tests/ttnn/unit_tests/operations/ccl/test_reduce_scatter_TG_nightly.py
@@ -0,0 +1,327 @@
+# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import pytest
+from loguru import logger
+import ttnn
+from tests.tt_eager.python_api_testing.sweep_tests.comparison_funcs import comp_equal, comp_pcc
+from models.utility_functions import skip_for_grayskull
+
+from ttnn import ShardTensor2dMesh, ConcatMesh2dToTensor
+
+
+def report_mismatches(golden, actual, max_printable=None):
+    printed = 0
+    for w in range(golden.shape[0]):
+        for z in range(golden.shape[1]):
+            for y in range(0, golden.shape[2], 32):
+                for x in range(0, golden.shape[3], 32):
+                    print_it = (max_printable is None or printed < max_printable) and golden[w, z, y, x] != actual[
+                        w, z, y, x
+                    ]
+                    if print_it:
+                        printed += 1
+                        print(
+                            f"output mismatch for tensor at [{w}, {z}, {y}, {x}]: expected {golden[w, z, y, x]} != actual {actual[w, z, y, x]}"
+                        )
+
+
+def print_tile_corners_of_tensor(t):
+    for w in range(t.shape[0]):
+        for z in range(t.shape[1]):
+            str = ""
+            for x in range(0, t.shape[3], 32):
+                str += f"{x:<5} "[:5]
+            print(f"     {str}")
+            for y in range(0, t.shape[2], 32):
+                str_vals = f"y={y:<3} "[:5]
+                for x in range(0, t.shape[3], 32):
+                    yy = 0
+                    xx = 0
+                    val = int(t[w, z, y + yy, x + xx].item())
+                    str_vals += f"{val:<5} "[:5]
+                print(f"{str_vals}")
+
+
+def run_line_reduce_scatter_on_TG_with_mesh_tensor_along_rows(
+    mesh_device,
+    num_devices_per_line,
+    per_chip_input_shape,
+    tensor_memory_layout,
+    dim,
+    num_links,
+    math_op,
+    input_dtype,
+    layout,
+    buffer_type: ttnn.BufferType,
+    use_program_cache,
+    function_level_defaults,
+    enable_async,
+    input_shard_spec: ttnn.ShardSpec = None,
+    num_reduce_scatter_instances: int = 1,
+    num_iters: int = 1,
+    cluster_axis: int = 0,
+):
+    if len(mesh_device.get_devices()) != 32:
+        pytest.skip("Not TG!")
+    for d in mesh_device.get_devices():
+        ttnn.enable_program_cache(d)
+    mesh_device.enable_async(enable_async)
+
+    per_reduce_scatter_output_shape = list(per_chip_input_shape)
+    per_reduce_scatter_output_shape[dim] *= num_devices_per_line
+    full_mesh_input_shape = list(per_reduce_scatter_output_shape)
+    ## The `reduce_scatter_instances_concat_dim` is the dimension we will split the cluster spanning tensor along in order to split it
+    ## off into per-all-gather tensors
+    reduce_scatter_instances_concat_dim = 1 if dim == 0 else 0
+    full_mesh_input_shape[reduce_scatter_instances_concat_dim] *= num_reduce_scatter_instances
+    logger.info(
+        f"full_mesh_input_shape: {full_mesh_input_shape}, dim: {dim}, reduce_scatter_instances_concat_dim: {reduce_scatter_instances_concat_dim}, num_devices_per_line: {num_devices_per_line}"
+    )
+
+    ##
+    ## Compute golden
+    ##
+
+    per_chip_output_shape = list(per_chip_input_shape)
+    per_chip_output_shape[dim] //= num_devices_per_line
+    per_reduce_scatter_inputs = []
+    per_reduce_scatter_goldens = []
+    for i in range(num_reduce_scatter_instances):
+        per_chip_inputs = [torch.rand(per_chip_input_shape).bfloat16() for _ in range(num_devices_per_line)]
+        per_reduce_scatter_inputs.append(per_chip_inputs)
+
+        golden_canonical_out_tensor = torch.zeros(per_chip_input_shape).bfloat16()
+        for t in per_chip_inputs:
+            golden_canonical_out_tensor = torch.add(golden_canonical_out_tensor, t).bfloat16()
+        per_reduce_scatter_goldens.append(golden_canonical_out_tensor)
+
+    per_reduce_scatter_concatenated_inputs = [
+        torch.cat(per_reduce_scatter_inputs[i], dim=dim) for i in range(num_reduce_scatter_instances)
+    ]
+
+    full_input_tensor_unfractured = torch.cat(
+        per_reduce_scatter_concatenated_inputs, dim=reduce_scatter_instances_concat_dim
+    )
+
+    input_mem_config = ttnn.MemoryConfig(tensor_memory_layout, buffer_type=buffer_type, shard_spec=input_shard_spec)
+    shard_dims = (
+        (dim, reduce_scatter_instances_concat_dim) if cluster_axis == 0 else (reduce_scatter_instances_concat_dim, dim)
+    )
+    concat_dims = shard_dims
+
+    mesh_shape = (
+        (num_devices_per_line, num_reduce_scatter_instances)
+        if cluster_axis == 0
+        else (num_reduce_scatter_instances, num_devices_per_line)
+    )
+
+    output_shard_spec = None
+    if input_shard_spec is not None:
+        output_shard_shape = list(input_shard_spec.shape)
+        if dim == 3:
+            output_shard_shape[1] *= num_devices_per_line
+        else:
+            output_shard_shape[0] *= num_devices_per_line
+        output_shard_spec = ttnn.ShardSpec(
+            input_shard_spec.grid,
+            output_shard_shape,
+            input_shard_spec.orientation,
+            False,
+        )
+    output_mem_config = ttnn.MemoryConfig(tensor_memory_layout, buffer_type=buffer_type, shard_spec=output_shard_spec)
+    ttnn_tensor = ttnn.from_torch(
+        full_input_tensor_unfractured,
+        dtype=input_dtype,
+        device=mesh_device,
+        layout=layout,
+        memory_config=input_mem_config,
+        mesh_mapper=ShardTensor2dMesh(mesh_device, mesh_shape=mesh_shape, dims=shard_dims),
+    )
+    ttnn_tensor = ttnn.to_device(ttnn_tensor, mesh_device)
+
+    # ttnn.visualize_mesh_device(mesh_device, tensor=ttnn_tensor)
+    ttnn_tensor_out = ttnn.reduce_scatter(
+        ttnn_tensor,
+        scatter_dim=dim,
+        cluster_axis=cluster_axis,
+        mesh_device=mesh_device,
+        math_op=math_op,
+        num_links=num_links,
+        memory_config=output_mem_config,
+        topology=ttnn.Topology.Linear,
+    )
+    trace_id = ttnn.begin_trace_capture(mesh_device, cq_id=0)
+    # ttnn.visualize_mesh_device(mesh_device, tensor=ttnn_tensor)
+    for _ in range(num_iters):
+        ttnn_tensor_out = ttnn.reduce_scatter(
+            ttnn_tensor,
+            scatter_dim=dim,
+            cluster_axis=cluster_axis,
+            mesh_device=mesh_device,
+            math_op=math_op,
+            num_links=num_links,
+            memory_config=output_mem_config,
+            topology=ttnn.Topology.Linear,
+        )
+    ttnn.end_trace_capture(mesh_device, trace_id, cq_id=0)
+    for d in mesh_device.get_devices():
+        ttnn.synchronize_device(d)
+
+    logger.info("Starting Trace perf test...")
+    ttnn.execute_trace(mesh_device, trace_id, blocking=False)
+    ttnn.release_trace(mesh_device, trace_id)
+    for d in mesh_device.get_devices():
+        ttnn.synchronize_device(d)
+
+    # ttnn.visualize_mesh_device(mesh_device, tensor=ttnn_tensor_out)
+    tt_output_tensor = ttnn.to_torch(
+        ttnn_tensor_out, mesh_composer=ConcatMesh2dToTensor(mesh_device, mesh_shape=mesh_shape, dims=concat_dims)
+    )
+    output_tensors_list = torch.chunk(
+        tt_output_tensor, num_reduce_scatter_instances, dim=reduce_scatter_instances_concat_dim
+    )
+
+    passed = True
+    for i in range(num_reduce_scatter_instances):
+        # The result of all-chips in the reduce scatter line having their outputs concatenated
+        reduce_scatter_outputs_concatenated = output_tensors_list[i]
+        per_chip_outputs = torch.chunk(reduce_scatter_outputs_concatenated, num_devices_per_line, dim=dim)
+        per_chip_goldens = torch.chunk(per_reduce_scatter_goldens[i], num_devices_per_line, dim=dim)
+
+        assert len(per_chip_outputs) == len(per_chip_goldens)
+        # compare the output and golden (zip)
+        for d, (output, golden) in enumerate(zip(per_chip_outputs, per_chip_goldens)):
+            eq, output = comp_pcc(output, golden)
+
+            if not eq:
+                passed = False
+                logger.error(f"output mismatch for tensor on reduce_scatter {i}, device {d}: {output}")
+
+    assert passed, f"FAILED: {output}"
+
+
+# Enumerate the post-commit cases explicitly
+@skip_for_grayskull("Requires eth connected devices to run")
+@pytest.mark.parametrize(
+    "num_devices, num_links, per_chip_output_shape, dim, layout",
+    [
+        (4, 1, [1, 4, 32, 2304], 1, ttnn.TILE_LAYOUT),
+    ],
+)
+@pytest.mark.parametrize(
+    "input_dtype",
+    [
+        ttnn.bfloat16,
+        # ttnn.bfloat8_b,
+    ],
+)
+@pytest.mark.parametrize(
+    "buffer_type",
+    [
+        ttnn.BufferType.DRAM,
+        ttnn.BufferType.L1,
+    ],
+)
+@pytest.mark.parametrize("replication_factor", [8])  # 1, 8])
+@pytest.mark.parametrize("enable_async", [True])
+@pytest.mark.parametrize("mesh_device", [pytest.param((8, 4), id="8x4_grid")], indirect=True)
+@pytest.mark.parametrize("math_op", [ttnn.ReduceType.Sum])
+@pytest.mark.parametrize("device_params", [{"trace_region_size": 10281600}], indirect=True)
+def test_line_reduce_scatter_on_TG_rows_post_commit(
+    mesh_device,
+    num_devices,
+    per_chip_output_shape,
+    dim,
+    num_links,
+    math_op,
+    input_dtype,
+    layout,
+    buffer_type,
+    use_program_cache,
+    function_level_defaults,
+    enable_async,
+    replication_factor,
+    num_iters=16,
+):
+    run_line_reduce_scatter_on_TG_with_mesh_tensor_along_rows(
+        mesh_device,
+        num_devices,
+        per_chip_output_shape,
+        ttnn.TensorMemoryLayout.INTERLEAVED,
+        dim,
+        num_links,
+        math_op,
+        input_dtype,
+        layout,
+        buffer_type,
+        use_program_cache,
+        function_level_defaults,
+        enable_async=enable_async,
+        num_iters=num_iters,
+        num_reduce_scatter_instances=replication_factor,
+        cluster_axis=1,
+    )
+
+
+@skip_for_grayskull("Requires eth connected devices to run")
+@pytest.mark.parametrize(
+    "num_devices, num_links, per_chip_output_shape, dim, layout",
+    [
+        (8, 1, [1, 8, 32, 1280], 1, ttnn.TILE_LAYOUT),
+        (8, 1, [8, 1, 32, 1280], 0, ttnn.TILE_LAYOUT),
+    ],
+)
+@pytest.mark.parametrize(
+    "input_dtype",
+    [
+        ttnn.bfloat16,
+    ],
+)
+@pytest.mark.parametrize(
+    "buffer_type",
+    [
+        ttnn.BufferType.DRAM,
+    ],
+)
+@pytest.mark.parametrize("enable_async", [True])
+@pytest.mark.parametrize("replication_factor", [4])
+@pytest.mark.parametrize("mesh_device", [pytest.param((8, 4), id="8x4_grid")], indirect=True)
+@pytest.mark.parametrize("math_op", [ttnn.ReduceType.Sum])
+@pytest.mark.parametrize("device_params", [{"trace_region_size": 10281600}], indirect=True)
+def test_line_reduce_scatter_on_TG_cols_post_commit(
+    mesh_device,
+    num_devices,
+    per_chip_output_shape,
+    dim,
+    num_links,
+    math_op,
+    input_dtype,
+    layout,
+    buffer_type,
+    use_program_cache,
+    function_level_defaults,
+    enable_async,
+    replication_factor,
+    num_iters=16,
+):
+    run_line_reduce_scatter_on_TG_with_mesh_tensor_along_rows(
+        mesh_device,
+        num_devices,
+        per_chip_output_shape,
+        ttnn.TensorMemoryLayout.INTERLEAVED,
+        dim,
+        num_links,
+        math_op,
+        input_dtype,
+        layout,
+        buffer_type,
+        use_program_cache,
+        function_level_defaults,
+        enable_async=enable_async,
+        num_iters=num_iters,
+        num_reduce_scatter_instances=replication_factor,
+        cluster_axis=0,
+    )
diff --git a/ttnn/cpp/ttnn/operations/ccl/ccl_common.cpp b/ttnn/cpp/ttnn/operations/ccl/ccl_common.cpp
index 410f8aaf85c..c66b196eea7 100644
--- a/ttnn/cpp/ttnn/operations/ccl/ccl_common.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/ccl_common.cpp
@@ -720,10 +720,14 @@ std::vector<TensorSlice> generate_slice_sequence_on_dim(
     std::size_t worker_index
 ) {
     static_assert(std::is_same_v<TensorSlice::ords_t, tt_xy_pair>, "generate_slice_sequence_on_dim not yet implemented for type not of tt_xy_pair");
-    TT_ASSERT(fracture_dim == 3);
     // We don't support 4D shapes in the CCL kernels yet, which are needed for proper reduction/concatenation in some cases
     // so for now we subtract the outer dims from the fracture_dim since we only support 2D at the moment.
-    fracture_dim -= 2;
+    if (fracture_dim == 3) {
+        fracture_dim -= 2;
+    } else {
+        // dims are
+        fracture_dim = 0;
+    }
 
     TT_ASSERT(worker_slice_shape.y == 1);
 
@@ -743,7 +747,7 @@ std::vector<TensorSlice> generate_slice_sequence_on_dim(
         log_trace(tt::LogOp, "worker_index {}", worker_index);
     }
 
-    auto worker_slice_start_offset = fracture_dim == 0 ? TensorSlice::ords_t{0, worker_index * worker_slice_shape.y} : TensorSlice::ords_t{worker_index * worker_slice_shape.x, 0};
+    auto worker_slice_start_offset = /*fracture_dim == 0 ? TensorSlice::ords_t{0, worker_index * worker_slice_shape.y} :*/ TensorSlice::ords_t{worker_index * worker_slice_shape.x, 0};
 
     auto generate_slice = [forward_direction,incr, &slices, &tensor_shape, &slice_shape, &worker_slice_shape, tensor_slice_offset, &worker_slice_start_offset, fracture_dim, dim_start_offset, slice_size_on_dim](std::int64_t i){
         auto tensor_slice_offset_adjusted = tensor_slice_offset;
diff --git a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/host/reduce_scatter_full_worker_grid.cpp b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/host/reduce_scatter_full_worker_grid.cpp
index f0f80e7e3f2..5db23aa52b8 100644
--- a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/host/reduce_scatter_full_worker_grid.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/host/reduce_scatter_full_worker_grid.cpp
@@ -253,7 +253,7 @@ static void set_reduce_scatter_worker_rt(
     std::vector<ttnn::ccl::EriscDatamoverBuilder>& cw_edm_builders,
     std::vector<ttnn::ccl::EriscDatamoverBuilder>& ccw_edm_builders,
     EdmInterfaceAddresses const& edm_interface_addresses,
-    WorkerAttributes &worker_attributes,
+    WorkerAttributes const& worker_attributes,
     std::size_t num_edm_channels,
     std::size_t edm_num_buffers_per_channel,
     ttnn::operations::binary::BinaryOpType binary_math_op) {
diff --git a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/reduce_scatter_op.cpp b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/reduce_scatter_op.cpp
index 2c87dd4dd00..8c55e0e68b8 100644
--- a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/reduce_scatter_op.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/reduce_scatter_op.cpp
@@ -24,7 +24,7 @@ std::vector<ttnn::SimpleShape> ReduceScatter::compute_output_shapes(const std::v
     auto shape = input_tensors[0].get_logical_shape();
     TT_FATAL(
         shape[this->scatter_dim] % this->ring_size == 0,
-        "The size of the scatter dimension must be a multiple of the ring size");
+        "The size of the scatter dimension must be a multiple of the ring size. Dimension size: {}, ring Size: {}", shape[this->scatter_dim], this->ring_size);
     shape[this->scatter_dim] /= this->ring_size;
     return std::vector<ttnn::SimpleShape>(input_tensors.size(), shape);
 }
@@ -132,6 +132,78 @@ Tensor reduce_scatter(
     return output_tensors.at(0);
 }
 
+
+
+
+Tensor reduce_scatter(
+    const Tensor &input_tensor,
+    const uint32_t scatter_dim,
+    const uint32_t cluster_axis,
+    const MeshDevice& mesh_device,
+    ttnn::operations::reduction::ReduceType reduce_op,
+    const uint32_t num_links,
+    const std::optional<MemoryConfig>& output_mem_config,
+    ttnn::ccl::Topology topology,
+    const std::optional<size_t> user_defined_num_workers,
+    const std::optional<size_t> user_defined_num_buffers_per_channel) {
+    ttnn::operations::binary::BinaryOpType binary_op_type = convert_reduce_type_to_eltwise_type(reduce_op);
+
+    TT_FATAL(topology == ttnn::ccl::Topology::Linear, "This all_gather API with cluster_axis is currently supported only for the Linear topology");
+    const auto mesh_view = mesh_device.get_view();
+    std::size_t num_devices = (cluster_axis == 0) ? mesh_view->num_rows() : mesh_view->num_cols();
+
+    std::vector<Tensor> output_tensors = {Tensor(operation::get_workers_for_op_output({input_tensor}))};
+
+    operation::launch_op(
+        [scatter_dim, binary_op_type, num_links, output_mem_config, mesh_view, cluster_axis, user_defined_num_workers, user_defined_num_buffers_per_channel, num_devices, topology](
+            const std::vector<Tensor>& input_tensors,
+            const std::vector<std::optional<const Tensor>>& optional_input_tensors,
+            const std::vector<std::optional<Tensor>>& optional_output_tensors) mutable -> std::vector<Tensor> {
+
+            const auto& input_device_tensor = input_tensors.at(0);
+
+            const auto coordinate = mesh_view->find_device(input_device_tensor.device()->id());
+            const auto view_index = (cluster_axis == 0) ? coordinate.col : coordinate.row;
+            const auto device_index = (cluster_axis == 0) ? coordinate.row : coordinate.col;
+
+            auto get_chip_id = [&](std::size_t line_index) -> std::optional<chip_id_t> {
+                auto new_coord = coordinate;
+                if (cluster_axis == 0) {
+                    new_coord.row = line_index % num_devices;
+                } else {
+                    new_coord.col = line_index % num_devices;
+                }
+                return mesh_view->find_device_id(new_coord);
+            };
+
+            bool is_last_chip_in_clockwise_direction = device_index == (num_devices - 1);
+            bool is_last_chip_in_counter_clockwise_direction = device_index == 0;
+            auto receiver_device_id = is_last_chip_in_clockwise_direction ? std::nullopt : get_chip_id(device_index + 1);
+            auto sender_device_id = is_last_chip_in_counter_clockwise_direction ? std::nullopt : get_chip_id(device_index + num_devices - 1);
+
+            return operation::run(
+                ttnn::ReduceScatter{
+                    binary_op_type,
+                    scatter_dim,
+                    num_links,
+                    num_devices,
+                    device_index,
+                    receiver_device_id,
+                    sender_device_id,
+                    output_mem_config.value_or(input_device_tensor.memory_config()),
+                    topology,
+                    user_defined_num_workers,
+                    user_defined_num_buffers_per_channel},
+                {input_device_tensor});
+        },
+        {input_tensor},
+        output_tensors);
+    return output_tensors.at(0);
+
+}
+
+
+
 } // namespace ccl
 } // namespace operations
 
diff --git a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/reduce_scatter_op.hpp b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/reduce_scatter_op.hpp
index 996d3078ca0..a1d5ea4f1dc 100644
--- a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/reduce_scatter_op.hpp
+++ b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/reduce_scatter_op.hpp
@@ -51,12 +51,24 @@ operation::ProgramWithCallbacks reduce_scatter_with_workers(
 
 namespace operations{
 namespace ccl{
-    Tensor reduce_scatter(
+Tensor reduce_scatter(
     const Tensor &input_tensor,
     const uint32_t scatter_split_dim,
     ttnn::operations::reduction::ReduceType reduce_op = ttnn::operations::reduction::ReduceType::Sum,
     const uint32_t num_links = 1,
-    const MemoryConfig& output_mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG,
+    const MemoryConfig &output_mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG,
+    ttnn::ccl::Topology topology = ttnn::ccl::Topology::Ring,
+    const std::optional<size_t> user_defined_num_workers = std::nullopt,
+    const std::optional<size_t> user_defined_num_buffers_per_channel = std::nullopt);
+
+Tensor reduce_scatter(
+    const ttnn::Tensor &input_tensor,
+    const uint32_t scatter_dim,
+    const uint32_t cluster_axis,
+    const MeshDevice& mesh_device,
+    ttnn::operations::reduction::ReduceType reduce_op = ttnn::operations::reduction::ReduceType::Sum,
+    const uint32_t num_links = 1,
+    const std::optional<ttnn::MemoryConfig>& output_mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG,
     ttnn::ccl::Topology topology = ttnn::ccl::Topology::Ring,
     const std::optional<size_t> user_defined_num_workers = std::nullopt,
     const std::optional<size_t> user_defined_num_buffers_per_channel = std::nullopt);
diff --git a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/reduce_scatter.cpp b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/reduce_scatter.cpp
index 3802ef74873..ea28f4bd932 100644
--- a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/reduce_scatter.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/reduce_scatter.cpp
@@ -21,5 +21,20 @@ ttnn::Tensor ExecuteReduceScatter::invoke(
     MemoryConfig out_memory_config = memory_config.value_or(input_tensor.memory_config());
     return ttnn::operations::ccl::reduce_scatter(input_tensor, scatter_dim, math_op, num_links, out_memory_config, topology, num_workers, num_buffers_per_channel);
 }
+ttnn::Tensor ExecuteReduceScatter::invoke(
+    const ttnn::Tensor& input_tensor,
+    const uint32_t scatter_dim,
+    const uint32_t cluster_axis,
+    const MeshDevice& mesh_device,
+    ttnn::operations::reduction::ReduceType math_op,
+    const uint32_t num_links,
+    const std::optional<ttnn::MemoryConfig>& memory_config,
+    ttnn::ccl::Topology topology,
+    const std::optional<size_t> num_workers,
+    const std::optional<size_t> num_buffers_per_channel) {
+
+    MemoryConfig out_memory_config = memory_config.value_or(input_tensor.memory_config());
+    return ttnn::operations::ccl::reduce_scatter(input_tensor, scatter_dim, cluster_axis, mesh_device, math_op, num_links, out_memory_config, topology, num_workers, num_buffers_per_channel);
+}
 
 }  // namespace ttnn::operations::ccl
diff --git a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/reduce_scatter.hpp b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/reduce_scatter.hpp
index 04a11f1f236..b7acc80e794 100644
--- a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/reduce_scatter.hpp
+++ b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/reduce_scatter.hpp
@@ -15,6 +15,18 @@ namespace operations {
 namespace ccl {
 
 struct ExecuteReduceScatter {
+    static ttnn::Tensor invoke(
+        const Tensor &input_tensor,
+        const uint32_t scatter_dim,
+        const uint32_t cluster_axis,
+        const MeshDevice& mesh_device,
+        ttnn::operations::reduction::ReduceType reduce_op = ttnn::operations::reduction::ReduceType::Sum,
+        const uint32_t num_links = 1,
+        const std::optional<ttnn::MemoryConfig>& output_mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG,
+        ttnn::ccl::Topology topology = ttnn::ccl::Topology::Ring,
+        const std::optional<size_t> user_defined_num_workers = std::nullopt,
+        const std::optional<size_t> user_defined_num_buffers_per_channel = std::nullopt);
+
     static ttnn::Tensor invoke(
         const ttnn::Tensor& input_tensor,
         const uint32_t scatter_dim,
diff --git a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/reduce_scatter_pybind.cpp b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/reduce_scatter_pybind.cpp
index 10574a7efb9..bfac2f9a1d1 100644
--- a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/reduce_scatter_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/reduce_scatter_pybind.cpp
@@ -8,7 +8,6 @@
 #include <pybind11/stl.h>
 
 #include "ttnn/cpp/pybind11/decorators.hpp"
-#include "ttnn/operations/ccl/ccl_host_datastructures.hpp"
 #include "ttnn/operations/ccl/reduce_scatter/reduce_scatter.hpp"
 #include "ttnn/types.hpp"
 
@@ -44,7 +43,33 @@ void bind_reduce_scatter(pybind11::module& module, const ccl_operation_t& operat
             py::arg("memory_config") = std::nullopt,
             py::arg("topology") = ttnn::ccl::Topology::Ring,
             py::arg("num_workers") = std::nullopt,
-            py::arg("num_buffers_per_channel") = std::nullopt});
+            py::arg("num_buffers_per_channel") = std::nullopt},
+
+        ttnn::pybind_overload_t{
+            [](const ccl_operation_t& self,
+               const ttnn::Tensor& input_tensor,
+               const uint32_t scatter_dim,
+               const uint32_t cluster_axis,
+               const MeshDevice& mesh_device,
+               ttnn::operations::reduction::ReduceType math_op,
+               const uint32_t num_links,
+               const std::optional<ttnn::MemoryConfig>& output_mem_config,
+               const std::optional<size_t> num_workers,
+               const std::optional<size_t> num_buffers_per_channel,
+               const ttnn::ccl::Topology topology) -> ttnn::Tensor {
+                return self(input_tensor, scatter_dim, cluster_axis, mesh_device, math_op, num_links, output_mem_config, topology, num_workers, num_buffers_per_channel);
+            },
+            py::arg("input_tensor"),
+            py::arg("scatter_dim"),
+            py::arg("cluster_axis"),
+            py::arg("mesh_device"),
+            py::arg("math_op"),
+            py::kw_only(),
+            py::arg("num_links") = 1,
+            py::arg("memory_config") = std::nullopt,
+            py::arg("num_workers") = std::nullopt,
+            py::arg("num_buffers_per_channel") = std::nullopt,
+            py::arg("topology") = ttnn::ccl::Topology::Ring});
 }
 
 }  // namespace detail
@@ -62,6 +87,11 @@ void py_bind_reduce_scatter(pybind11::module& module) {
         Args:
             input_tensor (ttnn.Tensor): multi-device tensor
             dim (int): Dimension to perform operation
+            cluster_axis (int): Provided a MeshTensor, the axis corresponding to MeshDevice to perform the line-all-gather operation on.
+            mesh_device (MeshDevice): Device mesh to perform the line-all-gather operation on.
+        * cluster_axis and mesh_device parameters are applicable only for Linear Topology.
+
+        Mesh Tensor Programming Guide : https://github.com/tenstorrent/tt-metal/blob/main/tech_reports/Programming%20Mesh%20of%20Devices/Programming%20Mesh%20of%20Devices%20with%20TT-NN.md
 
         Keyword Args:
             num_links (int, optional): Number of links to use for the all-gather operation. Defaults to `1`.

From 047da7a837646071e1e9fcf5dc7223304aa57f46 Mon Sep 17 00:00:00 2001
From: Raymond Kim <109366641+tt-rkim@users.noreply.github.com>
Date: Fri, 25 Oct 2024 13:45:00 -0400
Subject: [PATCH 09/30] #14225: Do not use a log driver to save logs to disk as
 we'll likely not need it for debug since we have GitHub UI + it's killing our
 runners because when tests print out lots of logs, it will take up space
 because default driver is JSON logs saved to disk (#14259)

* #14225: Do not use a log driver to save logs to disk as we'll likely not need it

* #14225: Move up comment as it seems to cause an invalid command
---
 .github/actions/docker-run/action.yml | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/.github/actions/docker-run/action.yml b/.github/actions/docker-run/action.yml
index 59a26859289..54f7d431769 100644
--- a/.github/actions/docker-run/action.yml
+++ b/.github/actions/docker-run/action.yml
@@ -38,7 +38,7 @@ runs:
       uses: ./.github/actions/generate-docker-tag
       with:
         image: ${{ inputs.docker_os_arch }}
-    - name: Set 
+    - name: Set
       shell: bash
       run: |
         echo "RUNNER_UID=$(id -u)" >> $GITHUB_ENV
@@ -63,8 +63,10 @@ runs:
         # The most important option below is `--rm`. Otherwise, the machines will fill up with undeleted containers.
         # The mounting of /etc/passwd, /etc/shadow, and /etc/bashrc is required in order for the correct file permissions
         # for newly created files.
-        # Passing HOME variable is necessary to avoid Python lib installation into /home/ubuntu/.local folder which 
+        # Passing HOME variable is necessary to avoid Python lib installation into /home/ubuntu/.local folder which
         # may not be writable by the RUNNER_UID user.
+        # --log-driver none: Do not save logs to disk as we're printing them to GitHub
+        # and it takes up space
         options: |
           -u ${{ env.RUNNER_UID }}:${{ env.RUNNER_GID }}
           --rm
@@ -73,6 +75,7 @@ runs:
           -v /etc/bashrc:/etc/bashrc:ro
           -v ${{ github.workspace }}:${{ github.workspace }}
           --net=host
+          --log-driver none
           ${{ inputs.docker_opts }}
           -e LOGURU_LEVEL=${{ env.LOGURU_LEVEL }}
           -e PYTHONPATH=${{ github.workspace }}

From 59a2d5e522783fd0d0c2a07ad4af9d2203d4935c Mon Sep 17 00:00:00 2001
From: Denys Makoviichuk <dmakoviichuk@tenstorrent.com>
Date: Fri, 25 Oct 2024 11:14:19 -0700
Subject: [PATCH 10/30] #14186: Fixed moreh_adam and moreh_adamw (#14243)

* #14186: Fixed moreh_adam

* #0: fixed adam too
---
 .../device/moreh_adam_device_operation.cpp    | 23 ++++---------------
 .../device/moreh_adamw_device_operation.cpp   | 23 ++++---------------
 2 files changed, 8 insertions(+), 38 deletions(-)

diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_adam/device/moreh_adam_device_operation.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_adam/device/moreh_adam_device_operation.cpp
index cf0faa72b4d..3cc32ff7ed1 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_adam/device/moreh_adam_device_operation.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_adam/device/moreh_adam_device_operation.cpp
@@ -160,25 +160,10 @@ std::tuple<MorehAdamOperation::operation_attributes_t, MorehAdamOperation::tenso
 auto MorehAdamOperation::compute_program_hash(
     const MorehAdamOperation::operation_attributes_t& operation_attributes,
     const MorehAdamOperation::tensor_args_t& tensor_args) -> tt::stl::hash::hash_t {
-    return operation::hash_operation<MorehAdamOperation>(
-        operation_attributes.beta1,
-        operation_attributes.beta2,
-        operation_attributes.eps,
-        operation_attributes.amsgrad,
-        operation_attributes.weight_decay,
-        operation_attributes.memory_config,
-        operation_attributes.compute_kernel_config,
-        tensor_args.param_in.memory_config(),
-        tensor_args.param_in.dtype(),
-        tensor_args.grad.memory_config(),
-        tensor_args.grad.dtype(),
-        tensor_args.exp_avg_in.memory_config(),
-        tensor_args.exp_avg_in.dtype(),
-        tensor_args.exp_avg_sq_in.memory_config(),
-        tensor_args.exp_avg_sq_in.dtype(),
-        tensor_args.max_exp_avg_sq_in.has_value() ? tensor_args.max_exp_avg_sq_in.value().memory_config()
-                                                  : MemoryConfig{},
-        tensor_args.max_exp_avg_sq_in.has_value() ? tensor_args.max_exp_avg_sq_in.value().dtype() : DataType::INVALID);
+    auto operation_attributes_without_step_and_lr = operation_attributes;
+    operation_attributes_without_step_and_lr.step = 0;
+    operation_attributes_without_step_and_lr.lr = 0.0f;
+    return tt::stl::hash::hash_objects_with_default_seed(operation_attributes_without_step_and_lr, tensor_args);
 }
 
 }  // namespace ttnn::operations::moreh::moreh_adam
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_adamw/device/moreh_adamw_device_operation.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_adamw/device/moreh_adamw_device_operation.cpp
index 774d5d63885..1084b7de99e 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_adamw/device/moreh_adamw_device_operation.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_adamw/device/moreh_adamw_device_operation.cpp
@@ -152,24 +152,9 @@ MorehAdamWDeviceOperation::invoke(
 
 tt::stl::hash::hash_t MorehAdamWDeviceOperation::compute_program_hash(
     const operation_attributes_t& operation_attributes, const tensor_args_t& tensor_args) {
-    return operation::hash_operation<MorehAdamWDeviceOperation>(
-        operation_attributes.beta1,
-        operation_attributes.beta2,
-        operation_attributes.eps,
-        operation_attributes.amsgrad,
-        operation_attributes.weight_decay,
-        operation_attributes.memory_config,
-        operation_attributes.compute_kernel_config,
-        tensor_args.param_in.memory_config(),
-        tensor_args.param_in.dtype(),
-        tensor_args.grad.memory_config(),
-        tensor_args.grad.dtype(),
-        tensor_args.exp_avg_in.memory_config(),
-        tensor_args.exp_avg_in.dtype(),
-        tensor_args.exp_avg_sq_in.memory_config(),
-        tensor_args.exp_avg_sq_in.dtype(),
-        tensor_args.max_exp_avg_sq_in.has_value() ? tensor_args.max_exp_avg_sq_in.value().memory_config()
-                                                  : MemoryConfig{},
-        tensor_args.max_exp_avg_sq_in.has_value() ? tensor_args.max_exp_avg_sq_in.value().dtype() : DataType::INVALID);
+    auto operation_attributes_without_step_and_lr = operation_attributes;
+    operation_attributes_without_step_and_lr.step = 0;
+    operation_attributes_without_step_and_lr.lr = 0.0f;
+    return tt::stl::hash::hash_objects_with_default_seed(operation_attributes_without_step_and_lr, tensor_args);
 }
 }  // namespace ttnn::operations::moreh::moreh_adamw

From ace76bf76afdfeece6947468dd18af55023f4c32 Mon Sep 17 00:00:00 2001
From: Austin Ho <aho@tenstorrent.com>
Date: Fri, 25 Oct 2024 17:51:14 +0000
Subject: [PATCH 11/30] #14238: Add missing dependency in
 tt_metal/hw/CMakeLists.txt

---
 tt_metal/hw/CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tt_metal/hw/CMakeLists.txt b/tt_metal/hw/CMakeLists.txt
index e8c35d24f54..5066024abf0 100644
--- a/tt_metal/hw/CMakeLists.txt
+++ b/tt_metal/hw/CMakeLists.txt
@@ -186,6 +186,8 @@ foreach(ARCH IN LISTS ARCHS)
                 ${CMAKE_COMMAND} -E make_directory ${HW_LIB_DIR}
             COMMAND
                 ${GPP_CMD} ${GPP_FLAGS} ${GPP_DEFINES} ${GPP_INCLUDES} -c -o ${HW_LIB_DIR}/${HWLIB}.o ${${HWLIB}_SOURCE}
+            DEPENDS
+                ${${HWLIB}_SOURCE}
             COMMENT "Building hw lib ${HWLIB}.o"
             VERBATIM
         )

From 808d0da4568d21be759e06c1e6c46754f4ec36f7 Mon Sep 17 00:00:00 2001
From: Bryan Wilder Field Lozano <blozano@tenstorrent.com>
Date: Fri, 25 Oct 2024 11:34:37 -0700
Subject: [PATCH 12/30] #0: tt_cluster.cpp include what you use (#14277)

---
 tt_metal/llrt/tt_cluster.cpp | 43 ++++++++++++++++++++++++++++++------
 1 file changed, 36 insertions(+), 7 deletions(-)

diff --git a/tt_metal/llrt/tt_cluster.cpp b/tt_metal/llrt/tt_cluster.cpp
index d89cfa14e2f..597fb3b2901 100644
--- a/tt_metal/llrt/tt_cluster.cpp
+++ b/tt_metal/llrt/tt_cluster.cpp
@@ -4,18 +4,47 @@
 
 #include "tt_cluster.hpp"
 
-#include <immintrin.h>
-
+#include <algorithm>
+#include <cstdint>
+#include <cstdlib>
 #include <filesystem>
-#include <iomanip>
 #include <iostream>
+#include <map>                                                       // for map
+#include <memory>
+#include <set>                                                       // for set
+#include <stdexcept>
 #include <string>
+#include <tuple>                                                     // for get
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "fmt/base.h"
+#include "tt_metal/common/base.hpp" // TODO: Eliminate this file, catchall include and is ARCH_NAME dependent
+#include "tt_metal/common/logger.hpp"
+#include "tt_metal/common/metal_soc_descriptor.h"
+#include "tt_metal/common/test_common.hpp"
+#include "tt_metal/common/tt_backend_api_types.hpp"
+#include "third_party/umd/device/tt_arch_types.h"
+#include "third_party/umd/device/tt_cluster_descriptor.h"
+#include "third_party/umd/device/tt_cluster_descriptor_types.h"
+#include "third_party/umd/device/tt_device.h"
+#include "third_party/umd/device/tt_soc_descriptor.h"
+#include "third_party/umd/device/tt_xy_pair.h"
+#include "third_party/umd/device/xy_pair.h"
+
+// TODO: ARCH_NAME specific, must remove
+#include "eth_l1_address_map.h"
+#include "dev_msgs.h"
+#include "tensix.h"
+//
+//
+#include "llrt/hal.hpp"                                              // for Hal
 
-#include "hostdevcommon/dprint_common.h"
-#include "rtoptions.hpp"
-#include "third_party/umd/device/tt_silicon_driver_common.hpp"
+#include "third_party/tracy/public/tracy/Tracy.hpp"
 #include "third_party/umd/device/simulation/tt_simulation_device.h"
-#include "tools/profiler/profiler.hpp"
+
 #include "tt_metal/impl/debug/sanitize_noc_host.hpp"
 #include "tt_metal/llrt/rtoptions.hpp"
 #include "tt_metal/llrt/tlb_config.hpp"

From ebe9cad56193c7d184c29da0327b21e97bbfeb06 Mon Sep 17 00:00:00 2001
From: Patrick Roberts <proberts@tenstorrent.com>
Date: Wed, 23 Oct 2024 21:19:30 +0000
Subject: [PATCH 13/30] #11208: Make v0::Program opaque(ish)

---
 .../detail/reports/compilation_reporter.cpp   |  24 +-
 .../detail/reports/compilation_reporter.hpp   |   4 +-
 tt_metal/detail/reports/memory_reporter.cpp   |   8 +-
 tt_metal/detail/reports/memory_reporter.hpp   |   2 +-
 tt_metal/impl/dispatch/command_queue.cpp      |  61 +--
 tt_metal/impl/program/program.cpp             | 428 +++++++++++++++---
 tt_metal/impl/program/program.hpp             | 146 +-----
 7 files changed, 436 insertions(+), 237 deletions(-)

diff --git a/tt_metal/detail/reports/compilation_reporter.cpp b/tt_metal/detail/reports/compilation_reporter.cpp
index 2940e7fe879..9a681cbca0b 100644
--- a/tt_metal/detail/reports/compilation_reporter.cpp
+++ b/tt_metal/detail/reports/compilation_reporter.cpp
@@ -79,13 +79,13 @@ std::string kernel_attributes_str(std::shared_ptr<Kernel> kernel) {
     return attr_str;
 }
 
-void CompilationReporter::add_kernel_compile_stats(const Program &program, std::shared_ptr<Kernel> kernel, bool cache_hit, size_t kernel_hash) {
+void CompilationReporter::add_kernel_compile_stats(uint64_t program_id, std::shared_ptr<Kernel> kernel, bool cache_hit, size_t kernel_hash) {
     std::unique_lock<std::mutex> lock(mutex_);
 
     if (cache_hit) {
-        this->program_id_to_cache_hit_counter_[program.get_id()].hits++;
+        this->program_id_to_cache_hit_counter_[program_id].hits++;
     } else {
-        this->program_id_to_cache_hit_counter_[program.get_id()].misses++;
+        this->program_id_to_cache_hit_counter_[program_id].misses++;
     }
     std::string kernel_stats = "," + kernel->name() + ",";
     std::string cache_status = cache_hit ? "cache hit" : "cache miss";
@@ -99,13 +99,13 @@ void CompilationReporter::add_kernel_compile_stats(const Program &program, std::
         }
         index++;
     }
-    this->program_id_to_kernel_stats_[program.get_id()].push_back(kernel_stats);
+    this->program_id_to_kernel_stats_[program_id].push_back(kernel_stats);
 }
 
-void CompilationReporter::flush_program_entry(const Program &program, bool persistent_compilation_cache_enabled) {
+void CompilationReporter::flush_program_entry(uint64_t program_id, size_t num_kernels, std::function<std::shared_ptr<Kernel>(size_t)> get_kernel, bool persistent_compilation_cache_enabled) {
     std::unique_lock<std::mutex> lock(mutex_);
-    auto num_cache_misses = this->program_id_to_cache_hit_counter_.at(program.get_id()).misses;
-    auto num_cache_hits = this->program_id_to_cache_hit_counter_.at(program.get_id()).hits;
+    auto num_cache_misses = this->program_id_to_cache_hit_counter_.at(program_id).misses;
+    auto num_cache_hits = this->program_id_to_cache_hit_counter_.at(program_id).hits;
     if (this->total_num_compile_programs_ == 0) {
         this->init_reports();
     }
@@ -113,8 +113,8 @@ void CompilationReporter::flush_program_entry(const Program &program, bool persi
     auto get_num_compute_and_data_movement_kernels = [&]() {
         uint32_t num_compute = 0;
         uint32_t num_data_movement = 0;
-        for (size_t kernel_id = 0; kernel_id < program.num_kernels(); kernel_id++) {
-            const auto kernel = detail::GetKernel(program, kernel_id);
+        for (size_t kernel_id = 0; kernel_id < num_kernels; kernel_id++) {
+            const auto kernel = get_kernel(kernel_id);
             if (kernel->processor() == tt::RISCV::BRISC or kernel->processor() == tt::RISCV::NCRISC) {
                 num_data_movement++;
             } else {
@@ -126,14 +126,14 @@ void CompilationReporter::flush_program_entry(const Program &program, bool persi
 
     auto [num_compute_kernels, num_data_movement_kernels] = get_num_compute_and_data_movement_kernels();
 
-    this->summary_report_ << program.get_id() << ", "
+    this->summary_report_ << program_id << ", "
                             << num_compute_kernels << ", "
                             << num_data_movement_kernels << ", "
                             << (persistent_compilation_cache_enabled ? "Y" : "N") << ", "
                             << num_cache_misses << ", "
                             << num_cache_hits << "\n";
 
-    this->detailed_report_ << "Compiling Program: " << program.get_id() << "\n";
+    this->detailed_report_ << "Compiling Program: " << program_id << "\n";
     this->detailed_report_ << "\n,Kernel Creation Report:\n";
     this->detailed_report_ << ",,Number of Compute CreateKernel API calls: " << num_compute_kernels << "\n";
     this->detailed_report_ << ",,Number of Datamovement CreateKernel API calls: " << num_data_movement_kernels << "\n";
@@ -144,7 +144,7 @@ void CompilationReporter::flush_program_entry(const Program &program, bool persi
     this->detailed_report_ << ",,Total number of kernel compile cache hits: " << num_cache_hits << "\n";
 
     this->detailed_report_ << "\n,Kernel File Name, Core Range, Cache Hit, Kernel Attributes, Hash\n";
-    auto kernel_stats_vec = this->program_id_to_kernel_stats_.at(program.get_id());
+    auto kernel_stats_vec = this->program_id_to_kernel_stats_.at(program_id);
     for (const auto &kernel_stats : kernel_stats_vec) {
         this->detailed_report_ << kernel_stats;
     }
diff --git a/tt_metal/detail/reports/compilation_reporter.hpp b/tt_metal/detail/reports/compilation_reporter.hpp
index c976bf5c8bc..23707b8eff3 100644
--- a/tt_metal/detail/reports/compilation_reporter.hpp
+++ b/tt_metal/detail/reports/compilation_reporter.hpp
@@ -45,9 +45,9 @@ class CompilationReporter {
     CompilationReporter(const CompilationReporter&) = delete;
     CompilationReporter(CompilationReporter&& other) noexcept = delete;
 
-    void add_kernel_compile_stats(const Program &program, std::shared_ptr<Kernel> kernel, bool cache_hit, size_t kernel_hash);
+    void add_kernel_compile_stats(uint64_t program_id, std::shared_ptr<Kernel> kernel, bool cache_hit, size_t kernel_hash);
 
-    void flush_program_entry(const Program &program, bool persistent_compilation_cache_enabled);
+    void flush_program_entry(uint64_t program_id, size_t num_kernels, std::function<std::shared_ptr<Kernel>(size_t)> get_kernel, bool persistent_compilation_cache_enabled);
     static CompilationReporter& inst();
     static void toggle (bool state);
     static bool enabled ();
diff --git a/tt_metal/detail/reports/memory_reporter.cpp b/tt_metal/detail/reports/memory_reporter.cpp
index 1bffc4421dd..5275d438742 100644
--- a/tt_metal/detail/reports/memory_reporter.cpp
+++ b/tt_metal/detail/reports/memory_reporter.cpp
@@ -100,14 +100,14 @@ void populate_reports(const Device *device, std::ofstream &memory_usage_summary_
     write_memory_usage(device, BufferType::L1, memory_usage_summary_report, detailed_memory_usage_report, l1_usage_summary_report);
 }
 
-void MemoryReporter::flush_program_memory_usage(const Program &program, const Device *device) {
+void MemoryReporter::flush_program_memory_usage(uint64_t program_id, const Device *device) {
     if (not this->program_memory_usage_summary_report_.is_open()) {
         this->init_reports();
     }
 
-    this->program_memory_usage_summary_report_ << program.get_id();
-    this->program_l1_usage_summary_report_ << program.get_id();
-    this->program_detailed_memory_usage_report_ << program.get_id();
+    this->program_memory_usage_summary_report_ << program_id;
+    this->program_l1_usage_summary_report_ << program_id;
+    this->program_detailed_memory_usage_report_ << program_id;
 
     populate_reports(device, this->program_memory_usage_summary_report_, this->program_detailed_memory_usage_report_, this->program_l1_usage_summary_report_);
 }
diff --git a/tt_metal/detail/reports/memory_reporter.hpp b/tt_metal/detail/reports/memory_reporter.hpp
index e5138f02a35..217f6490522 100644
--- a/tt_metal/detail/reports/memory_reporter.hpp
+++ b/tt_metal/detail/reports/memory_reporter.hpp
@@ -60,7 +60,7 @@ class MemoryReporter {
     MemoryReporter(const MemoryReporter&) = delete;
     MemoryReporter(MemoryReporter&& other) noexcept = delete;
 
-    void flush_program_memory_usage(const Program &program, const Device *device);
+    void flush_program_memory_usage(uint64_t program_id, const Device *device);
 
     void dump_memory_usage_state(const Device *device, std::string prefix="") const;
 
diff --git a/tt_metal/impl/dispatch/command_queue.cpp b/tt_metal/impl/dispatch/command_queue.cpp
index 32c7ac99e73..3c5620b6839 100644
--- a/tt_metal/impl/dispatch/command_queue.cpp
+++ b/tt_metal/impl/dispatch/command_queue.cpp
@@ -730,8 +730,9 @@ void EnqueueProgramCommand::assemble_device_commands(ProgramCommandSequence& pro
     const uint32_t max_prefetch_command_size =
         dispatch_constants::get(dispatch_core_type).max_prefetch_command_size();
 
+    const auto &program_transfer_info = program.get_program_transfer_info();
     // Multicast Semaphore Cmd
-    uint32_t num_multicast_semaphores = program.program_transfer_info.multicast_semaphores.size();
+    uint32_t num_multicast_semaphores = program_transfer_info.multicast_semaphores.size();
     std::vector<std::vector<CQDispatchWritePackedMulticastSubCmd>> multicast_sem_sub_cmds(num_multicast_semaphores);
     std::vector<std::vector<std::pair<const void*, uint32_t>>> multicast_sem_data(num_multicast_semaphores);
     std::vector<std::vector<std::pair<uint32_t, uint32_t>>> multicast_sem_payload(num_multicast_semaphores);
@@ -739,7 +740,7 @@ void EnqueueProgramCommand::assemble_device_commands(ProgramCommandSequence& pro
     multicast_sem_dst_size.reserve(num_multicast_semaphores);
     if (num_multicast_semaphores > 0) {
         uint32_t i = 0;
-        for (const auto& [dst, transfer_info_vec] : program.program_transfer_info.multicast_semaphores) {
+        for (const auto& [dst, transfer_info_vec] : program_transfer_info.multicast_semaphores) {
             // TODO: loop over things inside transfer_info[i]
             uint32_t write_packed_len = transfer_info_vec[0].data.size();
             multicast_sem_dst_size.emplace_back(std::make_pair(dst, write_packed_len * sizeof(uint32_t)));
@@ -768,7 +769,7 @@ void EnqueueProgramCommand::assemble_device_commands(ProgramCommandSequence& pro
     }
 
     // Unicast Semaphore Cmd
-    uint32_t num_unicast_semaphores = program.program_transfer_info.unicast_semaphores.size();
+    uint32_t num_unicast_semaphores = program_transfer_info.unicast_semaphores.size();
     std::vector<std::vector<CQDispatchWritePackedUnicastSubCmd>> unicast_sem_sub_cmds(num_unicast_semaphores);
     std::vector<std::vector<std::pair<const void*, uint32_t>>> unicast_sem_data(num_unicast_semaphores);
     std::vector<std::vector<std::pair<uint32_t, uint32_t>>> unicast_sem_payload(num_unicast_semaphores);
@@ -776,7 +777,7 @@ void EnqueueProgramCommand::assemble_device_commands(ProgramCommandSequence& pro
     unicast_sem_dst_size.reserve(num_unicast_semaphores);
     if (num_unicast_semaphores > 0) {
         uint32_t i = 0;
-        for (const auto& [dst, transfer_info_vec] : program.program_transfer_info.unicast_semaphores) {
+        for (const auto& [dst, transfer_info_vec] : program_transfer_info.unicast_semaphores) {
             // TODO: loop over things inside transfer_info[i]
             uint32_t write_packed_len = transfer_info_vec[0].data.size();
             unicast_sem_dst_size.emplace_back(std::make_pair(dst, write_packed_len * sizeof(uint32_t)));
@@ -876,7 +877,8 @@ void EnqueueProgramCommand::assemble_device_commands(ProgramCommandSequence& pro
     const uint32_t max_length_per_sub_cmd = dispatch_constants::get(this->dispatch_core_type).scratch_db_size() / 2;
     const uint32_t max_paged_length_per_sub_cmd =
         max_length_per_sub_cmd / HostMemDeviceCommand::PROGRAM_PAGE_SIZE * HostMemDeviceCommand::PROGRAM_PAGE_SIZE;
-    for (const auto& [cores, num_mcast_dests, kg_transfer_info] : program.program_transfer_info.kernel_bins) {
+    const auto &kernels_buffer = program.get_kernels_buffer();
+    for (const auto& [cores, num_mcast_dests, kg_transfer_info] : program_transfer_info.kernel_bins) {
         bool write_linear;
         uint32_t noc_encoding;
         std::visit(
@@ -913,14 +915,14 @@ void EnqueueProgramCommand::assemble_device_commands(ProgramCommandSequence& pro
 
                 uint32_t base_address, page_offset;
                 if (kg_transfer_info.page_offsets[kernel_idx] > CQ_PREFETCH_RELAY_PAGED_START_PAGE_MASK) {
-                    const uint32_t num_banks = this->device->num_banks(this->program.kernels_buffer->buffer_type());
+                    const uint32_t num_banks = this->device->num_banks(kernels_buffer->buffer_type());
                     page_offset = kg_transfer_info.page_offsets[kernel_idx] % num_banks;
                     uint32_t num_full_pages_written_per_bank =
                         kg_transfer_info.page_offsets[kernel_idx] / num_banks;
-                    base_address = this->program.kernels_buffer->address() +
-                                    num_full_pages_written_per_bank * this->program.kernels_buffer->page_size();
+                    base_address = kernels_buffer->address() +
+                                    num_full_pages_written_per_bank * kernels_buffer->page_size();
                 } else {
-                    base_address = this->program.kernels_buffer->address();
+                    base_address = kernels_buffer->address();
                     page_offset = kg_transfer_info.page_offsets[kernel_idx];
                 }
 
@@ -928,11 +930,11 @@ void EnqueueProgramCommand::assemble_device_commands(ProgramCommandSequence& pro
                     true,  // is_dram
                     page_offset,
                     base_address,
-                    this->program.kernels_buffer->page_size(),
-                    relayed_bytes / this->program.kernels_buffer->page_size(),
+                    kernels_buffer->page_size(),
+                    relayed_bytes / kernels_buffer->page_size(),
                     length_adjust);
             } else {
-                uint32_t base_address = this->program.kernels_buffer->address();
+                uint32_t base_address = kernels_buffer->address();
                 uint32_t page_offset = kg_transfer_info.page_offsets[kernel_idx];
                 uint32_t dst_addr = kg_transfer_info.dst_base_addrs[kernel_idx];
                 uint32_t aligned_length = align(kg_transfer_info.lengths[kernel_idx], hal.get_alignment(HalMemType::DRAM));
@@ -1068,7 +1070,7 @@ void EnqueueProgramCommand::assemble_device_commands(ProgramCommandSequence& pro
     }
     // if dispatch_s is enabled have dispatch_d send a semaphore update to dispatch_s (this will include a write barrier on dispatch_d if program is active)
     // if not,  check if the program is active on workers. If active, have dispatch_d issue a write barrier
-    cmd_sequence_sizeB += (this->device->dispatch_s_enabled() || program.program_transfer_info.num_active_cores > 0) * CQ_PREFETCH_CMD_BARE_MIN_SIZE;
+    cmd_sequence_sizeB += (this->device->dispatch_s_enabled() || program_transfer_info.num_active_cores > 0) * CQ_PREFETCH_CMD_BARE_MIN_SIZE;
 
     // either dispatch_s or dispatch_d will send the go signal (go_signal_mcast command)
     cmd_sequence_sizeB += CQ_PREFETCH_CMD_BARE_MIN_SIZE;
@@ -1251,11 +1253,11 @@ void EnqueueProgramCommand::assemble_device_commands(ProgramCommandSequence& pro
     DispatcherSelect dispatcher_for_go_signal = DispatcherSelect::DISPATCH_MASTER;
     if (this->device->dispatch_s_enabled()) {
         // dispatch_d signals dispatch_s to send the go signal, use a barrier if there are cores active
-        device_command_sequence.add_notify_dispatch_s_go_signal_cmd(program.program_transfer_info.num_active_cores > 0);
+        device_command_sequence.add_notify_dispatch_s_go_signal_cmd(program_transfer_info.num_active_cores > 0);
         dispatcher_for_go_signal = DispatcherSelect::DISPATCH_SLAVE;
     } else {
         // Wait Noc Write Barrier, wait for binaries/configs and launch_msg to be written to worker cores
-        if (program.program_transfer_info.num_active_cores > 0) {
+        if (program_transfer_info.num_active_cores > 0) {
             device_command_sequence.add_dispatch_wait(true, this->dispatch_message_addr, 0, 0, false, false);
         }
     }
@@ -1463,7 +1465,7 @@ void EnqueueProgramCommand::process() {
     }
 
     const std::pair<ConfigBufferSync, std::vector<ConfigBufferEntry>&> reservation =
-        this->manager.get_config_buffer_mgr().reserve(program.program_config_sizes_);
+        this->manager.get_config_buffer_mgr().reserve(program.get_program_config_sizes());
     bool stall_first = reservation.first.need_sync;
     // Note: since present implementation always stalls, we always free up to "now"
     this->manager.get_config_buffer_mgr().free(reservation.first.sync_count);
@@ -1484,9 +1486,10 @@ void EnqueueProgramCommand::process() {
     // Cache is only usable if caching is enabled and program is finalized
     // If cache has a program entry but the program is not finalized, then the cache is stale
     // Currently this is mapped by device, but will be mapped by multiple values in the future
+    auto &cached_program_command_sequences = program.get_cached_program_command_sequences();
     uint64_t command_hash = this->device->id();
-    auto cached_cmd_iter = this->program.cached_program_command_sequences_.find(command_hash);
-    bool is_cached = is_finalized && cached_cmd_iter != this->program.cached_program_command_sequences_.end();
+    auto cached_cmd_iter = cached_program_command_sequences.find(command_hash);
+    bool is_cached = is_finalized && cached_cmd_iter != cached_program_command_sequences.end();
 
     // Calculate all commands size and determine how many fetch q entries to use
     // Preamble, some waits and stalls
@@ -1506,7 +1509,7 @@ void EnqueueProgramCommand::process() {
         this->assemble_device_commands(program_command_sequence, kernel_config_addrs);
         this->write_program_command_sequence(program_command_sequence, stall_first);
         this->assemble_stall_commands(program_command_sequence, false);
-        this->program.cached_program_command_sequences_.insert({command_hash, std::move(program_command_sequence)});
+        cached_program_command_sequences.insert({command_hash, std::move(program_command_sequence)});
     } else {
         static constexpr uint32_t wait_count_offset = (sizeof(CQPrefetchCmd) + offsetof(CQDispatchCmd, wait.count));
         static constexpr uint32_t tensix_l1_write_offset_offset =
@@ -2230,20 +2233,19 @@ void HWCommandQueue::enqueue_program(Program& program, bool blocking) {
     ZoneScopedN("HWCommandQueue_enqueue_program");
     if (not program.is_finalized()) {
         TT_FATAL(!this->manager.get_bypass_mode(), "Tracing should only be used when programs have been cached");
-        if (program.kernels_buffer != nullptr) {
+        if (const auto &kernels_buffer = program.get_kernels_buffer()) {
             this->enqueue_write_buffer(
-                *program.kernels_buffer, program.program_transfer_info.binary_data.data(), false);
+                *kernels_buffer, program.get_program_transfer_info().binary_data.data(), false);
         }
     }
 #ifdef DEBUG
     if (tt::llrt::OptionsG.get_validate_kernel_binaries()) {
         TT_FATAL(!this->manager.get_bypass_mode(), "Tracing cannot be used while validating program binaries");
-        if (program.kernels_buffer != nullptr) {
-            const auto& buffer = program.kernels_buffer;
+        if (const auto &buffer = program.get_kernels_buffer()) {
             std::vector<uint32_t> read_data(buffer->page_size() * buffer->num_pages() / sizeof(uint32_t));
-            this->enqueue_read_buffer(*program.kernels_buffer, read_data.data(), true);
+            this->enqueue_read_buffer(*buffer, read_data.data(), true);
             TT_FATAL(
-                program.program_transfer_info.binary_data == read_data,
+                program.get_program_transfer_info().binary_data == read_data,
                 "Binary for program to be executed is corrupted. Another program likely corrupted this binary");
         }
     }
@@ -2293,12 +2295,11 @@ void HWCommandQueue::enqueue_program(Program& program, bool blocking) {
 #ifdef DEBUG
     if (tt::llrt::OptionsG.get_validate_kernel_binaries()) {
         TT_FATAL(!this->manager.get_bypass_mode(), "Tracing cannot be used while validating program binaries");
-        if (program.kernels_buffer != nullptr) {
-            const auto& buffer = program.kernels_buffer;
+        if (const auto& buffer = program.get_kernels_buffer()) {
             std::vector<uint32_t> read_data(buffer->page_size() * buffer->num_pages() / sizeof(uint32_t));
-            this->enqueue_read_buffer(*program.kernels_buffer, read_data.data(), true);
+            this->enqueue_read_buffer(*buffer, read_data.data(), true);
             TT_FATAL(
-                program.program_transfer_info.binary_data == read_data,
+                program.get_program_transfer_info().binary_data == read_data,
                 "Binary for program that executed is corrupted. This program likely corrupted its own binary.");
         }
     }
@@ -2307,7 +2308,7 @@ void HWCommandQueue::enqueue_program(Program& program, bool blocking) {
     log_trace(
         tt::LogMetal,
         "Created EnqueueProgramCommand (active_cores: {} bypass_mode: {} expected_workers_completed: {})",
-        program.program_transfer_info.num_active_cores,
+        program.get_program_transfer_info().num_active_cores,
         this->manager.get_bypass_mode(),
         expected_workers_completed);
 }
diff --git a/tt_metal/impl/program/program.cpp b/tt_metal/impl/program/program.cpp
index 4cece79d0c4..25004bbab29 100644
--- a/tt_metal/impl/program/program.cpp
+++ b/tt_metal/impl/program/program.cpp
@@ -78,8 +78,173 @@ size_t KernelCompileHash(const std::shared_ptr<Kernel> kernel, JitBuildOptions &
 }  // namespace
 namespace detail {
 
+class Program_ {
+   public:
+    Program_();
+
+    Program_(const Program_ &other) = delete;
+    Program_& operator=(const Program_ &other) = delete;
+
+    Program_(Program_ &&other) = default;
+    Program_& operator=(Program_ &&other) = default;
+
+    void set_runtime_id(uint64_t id);
+    ~Program_() noexcept = default;
+
+    uint64_t get_id() const;
+    uint64_t get_runtime_id() const;
+
+    size_t num_kernels() const;
+
+    const std::vector<std::shared_ptr<CircularBuffer>> &circular_buffers() const;
+
+    const std::vector< Semaphore > & semaphores() const;
+
+    KernelGroup * kernels_on_core(const CoreCoord &core, uint32_t programmable_core_type_index);
+    std::vector<KernelGroup>& get_kernel_groups(uint32_t programmable_core_type_index);
+    void add_buffer(std::shared_ptr<Buffer> buf);
+    void release_buffers();
+    std::vector<std::shared_ptr<CircularBuffer>> circular_buffers_on_core(const CoreCoord &core) const;
+
+    std::vector<std::shared_ptr<CircularBuffer>> circular_buffers_on_corerange(const CoreRange &cr) const;
+
+    std::vector<CoreRange> circular_buffers_unique_coreranges() const;
+
+    std::vector<std::reference_wrapper<const Semaphore>> semaphores_on_core(const CoreCoord &core) const;
+
+    size_t num_semaphores () const;
+    void init_semaphores ( const Device & device, const CoreCoord &logical_core, uint32_t programmable_core_type_index) const;
+    // XXXXX TODO: this should return a const reference
+    std::vector<std::vector<CoreCoord>> logical_cores() const;
+
+    void compile(Device * device, bool fd_bootloader_mode = false);
+
+    void invalidate_circular_buffer_allocation();
+
+    void allocate_circular_buffers(const Device *device);
+
+    bool is_finalized() const;
+    void finalize(Device *device);
+    std::shared_ptr<Kernel> get_kernel(KernelHandle kernel_id) const;
+
+    ProgramConfig& get_program_config(uint32_t programmable_core_type_index);
+
+    // debug/test
+    uint32_t get_sem_base_addr(Device *device, CoreCoord logical_core, CoreType core_type) const;
+    uint32_t get_cb_base_addr(Device *device, CoreCoord logical_core, CoreType core_type) const;
+    uint32_t get_sem_size(Device *device, CoreCoord logical_core, CoreType core_type) const;
+    uint32_t get_cb_size(Device *device, CoreCoord logical_core, CoreType core_type) const;
+
+   private:
+    void populate_dispatch_data(Device *device);
+
+    // Buffers temporarily owned by the program
+    std::vector<std::shared_ptr<Buffer>> owned_buffer_pool = {};
+
+    // The buffer that holds the kernel/binaries/etc for this program
+    std::shared_ptr<Buffer> kernels_buffer = nullptr;
+    ProgramTransferInfo program_transfer_info;
+
+    bool finalized_;
+
+    struct CircularBufferAllocator {
+        CircularBufferAllocator(const CoreRange &core_range_) : core_range(core_range_) {}
+
+        // Circular buffers are created and allocated at core range granularity
+        CoreRange core_range;
+
+        // Holds vector of addresses where circular buffers are allocated [start, end)
+        // There are multiple ranges because per core L1 regions are not in lockstep but circular buffers spanning multiple cores must share the same address
+        // To enable this, circular buffer address is the maximum address amongst all of its target cores
+        // This vector is sorted from lower to higher address spaces
+        std::vector<std::pair<uint64_t, uint64_t>> l1_regions;
+
+        // Returns address for next circular buffer
+        // Circular buffers are placed sequentially on a core so the next available address gets appended to the last L1 region
+        uint64_t get_cb_region_end() const {
+            return this->l1_regions.empty() ? 0 : this->l1_regions.back().second;
+        }
+
+        // If address is the end of the last L1 region, the last region is extended by size bytes,
+        //  otherwise address must be higher than existing regions and a new L1 region [address, size) is added
+        void mark_address(uint64_t address, uint64_t size, uint64_t base_address);
+
+        // Reset when circular buffer allocation is invalidated
+        void reset_available_addresses() { this->l1_regions.clear(); }
+    };
+
+    uint64_t id; // Need to make non-const due to move constructor
+    uint64_t runtime_id;
+    static std::atomic<uint64_t> program_counter;
+    std::vector<std::unordered_map<KernelHandle, std::shared_ptr<Kernel> >> kernels_;
+    std::vector<CoreCoord> grid_extent_;
+
+    std::vector<std::shared_ptr<CircularBuffer>> circular_buffers_;
+    std::unordered_map<CBHandle,  std::shared_ptr<CircularBuffer>> circular_buffer_by_id_;
+    // Tracks which circular buffer indices are being used
+    std::unordered_map<CoreCoord, std::bitset<NUM_CIRCULAR_BUFFERS>> per_core_cb_indices_;
+    // Used to generate circular buffer addresses. There is one CircularBufferAllocator per unique CoreRange
+    std::vector<CircularBufferAllocator> cb_allocators_;
+
+    std::vector<Semaphore> semaphores_;
+
+    std::unordered_set<chip_id_t> compiled_;
+    bool local_circular_buffer_allocation_needed_;
+
+    static constexpr uint8_t core_to_kernel_group_invalid_index = 0xff;
+    std::vector<std::vector<KernelGroup>> kernel_groups_;
+    std::vector<std::vector<uint8_t>> core_to_kernel_group_index_table_;
+    uint32_t tensix_go_signal_count_;
+
+    std::vector<std::shared_ptr<Buffer>> config_buffers_;
+
+    std::vector<ProgramConfig> program_configs_;
+    std::vector<uint32_t> program_config_sizes_;
+
+    std::unordered_map<uint64_t, ProgramCommandSequence> cached_program_command_sequences_;
+
+    friend std::shared_ptr<CircularBuffer> GetCircularBuffer(const Program &program, CBHandle id);
+    friend void ValidateCircularBufferRegion(const Program &program, const Device *device);
+
+    friend KernelHandle AddKernel(Program &program, std::shared_ptr<Kernel> kernel, const HalProgrammableCoreType core_type);
+
+    KernelHandle add_kernel(std::shared_ptr<Kernel> kernel, const HalProgrammableCoreType &core_type);
+
+    CBHandle add_circular_buffer(const CoreRangeSet &core_range_set, const CircularBufferConfig &config);
+    std::shared_ptr<CircularBuffer> get_circular_buffer(CBHandle cb_id) const;
+
+    void add_semaphore(const CoreRangeSet & crs, uint32_t semaphore_id, uint32_t init_value, CoreType core_type);
+
+    friend void AddConfigBuffer(Program &program, std::shared_ptr<Buffer> config_buffer);
+    void add_config_buffer(std::shared_ptr<Buffer> config_buffer);
+
+    // Ensures that statically allocated circular buffers do not grow into L1 buffer space
+    void validate_circular_buffer_region(const Device *device) const;
+
+    void set_cb_data_fmt( Device *device, const std::vector<CoreRange> & crs, JitBuildOptions& build_options) const;
+
+    void set_cb_tile_dims( Device *device, const std::vector<CoreRange> & crs, JitBuildOptions& build_options) const;
+
+    void update_kernel_groups(uint32_t programmable_core_type_index);
+
+    uint32_t& get_program_config_size(uint32_t programmable_core_type_index);
+
+    uint32_t finalize_rt_args(uint32_t programmable_core_type_index, uint32_t base_offset);
+    uint32_t finalize_sems(uint32_t programmable_core_type_index, uint32_t base_offset);
+    uint32_t finalize_cbs(uint32_t programmable_core_type_index, uint32_t base_offset);
+    uint32_t finalize_kernel_bins(Device *device, uint32_t programmable_core_type_index, uint32_t base_offset);
+    void set_launch_msg_sem_offsets();
+
+    bool runs_on_noc_unicast_only_cores();
+    bool runs_on_noc_multicast_only_cores();
+
+    friend HWCommandQueue;
+    friend EnqueueProgramCommand;
+    friend Program;
+};
+
 KernelHandle AddKernel (Program &program, std::shared_ptr<Kernel> kernel, const HalProgrammableCoreType core_type) {
-    return program.add_kernel(kernel, core_type);
+    return program.pimpl_->add_kernel(std::move(kernel), core_type);
 }
 
 std::shared_ptr<Kernel> GetKernel(const Program &program, KernelHandle kernel_id) {
@@ -87,16 +252,16 @@ std::shared_ptr<Kernel> GetKernel(const Program &program, KernelHandle kernel_id
 }
 
 std::shared_ptr<CircularBuffer> GetCircularBuffer(const Program &program, CBHandle id) {
-    return program.get_circular_buffer(id);
+    return program.pimpl_->get_circular_buffer(id);
 }
 
 // Checks that circular buffers do not grow into L1 buffer space
 void ValidateCircularBufferRegion(const Program &program, const Device *device) {
-    program.validate_circular_buffer_region(device);
+    program.pimpl_->validate_circular_buffer_region(device);
 }
 
 void AddConfigBuffer(Program &program, std::shared_ptr<Buffer> config_buffer) {
-    program.add_config_buffer(config_buffer);
+    program.pimpl_->add_config_buffer(std::move(config_buffer));
 }
 
 void EnablePersistentKernelCache() { enable_persistent_kernel_cache = true; }
@@ -104,12 +269,11 @@ void EnablePersistentKernelCache() { enable_persistent_kernel_cache = true; }
 void DisablePersistentKernelCache() { enable_persistent_kernel_cache = false; }
 }  // namespace detail
 
-std::atomic<uint64_t> Program::program_counter = 0;
+std::atomic<uint64_t> detail::Program_::program_counter = 0;
 
-Program::Program() :
+detail::Program_::Program_() :
     id(program_counter++),
     runtime_id(0),
-    worker_crs_(),
     local_circular_buffer_allocation_needed_(false),
     finalized_(false) {
     uint32_t programmable_core_count = hal.get_programmable_core_type_count();
@@ -124,7 +288,9 @@ Program::Program() :
     program_config_sizes_.resize(programmable_core_count);
 }
 
-KernelHandle Program::add_kernel(std::shared_ptr<Kernel> kernel, const HalProgrammableCoreType &programmable_core_type) {
+Program::Program() : pimpl_(std::make_unique<detail::Program_>()) {}
+
+KernelHandle detail::Program_::add_kernel(std::shared_ptr<Kernel> kernel, const HalProgrammableCoreType &programmable_core_type) {
     TT_FATAL(this->compiled_.empty(), "Cannot add kernel to an already compiled program {}", this->id);
     // Id is unique across all kernels on all core types
     KernelHandle id = this->num_kernels();
@@ -135,7 +301,7 @@ KernelHandle Program::add_kernel(std::shared_ptr<Kernel> kernel, const HalProgra
     return id;
 }
 
-std::shared_ptr<Kernel> Program::get_kernel(KernelHandle kernel_id) const {
+std::shared_ptr<Kernel> detail::Program_::get_kernel(KernelHandle kernel_id) const {
     // TT_ASSERT(kernel_id < this->kernels_.size(), "Expected Kernel with ID {} to be in Program {}", kernel_id,
     // this->id);
     //  find coretype based on kernel_id
@@ -149,10 +315,12 @@ std::shared_ptr<Kernel> Program::get_kernel(KernelHandle kernel_id) const {
     return nullptr;
 }
 
+std::shared_ptr<Kernel> Program::get_kernel(KernelHandle kernel_id) const { return pimpl_->get_kernel(kernel_id); }
+
 KernelGroup::KernelGroup() : core_ranges(CoreRangeSet()) {}
 
 KernelGroup::KernelGroup(
-    const Program &program,
+    const detail::Program_ &program,
     uint32_t programmable_core_type_index,
     kernel_id_array_t kernel_ids,
     bool erisc_is_idle,
@@ -219,12 +387,16 @@ CoreType KernelGroup::get_core_type() const {
     return hal.get_core_type(this->programmable_core_type_index);
 };
 
-std::vector<KernelGroup> &Program::get_kernel_groups(uint32_t programmable_core_type_index) {
+std::vector<KernelGroup> &detail::Program_::get_kernel_groups(uint32_t programmable_core_type_index) {
     update_kernel_groups(programmable_core_type_index);
     return kernel_groups_[programmable_core_type_index];
 }
 
-KernelGroup *Program::kernels_on_core(const CoreCoord &core, uint32_t programmable_core_type_index) {
+std::vector<KernelGroup> &Program::get_kernel_groups(uint32_t programmable_core_type_index) {
+    return pimpl_->get_kernel_groups(programmable_core_type_index);
+}
+
+KernelGroup *detail::Program_::kernels_on_core(const CoreCoord &core, uint32_t programmable_core_type_index) {
     update_kernel_groups(programmable_core_type_index);
     if (core.x >= grid_extent_[programmable_core_type_index].x || core.y >= grid_extent_[programmable_core_type_index].y)
         return nullptr;
@@ -232,6 +404,10 @@ KernelGroup *Program::kernels_on_core(const CoreCoord &core, uint32_t programmab
     return (index == core_to_kernel_group_invalid_index) ? nullptr : &kernel_groups_[programmable_core_type_index].at(index);
 }
 
+KernelGroup *Program::kernels_on_core(const CoreCoord &core, uint32_t programmable_core_type_index) {
+    return pimpl_->kernels_on_core(core, programmable_core_type_index);
+}
+
 struct KernelGroupInt {
     bool valid;
     kernel_id_array_t kernel_ids;
@@ -262,7 +438,7 @@ struct KernelGroupIntHasher {
     }
 };
 
-void Program::update_kernel_groups(uint32_t programmable_core_type_index) {
+void detail::Program_::update_kernel_groups(uint32_t programmable_core_type_index) {
     if (core_to_kernel_group_index_table_[programmable_core_type_index].size() == 0) {
         bool erisc_is_idle = false;
 
@@ -351,7 +527,7 @@ void Program::update_kernel_groups(uint32_t programmable_core_type_index) {
     }
 }
 
-void Program::CircularBufferAllocator::mark_address(uint64_t address, uint64_t size, uint64_t base_address) {
+void detail::Program_::CircularBufferAllocator::mark_address(uint64_t address, uint64_t size, uint64_t base_address) {
     if (this->l1_regions.empty()) {
         this->l1_regions.emplace_back(base_address, base_address);
     }
@@ -370,7 +546,7 @@ void Program::CircularBufferAllocator::mark_address(uint64_t address, uint64_t s
     }
 }
 
-CBHandle Program::add_circular_buffer(const CoreRangeSet &core_range_set, const CircularBufferConfig &config) {
+CBHandle detail::Program_::add_circular_buffer(const CoreRangeSet &core_range_set, const CircularBufferConfig &config) {
     TT_FATAL(this->compiled_.empty(), "Cannot add circular buffer to an already compiled program {}", this->id);
     std::shared_ptr<CircularBuffer> circular_buffer = std::make_shared<CircularBuffer>(core_range_set, config);
     // Globally allocated circular buffer do not invalidate allocation because their addresses are tracked by memory
@@ -421,14 +597,18 @@ CBHandle Program::add_circular_buffer(const CoreRangeSet &core_range_set, const
     return circular_buffer->id();
 }
 
-std::shared_ptr<CircularBuffer> Program::get_circular_buffer(CBHandle cb_id) const {
+CBHandle Program::add_circular_buffer(const CoreRangeSet &core_range_set, const CircularBufferConfig &config) {
+    return pimpl_->add_circular_buffer(core_range_set, config);
+}
+
+std::shared_ptr<CircularBuffer> detail::Program_::get_circular_buffer(CBHandle cb_id) const {
     if (this->circular_buffer_by_id_.find(cb_id) == this->circular_buffer_by_id_.end()) {
         TT_THROW("No circular buffer with id {} exists in Program {}", cb_id, this->id);
     }
     return this->circular_buffer_by_id_.at(cb_id);
 }
 
-const std::vector<std::shared_ptr<CircularBuffer>> Program::circular_buffers_on_core(const CoreCoord &core) const {
+std::vector<std::shared_ptr<CircularBuffer>> detail::Program_::circular_buffers_on_core(const CoreCoord &core) const {
     std::vector<std::shared_ptr<CircularBuffer>> cbs_on_core;
     for (auto circular_buffer : circular_buffers_) {
         if (circular_buffer->is_on_logical_core(core)) {
@@ -438,7 +618,11 @@ const std::vector<std::shared_ptr<CircularBuffer>> Program::circular_buffers_on_
     return cbs_on_core;
 }
 
-const std::vector<std::shared_ptr<CircularBuffer>> Program::circular_buffers_on_corerange(const CoreRange &cr) const {
+std::vector<std::shared_ptr<CircularBuffer>> Program::circular_buffers_on_core(const CoreCoord &core) const {
+    return pimpl_->circular_buffers_on_core(core);
+}
+
+std::vector<std::shared_ptr<CircularBuffer>> detail::Program_::circular_buffers_on_corerange(const CoreRange &cr) const {
     std::vector<std::shared_ptr<CircularBuffer>> cbs_on_core;
     for (auto circular_buffer : circular_buffers_) {
         if (circular_buffer->is_on_logical_corerange(cr)) {
@@ -448,7 +632,11 @@ const std::vector<std::shared_ptr<CircularBuffer>> Program::circular_buffers_on_
     return cbs_on_core;
 }
 
-const std::vector<CoreRange> Program::circular_buffers_unique_coreranges() const {
+std::vector<std::shared_ptr<CircularBuffer>> Program::circular_buffers_on_corerange(const CoreRange &cr) const {
+    return pimpl_->circular_buffers_on_corerange(cr);
+}
+
+std::vector<CoreRange> detail::Program_::circular_buffers_unique_coreranges() const {
     std::vector<CoreRange> core_ranges;
     for (auto circular_buffer : circular_buffers_) {
         for (const CoreRange &core_range : circular_buffer->core_ranges().ranges()) {
@@ -460,7 +648,11 @@ const std::vector<CoreRange> Program::circular_buffers_unique_coreranges() const
     return core_ranges;
 }
 
-void Program::invalidate_circular_buffer_allocation() {
+std::vector<CoreRange> Program::circular_buffers_unique_coreranges() const {
+    return pimpl_->circular_buffers_unique_coreranges();
+}
+
+void detail::Program_::invalidate_circular_buffer_allocation() {
     if (this->local_circular_buffer_allocation_needed_) {
         return;
     }
@@ -470,7 +662,9 @@ void Program::invalidate_circular_buffer_allocation() {
     this->local_circular_buffer_allocation_needed_ = true;
 }
 
-void Program::allocate_circular_buffers(const Device *device) {
+void Program::invalidate_circular_buffer_allocation() { pimpl_->invalidate_circular_buffer_allocation(); }
+
+void detail::Program_::allocate_circular_buffers(const Device *device) {
     ZoneScoped;
     if (not this->local_circular_buffer_allocation_needed_) {
         return;
@@ -512,7 +706,9 @@ void Program::allocate_circular_buffers(const Device *device) {
     this->local_circular_buffer_allocation_needed_ = false;
 }
 
-void Program::validate_circular_buffer_region(const Device *device) const {
+void Program::allocate_circular_buffers(const Device *device) { pimpl_->allocate_circular_buffers(device); }
+
+void detail::Program_::validate_circular_buffer_region(const Device *device) const {
     ZoneScoped;
 
     // Banks are in lockstep so we only need to get lowest L1 address of one compute and storage core
@@ -549,9 +745,11 @@ void Program::validate_circular_buffer_region(const Device *device) const {
 
 size_t Program::num_semaphores(const CoreCoord &core) const { return semaphores_on_core(core).size(); }
 
-size_t Program::num_semaphores() const { return semaphores_.size(); }
+size_t detail::Program_::num_semaphores() const { return semaphores_.size(); }
 
-void Program::init_semaphores(const Device &device, const CoreCoord &logical_core, uint32_t programmable_core_type_index) const {
+size_t Program::num_semaphores() const { return pimpl_->num_semaphores(); }
+
+void detail::Program_::init_semaphores(const Device &device, const CoreCoord &logical_core, uint32_t programmable_core_type_index) const {
     auto semaphores_on_core = this->semaphores_on_core(logical_core);
 
     uint64_t kernel_config_base = hal.get_dev_addr(programmable_core_type_index, HalL1MemAddrType::KERNEL_CONFIG);
@@ -566,14 +764,22 @@ void Program::init_semaphores(const Device &device, const CoreCoord &logical_cor
     }
 }
 
-void Program::add_semaphore(const CoreRangeSet &crs, uint32_t semaphore_id, uint32_t init_value, CoreType core_type) {
+void Program::init_semaphores(const Device &device, const CoreCoord &logical_core, uint32_t programmable_core_type_index) const {
+    pimpl_->init_semaphores(device, logical_core, programmable_core_type_index);
+}
+
+void detail::Program_::add_semaphore(const CoreRangeSet &crs, uint32_t semaphore_id, uint32_t init_value, CoreType core_type) {
     TT_FATAL(this->compiled_.empty(), "Cannot add semaphore to an already compiled program {}", this->id);
     semaphores_.emplace_back(Semaphore(crs, semaphore_id, init_value, core_type));
 }
 
-void Program::add_config_buffer(std::shared_ptr<Buffer> config_buffer) { config_buffers_.emplace_back(config_buffer); }
+void Program::add_semaphore(const CoreRangeSet &crs, uint32_t semaphore_id, uint32_t init_value, CoreType core_type) {
+    pimpl_->add_semaphore(crs, semaphore_id, init_value, core_type);
+}
 
-std::vector<std::vector<CoreCoord>> Program::logical_cores() const {
+void detail::Program_::add_config_buffer(std::shared_ptr<Buffer> config_buffer) { config_buffers_.emplace_back(config_buffer); }
+
+std::vector<std::vector<CoreCoord>> detail::Program_::logical_cores() const {
     std::vector<std::vector<CoreCoord>> cores_in_program;
     std::vector<std::set<CoreCoord>> unique_cores;
     for (uint32_t programmable_core_type_index = 0; programmable_core_type_index < kernels_.size(); programmable_core_type_index++) {
@@ -593,17 +799,9 @@ std::vector<std::vector<CoreCoord>> Program::logical_cores() const {
     return cores_in_program;
 }
 
-void Program::construct_core_range_set_for_worker_cores() {
-    bool found_kernels = false;
-    uint32_t index = hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX);
-    for (auto [id, kernel] : kernels_[index]) {
-        this->worker_crs_ = this->worker_crs_.merge(kernel->core_range_set());
-        found_kernels = true;
-    }
-    TT_ASSERT(!found_kernels || this->worker_crs_.ranges().size() >= 1, "Invalid core range set");
-}
+std::vector<std::vector<CoreCoord>> Program::logical_cores() const { return pimpl_->logical_cores(); }
 
-void Program::set_cb_data_fmt(Device *device, const std::vector<CoreRange> &crs, JitBuildOptions &build_options) const {
+void detail::Program_::set_cb_data_fmt(Device *device, const std::vector<CoreRange> &crs, JitBuildOptions &build_options) const {
     ZoneScoped;
     for (auto logical_cr : crs) {
         auto cbs_on_core = this->circular_buffers_on_corerange(logical_cr);
@@ -616,7 +814,7 @@ void Program::set_cb_data_fmt(Device *device, const std::vector<CoreRange> &crs,
     }
 }
 
-void Program::set_cb_tile_dims(Device *device, const std::vector<CoreRange> &crs, JitBuildOptions &build_options) const {
+void detail::Program_::set_cb_tile_dims(Device *device, const std::vector<CoreRange> &crs, JitBuildOptions &build_options) const {
     ZoneScoped;
     for (const auto &logical_cr : crs) {
         auto cbs_on_core = this->circular_buffers_on_corerange(logical_cr);
@@ -647,7 +845,7 @@ void Program::set_cb_tile_dims(Device *device, const std::vector<CoreRange> &crs
     }
 }
 
-void Program::populate_dispatch_data(Device *device) {
+void detail::Program_::populate_dispatch_data(Device *device) {
     static const uint32_t processor_to_firmware_base[] = {
         MEM_BRISC_FIRMWARE_BASE,
         MEM_NCRISC_FIRMWARE_BASE,
@@ -841,7 +1039,7 @@ void Program::populate_dispatch_data(Device *device) {
     return;
 }
 
-uint32_t Program::finalize_rt_args(uint32_t programmable_core_type_index, uint32_t base_offset) {
+uint32_t detail::Program_::finalize_rt_args(uint32_t programmable_core_type_index, uint32_t base_offset) {
 
     // Iterate over kernels in the program and "level" the number of RTAs based on the max
     // Unique RTAs are packed across dispatch classes
@@ -864,7 +1062,7 @@ uint32_t Program::finalize_rt_args(uint32_t programmable_core_type_index, uint32
             max_rtas[dispatch_class] = 0;
             auto& optional_id = kg.kernel_ids[dispatch_class];
             if (optional_id) {
-                auto kernel = detail::GetKernel(*this, optional_id.value());
+                auto kernel = get_kernel(optional_id.value());
                 for (const CoreRange &core_range : kg.core_ranges.ranges()) {
                     for (auto x = core_range.start_coord.x; x <= core_range.end_coord.x; x++) {
                         for (auto y = core_range.start_coord.y; y <= core_range.end_coord.y; y++) {
@@ -882,7 +1080,7 @@ uint32_t Program::finalize_rt_args(uint32_t programmable_core_type_index, uint32
             auto& optional_id = kg.kernel_ids[dispatch_class];
             kg.rta_sizes[dispatch_class] = max_rtas[dispatch_class] * sizeof(uint32_t);
             if (optional_id) {
-                auto kernel = detail::GetKernel(*this, optional_id.value());
+                auto kernel = get_kernel(optional_id.value());
                 kernel->set_runtime_args_count(kg.core_ranges, max_rtas[dispatch_class]);
                 kg.launch_msg.kernel_config.rta_offset[dispatch_class].rta_offset = base_offset + offset;
                 offset += max_rtas[dispatch_class] * sizeof(uint32_t);
@@ -901,7 +1099,7 @@ uint32_t Program::finalize_rt_args(uint32_t programmable_core_type_index, uint32
     }
     // Find the max # common RTAs across all kernels for each dispatch class
     for (size_t kernel_id = 0; kernel_id < this->num_kernels(); kernel_id++) {
-        auto kernel = detail::GetKernel(*this, kernel_id);
+        auto kernel = get_kernel(kernel_id);
         // TODO: kernels should be stored by programmable core type
         if (core_type == kernel->get_kernel_core_type() &&
             (programmable_core_type == HalProgrammableCoreType::IDLE_ETH) == kernel->is_idle_eth()) {
@@ -924,7 +1122,7 @@ uint32_t Program::finalize_rt_args(uint32_t programmable_core_type_index, uint32
 
     // Set the runtime_args_data sizing info based on the shared max
     for (size_t kernel_id = 0; kernel_id < this->num_kernels(); kernel_id++) {
-        auto kernel = detail::GetKernel(*this, kernel_id);
+        auto kernel = get_kernel(kernel_id);
         // TODO: as above, fix when kernels are stored by programmable core type
         if (core_type == kernel->get_kernel_core_type() &&
             (programmable_core_type == HalProgrammableCoreType::IDLE_ETH) == kernel->is_idle_eth()) {
@@ -947,11 +1145,15 @@ uint32_t Program::finalize_rt_args(uint32_t programmable_core_type_index, uint32
     return max_unique_rta_size + total_crta_size;
 }
 
-ProgramConfig& Program::get_program_config(uint32_t programmable_core_type_index) {
+ProgramConfig& detail::Program_::get_program_config(uint32_t programmable_core_type_index) {
     return this->program_configs_[programmable_core_type_index];
 }
 
-uint32_t Program::finalize_sems(uint32_t programmable_core_type_index, uint32_t base_offset) {
+ProgramConfig& Program::get_program_config(uint32_t programmable_core_type_index) {
+    return pimpl_->get_program_config(programmable_core_type_index);
+}
+
+uint32_t detail::Program_::finalize_sems(uint32_t programmable_core_type_index, uint32_t base_offset) {
 
     int max_id = -1;
     CoreType core_type = hal.get_core_type(programmable_core_type_index);
@@ -969,7 +1171,7 @@ uint32_t Program::finalize_sems(uint32_t programmable_core_type_index, uint32_t
     return base_offset + sem_size;
 }
 
-void Program::set_launch_msg_sem_offsets() {
+void detail::Program_::set_launch_msg_sem_offsets() {
 
     for (uint32_t kg_type_index = 0; kg_type_index < hal.get_programmable_core_type_count(); kg_type_index++) {
         for (auto& kg : this->get_kernel_groups(kg_type_index)) {
@@ -981,7 +1183,7 @@ void Program::set_launch_msg_sem_offsets() {
     }
 }
 
-uint32_t Program::finalize_cbs(uint32_t programmable_core_type_index, uint32_t base_offset) {
+uint32_t detail::Program_::finalize_cbs(uint32_t programmable_core_type_index, uint32_t base_offset) {
 
     int count = 0;
 
@@ -1003,7 +1205,7 @@ uint32_t Program::finalize_cbs(uint32_t programmable_core_type_index, uint32_t b
     return base_offset + cb_size;
 }
 
-uint32_t Program::finalize_kernel_bins(Device *device, uint32_t programmable_core_type_index, uint32_t base_offset) {
+uint32_t detail::Program_::finalize_kernel_bins(Device *device, uint32_t programmable_core_type_index, uint32_t base_offset) {
 
     uint32_t l1_alignment = hal.get_alignment(HalMemType::L1);
 
@@ -1062,11 +1264,11 @@ uint32_t Program::finalize_kernel_bins(Device *device, uint32_t programmable_cor
     return max_offset;
 }
 
-uint32_t& Program::get_program_config_size(uint32_t programmable_core_type_index) {
+uint32_t& detail::Program_::get_program_config_size(uint32_t programmable_core_type_index) {
     return this->program_config_sizes_[programmable_core_type_index];
 }
 
-void Program::finalize(Device *device) {
+void detail::Program_::finalize(Device *device) {
     // Store the number of tensix "go signals" for use by CQ
     // CQ iterates over these to update runtime addresses, needs to know when eth begins (after tensix)
     // TODO: should store all the counts
@@ -1105,7 +1307,9 @@ void Program::finalize(Device *device) {
     finalized_ = true;
 }
 
-void Program::compile(Device *device, bool fd_bootloader_mode) {
+void Program::finalize(Device *device) { pimpl_->finalize(device); }
+
+void detail::Program_::compile(Device *device, bool fd_bootloader_mode) {
     ZoneScoped;
     if (compiled_.contains(device->id())) {
         return;
@@ -1185,7 +1389,7 @@ void Program::compile(Device *device, bool fd_bootloader_mode) {
                     }
                     if (detail::CompilationReporter::enabled()) {
                         detail::CompilationReporter::inst().add_kernel_compile_stats(
-                            *this, kernel, cache_hit, kernel_hash);
+                            get_id(), kernel, cache_hit, kernel_hash);
                     }
                     kernel->set_binary_path(build_options.path);
                 },
@@ -1202,23 +1406,28 @@ void Program::compile(Device *device, bool fd_bootloader_mode) {
 
     sync_build_step(events);
 
-    this->construct_core_range_set_for_worker_cores();
     if (std::getenv("TT_METAL_SLOW_DISPATCH_MODE") == nullptr) {
         this->populate_dispatch_data(device);  // TODO: maybe rename
     }
 
     if (detail::CompilationReporter::enabled()) {
-        detail::CompilationReporter::inst().flush_program_entry(*this, enable_persistent_kernel_cache);
+        detail::CompilationReporter::inst().flush_program_entry(get_id(), num_kernels(), [this](size_t kernel_id) {
+            return get_kernel(kernel_id);
+        }, enable_persistent_kernel_cache);
     }
     if (detail::MemoryReporter::enabled()) {
-        detail::MemoryReporter::inst().flush_program_memory_usage(*this, device);
+        detail::MemoryReporter::inst().flush_program_memory_usage(get_id(), device);
     }
     compiled_.insert(device->id());
 }
 
-void Program::set_runtime_id(uint64_t id) { this->runtime_id = id; }
+void Program::compile(Device *device, bool fd_bootloader_mode) { pimpl_->compile(device, fd_bootloader_mode); }
 
-uint32_t Program::get_sem_base_addr(Device *device, CoreCoord logical_core, CoreType core_type) const {
+void detail::Program_::set_runtime_id(uint64_t id) { this->runtime_id = id; }
+
+void Program::set_runtime_id(uint64_t id) { pimpl_->set_runtime_id(id); }
+
+uint32_t detail::Program_::get_sem_base_addr(Device *device, CoreCoord logical_core, CoreType core_type) const {
 
     CoreCoord phys_core = device->physical_core_from_logical_core(logical_core, core_type);
     HalProgrammableCoreType programmable_core_type = device->get_programmable_core_type(phys_core);
@@ -1231,7 +1440,11 @@ uint32_t Program::get_sem_base_addr(Device *device, CoreCoord logical_core, Core
     return base_addr + this->program_configs_[index].sem_offset;
 }
 
-uint32_t Program::get_cb_base_addr(Device *device, CoreCoord logical_core, CoreType core_type) const {
+uint32_t Program::get_sem_base_addr(Device *device, CoreCoord logical_core, CoreType core_type) const {
+    return pimpl_->get_sem_base_addr(device, logical_core, core_type);
+}
+
+uint32_t detail::Program_::get_cb_base_addr(Device *device, CoreCoord logical_core, CoreType core_type) const {
 
     CoreCoord phys_core = device->physical_core_from_logical_core(logical_core, core_type);
     HalProgrammableCoreType programmable_core_type = device->get_programmable_core_type(phys_core);
@@ -1244,7 +1457,11 @@ uint32_t Program::get_cb_base_addr(Device *device, CoreCoord logical_core, CoreT
     return base_addr + this->program_configs_[index].cb_offset;
 }
 
-uint32_t Program::get_sem_size(Device *device, CoreCoord logical_core, CoreType core_type) const {
+uint32_t Program::get_cb_base_addr(Device *device, CoreCoord logical_core, CoreType core_type) const {
+    return pimpl_->get_cb_base_addr(device, logical_core, core_type);
+}
+
+uint32_t detail::Program_::get_sem_size(Device *device, CoreCoord logical_core, CoreType core_type) const {
 
     CoreCoord phys_core = device->physical_core_from_logical_core(logical_core, core_type);
     HalProgrammableCoreType programmable_core_type = device->get_programmable_core_type(phys_core);
@@ -1253,7 +1470,11 @@ uint32_t Program::get_sem_size(Device *device, CoreCoord logical_core, CoreType
     return this->program_configs_[index].sem_size;
 }
 
-uint32_t Program::get_cb_size(Device *device, CoreCoord logical_core, CoreType core_type) const {
+uint32_t Program::get_sem_size(Device *device, CoreCoord logical_core, CoreType core_type) const {
+    return pimpl_->get_sem_size(device, logical_core, core_type);
+}
+
+uint32_t detail::Program_::get_cb_size(Device *device, CoreCoord logical_core, CoreType core_type) const {
 
     CoreCoord phys_core = device->physical_core_from_logical_core(logical_core, core_type);
     HalProgrammableCoreType programmable_core_type = device->get_programmable_core_type(phys_core);
@@ -1262,17 +1483,92 @@ uint32_t Program::get_cb_size(Device *device, CoreCoord logical_core, CoreType c
     return this->program_configs_[index].cb_size;
 }
 
+uint32_t Program::get_cb_size(Device *device, CoreCoord logical_core, CoreType core_type) const {
+    return pimpl_->get_cb_size(device, logical_core, core_type);
+}
+
 // TODO: Too low level for program.cpp. Move this to HAL, once we have support.
-bool Program::runs_on_noc_unicast_only_cores() {
+bool detail::Program_::runs_on_noc_unicast_only_cores() {
     return (hal.get_programmable_core_type_index(HalProgrammableCoreType::ACTIVE_ETH) != -1 and
-            this->get_kernel_groups(hal.get_programmable_core_type_index(HalProgrammableCoreType::ACTIVE_ETH)).size());
+            not this->get_kernel_groups(hal.get_programmable_core_type_index(HalProgrammableCoreType::ACTIVE_ETH)).empty());
 }
 
+bool Program::runs_on_noc_unicast_only_cores() { return pimpl_->runs_on_noc_unicast_only_cores(); }
+
 // TODO: Too low level for program.cpp. Move this to HAL, once we have support.
-bool Program::runs_on_noc_multicast_only_cores() {
+bool detail::Program_::runs_on_noc_multicast_only_cores() {
     return (hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX) != -1 and
-            this->get_kernel_groups(hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX)).size());
+            not this->get_kernel_groups(hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX)).empty());
+}
+
+bool Program::runs_on_noc_multicast_only_cores() { return pimpl_->runs_on_noc_multicast_only_cores(); }
+
+Program::Program(Program &&other) noexcept = default;
+
+Program& Program::operator=(Program &&other) noexcept = default;
+
+Program::~Program() noexcept = default;
+
+uint64_t detail::Program_::get_id() const { return this->id; }
+
+uint64_t Program::get_id() const { return pimpl_->get_id(); }
+
+uint64_t detail::Program_::get_runtime_id() const { return this->runtime_id; }
+
+uint64_t Program::get_runtime_id() const { return pimpl_->get_runtime_id(); }
+
+size_t detail::Program_::num_kernels() const {
+    size_t count = 0;
+    for (const auto& kernels : kernels_) {
+        count += kernels.size();
+    }
+    return count;
+}
+
+size_t Program::num_kernels() const { return pimpl_->num_kernels(); }
+
+const std::vector<std::shared_ptr<CircularBuffer>> &detail::Program_::circular_buffers() const { return circular_buffers_; }
+
+const std::vector<std::shared_ptr<CircularBuffer>> &Program::circular_buffers() const { return pimpl_->circular_buffers(); }
+
+const std::vector< Semaphore > & detail::Program_::semaphores() const { return semaphores_; }
+
+const std::vector< Semaphore > & Program::semaphores() const { return pimpl_->semaphores(); }
+
+void detail::Program_::add_buffer(std::shared_ptr<Buffer> buf) { owned_buffer_pool.push_back(std::move(buf)); }
+
+void Program::add_buffer(std::shared_ptr<Buffer> buf) { pimpl_->add_buffer(std::move(buf)); }
+
+void detail::Program_::release_buffers() { owned_buffer_pool = {}; }
+
+void Program::release_buffers() { pimpl_->release_buffers(); }
+
+std::vector<std::reference_wrapper<const Semaphore>> detail::Program_::semaphores_on_core(const CoreCoord &core) const {
+    std::vector<std::reference_wrapper<const Semaphore>> semaphores;
+    for (const Semaphore &s : this->semaphores_) {
+        if (s.initialized_on_logical_core(core)) {
+            semaphores.emplace_back(std::cref(s));
+        }
+    }
+    return semaphores;
+}
+
+std::vector<std::reference_wrapper<const Semaphore>> Program::semaphores_on_core(const CoreCoord &core) const {
+    return pimpl_->semaphores_on_core(core);
+}
+
+bool detail::Program_::is_finalized() const { return this->finalized_; }
+
+bool Program::is_finalized() const { return pimpl_->is_finalized(); }
+
+const ProgramTransferInfo &Program::get_program_transfer_info() const noexcept { return pimpl_->program_transfer_info; }
+
+const std::shared_ptr<Buffer> &Program::get_kernels_buffer() const noexcept { return pimpl_->kernels_buffer; }
+
+const std::vector<uint32_t> &Program::get_program_config_sizes() const noexcept { return pimpl_->program_config_sizes_; }
+
+std::unordered_map<uint64_t, ProgramCommandSequence> &Program::get_cached_program_command_sequences() noexcept {
+    return pimpl_->cached_program_command_sequences_;
 }
 
-Program::~Program() {}
 }  // namespace tt::tt_metal
diff --git a/tt_metal/impl/program/program.hpp b/tt_metal/impl/program/program.hpp
index 05aa822d787..b239ddf93b3 100644
--- a/tt_metal/impl/program/program.hpp
+++ b/tt_metal/impl/program/program.hpp
@@ -35,6 +35,8 @@ class EnqueueProgramCommand;
 class HWCommandQueue;
 class JitBuildOptions;
 namespace detail{
+    class Program_;
+
     void ValidateCircularBufferRegion(const Program &program, const Device *device);
     KernelHandle AddKernel (Program &program, std::shared_ptr<Kernel> kernel, const HalProgrammableCoreType core_type);
     std::shared_ptr<Kernel> GetKernel(const Program &program, KernelHandle kernel_id);
@@ -56,7 +58,7 @@ struct KernelGroup {
 
     KernelGroup();
     KernelGroup(
-        const Program &program,
+        const detail::Program_ &program,
         uint32_t programmable_core_type_index,
         kernel_id_array_t kernel_ids,
         bool erisc_is_idle,
@@ -90,48 +92,32 @@ class Program {
     Program(const Program &other) = delete;
     Program& operator=(const Program &other) = delete;
 
-    Program(Program &&other) = default;
-    Program& operator=(Program &&other) = default;
+    Program(Program &&other) noexcept;
+    Program& operator=(Program &&other) noexcept;
 
     void set_runtime_id(uint64_t id);
-    ~Program();
-
-    void construct_core_range_set_for_worker_cores();
+    ~Program() noexcept;
 
-    const uint64_t get_id() const { return this->id; }
-    const uint64_t get_runtime_id() const { return this->runtime_id; }
+    uint64_t get_id() const;
+    uint64_t get_runtime_id() const;
 
-    size_t num_kernels() const {
-      size_t count = 0;
-      for (const auto& kernels : kernels_) {
-        count += kernels.size();
-      }
-      return count;
-    }
+    size_t num_kernels() const;
 
-    const std::vector<std::shared_ptr<CircularBuffer>> &circular_buffers() const { return circular_buffers_; }
+    const std::vector<std::shared_ptr<CircularBuffer>> &circular_buffers() const;
 
-    const std::vector< Semaphore > & semaphores() const { return semaphores_; }
+    const std::vector< Semaphore > & semaphores() const;
 
     KernelGroup * kernels_on_core(const CoreCoord &core, uint32_t programmable_core_type_index);
     std::vector<KernelGroup>& get_kernel_groups(uint32_t programmable_core_type_index);
-    inline void add_buffer(std::shared_ptr<Buffer> buf) { owned_buffer_pool.push_back(buf); }
-    inline void release_buffers() { owned_buffer_pool = {}; }
-    const std::vector<std::shared_ptr<CircularBuffer>> circular_buffers_on_core(const CoreCoord &core) const;
+    void add_buffer(std::shared_ptr<Buffer> buf);
+    void release_buffers();
+    std::vector<std::shared_ptr<CircularBuffer>> circular_buffers_on_core(const CoreCoord &core) const;
 
-    const std::vector<std::shared_ptr<CircularBuffer>> circular_buffers_on_corerange(const CoreRange &cr) const;
+    std::vector<std::shared_ptr<CircularBuffer>> circular_buffers_on_corerange(const CoreRange &cr) const;
 
-    const std::vector<CoreRange> circular_buffers_unique_coreranges() const;
+    std::vector<CoreRange> circular_buffers_unique_coreranges() const;
 
-    auto semaphores_on_core(const CoreCoord &core) const {
-        std::vector<std::reference_wrapper<const Semaphore>> semaphores;
-        for ( const Semaphore & s : this->semaphores_) {
-            if (s.initialized_on_logical_core(core)) {
-                semaphores.emplace_back(std::cref(s));
-            }
-        }
-        return semaphores;
-    }
+    std::vector<std::reference_wrapper<const Semaphore>> semaphores_on_core(const CoreCoord &core) const;
 
     size_t num_semaphores ( const CoreCoord & core ) const;
     size_t num_semaphores () const;
@@ -139,16 +125,13 @@ class Program {
     // XXXXX TODO: this should return a const reference
     std::vector<std::vector<CoreCoord>> logical_cores() const;
 
-    // Is worker_crs_ used anywhere?
-    const CoreRangeSet& get_worker_core_range_set() const { return worker_crs_; };
-
     void compile(Device * device, bool fd_bootloader_mode = false);
 
     void invalidate_circular_buffer_allocation();
 
     void allocate_circular_buffers(const Device *device);
 
-    bool is_finalized() const { return this->finalized_; }
+    bool is_finalized() const;
     void finalize(Device *device);
     std::shared_ptr<Kernel> get_kernel(KernelHandle kernel_id) const;
 
@@ -161,73 +144,7 @@ class Program {
     uint32_t get_cb_size(Device *device, CoreCoord logical_core, CoreType core_type) const;
 
    private:
-    void populate_dispatch_data(Device *device);
-
-    // Buffers temporarily owned by the program
-    std::vector<std::shared_ptr<Buffer>> owned_buffer_pool = {};
-
-    // The buffer that holds the kernel/binaries/etc for this program
-    std::shared_ptr<Buffer> kernels_buffer = nullptr;
-    ProgramTransferInfo program_transfer_info;
-
-    bool finalized_;
-
-    struct CircularBufferAllocator {
-        CircularBufferAllocator(const CoreRange &core_range_) : core_range(core_range_) {}
-
-        // Circular buffers are created and allocated at core range granularity
-        CoreRange core_range;
-
-        // Holds vector of addresses where circular buffers are allocated [start, end)
-        // There are multiple ranges because per core L1 regions are not in lockstep but circular buffers spanning multiple cores must share the same address
-        // To enable this, circular buffer address is the maximum address amongst all of its target cores
-        // This vector is sorted from lower to higher address spaces
-        std::vector<std::pair<uint64_t, uint64_t>> l1_regions;
-
-        // Returns address for next circular buffer
-        // Circular buffers are placed sequentially on a core so the next available address gets appended to the last L1 region
-        uint64_t get_cb_region_end() const {
-            return this->l1_regions.empty() ? 0 : this->l1_regions.back().second;
-        }
-
-        // If address is the end of the last L1 region, the last region is extended by size bytes,
-        //  otherwise address must be higher than existing regions and a new L1 region [address, size) is added
-        void mark_address(uint64_t address, uint64_t size, uint64_t base_address);
-
-        // Reset when circular buffer allocation is invalidated
-        void reset_available_addresses() { this->l1_regions.clear(); }
-    };
-
-    uint64_t id; // Need to make non-const due to move constructor
-    uint64_t runtime_id;
-    static std::atomic<uint64_t> program_counter;
-    std::vector<std::unordered_map<KernelHandle, std::shared_ptr<Kernel> >> kernels_;
-    std::vector<CoreCoord> grid_extent_;
-
-    std::vector<std::shared_ptr<CircularBuffer>> circular_buffers_;
-    std::unordered_map<CBHandle,  std::shared_ptr<CircularBuffer>> circular_buffer_by_id_;
-    // Tracks which circular buffer indices are being used
-    std::unordered_map<CoreCoord, std::bitset<NUM_CIRCULAR_BUFFERS>> per_core_cb_indices_;
-    // Used to generate circular buffer addresses. There is one CircularBufferAllocator per unique CoreRange
-    std::vector<CircularBufferAllocator> cb_allocators_;
-
-    std::vector<Semaphore> semaphores_;
-
-    CoreRangeSet worker_crs_;
-    std::unordered_set<chip_id_t> compiled_;
-    bool local_circular_buffer_allocation_needed_;
-
-    static constexpr uint8_t core_to_kernel_group_invalid_index = 0xff;
-    std::vector<std::vector<KernelGroup>> kernel_groups_;
-    std::vector<std::vector<uint8_t>> core_to_kernel_group_index_table_;
-    uint32_t tensix_go_signal_count_;
-
-    std::vector<std::shared_ptr<Buffer>> config_buffers_;
-
-    std::vector<ProgramConfig> program_configs_;
-    std::vector<uint32_t> program_config_sizes_;
-
-    std::unordered_map<uint64_t, ProgramCommandSequence> cached_program_command_sequences_;
+    std::unique_ptr<detail::Program_> pimpl_;
 
     friend CBHandle CreateCircularBuffer(Program &program, const std::variant<CoreCoord, CoreRange, CoreRangeSet> &core_spec, const CircularBufferConfig &config);
     friend std::shared_ptr<CircularBuffer> detail::GetCircularBuffer(const Program &program, CBHandle id);
@@ -237,38 +154,23 @@ class Program {
     friend std::shared_ptr<Kernel> detail::GetKernel(const Program &program, KernelHandle kernel_id);
 
     friend uint32_t CreateSemaphore(Program &program, const std::variant<CoreRange,CoreRangeSet> &core_spec, uint32_t initial_value, CoreType core_type);
-    KernelHandle add_kernel(std::shared_ptr<Kernel> kernel, const HalProgrammableCoreType &core_type);
 
     CBHandle add_circular_buffer(const CoreRangeSet &core_range_set, const CircularBufferConfig &config);
-    std::shared_ptr<CircularBuffer> get_circular_buffer(CBHandle cb_id) const;
 
     void add_semaphore(const CoreRangeSet & crs, uint32_t semaphore_id, uint32_t init_value, CoreType core_type);
 
     friend void detail::AddConfigBuffer(Program &program, std::shared_ptr<Buffer> config_buffer);
-    void add_config_buffer(std::shared_ptr<Buffer> config_buffer);
-
-    // Ensures that statically allocated circular buffers do not grow into L1 buffer space
-    void validate_circular_buffer_region(const Device *device) const;
-
-    void set_cb_data_fmt( Device *device, const std::vector<CoreRange> & crs, JitBuildOptions& build_options) const;
-
-    void set_cb_tile_dims( Device *device, const std::vector<CoreRange> & crs, JitBuildOptions& build_options) const;
-
-    void update_kernel_groups(uint32_t programmable_core_type_index);
-
-    uint32_t& get_program_config_size(uint32_t programmable_core_type_index);
-
-    uint32_t finalize_rt_args(uint32_t programmable_core_type_index, uint32_t base_offset);
-    uint32_t finalize_sems(uint32_t programmable_core_type_index, uint32_t base_offset);
-    uint32_t finalize_cbs(uint32_t programmable_core_type_index, uint32_t base_offset);
-    uint32_t finalize_kernel_bins(Device *device, uint32_t programmable_core_type_index, uint32_t base_offset);
-    void set_launch_msg_sem_offsets();
 
     bool runs_on_noc_unicast_only_cores();
     bool runs_on_noc_multicast_only_cores();
 
     friend HWCommandQueue;
     friend EnqueueProgramCommand;
+
+    const ProgramTransferInfo &get_program_transfer_info() const noexcept;
+    const std::shared_ptr<Buffer> &get_kernels_buffer() const noexcept;
+    const std::vector<uint32_t> &get_program_config_sizes() const noexcept;
+    std::unordered_map<uint64_t, ProgramCommandSequence> &get_cached_program_command_sequences() noexcept;
 };
 
 }  // namespace v0

From e0c4924a4d4444ecdd752f4e22b67d3db207912e Mon Sep 17 00:00:00 2001
From: Naif Tarafdar <135640067+ntarafdar@users.noreply.github.com>
Date: Fri, 25 Oct 2024 13:03:59 -0700
Subject: [PATCH 14/30] Revert "Fix Hanging Reshape" (#14295)

Revert "Fix Hanging Reshape (#14120)"

This reverts commit d1a10ea9a9ec91d4730500bc86fa8c4bc398579f.
---
 tests/ttnn/unit_tests/test_reshape.py                          | 3 ---
 .../cpp/ttnn/operations/data_movement/reshape_view/reshape.cpp | 2 +-
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/tests/ttnn/unit_tests/test_reshape.py b/tests/ttnn/unit_tests/test_reshape.py
index 4413fa7b601..4ada4299f60 100644
--- a/tests/ttnn/unit_tests/test_reshape.py
+++ b/tests/ttnn/unit_tests/test_reshape.py
@@ -293,9 +293,6 @@ def test_reshape_tile_layout_only_change_shape(device):
         ((1, 1445, 192), (1445, 192)),
         ((1, 256), (1, 1, 256)),
         ((16, 1, 32), (16, 1, 32)),
-        ((32,), (1, 1, 1, 32)),
-        ((16,), (1, 1, 1, 16)),
-        ((48,), (1, 1, 1, 48)),
     ],
 )
 @pytest.mark.parametrize("layout", [ttnn.ROW_MAJOR_LAYOUT, ttnn.TILE_LAYOUT])
diff --git a/ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape.cpp b/ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape.cpp
index db61e2d169e..7f459d8046e 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape.cpp
@@ -59,7 +59,7 @@ ttnn::Tensor convert_tensor_to_rm_reshape_convert_back_to_orig_layout(const ttnn
     //Constraint in device kernel
     uint32_t ROW_MAJOR_WIDTH = 8;
     ttnn::Tensor reshaped_rm_tensor;
-    if((tensor_shape[-1] % ROW_MAJOR_WIDTH == 0 && shape[-1] % ROW_MAJOR_WIDTH == 0) and tensor_shape.rank() == 4) {
+    if((tensor_shape[-1] % ROW_MAJOR_WIDTH == 0 && shape[-1] % ROW_MAJOR_WIDTH == 0)) {
         auto rm_tensor = ttnn::to_layout(tensor, ttnn::ROW_MAJOR_LAYOUT, std::nullopt, std::nullopt, (Device *)nullptr);
         if (rm_tensor.is_contiguous()) {
             // Page size depends on the width, so only modify the shape if the width is the same

From 561e2fd349ad4598150190ef531288cf6c21104f Mon Sep 17 00:00:00 2001
From: Austin Ho <aho@tenstorrent.com>
Date: Mon, 21 Oct 2024 13:45:40 +0000
Subject: [PATCH 15/30] #14038: Remove global BUFFER_MAP and make the tracking
 of buffers local to an allocator

---
 tt_metal/detail/tt_metal.hpp             |   2 +-
 tt_metal/graph/graph_tracking.cpp        |   8 +-
 tt_metal/graph/graph_tracking.hpp        |   8 +-
 tt_metal/impl/allocator/allocator.cpp    |  41 ++++---
 tt_metal/impl/allocator/allocator.hpp    |  13 ++-
 tt_metal/impl/buffers/buffer.cpp         |  31 +++--
 tt_metal/impl/buffers/buffer.hpp         |  36 +-----
 tt_metal/impl/device/device.cpp          |  11 +-
 tt_metal/impl/device/device.hpp          |   2 +
 tt_metal/impl/dispatch/command_queue.hpp |   3 -
 tt_metal/tt_metal.cpp                    |  26 ++---
 ttnn/cpp/ttnn/graph/graph_processor.cpp  |   4 +-
 ttnn/cpp/ttnn/graph/graph_processor.hpp  |   4 +-
 ttnn/cpp/ttnn/reports.hpp                | 139 +++++++++++------------
 14 files changed, 156 insertions(+), 172 deletions(-)

diff --git a/tt_metal/detail/tt_metal.hpp b/tt_metal/detail/tt_metal.hpp
index d6168102a5e..e5464e721a6 100644
--- a/tt_metal/detail/tt_metal.hpp
+++ b/tt_metal/detail/tt_metal.hpp
@@ -276,7 +276,7 @@ inline namespace v0 {
 
         void SetLazyCommandQueueMode(bool lazy);
 
-        DeviceAddr AllocateBuffer(const Buffer* buffer, bool bottom_up);
+        DeviceAddr AllocateBuffer(Buffer* buffer);
 
         void DeallocateBuffer(Buffer *buffer);
     }  // namespace detail
diff --git a/tt_metal/graph/graph_tracking.cpp b/tt_metal/graph/graph_tracking.cpp
index 17a72ddd5ee..c12eff0d7ec 100644
--- a/tt_metal/graph/graph_tracking.cpp
+++ b/tt_metal/graph/graph_tracking.cpp
@@ -27,12 +27,12 @@ bool GraphTracker::add_hook(const std::shared_ptr<IGraphHooks>& new_hook) {
     return true;
 }
 
-void GraphTracker::track_allocate(const Buffer* buffer, bool bottom_up) {
+void GraphTracker::track_allocate(const Buffer* buffer) {
     if (processors.empty()) {
         return;
     }
     for (auto& it : processors) {
-        it->track_allocate(buffer, bottom_up);
+        it->track_allocate(buffer);
     }
 }
 
@@ -73,11 +73,11 @@ void GraphTracker::track_program(Program* program) {
     }
 }
 
-bool GraphTracker::hook_allocate(const Buffer* buffer, bool bottom_up) {
+bool GraphTracker::hook_allocate(const Buffer* buffer) {
     if (hook == nullptr)
         return false;
 
-    return hook->hook_allocate(buffer, bottom_up);
+    return hook->hook_allocate(buffer);
 }
 
 bool GraphTracker::hook_deallocate(Buffer* buffer) {
diff --git a/tt_metal/graph/graph_tracking.hpp b/tt_metal/graph/graph_tracking.hpp
index 54ee8eef41d..712373ab005 100644
--- a/tt_metal/graph/graph_tracking.hpp
+++ b/tt_metal/graph/graph_tracking.hpp
@@ -28,7 +28,7 @@ inline namespace v0 {
 
         IGraphProcessor() = default;
 
-        virtual void track_allocate(const tt::tt_metal::Buffer* buffer, bool bottom_up) {};
+        virtual void track_allocate(const tt::tt_metal::Buffer* buffer) {};
 
         virtual void track_deallocate(tt::tt_metal::Buffer* buffer) {};
 
@@ -54,7 +54,7 @@ inline namespace v0 {
     class IGraphHooks {
     public:
         IGraphHooks() = default;
-        virtual bool hook_allocate(const tt::tt_metal::Buffer* buffer, bool bottom_up) = 0;
+        virtual bool hook_allocate(const tt::tt_metal::Buffer* buffer) = 0;
 
         virtual bool hook_deallocate(tt::tt_metal::Buffer* buffer) = 0;
 
@@ -77,7 +77,7 @@ inline namespace v0 {
 
         bool add_hook(const std::shared_ptr<IGraphHooks>& hook);
 
-        void track_allocate(const Buffer* buffer, bool bottom_up);
+        void track_allocate(const Buffer* buffer);
 
         void track_deallocate(Buffer* buffer);
 
@@ -118,7 +118,7 @@ inline namespace v0 {
             }
         }
 
-        bool hook_allocate(const Buffer* buffer, bool bottom_up);
+        bool hook_allocate(const Buffer* buffer);
 
         bool hook_deallocate(Buffer* buffer);
 
diff --git a/tt_metal/impl/allocator/allocator.cpp b/tt_metal/impl/allocator/allocator.cpp
index 023826e5cd9..7e760b3bf37 100644
--- a/tt_metal/impl/allocator/allocator.cpp
+++ b/tt_metal/impl/allocator/allocator.cpp
@@ -377,38 +377,45 @@ void verify_safe_allocation(Allocator& allocator) {
     }
 }
 
-uint64_t allocate_buffer(
-    Allocator &allocator,
-    DeviceAddr size,
-    DeviceAddr page_size,
-    const BufferType &buffer_type,
-    bool bottom_up,
-    std::optional<uint32_t> num_shards) {
-    uint64_t address = 0;
+const std::unordered_set<Buffer *> &get_allocated_buffers(const Allocator &allocator) { return allocator.allocated_buffers; }
+
+DeviceAddr allocate_buffer(Allocator &allocator, DeviceAddr size, Buffer *buffer) {
+    DeviceAddr address = 0;
+    auto page_size = buffer->page_size();
+    auto buffer_type = buffer->buffer_type();
+    auto bottom_up = buffer->bottom_up();
+    auto num_shards = buffer->num_cores();
     verify_safe_allocation(allocator);
     switch (buffer_type) {
         case BufferType::DRAM:
-            return allocator.descriptor.dram.alloc(
+            address = allocator.descriptor.dram.alloc(
                 allocator.config, allocator.dram_manager, size, page_size, bottom_up, num_shards);
+            break;
         case BufferType::L1:
-            return allocator.descriptor.l1.alloc(
+            address = allocator.descriptor.l1.alloc(
                 allocator.config, allocator.l1_manager, size, page_size, bottom_up, num_shards);
+            break;
         case BufferType::L1_SMALL: {
             TT_FATAL(num_shards.has_value(), "L1_SMALL only supports sharded allocations, see validate_num_banks");
-            return allocator.descriptor.l1.alloc(
+            address = allocator.descriptor.l1.alloc(
                 allocator.config, allocator.l1_small_manager, size, page_size, bottom_up, num_shards);
-            case BufferType::TRACE:
-                return allocator.descriptor.dram.alloc(
-                    allocator.config, allocator.trace_buffer_manager, size, page_size, bottom_up, num_shards);
+            break;
         }
+        case BufferType::TRACE:
+            address = allocator.descriptor.dram.alloc(
+                allocator.config, allocator.trace_buffer_manager, size, page_size, bottom_up, num_shards);
+            break;
         default: {
             TT_THROW("Unsupported buffer type!");
         }
     }
+    allocator.allocated_buffers.insert(buffer);
     return address;
 }
 
-void deallocate_buffer(Allocator &allocator, DeviceAddr address, const BufferType &buffer_type) {
+void deallocate_buffer(Allocator &allocator, Buffer *buffer) {
+    auto address = buffer->address();
+    auto buffer_type = buffer->buffer_type();
     switch (buffer_type) {
         case BufferType::DRAM: allocator.dram_manager.deallocate_buffer(address); break;
         case BufferType::L1: allocator.l1_manager.deallocate_buffer(address); break;
@@ -418,6 +425,7 @@ void deallocate_buffer(Allocator &allocator, DeviceAddr address, const BufferTyp
             TT_THROW("Unsupported buffer type!");
         }
     }
+    allocator.allocated_buffers.erase(buffer);
 }
 
 void deallocate_buffers(Allocator &allocator) {
@@ -425,6 +433,7 @@ void deallocate_buffers(Allocator &allocator) {
     allocator.l1_manager.deallocate_all();
     allocator.l1_small_manager.deallocate_all();
     allocator.trace_buffer_manager.deallocate_all();
+    allocator.allocated_buffers.clear();
 }
 
 void clear(Allocator &allocator) {
@@ -432,6 +441,7 @@ void clear(Allocator &allocator) {
     allocator.l1_manager.clear();
     allocator.l1_small_manager.clear();
     allocator.trace_buffer_manager.clear();
+    allocator.allocated_buffers.clear();
 }
 
 }  // namespace allocator
@@ -460,6 +470,7 @@ void Allocator::reset() {
     l1_manager.clear();
     l1_small_manager.clear();
     trace_buffer_manager.clear();
+    allocated_buffers.clear();
     config.reset();
 }
 
diff --git a/tt_metal/impl/allocator/allocator.hpp b/tt_metal/impl/allocator/allocator.hpp
index ecb31dfb5c8..60e4c97f0b9 100644
--- a/tt_metal/impl/allocator/allocator.hpp
+++ b/tt_metal/impl/allocator/allocator.hpp
@@ -19,6 +19,12 @@ namespace tt {
 
 namespace tt_metal {
 
+inline namespace v0 {
+
+class Buffer;
+
+}  // namespace v0
+
 // Fwd declares
 enum class BufferType;
 struct Allocator;
@@ -99,15 +105,17 @@ std::optional<DeviceAddr> lowest_occupied_l1_address(const Allocator &allocator,
 
 DeviceAddr base_alloc(const AllocatorConfig & config, BankManager &bank_manager, DeviceAddr size, DeviceAddr page_size, bool bottom_up, std::optional<uint32_t> num_shards);
 
-DeviceAddr allocate_buffer(Allocator &allocator, DeviceAddr size, DeviceAddr page_size, const BufferType &buffer_type, bool bottom_up, std::optional<uint32_t> num_shards = std::nullopt);
+DeviceAddr allocate_buffer(Allocator &allocator, DeviceAddr size, Buffer *buffer);
 
 void mark_allocations_unsafe(Allocator &allocator);
 
 void mark_allocations_safe(Allocator &allocator);
 
-void deallocate_buffer(Allocator &allocator, DeviceAddr address, const BufferType &buffer_type);
+void deallocate_buffer(Allocator &allocator, Buffer *buffer);
 void deallocate_buffers(Allocator &allocator);
 
+const std::unordered_set<Buffer *> &get_allocated_buffers(const Allocator &allocator);
+
 void clear(Allocator &allocatator);
 
 }  // namespace allocator
@@ -127,6 +135,7 @@ struct Allocator {
     std::unordered_map<uint32_t, std::vector<uint32_t>> dram_channel_to_bank_ids;
     std::unordered_map<uint32_t, CoreCoord> bank_id_to_logical_core;
     std::unordered_map<BufferType, std::unordered_map<CoreCoord, std::vector<uint32_t>>> logical_core_to_bank_ids;
+    std::unordered_set<Buffer *> allocated_buffers;
 
     AllocatorConfig config;
     // Callbacks to invoke during initialization and allocation
diff --git a/tt_metal/impl/buffers/buffer.cpp b/tt_metal/impl/buffers/buffer.cpp
index 0403a82af98..d4cfcf88be3 100644
--- a/tt_metal/impl/buffers/buffer.cpp
+++ b/tt_metal/impl/buffers/buffer.cpp
@@ -44,9 +44,15 @@ void validate_buffer_size_and_page_size(
         "Page size must be divisible by sizeof(uint32_t) because buffers hold uint32_t values");
 
     if (is_sharded(buffer_layout)) {
-        TT_FATAL(shard_parameters != std::nullopt, "Sharded buffers must have a core grid assigned");
-    } else if (buffer_layout == TensorMemoryLayout::SINGLE_BANK) {
-        TT_FATAL(page_size == size, "Contiguous buffer must be one contiguous page");
+        TT_FATAL(
+            shard_parameters != std::nullopt,
+            "Buffer was specified as sharded but does not have shard_parameters specified");
+    } else {
+        TT_FATAL(
+            shard_parameters == std::nullopt, "Buffer was specified as not sharded but has shard_parameters specified");
+        if (buffer_layout == TensorMemoryLayout::SINGLE_BANK) {
+            TT_FATAL(page_size == size, "Contiguous buffer must be one contiguous page");
+        }
     }
 }
 
@@ -125,7 +131,7 @@ BufferPageMapping generate_buffer_page_mapping(const Buffer& buffer) {
     auto shard_spec = buffer.shard_spec();
 
     bool row_major = shard_spec.orientation() == ShardOrientation::ROW_MAJOR;
-    uint32_t num_cores = buffer.num_cores();
+    uint32_t num_cores = buffer.num_cores().value();
 
     buffer_page_mapping.all_cores_ = corerange_to_cores(shard_spec.grid(), num_cores, row_major);
     TT_FATAL(num_cores == buffer_page_mapping.all_cores_.size(), "Buffer has {} cores, but page mapping expects {} cores", num_cores, buffer_page_mapping.all_cores_.size());
@@ -196,7 +202,7 @@ Buffer::Buffer(
     buffer_type_(buffer_type),
     buffer_layout_(buffer_layout),
     shard_parameters_(shard_parameters),
-    bottom_up_(bottom_up),
+    bottom_up_(bottom_up.value_or(this->is_dram())),
     buffer_page_mapping_(nullptr) {
     TT_FATAL(this->device_ != nullptr && this->device_->allocator_ != nullptr, "Device and allocator need to not be null.");
 
@@ -223,9 +229,7 @@ std::shared_ptr<Buffer> Buffer::create(
     }
 
     buffer->device_->push_work([buffer] {
-        bool bottom_up = buffer->bottom_up_.value_or(buffer->is_dram());
-        buffer->address_ = detail::AllocateBuffer(buffer.get(), bottom_up);
-        detail::BUFFER_MAP.insert({buffer->device_->id(), buffer->address_}, buffer.get());
+        buffer->address_ = detail::AllocateBuffer(buffer.get());
 
         std::unique_lock lock(buffer->allocation_mutex_);
         buffer->allocation_status_.store(AllocationStatus::ALLOCATED, std::memory_order::relaxed);
@@ -257,7 +261,6 @@ void Buffer::deallocate_impl() {
 
     if (device_->initialized_ && size_ != 0) {
         // address_ is only modified from this thread, no sync required
-        detail::BUFFER_MAP.erase({device_->id(), address_});
         detail::DeallocateBuffer(this);
     }
 
@@ -306,7 +309,7 @@ uint32_t Buffer::num_dev_pages() const {
         return this->num_pages();
     }
 
-    return this->shard_spec().size() * this->num_cores();
+    return this->shard_spec().size() * this->num_cores().value();
 }
 
 CoreType Buffer::core_type() const {
@@ -399,9 +402,9 @@ void Buffer::set_shard_spec(const ShardSpecBuffer& shard_spec) {
     this->buffer_page_mapping_ = nullptr;
 }
 
-uint32_t Buffer::num_cores() const {
+std::optional<uint32_t> Buffer::num_cores() const {
     if (!is_sharded(this->buffer_layout_))
-        return 1;
+        return std::nullopt;
 
     return this->shard_spec().tensor_shard_spec.grid.num_cores();
 }
@@ -433,10 +436,6 @@ DeviceAddr ShardSpecBuffer::size() const {
     return shape_in_pages_[0] * shape_in_pages_[1];
 }
 
-namespace detail {
-buffer_map_t BUFFER_MAP = {};
-}
-
 }  // namespace tt_metal
 }  // namespace tt
 
diff --git a/tt_metal/impl/buffers/buffer.hpp b/tt_metal/impl/buffers/buffer.hpp
index 8c4332de0cb..ec3cbb22aec 100644
--- a/tt_metal/impl/buffers/buffer.hpp
+++ b/tt_metal/impl/buffers/buffer.hpp
@@ -176,6 +176,8 @@ class Buffer final {
 
     TensorMemoryLayout buffer_layout() const { return buffer_layout_; }
 
+    bool bottom_up() const { return bottom_up_; }
+
     uint32_t dram_channel_from_bank_id(uint32_t bank_id) const;
 
     CoreCoord logical_core_from_bank_id(uint32_t bank_id) const;
@@ -199,7 +201,7 @@ class Buffer final {
     ShardSpecBuffer shard_spec() const;
     void set_shard_spec(const ShardSpecBuffer& shard_spec);
 
-    uint32_t num_cores() const;
+    std::optional<uint32_t> num_cores() const;
 
     const std::shared_ptr<const BufferPageMapping>& get_buffer_page_mapping();
 
@@ -231,7 +233,7 @@ class Buffer final {
     const DeviceAddr size_; // Size in bytes
     const BufferType buffer_type_;
     const TensorMemoryLayout buffer_layout_;
-    const std::optional<bool> bottom_up_;
+    const bool bottom_up_;
 
     std::atomic<AllocationStatus> allocation_status_ = AllocationStatus::ALLOCATION_REQUESTED;
     DeviceAddr address_ = 0;
@@ -252,36 +254,6 @@ class Buffer final {
 
 BufferPageMapping generate_buffer_page_mapping(const Buffer &buffer);
 
-namespace detail {
-using Deviceid = uint32_t;
-
-class buffer_map_t {
-   public:
-    void insert(std::tuple<Deviceid, DeviceAddr> buf_attr, Buffer *buffer) {
-        std::scoped_lock<std::mutex> lock(this->map_mutex);
-        this->map.insert({buf_attr, buffer});
-    }
-
-    void erase(std::tuple<Deviceid, DeviceAddr> buf_attr) {
-        std::scoped_lock<std::mutex> lock(this->map_mutex);
-        this->map.erase(buf_attr);
-    }
-
-    std::map<std::tuple<Deviceid, DeviceAddr>, Buffer *> value() {
-        std::scoped_lock<std::mutex> lock(this->map_mutex);
-        return this->map;
-    }
-
-    ~buffer_map_t() { TT_ASSERT(this->map.empty(), "Not all buffers deallocated by runtime!"); }
-
-   private:
-    std::mutex map_mutex;
-    std::map<std::tuple<Deviceid, DeviceAddr>, Buffer *> map = {};
-};
-
-extern buffer_map_t BUFFER_MAP;
-}  // namespace detail
-
 inline namespace v0 {
 
 using HostDataType = std::variant<
diff --git a/tt_metal/impl/device/device.cpp b/tt_metal/impl/device/device.cpp
index 0479807018f..8afff15103b 100644
--- a/tt_metal/impl/device/device.cpp
+++ b/tt_metal/impl/device/device.cpp
@@ -2949,10 +2949,8 @@ bool Device::close() {
     tt::Cluster::instance().l1_barrier(id_);
     allocator::clear(*this->allocator_);
     // After device close, no buffers on this device should be used
-    for (const auto &[buf_attr, buf] : detail::BUFFER_MAP.value()) {
-        if (std::get<0>(buf_attr) == this->id()) {
-            DeallocateBuffer(*buf);
-        }
+    for (const auto &buf : this->get_allocated_buffers()) {
+        DeallocateBuffer(*buf);
     }
 
     this->compute_cores_.clear();
@@ -3174,6 +3172,11 @@ void Device::dump_memory_blocks(const BufferType &buffer_type, std::ofstream &ou
     return allocator::dump_memory_blocks(*this->allocator_, buffer_type, out);
 }
 
+const std::unordered_set<Buffer *> &Device::get_allocated_buffers() const {
+    this->check_allocator_is_initialized();
+    return allocator::get_allocated_buffers(*this->allocator_);
+}
+
 void Device::deallocate_buffers(){
     allocator::deallocate_buffers(*allocator_);
 }
diff --git a/tt_metal/impl/device/device.hpp b/tt_metal/impl/device/device.hpp
index 7beb58f3ea8..dce53a1eae8 100644
--- a/tt_metal/impl/device/device.hpp
+++ b/tt_metal/impl/device/device.hpp
@@ -197,6 +197,8 @@ class Device {
     uint32_t get_noc_unicast_encoding(uint8_t noc_index, const CoreCoord& physical_core) const;
     uint32_t get_noc_multicast_encoding(uint8_t noc_index, const CoreRange& physical_cores) const;
 
+    const std::unordered_set<Buffer *> &get_allocated_buffers() const;
+
     void deallocate_buffers();
 
     // machine epsilon
diff --git a/tt_metal/impl/dispatch/command_queue.hpp b/tt_metal/impl/dispatch/command_queue.hpp
index 64f6c5407b7..a840fd19b8a 100644
--- a/tt_metal/impl/dispatch/command_queue.hpp
+++ b/tt_metal/impl/dispatch/command_queue.hpp
@@ -478,9 +478,6 @@ using CompletionReaderQueue = LockFreeQueue<CompletionReaderVariant>;
 struct AllocBufferMetadata {
     Buffer* buffer;
     std::reference_wrapper<Allocator> allocator;
-    BufferType buffer_type;
-    uint32_t device_address;
-    bool bottom_up;
 };
 
 struct RuntimeArgsMetadata {
diff --git a/tt_metal/tt_metal.cpp b/tt_metal/tt_metal.cpp
index f71c5c49302..001cec165e1 100644
--- a/tt_metal/tt_metal.cpp
+++ b/tt_metal/tt_metal.cpp
@@ -794,33 +794,29 @@ void CompileProgram(Device *device, Program &program, bool fd_bootloader_mode) {
     program.compile(device, fd_bootloader_mode);
 }
 
-DeviceAddr AllocateBuffer(const Buffer *buffer, bool bottom_up) {
-    if(GraphTracker::instance().hook_allocate(buffer, bottom_up)) {
-        GraphTracker::instance().track_allocate(buffer, bottom_up);
+DeviceAddr AllocateBuffer(Buffer *buffer) {
+    if(GraphTracker::instance().hook_allocate(buffer)) {
+        GraphTracker::instance().track_allocate(buffer);
         return 0;
     }
 
-    uint32_t allocated_addr;
+    DeviceAddr allocated_addr;
     if (is_sharded(buffer->buffer_layout())) {
         allocated_addr = allocator::allocate_buffer(
             *(buffer->device()->allocator_),
-            buffer->shard_spec().size() * buffer->num_cores() * buffer->page_size(),
-            buffer->page_size(),
-            buffer->buffer_type(),
-            bottom_up,
-            buffer->num_cores());
+            buffer->shard_spec().size() * buffer->num_cores().value() * buffer->page_size(),
+            buffer);
     } else {
         allocated_addr = allocator::allocate_buffer(
             *(buffer->device()->allocator_),
             buffer->size(),
-            buffer->page_size(),
-            buffer->buffer_type(),
-            bottom_up,
-            std::nullopt);
+            buffer);
     }
+    // Assertion here because buffer class returns a u32 when address is queried
+    // Requires updating all use cases of buffer address to accept a u64 to remove
     TT_ASSERT(allocated_addr <= std::numeric_limits<uint32_t>::max());
 
-    GraphTracker::instance().track_allocate(buffer, bottom_up);
+    GraphTracker::instance().track_allocate(buffer);
 
     return allocated_addr;
 }
@@ -831,7 +827,7 @@ void DeallocateBuffer(Buffer *buffer) {
         return;
     }
 
-    allocator::deallocate_buffer(*buffer->device()->allocator_, buffer->address(), buffer->buffer_type());
+    allocator::deallocate_buffer(*buffer->device()->allocator_, buffer);
 }
 
 }  // namespace detail
diff --git a/ttnn/cpp/ttnn/graph/graph_processor.cpp b/ttnn/cpp/ttnn/graph/graph_processor.cpp
index bebeadebd9d..882f9588a22 100644
--- a/ttnn/cpp/ttnn/graph/graph_processor.cpp
+++ b/ttnn/cpp/ttnn/graph/graph_processor.cpp
@@ -90,7 +90,7 @@ GraphProcessor::GraphProcessor(RunMode mode) : run_mode(mode) {
     end_function_any_map[typeid(std::reference_wrapper<Tensor>)] = [ptr = this] (const std::any& val) mutable {ptr->end_function_process_tensor(val);};
 
 }
-void GraphProcessor::track_allocate(const tt::tt_metal::Buffer* buffer, bool bottom_up) {
+void GraphProcessor::track_allocate(const tt::tt_metal::Buffer* buffer) {
     const std::lock_guard<std::mutex> lock(mutex);
     auto buf_id = add_buffer(buffer);
 
@@ -478,7 +478,7 @@ nlohmann::json GraphProcessor::end_graph_capture() {
         return res;
 }
 
-bool ProcessorHooks::hook_allocate(const tt::tt_metal::Buffer* buffer, bool bottom_up) {
+bool ProcessorHooks::hook_allocate(const tt::tt_metal::Buffer* buffer) {
     return do_block;
 }
 
diff --git a/ttnn/cpp/ttnn/graph/graph_processor.hpp b/ttnn/cpp/ttnn/graph/graph_processor.hpp
index 4f7d6f1b6e7..83179dabe59 100644
--- a/ttnn/cpp/ttnn/graph/graph_processor.hpp
+++ b/ttnn/cpp/ttnn/graph/graph_processor.hpp
@@ -22,7 +22,7 @@ namespace ttnn::graph {
 
     public:
         ProcessorHooks() = default;
-        bool hook_allocate(const tt::tt_metal::Buffer* buffer, bool bottom_up) override;
+        bool hook_allocate(const tt::tt_metal::Buffer* buffer) override;
 
         bool hook_deallocate(tt::tt_metal::Buffer* buffer) override;
 
@@ -40,7 +40,7 @@ namespace ttnn::graph {
         GraphProcessor(tt::tt_metal::IGraphProcessor::RunMode mode);
         ~GraphProcessor() override;
 
-        void track_allocate(const tt::tt_metal::Buffer* buffer, bool bottom_up) override;
+        void track_allocate(const tt::tt_metal::Buffer* buffer) override;
 
         void track_deallocate(tt::tt_metal::Buffer* buffer) override;
 
diff --git a/ttnn/cpp/ttnn/reports.hpp b/ttnn/cpp/ttnn/reports.hpp
index 9392f8eda7c..0eee2efedbc 100644
--- a/ttnn/cpp/ttnn/reports.hpp
+++ b/ttnn/cpp/ttnn/reports.hpp
@@ -7,6 +7,7 @@
 #include <optional>
 
 #include "tt_metal/impl/buffers/buffer.hpp"
+#include "tt_metal/impl/device/device_pool.hpp"
 
 namespace ttnn {
 
@@ -64,49 +65,52 @@ struct BufferInfo {
 
 std::vector<BufferInfo> get_buffers() {
     std::vector<BufferInfo> buffer_infos;
-    for (const auto &[key, buffer] : tt::tt_metal::detail::BUFFER_MAP.value()) {
-        auto [device_id, address] = key;
-        auto device = buffer->device();
-
-        auto num_pages = buffer->num_pages();
-        auto page_size = buffer->page_size();
-        auto num_banks = device->num_banks(buffer->buffer_type());
-
-        std::map<uint32_t, uint32_t> bank_to_num_pages;
-        if (buffer->buffer_layout() == tt::tt_metal::TensorMemoryLayout::INTERLEAVED) {
-            uint32_t bank_id = 0;
-            for (int page_index = 0; page_index < num_pages; page_index++) {
-                if (bank_to_num_pages.find(bank_id) == bank_to_num_pages.end()) {
-                    bank_to_num_pages[bank_id] = 0;
+    for (const auto &device : tt::DevicePool::instance().get_all_active_devices()) {
+        for (const auto &buffer : device->get_allocated_buffers()) {
+            auto device_id = device->id();
+            auto address = buffer->address();
+
+            auto num_pages = buffer->num_pages();
+            auto page_size = buffer->page_size();
+            auto num_banks = device->num_banks(buffer->buffer_type());
+
+            std::map<uint32_t, uint32_t> bank_to_num_pages;
+            if (buffer->buffer_layout() == tt::tt_metal::TensorMemoryLayout::INTERLEAVED) {
+                uint32_t bank_id = 0;
+                for (int page_index = 0; page_index < num_pages; page_index++) {
+                    if (bank_to_num_pages.find(bank_id) == bank_to_num_pages.end()) {
+                        bank_to_num_pages[bank_id] = 0;
+                    }
+                    bank_to_num_pages[bank_id]++;
+                    bank_id = (bank_id + 1) % num_banks;
                 }
-                bank_to_num_pages[bank_id]++;
-                bank_id = (bank_id + 1) % num_banks;
-            }
-        } else {
-            const auto& buffer_page_mapping = *buffer->get_buffer_page_mapping();
-            for (int page_index = 0; page_index < num_pages; page_index++) {
-                auto dev_page_index = buffer_page_mapping.host_page_to_dev_page_mapping_[page_index];
-                auto core = buffer_page_mapping.all_cores_[buffer_page_mapping.dev_page_to_core_mapping_[dev_page_index]];
-                auto bank_id = device->bank_ids_from_logical_core(buffer->buffer_type(), core)[0];
-
-                if (bank_to_num_pages.find(bank_id) == bank_to_num_pages.end()) {
-                    bank_to_num_pages[bank_id] = 0;
+            } else {
+                const auto &buffer_page_mapping = *buffer->get_buffer_page_mapping();
+                for (int page_index = 0; page_index < num_pages; page_index++) {
+                    auto dev_page_index = buffer_page_mapping.host_page_to_dev_page_mapping_[page_index];
+                    auto core =
+                        buffer_page_mapping.all_cores_[buffer_page_mapping.dev_page_to_core_mapping_[dev_page_index]];
+                    auto bank_id = device->bank_ids_from_logical_core(buffer->buffer_type(), core)[0];
+
+                    if (bank_to_num_pages.find(bank_id) == bank_to_num_pages.end()) {
+                        bank_to_num_pages[bank_id] = 0;
+                    }
+                    bank_to_num_pages[bank_id]++;
                 }
-                bank_to_num_pages[bank_id]++;
             }
-        }
 
-        auto max_num_pages =
-            std::max_element(bank_to_num_pages.begin(), bank_to_num_pages.end(), [](const auto &a, const auto &b) {
-                return a.second < b.second;
-            });
-
-        BufferInfo buffer_info = {};
-        buffer_info.device_id = device_id;
-        buffer_info.address = address;
-        buffer_info.max_size_per_bank = (*max_num_pages).second * page_size;
-        buffer_info.buffer_type = buffer->buffer_type();
-        buffer_infos.push_back(buffer_info);
+            auto max_num_pages =
+                std::max_element(bank_to_num_pages.begin(), bank_to_num_pages.end(), [](const auto &a, const auto &b) {
+                    return a.second < b.second;
+                });
+
+            BufferInfo buffer_info = {};
+            buffer_info.device_id = device_id;
+            buffer_info.address = address;
+            buffer_info.max_size_per_bank = (*max_num_pages).second * page_size;
+            buffer_info.buffer_type = buffer->buffer_type();
+            buffer_infos.push_back(buffer_info);
+        }
     }
     return buffer_infos;
 }
@@ -125,23 +129,35 @@ struct BufferPageInfo {
 
 std::vector<BufferPageInfo> get_buffer_pages() {
     std::vector<BufferPageInfo> buffer_page_infos;
-    for (const auto &[key, buffer] : tt::tt_metal::detail::BUFFER_MAP.value()) {
-        if (not buffer->is_l1()) {
-            continue;
-        }
+    for (const auto &device : tt::DevicePool::instance().get_all_active_devices()) {
+        for (const auto &buffer : device->get_allocated_buffers()) {
+            if (not buffer->is_l1()) {
+                continue;
+            }
 
-        auto [device_id, address] = key;
-        auto device = buffer->device();
+            auto device_id = device->id();
+            auto address = buffer->address();
 
-        uint32_t page_size = buffer->page_size();
-        auto num_pages = buffer->num_pages();
-        auto num_banks = device->num_banks(buffer->buffer_type());
+            auto page_size = buffer->page_size();
+            auto num_pages = buffer->num_pages();
+            auto num_banks = device->num_banks(buffer->buffer_type());
 
-        if (buffer->buffer_layout() == tt::tt_metal::TensorMemoryLayout::INTERLEAVED) {
             uint32_t bank_id = 0;
             for (int page_index = 0; page_index < num_pages; page_index++) {
-                auto page_address = buffer->page_address(bank_id, page_index);
-                auto core = buffer->logical_core_from_bank_id(bank_id);
+                CoreCoord core;
+                DeviceAddr page_address = 0;
+
+                if (buffer->buffer_layout() == tt::tt_metal::TensorMemoryLayout::INTERLEAVED) {
+                    page_address = buffer->page_address(bank_id, page_index);
+                    core = buffer->logical_core_from_bank_id(bank_id);
+                    bank_id = (bank_id + 1) % num_banks;
+                } else {
+                    const auto &buffer_page_mapping = *buffer->get_buffer_page_mapping();
+                    auto dev_page_index = buffer_page_mapping.host_page_to_dev_page_mapping_[page_index];
+                    core = buffer_page_mapping.all_cores_[buffer_page_mapping.dev_page_to_core_mapping_[dev_page_index]];
+                    bank_id = device->bank_ids_from_logical_core(buffer->buffer_type(), core)[0];
+                    page_address = buffer->sharded_page_address(bank_id, dev_page_index);
+                }
 
                 BufferPageInfo buffer_page_info = {};
                 buffer_page_info.device_id = device_id;
@@ -153,28 +169,7 @@ std::vector<BufferPageInfo> get_buffer_pages() {
                 buffer_page_info.page_address = page_address;
                 buffer_page_info.page_size = page_size;
                 buffer_page_info.buffer_type = buffer->buffer_type();
-                buffer_page_infos.push_back(buffer_page_info);
 
-                bank_id = (bank_id + 1) % num_banks;
-            }
-        } else {
-            const auto& buffer_page_mapping = *buffer->get_buffer_page_mapping();
-            for (int page_index = 0; page_index < num_pages; page_index++) {
-                auto dev_page_index = buffer_page_mapping.host_page_to_dev_page_mapping_[page_index];
-                auto core = buffer_page_mapping.all_cores_[buffer_page_mapping.dev_page_to_core_mapping_[dev_page_index]];
-                auto bank_id = device->bank_ids_from_logical_core(buffer->buffer_type(), core)[0];
-                auto page_address = buffer->sharded_page_address(bank_id, dev_page_index);
-
-                BufferPageInfo buffer_page_info = {};
-                buffer_page_info.device_id = device_id;
-                buffer_page_info.address = address;
-                buffer_page_info.core_y = core.y;
-                buffer_page_info.core_x = core.x;
-                buffer_page_info.bank_id = bank_id;
-                buffer_page_info.page_index = page_index;
-                buffer_page_info.page_address = page_address;
-                buffer_page_info.page_size = page_size;
-                buffer_page_info.buffer_type = buffer->buffer_type();
                 buffer_page_infos.push_back(buffer_page_info);
             }
         }

From 44b32f0803fc31adb6c639fe1b9276c3c2742d7b Mon Sep 17 00:00:00 2001
From: Abhinav Sarje <asarje@tenstorrent.com>
Date: Thu, 24 Oct 2024 00:05:51 +0000
Subject: [PATCH 16/30] #14188: bug fix, should be div_up

---
 .../device/untilize_with_halo_v2_program_factory.cpp            | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/device/untilize_with_halo_v2_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/device/untilize_with_halo_v2_program_factory.cpp
index 20c55da5bd3..7699c7a3403 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/device/untilize_with_halo_v2_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/device/untilize_with_halo_v2_program_factory.cpp
@@ -49,7 +49,7 @@ operation::ProgramWithCallbacks untilize_with_halo_multi_core_v2(
     auto output_shard_shape = output_tensor.shard_spec().value().shape;
     TT_ASSERT(input_shard_shape[1] == output_shard_shape[1]);
     uint32_t input_nhw_height = input_shape[0] * input_shape[1] * input_shape[2];
-    uint32_t remapped_input_shard_shape_for_output_grid = input_nhw_height / ncores_nhw;
+    uint32_t remapped_input_shard_shape_for_output_grid = tt::div_up(input_nhw_height, ncores_nhw);
     uint32_t ntiles_per_block = tt::div_up(input_shard_shape[1], TILE_WIDTH);
     uint32_t input_nblocks_per_core = tt::div_up(remapped_input_shard_shape_for_output_grid, TILE_HEIGHT);
     uint32_t input_npages = ntiles_per_block * input_nblocks_per_core;

From c57ed764e3c334d86623ab04f0657000ac1f8abb Mon Sep 17 00:00:00 2001
From: Raymond Kim <109366641+tt-rkim@users.noreply.github.com>
Date: Fri, 25 Oct 2024 18:24:25 -0400
Subject: [PATCH 17/30] #14297: Remove API single card FD nightly tests because
 they don't exist anymore (#14298)

---
 ...atch-full-regressions-and-models-impl.yaml | 21 -------------------
 tests/scripts/run_tests.sh                    |  6 ------
 2 files changed, 27 deletions(-)

diff --git a/.github/workflows/fast-dispatch-full-regressions-and-models-impl.yaml b/.github/workflows/fast-dispatch-full-regressions-and-models-impl.yaml
index 8f1777db303..eb55fb592cc 100644
--- a/.github/workflows/fast-dispatch-full-regressions-and-models-impl.yaml
+++ b/.github/workflows/fast-dispatch-full-regressions-and-models-impl.yaml
@@ -54,27 +54,6 @@ jobs:
               cmd: tests/scripts/single_card/nightly/run_gs_only.sh,
               timeout: 40
             },
-            {
-              name: "API tests GS",
-              arch: grayskull,
-              runs-on: ["cloud-virtual-machine", "E150", "in-service"],
-              cmd: ./tests/scripts/run_tests.sh --tt-arch grayskull --pipeline-type frequent_api --dispatch-mode fast,
-              timeout: 10
-            },
-            {
-              name: "API tests N300 WH B0",
-              arch: wormhole_b0,
-              runs-on: ["cloud-virtual-machine", "N300", "in-service"],
-              cmd: ./tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type frequent_api --dispatch-mode fast,
-              timeout: 10
-            },
-            {
-              name: "API tests N150 WH B0",
-              arch: wormhole_b0,
-              runs-on: ["cloud-virtual-machine", "N150", "in-service"],
-              cmd: ./tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type frequent_api --dispatch-mode fast,
-              timeout: 10
-            },
             {
               name: "[Unstable] N150 models",
               arch: wormhole_b0,
diff --git a/tests/scripts/run_tests.sh b/tests/scripts/run_tests.sh
index fd8d580296c..517503b2646 100755
--- a/tests/scripts/run_tests.sh
+++ b/tests/scripts/run_tests.sh
@@ -79,12 +79,6 @@ run_frequent_api_pipeline_tests() {
         TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests_frequent
         echo "Running Python API unit tests in SD for frequent..."
         ./tests/scripts/run_python_api_unit_tests.sh
-    else
-        if [[ $tt_arch == "wormhole_b0" ]]; then
-            pytest -n auto tests/ttnn/unit_tests/operations/test_all_gather.py -k nightly
-        else
-            echo "API tests are not available for fast dispatch because they're already covered in post-commit"
-        fi
     fi
 }
 

From 9f987fc99f7d004ad756fd7c90edfe9a53492dcb Mon Sep 17 00:00:00 2001
From: Almeet Bhullar <abhullar@tenstorrent.com>
Date: Thu, 24 Oct 2024 03:09:21 +0000
Subject: [PATCH 18/30] #12349: Enabling pytests on BH that are now working

---
 .../sweep_tests/pytests/tt_dnn/test_div.py                 | 3 +--
 .../sweep_tests/pytests/tt_dnn/test_div_unary.py           | 3 +--
 .../sweep_tests/pytests/tt_dnn/test_fmod.py                | 3 +--
 .../sweep_tests/pytests/tt_dnn/test_frac.py                | 3 +--
 .../misc/test_scaled_dot_product_attention_decode.py       | 7 +++++++
 .../python_api_testing/unit_testing/misc/test_sum.py       | 1 -
 .../python_api_testing/unit_testing/misc/test_transpose.py | 4 ++++
 .../python_api_testing/unit_testing/test_prod_nc.py        | 1 -
 .../eltwise/binary/device/binary_composite_op.cpp          | 2 +-
 .../operations/eltwise/unary/device/unary_composite_op.cpp | 2 +-
 10 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_div.py b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_div.py
index 60acfe5f59b..0fc89897817 100644
--- a/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_div.py
+++ b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_div.py
@@ -16,7 +16,7 @@
 from tests.tt_eager.python_api_testing.sweep_tests.run_pytorch_ci_tests import (
     run_single_pytorch_test,
 )
-from models.utility_functions import is_grayskull, skip_for_blackhole
+from models.utility_functions import is_grayskull
 
 mem_configs = [
     ttnn.MemoryConfig(ttnn.TensorMemoryLayout.INTERLEAVED, ttnn.BufferType.DRAM),
@@ -24,7 +24,6 @@
 ]
 
 
-@skip_for_blackhole("Only supported for WH, see #12349")
 @pytest.mark.parametrize("accurate_mode", [False, True])
 @pytest.mark.parametrize("round_mode", ["None", "trunc", "floor"])
 @pytest.mark.parametrize(
diff --git a/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_div_unary.py b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_div_unary.py
index 0a85d67f471..3d85a290137 100644
--- a/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_div_unary.py
+++ b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_div_unary.py
@@ -16,7 +16,7 @@
 from tests.tt_eager.python_api_testing.sweep_tests.run_pytorch_ci_tests import (
     run_single_pytorch_test,
 )
-from models.utility_functions import is_grayskull, skip_for_blackhole
+from models.utility_functions import is_grayskull
 
 mem_configs = [
     ttnn.MemoryConfig(ttnn.TensorMemoryLayout.INTERLEAVED, ttnn.BufferType.DRAM),
@@ -24,7 +24,6 @@
 ]
 
 
-@skip_for_blackhole("Only supported on WH, see #12349")
 @pytest.mark.parametrize("accurate_mode", [True])
 @pytest.mark.parametrize("round_mode", ["None", "trunc", "floor"])
 @pytest.mark.parametrize(
diff --git a/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_fmod.py b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_fmod.py
index 28cd0590730..65b45a5ba6b 100644
--- a/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_fmod.py
+++ b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_fmod.py
@@ -16,7 +16,7 @@
 from tests.tt_eager.python_api_testing.sweep_tests.run_pytorch_ci_tests import (
     run_single_pytorch_test,
 )
-from models.utility_functions import skip_for_grayskull, skip_for_blackhole
+from models.utility_functions import skip_for_grayskull
 
 mem_configs = [
     ttnn.MemoryConfig(ttnn.TensorMemoryLayout.INTERLEAVED, ttnn.BufferType.DRAM),
@@ -24,7 +24,6 @@
 ]
 
 
-@skip_for_blackhole("Only supported on WH, see #12349")
 @pytest.mark.parametrize(
     "input_shapes",
     [
diff --git a/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_frac.py b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_frac.py
index 25e92cbf3fd..9c6907d1fac 100644
--- a/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_frac.py
+++ b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_frac.py
@@ -7,7 +7,7 @@
 import random
 from functools import partial
 import ttnn
-from models.utility_functions import skip_for_grayskull, skip_for_blackhole
+from models.utility_functions import skip_for_grayskull
 
 from tests.tt_eager.python_api_testing.sweep_tests import (
     comparison_funcs,
@@ -23,7 +23,6 @@
 ]
 
 
-@skip_for_blackhole("Unsupported on BH, see #12349")
 @pytest.mark.parametrize(
     "input_shapes",
     [
diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_scaled_dot_product_attention_decode.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_scaled_dot_product_attention_decode.py
index 0783e8c464e..dcaac29bc89 100644
--- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_scaled_dot_product_attention_decode.py
+++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_scaled_dot_product_attention_decode.py
@@ -469,6 +469,7 @@ def run_test_sdpa_decode_single_iter(
     assert out_pass
 
 
+@skip_for_blackhole("Unsupported on BH, see #12349")
 @skip_for_grayskull("Unsupported in GS since L1 runs OOM with most configs")
 @pytest.mark.parametrize(
     "dtype, q_dtype",
@@ -518,6 +519,7 @@ def test_sdpa_decode(
         )
 
 
+@skip_for_blackhole("Unsupported on BH, see #12349")
 @skip_for_grayskull("Unsupported in GS since L1 runs OOM with most configs")
 @pytest.mark.parametrize(
     "dtype, q_dtype",
@@ -550,6 +552,7 @@ def test_sdpa_decode_non_causal(device, b, nh, nkv, s, d, dtype, grid_size, q_dt
     assert device.num_program_cache_entries() == 1
 
 
+@skip_for_blackhole("Unsupported on BH, see #12349")
 @skip_for_grayskull("Unsupported in GS since L1 runs OOM with most configs")
 @pytest.mark.parametrize(
     "dtype, q_dtype",
@@ -869,6 +872,7 @@ def test_sdpa_decode_paged_attention(
     assert device.num_program_cache_entries() == 4
 
 
+@skip_for_blackhole("Unsupported on BH, see #12349")
 @skip_for_grayskull("Unsupported in GS since L1 runs OOM with most configs")
 @pytest.mark.parametrize(
     "dtype, q_dtype",
@@ -898,6 +902,7 @@ def test_sdpa_decode_sharded(device, b, nh, nkv, s, d, dtype, grid_size, q_dtype
     )
 
 
+@skip_for_blackhole("Unsupported on BH, see #12349")
 @skip_for_grayskull("Unsupported in GS since L1 runs OOM with most configs")
 @pytest.mark.skip("Skipping Perf Test in CI")
 def test_sdpa_decode_perf(device, use_program_cache):
@@ -952,6 +957,7 @@ def test_sdpa_decode_perf(device, use_program_cache):
         )
 
 
+@skip_for_blackhole("Unsupported on BH, see #12349")
 @skip_for_grayskull("Unsupported in GS since L1 runs OOM with most configs")
 @pytest.mark.parametrize(
     "dtype",
@@ -1171,6 +1177,7 @@ def run_test_sdpa_decode_ndpcc(device, b, nh, nkv, s, d, dtype, grid_size, q_dty
     logger.info(f"PCC failed Start Pos: {failed_start_pos}")
 
 
+@skip_for_blackhole("Unsupported on BH, see #12349")
 @pytest.mark.timeout(600)
 @pytest.mark.skip("Skipping due to causing 45 minutes timeout on tt eager unit tests")
 @skip_for_grayskull("Unsupported in GS since L1 runs OOM with most configs")
diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_sum.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_sum.py
index 5bafca9f180..1b204e133fb 100644
--- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_sum.py
+++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_sum.py
@@ -9,7 +9,6 @@
 from models.utility_functions import skip_for_blackhole
 
 
-@skip_for_blackhole("Mismatching on BH, see #12349")
 @pytest.mark.parametrize(
     "shape_dim",
     (
diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_transpose.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_transpose.py
index 7ee7809fbb8..29ac447236e 100644
--- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_transpose.py
+++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_transpose.py
@@ -631,6 +631,7 @@ def test_transpose_bfloat8_b(device, shape, swap_dims):
     assert_with_pcc(torch_output, tt_output, 0.9999)
 
 
+@skip_for_blackhole("Mismatching on BH, see #12349")
 @pytest.mark.parametrize(
     "dtype",
     (ttnn.bfloat16, ttnn.float32),
@@ -649,6 +650,7 @@ def test_transpose_hc(dtype, shape, device):
     transpose(shape, device, dim0=1, dim1=-2, input_dtype=dtype)
 
 
+@skip_for_blackhole("Mismatching on BH, see #12349")
 @pytest.mark.parametrize(
     "dtype",
     (ttnn.bfloat16, ttnn.float32),
@@ -677,6 +679,7 @@ def test_transpose_2D(dtype, shape, layout, device):
     assert_with_pcc(torch_output, tt_output, 0.9999)
 
 
+@skip_for_blackhole("Mismatching on BH, see #12349")
 @pytest.mark.parametrize(
     "dtype",
     (ttnn.bfloat16, ttnn.float32),
@@ -758,6 +761,7 @@ def test_transpose_failures(config, device):
     assert_with_pcc(torch_output, tt_output, 0.9999)
 
 
+@skip_for_blackhole("Mismatching on BH, see #12349")
 @pytest.mark.parametrize(
     "config",
     [
diff --git a/tests/tt_eager/python_api_testing/unit_testing/test_prod_nc.py b/tests/tt_eager/python_api_testing/unit_testing/test_prod_nc.py
index 71641bc8071..9bf38150854 100644
--- a/tests/tt_eager/python_api_testing/unit_testing/test_prod_nc.py
+++ b/tests/tt_eager/python_api_testing/unit_testing/test_prod_nc.py
@@ -28,7 +28,6 @@ def get_tensors(input_shape, output_shape, device):
     return tt_input, tt_output, torch_input
 
 
-@skip_for_blackhole("Mismatching on BH, see #12349")
 @pytest.mark.parametrize(
     "input_shape",
     (
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_composite_op.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_composite_op.cpp
index 1303233ab57..96936013a3b 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_composite_op.cpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_composite_op.cpp
@@ -280,7 +280,7 @@ Tensor ExecuteBinaryRemainder::invoke(const Tensor& input, float scalar, const s
 // Binary FMOD will be overloaded by unary FMOD in another PR
 Tensor ExecuteBinaryFmod::invoke(const Tensor& input_a, const Tensor& input_b, const std::optional<MemoryConfig>& output_mem_config) {
     auto arch = input_a.device()->arch();
-    TT_FATAL(arch == tt::ARCH::WORMHOLE_B0, "Op is only supported on Wormhole");
+    TT_FATAL(arch == tt::ARCH::WORMHOLE_B0 or arch == tt::ARCH::BLACKHOLE, "Op is only supported on Wormhole or Blackhole");
     DataType input_dtype = input_a.get_dtype();
     Tensor a = typecast(input_a, DataType::FLOAT32);
     Tensor b = typecast(input_b, DataType::FLOAT32);
diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_composite_op.cpp b/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_composite_op.cpp
index dee368d713c..1ba9dbd7ac5 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_composite_op.cpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_composite_op.cpp
@@ -817,7 +817,7 @@ Tensor _normalize_global(const Tensor& y,  const std::optional<MemoryConfig>& ou
 
 Tensor _frac(const Tensor& input, const std::optional<MemoryConfig>& output_mem_config) {
     auto arch = input.device()->arch();
-    TT_FATAL(arch == tt::ARCH::WORMHOLE_B0, "Op is only supported on Wormhole");
+    TT_FATAL(arch == tt::ARCH::WORMHOLE_B0 or arch == tt::ARCH::BLACKHOLE, "Op is only supported on Wormhole or Blackhole");
     Tensor trunc_res = ttnn::trunc(input);
     Tensor result = ttnn::subtract(input, trunc_res, std::nullopt, output_mem_config);
     return result;

From 734c8c1a4394f5c65530a98452f7ef6fb1b617ab Mon Sep 17 00:00:00 2001
From: Bryan Wilder Field Lozano <blozano@tenstorrent.com>
Date: Fri, 25 Oct 2024 21:19:30 -0700
Subject: [PATCH 19/30] #13030: Eliminate namespace pollution (#14312)

---
 .../tests_common/sfpu_helper/sfpu_helper.hpp  | 18 ++---
 tests/tt_eager/ops/test_sfpu.cpp              |  1 +
 .../tt_eager/ops/test_sliding_window_ops.cpp  |  1 +
 .../test_dram_read_remote_cb.cpp              |  1 +
 .../1_compute_mm/test_compute_mm.cpp          |  1 +
 .../7_kernel_launch/test_kernel_launch.cpp    |  1 +
 .../perf_microbenchmark/common/util.hpp       |  4 +-
 .../perf_microbenchmark/dispatch/common.h     | 68 +++++++++----------
 .../dispatch/test_bw_and_latency.cpp          |  1 +
 .../dispatch/test_dispatcher.cpp              |  1 +
 .../dispatch/test_pgm_dispatch.cpp            |  1 +
 .../dispatch/test_prefetcher.cpp              |  1 +
 .../old/matmul/matmul_global_l1.cpp           |  1 +
 .../old/matmul/matmul_local_l1.cpp            |  1 +
 .../routing/test_bi_tunnel.cpp                |  1 +
 .../routing/test_mux_demux.cpp                |  1 +
 .../routing/test_mux_demux_2level.cpp         |  1 +
 .../routing/test_tunnel_1cq.cpp               |  1 +
 .../routing/test_tunnel_2cq.cpp               |  1 +
 .../routing/test_tx_rx.cpp                    |  1 +
 .../routing/test_uni_tunnel.cpp               |  1 +
 .../routing/test_uni_tunnel_single_chip.cpp   |  1 +
 .../routing/test_vc_bi_tunnel.cpp             |  1 +
 .../routing/test_vc_mux_demux.cpp             |  1 +
 .../routing/test_vc_uni_tunnel.cpp            |  1 +
 tests/tt_metal/tt_metal/test_bcast.cpp        |  1 +
 tests/tt_metal/tt_metal/test_bmm.cpp          |  1 +
 tests/tt_metal/tt_metal/test_clean_init.cpp   |  1 +
 tests/tt_metal/tt_metal/test_compile_args.cpp |  1 +
 .../tt_metal/test_compile_program.cpp         |  1 +
 .../test_compile_sets_kernel_binaries.cpp     |  1 +
 .../tt_metal/tt_metal/test_core_range_set.cpp |  1 +
 tests/tt_metal/tt_metal/test_datacopy.cpp     |  1 +
 .../tt_metal/tt_metal/test_datacopy_bfp8b.cpp |  1 +
 .../tt_metal/test_datacopy_output_in_l1.cpp   |  1 +
 .../tt_metal/tt_metal/test_eltwise_binary.cpp |  1 +
 tests/tt_metal/tt_metal/test_flatten.cpp      |  1 +
 ...neric_binary_reader_matmul_large_block.cpp |  1 +
 tests/tt_metal/tt_metal/test_gold_impls.hpp   | 10 +--
 .../tt_metal/test_interleaved_layouts.cpp     |  1 +
 .../tt_metal/test_matmul_large_block.cpp      |  1 +
 .../test_matmul_multi_core_multi_dram.cpp     |  1 +
 ...matmul_multi_core_multi_dram_in0_mcast.cpp |  1 +
 ...ti_core_multi_dram_in0_mcast_in1_mcast.cpp |  1 +
 ...matmul_multi_core_multi_dram_in1_mcast.cpp |  1 +
 .../test_matmul_multi_core_single_dram.cpp    |  1 +
 .../tt_metal/test_matmul_multi_tile.cpp       |  1 +
 .../tt_metal/test_matmul_single_core.cpp      |  1 +
 .../test_matmul_single_core_small.cpp         |  1 +
 .../tt_metal/test_matmul_single_tile.cpp      |  1 +
 .../test_matmul_single_tile_bfp8b.cpp         |  1 +
 .../test_matmul_single_tile_output_in_l1.cpp  |  1 +
 .../tt_metal/test_multi_core_kernel.cpp       |  1 +
 .../tt_metal/test_multiple_programs.cpp       |  1 +
 tests/tt_metal/tt_metal/test_transpose_hc.cpp |  1 +
 .../tt_metal/test_untilize_eltwise_binary.cpp |  1 +
 .../tt_dispatch/test_enqueue_program.cpp      |  1 +
 .../basic/initialize_semaphores.cpp           |  1 +
 .../unit_tests/buffer/test_banked.cpp         |  1 +
 .../test_CircularBuffer_allocation.cpp        |  1 +
 .../test_CircularBuffer_creation.cpp          |  1 +
 .../unit_tests/common/device_fixture.hpp      |  4 +-
 .../unit_tests/common/n300_device_fixture.hpp |  2 +-
 .../unit_tests/compute/test_broadcast.cpp     |  1 +
 .../test_copy_block_matmul_partials.cpp       |  1 +
 .../compute/test_dropout_sfpu_compute.cpp     |  1 +
 .../unit_tests/compute/test_golden_impls.cpp  |  1 +
 .../unit_tests/compute/test_reconfig.cpp      |  1 +
 .../unit_tests/compute/test_reduce.cpp        |  1 +
 .../unit_tests/compute/test_sfpu_compute.cpp  |  2 +
 .../test_single_core_binary_compute.cpp       |  2 +
 .../unit_tests/compute/test_transpose.cpp     |  1 +
 .../compute/test_untilize_tilize.cpp          |  1 +
 .../core_coord/test_CoreRange_iterator.cpp    |  2 +
 .../tt_metal/unit_tests/dram/direct.cpp       |  1 +
 .../fast_dispatch_kernels/test_write_host.cpp |  1 +
 .../multichip/ring_gather_kernels.cpp         |  1 +
 .../basic/test_device_init.cpp                |  1 +
 .../common/common_fixture.hpp                 |  4 +-
 .../common/test_dispatch.cpp                  |  2 +
 .../unit_tests_common/common/test_utils.hpp   |  6 +-
 .../compute/matmul/test_matmul_X_tile.cpp     |  1 +
 .../matmul/test_matmul_large_block.cpp        |  1 +
 .../matmul/test_matmul_multi_core_X_dram.cpp  |  1 +
 ...ti_core_multi_dram_in0_mcast_in1_mcast.cpp |  1 +
 ...matmul_multi_core_multi_dram_inX_mcast.cpp |  1 +
 .../matmul/test_matmul_single_core.cpp        |  1 +
 .../compute/test_flatten.cpp                  |  1 +
 .../dprint/test_print_before_finish.cpp       |  1 +
 .../watcher/test_link_training.cpp            |  1 +
 .../watcher/test_noc_sanitize_delays.cpp      |  1 +
 .../unit_tests_common/watcher/test_pause.cpp  |  1 +
 .../watcher/test_waypoint.cpp                 |  1 +
 .../command_queue/test_EnqueueProgram.cpp     |  1 +
 .../command_queue/test_EnqueueTrace.cpp       |  1 +
 ...queueWriteBuffer_and_EnqueueReadBuffer.cpp |  1 +
 .../command_queue/test_HostAsyncCQ.cpp        |  1 +
 .../command_queue/test_events.cpp             |  1 +
 .../compute/sfpu/sfpu_compute.cpp             |  2 +
 .../multichip/test_eth_EnqueueProgram.cpp     |  1 +
 .../test_eth_ring_gather_EnqueueProgram.cpp   |  1 +
 .../pipelining/basic_pipeline.cpp             |  2 +
 .../command_queue/test_EnqueueProgram.cpp     |  1 +
 .../command_queue/test_EnqueueTrace.cpp       |  1 +
 .../test_EnqueueWaitForEvent.cpp              |  1 +
 ...queueWriteBuffer_and_EnqueueReadBuffer.cpp |  1 +
 .../common/command_queue_test_utils.hpp       |  2 +-
 .../tests/run_many_times.cpp                  |  1 +
 tt_metal/common/base.hpp                      | 62 ++++-------------
 tt_metal/common/test_common.hpp               |  8 +--
 tt_metal/impl/buffers/buffer.hpp              |  1 -
 tt_metal/impl/debug/dprint_server.cpp         | 19 +++---
 tt_metal/impl/debug/noc_logging.cpp           |  2 +-
 tt_metal/impl/debug/sanitize_noc_host.hpp     |  2 +-
 tt_metal/impl/debug/watcher_device_reader.cpp |  6 +-
 tt_metal/impl/debug/watcher_device_reader.hpp | 14 ++--
 tt_metal/impl/debug/watcher_server.cpp        |  4 +-
 tt_metal/impl/device/device.cpp               |  6 +-
 tt_metal/impl/dispatch/command_queue.cpp      | 10 +--
 tt_metal/impl/dispatch/command_queue.hpp      |  1 -
 .../impl/dispatch/command_queue_interface.hpp | 24 +++----
 tt_metal/impl/dispatch/data_collection.cpp    | 24 +++----
 tt_metal/impl/dispatch/debug_tools.cpp        | 16 ++---
 tt_metal/impl/program/program.cpp             | 20 +++---
 tt_metal/impl/trace/trace.cpp                 |  6 +-
 tt_metal/jit_build/build.hpp                  |  2 +-
 tt_metal/jit_build/data_format.cpp            | 16 ++++-
 tt_metal/jit_build/data_format.hpp            |  9 ++-
 tt_metal/llrt/llrt.cpp                        |  2 +-
 tt_metal/llrt/rtoptions.cpp                   |  4 +-
 tt_metal/llrt/tt_cluster.cpp                  | 12 ++--
 tt_metal/llrt/tt_cluster.hpp                  |  8 +--
 .../add_2_integers_in_compute.cpp             |  2 +-
 .../eltwise_binary/eltwise_binary.cpp         |  2 +-
 .../eltwise_sfpu/eltwise_sfpu.cpp             |  2 +-
 .../hello_world_compute_kernel.cpp            |  2 +-
 .../matmul_multi_core/matmul_multi_core.cpp   | 12 ++--
 .../matmul_multicore_reuse.cpp                | 12 ++--
 .../matmul_multicore_reuse_mcast.cpp          |  6 +-
 .../matmul_single_core/matmul_single_core.cpp |  8 +--
 .../test_custom_cycle_count.cpp               |  2 +-
 .../test_custom_cycle_count_slow_dispatch.cpp |  2 +-
 .../test_dispatch_cores.cpp                   |  2 +-
 .../test_full_buffer/test_full_buffer.cpp     |  2 +-
 .../profiler/test_multi_op/test_multi_op.cpp  |  2 +-
 tt_metal/tools/memset.cpp                     |  4 +-
 tt_metal/tools/profiler/op_profiler.hpp       |  6 +-
 tt_metal/tools/profiler/profiler.cpp          |  6 +-
 tt_metal/tools/profiler/profiler.hpp          |  4 +-
 tt_metal/tools/profiler/tt_metal_profiler.cpp |  6 +-
 .../host/reduce_scatter_full_worker_grid.cpp  |  2 +-
 .../host/reduce_scatter_common.cpp            |  1 -
 .../conv/conv2d/device/conv2d_op.cpp          |  2 +-
 .../conv/conv2d/device/conv2d_op.hpp          |  2 +-
 .../conv2d_op_sharded_program_factory.cpp     |  2 +-
 .../clone/device/clone_program_factory.cpp    |  4 +-
 .../concat/device/concat_program_factory.cpp  | 38 +++++------
 .../copy/device/copy_program_factory.cpp      |  4 +-
 .../pad/device/pad_program_factory.cpp        |  6 +-
 .../repeat/device/repeat_program_factory.cpp  |  4 +-
 ...sharded_to_interleaved_program_factory.cpp |  2 +-
 .../slice/device/slice_program_factory.cpp    | 12 ++--
 .../tilize/device/tilize_program_factory.cpp  |  8 +--
 ...ilize_with_val_padding_program_factory.cpp |  8 +--
 .../device/transpose_program_factory.cpp      |  4 +-
 .../device/untilize_program_factory.cpp       | 24 +++----
 ...ntilize_with_unpadding_program_factory.cpp | 12 ++--
 ...lement_wise_multi_core_program_factory.cpp |  2 +-
 .../unary/device/unary_program_factory.cpp    |  6 +-
 .../device/unary_sharded_program_factory.cpp  |  4 +-
 .../device/embedding_program_factory.hpp      |  4 +-
 .../device/multi_core_program_factory.cpp     |  4 +-
 .../device/single_core_program_factory.cpp    |  4 +-
 .../device/single_core_program_factory.cpp    |  4 +-
 .../device/attn_matmul_program_factory.cpp    |  2 +-
 .../group_attn_matmul_program_factory.cpp     |  2 +-
 ...lp_kv_cache_load_slice_program_factory.cpp |  6 +-
 .../rotary_embedding_program_factory.cpp      |  2 +-
 .../device/update_cache_op_multi_core.cpp     |  2 +-
 .../matmul_op_multi_core_program_factory.cpp  |  4 +-
 ...ti_core_reuse_mcast_1d_program_factory.cpp |  4 +-
 ...ti_core_reuse_mcast_2d_program_factory.cpp |  2 +-
 ...use_mcast_dram_sharded_program_factory.cpp |  2 +-
 ...i_core_reuse_optimized_program_factory.cpp |  4 +-
 ...ul_op_multi_core_reuse_program_factory.cpp |  2 +-
 .../device/moreh_dot_program_factory.cpp      |  2 +-
 .../moreh_dot_backward_program_factory.cpp    |  2 +-
 .../device/moreh_getitem_rm_factory.cpp       |  4 +-
 .../device/moreh_getitem_tilized_factory.cpp  |  8 +--
 .../moreh/moreh_helper_functions.cpp          |  4 +-
 .../moreh/moreh_helper_functions.hpp          |  6 +-
 ...r_backward_single_core_program_factory.cpp |  2 +-
 .../device/moreh_matmul_program_factory.cpp   |  2 +-
 .../device/moreh_mean_nc_program_factory.cpp  |  2 +-
 .../device/moreh_mean_w_program_factory.cpp   |  6 +-
 .../moreh_mean_backward_program_factory.cpp   |  2 +-
 .../moreh_nll_loss_step1_program_factory.cpp  |  4 +-
 .../moreh_nll_loss_step2_program_factory.cpp  | 12 ++--
 .../device/moreh_sgd_program_factory.cpp      |  4 +-
 .../softmax_c_large/softmax_c_large.cpp       |  4 +-
 .../softmax_h_large/softmax_h_large.cpp       |  4 +-
 .../softmax_h_small/softmax_h_small.cpp       |  4 +-
 .../softmax_w_large/softmax_w_large.cpp       |  4 +-
 .../softmax_w_small/softmax_w_small.cpp       |  4 +-
 .../softmax_backward_c_large.cpp              |  4 +-
 .../softmax_backward_h_large.cpp              |  4 +-
 .../softmax_backward_h_small.cpp              |  4 +-
 .../softmax_backward_w_large.cpp              |  4 +-
 .../softmax_backward_w_small.cpp              |  4 +-
 .../device/moreh_sum_h_program_factory.cpp    |  6 +-
 .../device/moreh_sum_nc_program_factory.cpp   |  2 +-
 .../multi_core/layernorm_op_multi_core.cpp    |  2 +-
 ...ayernorm_post_all_gather_op_multi_core.cpp |  2 +-
 ...layernorm_pre_all_gather_op_multi_core.cpp |  2 +-
 ttnn/cpp/ttnn/operations/numpy/functions.hpp  |  8 +--
 .../device/downsample_program_factory.cpp     |  6 +-
 ...ple_bilinear_program_factory_multicore.cpp |  2 +-
 .../pool/upsample/device/upsample_op.cpp      |  4 +-
 .../upsample_program_factory_multicore.cpp    |  2 +-
 .../multi_core_h/reduce_op_multi_core_h.cpp   | 10 +--
 .../multi_core_w/reduce_op_multi_core_w.cpp   |  4 +-
 .../reduce_op_single_core_hw.cpp              |  2 +-
 .../prod/device/prod_all_program_factory.cpp  |  2 +-
 .../reference_sliding_window.cpp              | 12 ++--
 .../reference_sliding_window.hpp              | 34 +++++-----
 ttnn/cpp/ttnn/tensor/tensor_impl.cpp          |  2 +-
 ttnn/cpp/ttnn/tensor/tensor_impl.hpp          |  2 +-
 ttnn/cpp/ttnn/tensor/tensor_utils.hpp         |  2 +-
 228 files changed, 531 insertions(+), 453 deletions(-)

diff --git a/tests/tests_common/sfpu_helper/sfpu_helper.hpp b/tests/tests_common/sfpu_helper/sfpu_helper.hpp
index f643d93e478..05ea74d1b5b 100644
--- a/tests/tests_common/sfpu_helper/sfpu_helper.hpp
+++ b/tests/tests_common/sfpu_helper/sfpu_helper.hpp
@@ -74,8 +74,8 @@ float ref_identity(float x) {
     return x;
 }
 
-vector<uint32_t> sfpu(const std::vector<uint32_t> &src, std::function<float(float)> sfpu_func) {
-    vector<uint32_t> dst;
+std::vector<uint32_t> sfpu(const std::vector<uint32_t> &src, std::function<float(float)> sfpu_func) {
+    std::vector<uint32_t> dst;
 
     for (uint32_t el: src) {
 
@@ -98,13 +98,13 @@ vector<uint32_t> sfpu(const std::vector<uint32_t> &src, std::function<float(floa
 }
 
 // Helper functions
-vector<uint32_t> create_random_ones_and_twos_vector_of_bfloat16(uint32_t num_bytes, int seed) {
+std::vector<uint32_t> create_random_ones_and_twos_vector_of_bfloat16(uint32_t num_bytes, int seed) {
     // Used for reciprocal, since binary vectors are filled with 0s and 1s, and recip of 0 is undefined,
     // so then we just generate a vector of ones and twos
 
-    vector<uint32_t> src = create_random_binary_vector_of_bfloat16(num_bytes, seed);
+    std::vector<uint32_t> src = create_random_binary_vector_of_bfloat16(num_bytes, seed);
 
-    vector<uint32_t> dst;
+    std::vector<uint32_t> dst;
 
     for (uint32_t el: src) {
 
@@ -148,7 +148,7 @@ bool is_close_rtol_0p175_atol_0p1(float a, float b) {
 }
 
 // SFPU maps -> relevant kernels, golden functions, comparison functions
-static std::vector<string> sfpu_op =
+static std::vector<std::string> sfpu_op =
     { "relu",
      "exponential",
      "reciprocal",
@@ -165,7 +165,7 @@ static std::vector<string> sfpu_op =
      "identity"
     };
 
-const map<string, std::function<float(float)>> sfpu_op_to_function = {
+const std::map<std::string, std::function<float(float)>> sfpu_op_to_function = {
     {"relu",        relu},
     {"exponential", exponential},
     {"reciprocal",  reciprocal},
@@ -182,7 +182,7 @@ const map<string, std::function<float(float)>> sfpu_op_to_function = {
     {"identity",    ref_identity}
 };
 
-const map<string, std::function<vector<uint32_t>(uint32_t num_bytes, int seed)>> sfpu_op_to_init_func = {
+const std::map<std::string, std::function<std::vector<uint32_t>(uint32_t num_bytes, int seed)>> sfpu_op_to_init_func = {
     {"relu",        create_random_vector_of_bfloat16_1_1},
     {"exponential", create_random_binary_vector_of_bfloat16},
     {"reciprocal",  create_random_ones_and_twos_vector_of_bfloat16},
@@ -199,7 +199,7 @@ const map<string, std::function<vector<uint32_t>(uint32_t num_bytes, int seed)>>
     {"identity",      create_random_vector_of_bfloat16_1_1}
 };
 
-const map<string, std::function<bool(float a, float b)>> sfpu_op_to_comparison_function = {
+const std::map<std::string, std::function<bool(float a, float b)>> sfpu_op_to_comparison_function = {
     {"exponential", equal_within_two_sig_figs},
     {"reciprocal", equal_within_absolute_tolerance_of_0p03},
     {"gelu", is_close_0p015},
diff --git a/tests/tt_eager/ops/test_sfpu.cpp b/tests/tt_eager/ops/test_sfpu.cpp
index 40543974d05..2773327468e 100644
--- a/tests/tt_eager/ops/test_sfpu.cpp
+++ b/tests/tt_eager/ops/test_sfpu.cpp
@@ -18,6 +18,7 @@
 #include "ttnn/operations/eltwise/unary/common/unary_op_utils.hpp"
 // #include "tt_gdb/tt_gdb.hpp"
 
+using std::vector;
 
 // SFPU maps -> relevant kernels, golden functions, comparison functions
 std::map<std::string,std::map<std::string, std::string>> sfpu_op_to_hlk_op_name={};
diff --git a/tests/tt_eager/ops/test_sliding_window_ops.cpp b/tests/tt_eager/ops/test_sliding_window_ops.cpp
index 31db47a4b35..c5583841c7f 100644
--- a/tests/tt_eager/ops/test_sliding_window_ops.cpp
+++ b/tests/tt_eager/ops/test_sliding_window_ops.cpp
@@ -14,6 +14,7 @@
 #include "ttnn/operations/numpy/functions.hpp"
 #include "ttnn/tensor/types.hpp"
 
+using std::vector;
 using tt::tt_metal::LegacyShape;
 using tt::tt_metal::Tensor;
 using namespace ttnn::operations::sliding_window;
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/test_dram_read_remote_cb.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/test_dram_read_remote_cb.cpp
index 5bbf0ca25b0..71b845e629d 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/test_dram_read_remote_cb.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/test_dram_read_remote_cb.cpp
@@ -24,6 +24,7 @@
 #include "tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/matmul_utils.hpp"
 #include <yaml-cpp/yaml.h>
 
+using std::vector;
 using namespace tt;
 using std::chrono::duration_cast;
 using std::chrono::microseconds;
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/1_compute_mm/test_compute_mm.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/1_compute_mm/test_compute_mm.cpp
index 71447cb3e19..0596b2939ba 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/1_compute_mm/test_compute_mm.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/1_compute_mm/test_compute_mm.cpp
@@ -28,6 +28,7 @@
 #include "tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/matmul_utils.hpp"
 #include "tt_metal/common/work_split.hpp"
 
+using std::vector;
 using namespace tt;
 ////////////////////////////////////////////////////////////////////////////////
 // This benchmark measures the compute performance of matmul. When in the slow
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/7_kernel_launch/test_kernel_launch.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/7_kernel_launch/test_kernel_launch.cpp
index 9bb6a19092a..804c62594d2 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/7_kernel_launch/test_kernel_launch.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/7_kernel_launch/test_kernel_launch.cpp
@@ -13,6 +13,7 @@
 #include "tt_metal/impl/dispatch/command_queue.hpp"
 #include "tt_metal/tt_metal/perf_microbenchmark/common/util.hpp"
 
+using std::vector;
 using namespace tt;
 using namespace tt::tt_metal;
 using std::chrono::duration_cast;
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/common/util.hpp b/tests/tt_metal/tt_metal/perf_microbenchmark/common/util.hpp
index 879d11a1b4c..af91f5e785a 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/common/util.hpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/common/util.hpp
@@ -28,7 +28,7 @@ inline uint64_t get_t0_to_any_riscfw_end_cycle(tt::tt_metal::Device *device, con
         hal.get_dev_addr<dprint_buf_msg_t *>(HalProgrammableCoreType::TENSIX, HalL1MemAddrType::DPRINT);
 
     // This works for tensix only, will need to be updated for eth
-    vector<uint64_t> print_buffer_addrs = {
+    std::vector<uint64_t> print_buffer_addrs = {
         reinterpret_cast<uint64_t>(dprint_msg->data[DPRINT_RISCV_INDEX_NC]),
         reinterpret_cast<uint64_t>(dprint_msg->data[DPRINT_RISCV_INDEX_BR]),
         reinterpret_cast<uint64_t>(dprint_msg->data[DPRINT_RISCV_INDEX_TR0]),
@@ -37,7 +37,7 @@ inline uint64_t get_t0_to_any_riscfw_end_cycle(tt::tt_metal::Device *device, con
     };
     for (const auto &worker_core : worker_cores_used_in_program) {
         for (const auto &buffer_addr : print_buffer_addrs) {
-            vector<std::uint32_t> profile_buffer;
+            std::vector<std::uint32_t> profile_buffer;
             uint32_t end_index;
             uint32_t dropped_marker_counter;
             profile_buffer = tt::llrt::read_hex_vec_from_core(device_id, worker_core, buffer_addr, DPRINT_BUFFER_SIZE);
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/common.h b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/common.h
index 0959bd24c98..fada32bb47c 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/common.h
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/common.h
@@ -28,8 +28,8 @@ struct one_core_data_t {
     CoreCoord phys_core;
     int bank_id;
     int bank_offset;
-    vector<bool> valid;
-    vector<uint32_t> data;
+    std::vector<bool> valid;
+    std::vector<uint32_t> data;
 };
 
 class DeviceData {
@@ -342,7 +342,7 @@ inline bool DeviceData::validate_one_core(Device *device,
                                           uint32_t result_addr) {
     int fail_count = 0;
     const std::vector<uint32_t>& dev_data = one_core_data.data;
-    const vector<bool>& dev_valid = one_core_data.valid;
+    const std::vector<bool>& dev_valid = one_core_data.valid;
     const CoreCoord logical_core = one_core_data.logical_core;
     const CoreCoord phys_core = one_core_data.phys_core;
     const CoreType core_type = one_core_data.core_type;
@@ -366,7 +366,7 @@ inline bool DeviceData::validate_one_core(Device *device,
 
     // Read results from device and compare to expected for this core.
     result_addr += bank_offset;
-    vector<uint32_t> results = tt::llrt::read_hex_vec_from_core(device->id(), phys_core, result_addr, size_bytes);
+    std::vector<uint32_t> results = tt::llrt::read_hex_vec_from_core(device->id(), phys_core, result_addr, size_bytes);
 
     log_info(tt::LogTest, "Validating {} bytes from {} bank {} log_core {}: phys_core: {} at addr: 0x{:x}",
              size_bytes, core_string, bank_id, logical_core.str(), phys_core.str(), result_addr);
@@ -536,7 +536,7 @@ inline uint32_t get_min_required_buffer_addr(Device *device, bool is_dram){
     return min_required_positive_offset;
 }
 
-inline void generate_random_payload(vector<uint32_t>& cmds,
+inline void generate_random_payload(std::vector<uint32_t>& cmds,
                                     uint32_t length) {
 
     for (uint32_t i = 0; i < length; i++) {
@@ -545,7 +545,7 @@ inline void generate_random_payload(vector<uint32_t>& cmds,
     }
 }
 
-inline void generate_random_payload(vector<uint32_t>& cmds,
+inline void generate_random_payload(std::vector<uint32_t>& cmds,
                                     const CoreRange& workers,
                                     DeviceData& data,
                                     uint32_t length_words,
@@ -579,7 +579,7 @@ inline void generate_random_payload(vector<uint32_t>& cmds,
 // Generate a random payload for a paged write command. Note: Doesn't currently support using the base_addr here.
 inline void generate_random_paged_payload(Device *device,
                                           CQDispatchCmd cmd,
-                                          vector<uint32_t>& cmds,
+                                          std::vector<uint32_t>& cmds,
                                           DeviceData& data,
                                           uint32_t start_page,
                                           bool is_dram) {
@@ -618,8 +618,8 @@ inline void generate_random_paged_payload(Device *device,
     }
 }
 
-inline void generate_random_packed_payload(vector<uint32_t>& cmds,
-                                           vector<CoreCoord>& worker_cores,
+inline void generate_random_packed_payload(std::vector<uint32_t>& cmds,
+                                           std::vector<CoreCoord>& worker_cores,
                                            DeviceData& data,
                                            uint32_t size_words,
                                            bool repeat = false) {
@@ -628,7 +628,7 @@ inline void generate_random_packed_payload(vector<uint32_t>& cmds,
     const uint32_t bank_id = 0; // No interleaved pages here.
 
     bool first_core = true;
-    vector<uint32_t>results;
+    std::vector<uint32_t>results;
     CoreCoord first_worker = worker_cores[0];
     for (uint32_t i = 0; i < size_words; i++) {
         uint32_t datum = (use_coherent_data_g) ? ((first_worker.x << 16) | (first_worker.y << 24) | coherent_count++) : std::rand();
@@ -648,7 +648,7 @@ inline void generate_random_packed_payload(vector<uint32_t>& cmds,
     }
 }
 
-inline void generate_random_packed_large_payload(vector<uint32_t>& generated_data,
+inline void generate_random_packed_large_payload(std::vector<uint32_t>& generated_data,
                                                  CoreRange range,
                                                  DeviceData& data,
                                                  uint32_t size_words) {
@@ -676,7 +676,7 @@ inline void generate_random_packed_large_payload(vector<uint32_t>& generated_dat
     }
 }
 
-inline void add_bare_dispatcher_cmd(vector<uint32_t>& cmds,
+inline void add_bare_dispatcher_cmd(std::vector<uint32_t>& cmds,
                                     CQDispatchCmd cmd) {
     static_assert(sizeof(CQDispatchCmd) % sizeof(uint32_t) == 0, "CQDispatchCmd size must be a multiple of uint32_t size");
     const size_t num_uint32s = sizeof(CQDispatchCmd) / sizeof(uint32_t);
@@ -688,7 +688,7 @@ inline void add_bare_dispatcher_cmd(vector<uint32_t>& cmds,
     }
 }
 
-inline size_t debug_prologue(vector<uint32_t>& cmds) {
+inline size_t debug_prologue(std::vector<uint32_t>& cmds) {
     size_t prior = cmds.size();
 
     if (debug_g) {
@@ -707,7 +707,7 @@ inline size_t debug_prologue(vector<uint32_t>& cmds) {
     return prior;
 }
 
-inline void debug_epilogue(vector<uint32_t>& cmds,
+inline void debug_epilogue(std::vector<uint32_t>& cmds,
                            size_t prior_end) {
     if (debug_g) {
         // Doing a checksum on the full command length is problematic in the kernel
@@ -731,7 +731,7 @@ inline void debug_epilogue(vector<uint32_t>& cmds,
     }
 }
 
-inline void add_dispatcher_cmd(vector<uint32_t>& cmds,
+inline void add_dispatcher_cmd(std::vector<uint32_t>& cmds,
                                CQDispatchCmd cmd,
                                uint32_t length) {
 
@@ -744,7 +744,7 @@ inline void add_dispatcher_cmd(vector<uint32_t>& cmds,
     debug_epilogue(cmds, prior_end);
 }
 
-inline void add_dispatcher_cmd(vector<uint32_t>& cmds,
+inline void add_dispatcher_cmd(std::vector<uint32_t>& cmds,
                                const CoreRange& workers,
                                DeviceData& device_data,
                                CQDispatchCmd cmd,
@@ -762,7 +762,7 @@ inline void add_dispatcher_cmd(vector<uint32_t>& cmds,
 }
 
 inline void add_dispatcher_paged_cmd(Device *device,
-                                     vector<uint32_t>& cmds,
+                                     std::vector<uint32_t>& cmds,
                                      DeviceData& device_data,
                                      CQDispatchCmd cmd,
                                      uint32_t start_page,
@@ -775,8 +775,8 @@ inline void add_dispatcher_paged_cmd(Device *device,
 }
 
 inline void add_dispatcher_packed_cmd(Device *device,
-                                      vector<uint32_t>& cmds,
-                                      vector<CoreCoord>& worker_cores,
+                                      std::vector<uint32_t>& cmds,
+                                      std::vector<CoreCoord>& worker_cores,
                                       DeviceData& device_data,
                                       CQDispatchCmd cmd,
                                       uint32_t size_words,
@@ -798,7 +798,7 @@ inline void add_dispatcher_packed_cmd(Device *device,
 
 // bare: doesn't generate random payload data, for use w/ eg, dram reads
 inline void gen_bare_dispatcher_unicast_write_cmd(Device *device,
-                                                  vector<uint32_t>& cmds,
+                                                  std::vector<uint32_t>& cmds,
                                                   CoreCoord worker_core,
                                                   DeviceData& device_data,
                                                   uint32_t length) {
@@ -821,7 +821,7 @@ inline void gen_bare_dispatcher_unicast_write_cmd(Device *device,
 }
 
 inline void gen_dispatcher_unicast_write_cmd(Device *device,
-                                             vector<uint32_t>& cmds,
+                                             std::vector<uint32_t>& cmds,
                                              CoreCoord worker_core,
                                              DeviceData& device_data,
                                              uint32_t length) {
@@ -842,7 +842,7 @@ inline void gen_dispatcher_unicast_write_cmd(Device *device,
 }
 
 inline void gen_dispatcher_multicast_write_cmd(Device *device,
-                                             vector<uint32_t>& cmds,
+                                             std::vector<uint32_t>& cmds,
                                              CoreRange worker_core_range,
                                              DeviceData& device_data,
                                              uint32_t length) {
@@ -868,7 +868,7 @@ inline void gen_dispatcher_multicast_write_cmd(Device *device,
 }
 
 inline void gen_dispatcher_paged_write_cmd(Device *device,
-                                             vector<uint32_t>& cmds,
+                                             std::vector<uint32_t>& cmds,
                                              DeviceData& device_data,
                                              bool is_dram,
                                              uint32_t start_page,
@@ -913,8 +913,8 @@ inline void gen_dispatcher_paged_write_cmd(Device *device,
 
 
 inline void gen_dispatcher_packed_write_cmd(Device *device,
-                                            vector<uint32_t>& cmds,
-                                            vector<CoreCoord>& worker_cores,
+                                            std::vector<uint32_t>& cmds,
+                                            std::vector<CoreCoord>& worker_cores,
                                             DeviceData& device_data,
                                             uint32_t size_words,
                                             bool repeat = false) {
@@ -938,7 +938,7 @@ inline void gen_dispatcher_packed_write_cmd(Device *device,
 }
 
 inline void gen_rnd_dispatcher_packed_write_cmd(Device *device,
-                                                vector<uint32_t>& cmds,
+                                                std::vector<uint32_t>& cmds,
                                                 DeviceData& device_data) {
 
     // Note: this cmd doesn't clamp to a max size which means it can overflow L1 buffer
@@ -952,7 +952,7 @@ inline void gen_rnd_dispatcher_packed_write_cmd(Device *device,
         if (xfer_size_bytes < min_xfer_size_bytes_g) xfer_size_bytes = min_xfer_size_bytes_g;
     }
 
-    vector<CoreCoord> gets_data;
+    std::vector<CoreCoord> gets_data;
     while (gets_data.size() == 0) {
         for (auto & [core, one_worker] : device_data.get_data()) {
             if (device_data.core_and_bank_present(core, 0) &&
@@ -984,14 +984,14 @@ inline void gen_rnd_dispatcher_packed_write_cmd(Device *device,
 
 inline bool gen_rnd_dispatcher_packed_write_large_cmd(Device *device,
                                                       CoreRange workers,
-                                                      vector<uint32_t>& cmds,
+                                                      std::vector<uint32_t>& cmds,
                                                       DeviceData& device_data,
                                                       uint32_t space_available) {
 
     int ntransactions = perf_test_g ? (CQ_DISPATCH_CMD_PACKED_WRITE_LARGE_MAX_SUB_CMDS / 2) :
         ((std:: rand() % CQ_DISPATCH_CMD_PACKED_WRITE_LARGE_MAX_SUB_CMDS) + 1);
 
-    vector<uint32_t> sizes;
+    std::vector<uint32_t> sizes;
     for (int i = 0; i < ntransactions; i++) {
         constexpr uint32_t max_pages = 4;
         uint32_t xfer_size_16b = (std::rand() % (dispatch_buffer_page_size_g * max_pages / hal.get_alignment(HalMemType::L1))) + 1;
@@ -1022,7 +1022,7 @@ inline bool gen_rnd_dispatcher_packed_write_large_cmd(Device *device,
     cmd.write_packed_large.alignment = hal.get_alignment(HalMemType::L1);
     add_bare_dispatcher_cmd(cmds, cmd);
 
-    vector<uint32_t> data;
+    std::vector<uint32_t> data;
     for (int i = 0; i < ntransactions; i++) {
         uint32_t xfer_size_bytes = sizes[i];
 
@@ -1061,7 +1061,7 @@ inline bool gen_rnd_dispatcher_packed_write_large_cmd(Device *device,
     return false;
 }
 
-inline void gen_dispatcher_host_write_cmd(vector<uint32_t>& cmds,
+inline void gen_dispatcher_host_write_cmd(std::vector<uint32_t>& cmds,
                                           DeviceData& device_data,
                                           uint32_t length) {
 
@@ -1075,7 +1075,7 @@ inline void gen_dispatcher_host_write_cmd(vector<uint32_t>& cmds,
     add_dispatcher_cmd(cmds, device_data.get_host_core(), device_data, cmd, length, false, true);
 }
 
-inline void gen_bare_dispatcher_host_write_cmd(vector<uint32_t>& cmds, uint32_t length) {
+inline void gen_bare_dispatcher_host_write_cmd(std::vector<uint32_t>& cmds, uint32_t length) {
 
     CQDispatchCmd cmd;
     memset(&cmd, 0, sizeof(CQDispatchCmd));
@@ -1087,7 +1087,7 @@ inline void gen_bare_dispatcher_host_write_cmd(vector<uint32_t>& cmds, uint32_t
     add_bare_dispatcher_cmd(cmds, cmd);
 }
 
-inline void gen_dispatcher_set_write_offset_cmd(vector<uint32_t>& cmds, uint32_t wo0, uint32_t wo1 = 0, uint32_t wo2 = 0) {
+inline void gen_dispatcher_set_write_offset_cmd(std::vector<uint32_t>& cmds, uint32_t wo0, uint32_t wo1 = 0, uint32_t wo2 = 0) {
 
     CQDispatchCmd cmd;
     memset(&cmd, 0, sizeof(CQDispatchCmd));
@@ -1100,7 +1100,7 @@ inline void gen_dispatcher_set_write_offset_cmd(vector<uint32_t>& cmds, uint32_t
     add_dispatcher_cmd(cmds, cmd, payload_length);
 }
 
-inline void gen_dispatcher_terminate_cmd(vector<uint32_t>& cmds) {
+inline void gen_dispatcher_terminate_cmd(std::vector<uint32_t>& cmds) {
 
     CQDispatchCmd cmd;
     memset(&cmd, 0, sizeof(CQDispatchCmd));
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency.cpp
index b660e49d921..d13994ded6e 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency.cpp
@@ -28,6 +28,7 @@ constexpr uint32_t DEFAULT_BATCH_SIZE_K = 512;
 //
 // Test read/write bw and latency from host/dram/l1
 //////////////////////////////////////////////////////////////////////////////////////////
+using std::vector;
 using namespace tt;
 
 uint32_t iterations_g = DEFAULT_ITERATIONS;
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_dispatcher.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_dispatcher.cpp
index 6f25b8c6c04..ae6c2cf33a3 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_dispatcher.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_dispatcher.cpp
@@ -36,6 +36,7 @@ constexpr uint32_t MIN_PAGED_WRITE_ADDR = 512 * 1024; // Disable randomization b
 //
 // Times dispatching program to M cores, N processors, of various sizes, CBs, runtime args
 //////////////////////////////////////////////////////////////////////////////////////////
+using std::vector;
 using namespace tt;
 
 uint32_t iterations_g = DEFAULT_ITERATIONS;
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch.cpp
index 510406c08f9..dd5e90f3ac2 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch.cpp
@@ -24,6 +24,7 @@ constexpr uint32_t MAX_ARGS = 255;
 //
 // Times dispatching program to M cores, N processors, of various sizes, CBs, runtime args
 //////////////////////////////////////////////////////////////////////////////////////////
+using std::vector;
 using namespace tt;
 
 uint32_t iterations_g = DEFAULT_ITERATIONS;
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_prefetcher.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_prefetcher.cpp
index 866f5193212..ba38f8ac8db 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_prefetcher.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_prefetcher.cpp
@@ -53,6 +53,7 @@ constexpr uint32_t host_data_dirty_pattern = 0xbaadf00d;
 //
 // Times dispatching program to M cores, N processors, of various sizes, CBs, runtime args
 //////////////////////////////////////////////////////////////////////////////////////////
+using std::vector;
 using namespace tt;
 
 uint32_t iterations_g = DEFAULT_ITERATIONS;
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_global_l1.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_global_l1.cpp
index 4541164697f..984cb6e3483 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_global_l1.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_global_l1.cpp
@@ -17,6 +17,7 @@
 #include "tt_metal/impl/debug/dprint_server.hpp"
 #include "tt_metal/test_utils/deprecated/tensor.hpp"
 
+using std::vector;
 using namespace tt;
 
 // took from bmm_op.cpp
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_local_l1.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_local_l1.cpp
index f3e6a893ccd..ecef4cf8d6e 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_local_l1.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_local_l1.cpp
@@ -18,6 +18,7 @@
 
 #define LAUNCH
 
+using std::vector;
 using namespace tt;
 
 // Given a tensor that is row-major datums, make it tilized
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_bi_tunnel.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_bi_tunnel.cpp
index 81b30d3b56f..4bae04746bd 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_bi_tunnel.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_bi_tunnel.cpp
@@ -10,6 +10,7 @@
 #include "kernels/traffic_gen_test.hpp"
 #include "tt_metal/impl/device/device.hpp"
 
+using std::vector;
 using namespace tt;
 
 
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux.cpp
index 431fe73700f..07a59aefad8 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux.cpp
@@ -10,6 +10,7 @@
 #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp"
 #include "kernels/traffic_gen_test.hpp"
 
+using std::vector;
 using namespace tt;
 
 
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux_2level.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux_2level.cpp
index c251a567871..41bdccef04b 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux_2level.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux_2level.cpp
@@ -9,6 +9,7 @@
 #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp"
 #include "kernels/traffic_gen_test.hpp"
 
+using std::vector;
 using namespace tt;
 
 
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tunnel_1cq.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tunnel_1cq.cpp
index 7b9b45ba68c..6a8c1753b75 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tunnel_1cq.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tunnel_1cq.cpp
@@ -10,6 +10,7 @@
 #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp"
 #include "kernels/traffic_gen_test.hpp"
 
+using std::vector;
 using namespace tt;
 
 
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tunnel_2cq.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tunnel_2cq.cpp
index 3c501badf88..f311a276896 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tunnel_2cq.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tunnel_2cq.cpp
@@ -10,6 +10,7 @@
 #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp"
 #include "kernels/traffic_gen_test.hpp"
 
+using std::vector;
 using namespace tt;
 
 
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tx_rx.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tx_rx.cpp
index 1babd91d899..fb21f47cf07 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tx_rx.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tx_rx.cpp
@@ -10,6 +10,7 @@
 #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp"
 #include "kernels/traffic_gen_test.hpp"
 
+using std::vector;
 using namespace tt;
 
 
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_uni_tunnel.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_uni_tunnel.cpp
index ee9b74ea473..996b9361564 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_uni_tunnel.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_uni_tunnel.cpp
@@ -10,6 +10,7 @@
 #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp"
 #include "kernels/traffic_gen_test.hpp"
 
+using std::vector;
 using namespace tt;
 
 
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_uni_tunnel_single_chip.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_uni_tunnel_single_chip.cpp
index dec4e9ab3ca..1a33b8c655b 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_uni_tunnel_single_chip.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_uni_tunnel_single_chip.cpp
@@ -10,6 +10,7 @@
 #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp"
 #include "kernels/traffic_gen_test.hpp"
 
+using std::vector;
 using namespace tt;
 
 
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel.cpp
index b0b8930326c..c111007d735 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel.cpp
@@ -10,6 +10,7 @@
 #include "kernels/traffic_gen_test.hpp"
 #include "tt_metal/impl/device/device.hpp"
 
+using std::vector;
 using namespace tt;
 
 
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_mux_demux.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_mux_demux.cpp
index ed64139831d..8402cf52f6e 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_mux_demux.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_mux_demux.cpp
@@ -10,6 +10,7 @@
 #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp"
 #include "kernels/traffic_gen_test.hpp"
 
+using std::vector;
 using namespace tt;
 
 
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_uni_tunnel.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_uni_tunnel.cpp
index bd6708dca7d..5a06741d0df 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_uni_tunnel.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_uni_tunnel.cpp
@@ -10,6 +10,7 @@
 #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp"
 #include "kernels/traffic_gen_test.hpp"
 
+using std::vector;
 using namespace tt;
 
 
diff --git a/tests/tt_metal/tt_metal/test_bcast.cpp b/tests/tt_metal/tt_metal/test_bcast.cpp
index b60816779f6..932390177b8 100644
--- a/tests/tt_metal/tt_metal/test_bcast.cpp
+++ b/tests/tt_metal/tt_metal/test_bcast.cpp
@@ -16,6 +16,7 @@
 #include "test_gold_impls.hpp"
 #include "constants.hpp"
 
+using std::vector;
 using namespace tt;
 using namespace constants;
 
diff --git a/tests/tt_metal/tt_metal/test_bmm.cpp b/tests/tt_metal/tt_metal/test_bmm.cpp
index 21f021714cc..c91b6de5a36 100644
--- a/tests/tt_metal/tt_metal/test_bmm.cpp
+++ b/tests/tt_metal/tt_metal/test_bmm.cpp
@@ -11,6 +11,7 @@
 #include "common/bfloat16.hpp"
 #include "test_gold_impls.hpp"
 
+using std::vector;
 using namespace tt;
 using namespace tt::tt_metal;
 
diff --git a/tests/tt_metal/tt_metal/test_clean_init.cpp b/tests/tt_metal/tt_metal/test_clean_init.cpp
index 770f0f34f4a..1e2ab1c8613 100644
--- a/tests/tt_metal/tt_metal/test_clean_init.cpp
+++ b/tests/tt_metal/tt_metal/test_clean_init.cpp
@@ -14,6 +14,7 @@
  * recover from a "bad" state.
 */
 
+using std::vector;
 using namespace tt::tt_metal;
 
 int main(int argc, char **argv) {
diff --git a/tests/tt_metal/tt_metal/test_compile_args.cpp b/tests/tt_metal/tt_metal/test_compile_args.cpp
index 794404b1a65..5daa58616c4 100644
--- a/tests/tt_metal/tt_metal/test_compile_args.cpp
+++ b/tests/tt_metal/tt_metal/test_compile_args.cpp
@@ -16,6 +16,7 @@
 //////////////////////////////////////////////////////////////////////////////////////////
 // TODO: explain what test does
 //////////////////////////////////////////////////////////////////////////////////////////
+using std::vector;
 using namespace tt;
 
 bool test_compile_args(std::vector<uint32_t> compile_args_vec, tt_metal::Device *device) {
diff --git a/tests/tt_metal/tt_metal/test_compile_program.cpp b/tests/tt_metal/tt_metal/test_compile_program.cpp
index dac634833e2..a093bb91fcf 100644
--- a/tests/tt_metal/tt_metal/test_compile_program.cpp
+++ b/tests/tt_metal/tt_metal/test_compile_program.cpp
@@ -17,6 +17,7 @@
 #include "tt_metal/impl/device/device.hpp"
 #include "tt_metal/impl/kernels/kernel.hpp"
 
+using std::vector;
 using namespace tt;
 using namespace tt::tt_metal;
 
diff --git a/tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp b/tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp
index 36909cb681d..7727c0a13ff 100644
--- a/tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp
+++ b/tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp
@@ -19,6 +19,7 @@
 //////////////////////////////////////////////////////////////////////////////////////////
 // TODO: explain what test does
 //////////////////////////////////////////////////////////////////////////////////////////
+using std::vector;
 using namespace tt;
 
 std::string get_latest_kernel_binary_path(uint32_t mask, const std::shared_ptr<Kernel> kernel) {
diff --git a/tests/tt_metal/tt_metal/test_core_range_set.cpp b/tests/tt_metal/tt_metal/test_core_range_set.cpp
index 6140121289d..bb32b100d4a 100644
--- a/tests/tt_metal/tt_metal/test_core_range_set.cpp
+++ b/tests/tt_metal/tt_metal/test_core_range_set.cpp
@@ -17,6 +17,7 @@
 //////////////////////////////////////////////////////////////////////////////////////////
 // TODO: explain what test does
 //////////////////////////////////////////////////////////////////////////////////////////
+using std::vector;
 using namespace tt;
 
 void check_program_is_mapped_to_correct_cores(const tt_metal::Program &program, const CoreRangeSet &core_range_set, const std::vector<uint32_t> &compute_kernel_args) {
diff --git a/tests/tt_metal/tt_metal/test_datacopy.cpp b/tests/tt_metal/tt_metal/test_datacopy.cpp
index 809d0a5003a..911916a5d0e 100644
--- a/tests/tt_metal/tt_metal/test_datacopy.cpp
+++ b/tests/tt_metal/tt_metal/test_datacopy.cpp
@@ -15,6 +15,7 @@
 //////////////////////////////////////////////////////////////////////////////////////////
 // TODO: explain what test does
 //////////////////////////////////////////////////////////////////////////////////////////
+using std::vector;
 using namespace tt;
 
 namespace unary_datacopy {
diff --git a/tests/tt_metal/tt_metal/test_datacopy_bfp8b.cpp b/tests/tt_metal/tt_metal/test_datacopy_bfp8b.cpp
index 16795cccb8b..59f9d90fe27 100644
--- a/tests/tt_metal/tt_metal/test_datacopy_bfp8b.cpp
+++ b/tests/tt_metal/tt_metal/test_datacopy_bfp8b.cpp
@@ -16,6 +16,7 @@
 //////////////////////////////////////////////////////////////////////////////////////////
 // TODO: explain what test does
 //////////////////////////////////////////////////////////////////////////////////////////
+using std::vector;
 using namespace tt;
 
 int main(int argc, char **argv) {
diff --git a/tests/tt_metal/tt_metal/test_datacopy_output_in_l1.cpp b/tests/tt_metal/tt_metal/test_datacopy_output_in_l1.cpp
index c7577a40435..d66f5441fc4 100644
--- a/tests/tt_metal/tt_metal/test_datacopy_output_in_l1.cpp
+++ b/tests/tt_metal/tt_metal/test_datacopy_output_in_l1.cpp
@@ -15,6 +15,7 @@
 //////////////////////////////////////////////////////////////////////////////////////////
 // TODO: explain what test does
 //////////////////////////////////////////////////////////////////////////////////////////
+using std::vector;
 using namespace tt;
 
 namespace unary_datacopy {
diff --git a/tests/tt_metal/tt_metal/test_eltwise_binary.cpp b/tests/tt_metal/tt_metal/test_eltwise_binary.cpp
index 6583d62ecb1..7abd5863f8f 100644
--- a/tests/tt_metal/tt_metal/test_eltwise_binary.cpp
+++ b/tests/tt_metal/tt_metal/test_eltwise_binary.cpp
@@ -12,6 +12,7 @@
 #include "tt_metal/impl/dispatch/command_queue.hpp"
 #include "tt_metal/impl/device/device.hpp"
 
+using std::vector;
 using namespace tt;
 using namespace tt::tt_metal;
 
diff --git a/tests/tt_metal/tt_metal/test_flatten.cpp b/tests/tt_metal/tt_metal/test_flatten.cpp
index 430436cf829..6322da0dc4e 100644
--- a/tests/tt_metal/tt_metal/test_flatten.cpp
+++ b/tests/tt_metal/tt_metal/test_flatten.cpp
@@ -16,6 +16,7 @@
 //////////////////////////////////////////////////////////////////////////////////////////
 // TODO: explain what test does
 //////////////////////////////////////////////////////////////////////////////////////////
+using std::vector;
 using namespace tt;
 
 uint32_t prod(vector<uint32_t> &shape) {
diff --git a/tests/tt_metal/tt_metal/test_generic_binary_reader_matmul_large_block.cpp b/tests/tt_metal/tt_metal/test_generic_binary_reader_matmul_large_block.cpp
index 8f6269b2c39..6a125cf8c21 100644
--- a/tests/tt_metal/tt_metal/test_generic_binary_reader_matmul_large_block.cpp
+++ b/tests/tt_metal/tt_metal/test_generic_binary_reader_matmul_large_block.cpp
@@ -16,6 +16,7 @@
 // This test is similar to test_matmul_large_block.
 // The only difference is that it uses generic_binary_reader_kernel instead of reader_matmul_blocked kernel.
 //////////////////////////////////////////////////////////////////////////////////////////
+using std::vector;
 using namespace tt;
 
 // Given a tensor that is row-major datums, make it tilized
diff --git a/tests/tt_metal/tt_metal/test_gold_impls.hpp b/tests/tt_metal/tt_metal/test_gold_impls.hpp
index e2737044df6..e989339bbb7 100644
--- a/tests/tt_metal/tt_metal/test_gold_impls.hpp
+++ b/tests/tt_metal/tt_metal/test_gold_impls.hpp
@@ -61,7 +61,7 @@ struct BcastOp {
     // These constants above map to ops in llk_3c.h:
     // add_tiles_bcast, sub_tiles_bcast, mul_tiles_bcast
 
-    static const vector<Enum> all() { return { ADD, SUB, MUL }; }
+    static const std::vector<Enum> all() { return { ADD, SUB, MUL }; }
 };
 
 
@@ -122,7 +122,7 @@ inline std::vector<uint16_t> gold_bmm(
     const std::vector<uint32_t> shapeA,
     const std::vector<uint16_t>& A,
     const std::vector<uint32_t>& shapeB,
-    const vector<uint16_t>& B,
+    const std::vector<uint16_t>& B,
     bool acc16 = false
     )
 {
@@ -132,12 +132,12 @@ inline std::vector<uint16_t> gold_bmm(
     uint32_t K = shapeA[3]; TT_FATAL(shapeB[2] == K, "Error");
     uint32_t N = shapeB[3];
 
-    vector<uint32_t> shapeC{1, nb, M, N};
+    std::vector<uint32_t> shapeC{1, nb, M, N};
     TensAddr addrC(shapeC);
     TensAddr addrA(shapeA);
     TensAddr addrB(shapeB);
-    vector<uint16_t> result(addrC.numel());
-    vector<float> resultf(addrC.numel());
+    std::vector<uint16_t> result(addrC.numel());
+    std::vector<float> resultf(addrC.numel());
     std::fill(resultf.begin(), resultf.end(), 0);
 
     for (int ib = 0; ib < nb; ib++)
diff --git a/tests/tt_metal/tt_metal/test_interleaved_layouts.cpp b/tests/tt_metal/tt_metal/test_interleaved_layouts.cpp
index ee499eac073..c5677750107 100644
--- a/tests/tt_metal/tt_metal/test_interleaved_layouts.cpp
+++ b/tests/tt_metal/tt_metal/test_interleaved_layouts.cpp
@@ -19,6 +19,7 @@
 //////////////////////////////////////////////////////////////////////////////////////////
 // TODO: explain what test does
 //////////////////////////////////////////////////////////////////////////////////////////
+using std::vector;
 using namespace tt;
 
 bool test_write_interleaved_sticks_and_then_read_interleaved_sticks(const tt::ARCH& arch) {
diff --git a/tests/tt_metal/tt_metal/test_matmul_large_block.cpp b/tests/tt_metal/tt_metal/test_matmul_large_block.cpp
index c3da4364db7..c05244e90df 100644
--- a/tests/tt_metal/tt_metal/test_matmul_large_block.cpp
+++ b/tests/tt_metal/tt_metal/test_matmul_large_block.cpp
@@ -15,6 +15,7 @@
 //////////////////////////////////////////////////////////////////////////////////////////
 // TODO: explain what test does
 //////////////////////////////////////////////////////////////////////////////////////////
+using std::vector;
 using namespace tt;
 
 // Given a tensor that is row-major datums, make it tilized
diff --git a/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram.cpp b/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram.cpp
index e8c63539ff2..8e0a625fa51 100644
--- a/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram.cpp
+++ b/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram.cpp
@@ -16,6 +16,7 @@
 //////////////////////////////////////////////////////////////////////////////////////////
 // TODO: explain what test does
 //////////////////////////////////////////////////////////////////////////////////////////
+using std::vector;
 using namespace tt;
 using namespace tt::tt_metal;
 
diff --git a/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram_in0_mcast.cpp b/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram_in0_mcast.cpp
index d2ffdee3654..2ebe6804c8a 100644
--- a/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram_in0_mcast.cpp
+++ b/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram_in0_mcast.cpp
@@ -16,6 +16,7 @@
 //////////////////////////////////////////////////////////////////////////////////////////
 // TODO: explain what test does
 //////////////////////////////////////////////////////////////////////////////////////////
+using std::vector;
 using namespace tt;
 
 
diff --git a/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram_in0_mcast_in1_mcast.cpp b/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram_in0_mcast_in1_mcast.cpp
index 21692b41c63..73f2df50cd8 100644
--- a/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram_in0_mcast_in1_mcast.cpp
+++ b/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram_in0_mcast_in1_mcast.cpp
@@ -16,6 +16,7 @@
 //////////////////////////////////////////////////////////////////////////////////////////
 // TODO: explain what test does
 //////////////////////////////////////////////////////////////////////////////////////////
+using std::vector;
 using namespace tt;
 
 // Given a tensor that is row-major datums, make it tilized
diff --git a/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram_in1_mcast.cpp b/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram_in1_mcast.cpp
index 567c7b2d0e8..ea75884ff97 100644
--- a/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram_in1_mcast.cpp
+++ b/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram_in1_mcast.cpp
@@ -16,6 +16,7 @@
 //////////////////////////////////////////////////////////////////////////////////////////
 // TODO: explain what test does
 //////////////////////////////////////////////////////////////////////////////////////////
+using std::vector;
 using namespace tt;
 
 // Given a tensor that is row-major datums, make it tilized
diff --git a/tests/tt_metal/tt_metal/test_matmul_multi_core_single_dram.cpp b/tests/tt_metal/tt_metal/test_matmul_multi_core_single_dram.cpp
index ab87b99b88c..51f703323ff 100644
--- a/tests/tt_metal/tt_metal/test_matmul_multi_core_single_dram.cpp
+++ b/tests/tt_metal/tt_metal/test_matmul_multi_core_single_dram.cpp
@@ -15,6 +15,7 @@
 //////////////////////////////////////////////////////////////////////////////////////////
 // TODO: explain what test does
 //////////////////////////////////////////////////////////////////////////////////////////
+using std::vector;
 using namespace tt;
 
 // Given a tensor that is row-major datums, make it tilized
diff --git a/tests/tt_metal/tt_metal/test_matmul_multi_tile.cpp b/tests/tt_metal/tt_metal/test_matmul_multi_tile.cpp
index 26a8773d4e7..13ad19b9b5d 100644
--- a/tests/tt_metal/tt_metal/test_matmul_multi_tile.cpp
+++ b/tests/tt_metal/tt_metal/test_matmul_multi_tile.cpp
@@ -15,6 +15,7 @@
 //////////////////////////////////////////////////////////////////////////////////////////
 // TODO: explain what test does
 //////////////////////////////////////////////////////////////////////////////////////////
+using std::vector;
 using namespace tt;
 
 // Given a tensor that is row-major datums, make it tilized
diff --git a/tests/tt_metal/tt_metal/test_matmul_single_core.cpp b/tests/tt_metal/tt_metal/test_matmul_single_core.cpp
index 131e96d497f..f7b7da888f9 100644
--- a/tests/tt_metal/tt_metal/test_matmul_single_core.cpp
+++ b/tests/tt_metal/tt_metal/test_matmul_single_core.cpp
@@ -15,6 +15,7 @@
 //////////////////////////////////////////////////////////////////////////////////////////
 // TODO: explain what test does
 //////////////////////////////////////////////////////////////////////////////////////////
+using std::vector;
 using namespace tt;
 
 // Given a tensor that is row-major datums, make it tilized
diff --git a/tests/tt_metal/tt_metal/test_matmul_single_core_small.cpp b/tests/tt_metal/tt_metal/test_matmul_single_core_small.cpp
index 72e910db512..0b68b76326e 100644
--- a/tests/tt_metal/tt_metal/test_matmul_single_core_small.cpp
+++ b/tests/tt_metal/tt_metal/test_matmul_single_core_small.cpp
@@ -16,6 +16,7 @@
 //////////////////////////////////////////////////////////////////////////////////////////
 // TODO: explain what test does
 //////////////////////////////////////////////////////////////////////////////////////////
+using std::vector;
 using namespace tt;
 
 // Given a tensor that is row-major datums, make it tilized
diff --git a/tests/tt_metal/tt_metal/test_matmul_single_tile.cpp b/tests/tt_metal/tt_metal/test_matmul_single_tile.cpp
index b4f52531989..dcd371a9d3d 100644
--- a/tests/tt_metal/tt_metal/test_matmul_single_tile.cpp
+++ b/tests/tt_metal/tt_metal/test_matmul_single_tile.cpp
@@ -15,6 +15,7 @@
 //////////////////////////////////////////////////////////////////////////////////////////
 // TODO: explain what test does
 //////////////////////////////////////////////////////////////////////////////////////////
+using std::vector;
 using namespace tt;
 
 int main(int argc, char **argv) {
diff --git a/tests/tt_metal/tt_metal/test_matmul_single_tile_bfp8b.cpp b/tests/tt_metal/tt_metal/test_matmul_single_tile_bfp8b.cpp
index bcafffbe44e..df3273a197b 100644
--- a/tests/tt_metal/tt_metal/test_matmul_single_tile_bfp8b.cpp
+++ b/tests/tt_metal/tt_metal/test_matmul_single_tile_bfp8b.cpp
@@ -14,6 +14,7 @@
 //////////////////////////////////////////////////////////////////////////////////////////
 // TODO: explain what test does
 //////////////////////////////////////////////////////////////////////////////////////////
+using std::vector;
 using namespace tt;
 
 int main(int argc, char **argv) {
diff --git a/tests/tt_metal/tt_metal/test_matmul_single_tile_output_in_l1.cpp b/tests/tt_metal/tt_metal/test_matmul_single_tile_output_in_l1.cpp
index bdfe897259b..4a58e403a1a 100644
--- a/tests/tt_metal/tt_metal/test_matmul_single_tile_output_in_l1.cpp
+++ b/tests/tt_metal/tt_metal/test_matmul_single_tile_output_in_l1.cpp
@@ -15,6 +15,7 @@
 //////////////////////////////////////////////////////////////////////////////////////////
 // TODO: explain what test does
 //////////////////////////////////////////////////////////////////////////////////////////
+using std::vector;
 using namespace tt;
 
 int main(int argc, char **argv) {
diff --git a/tests/tt_metal/tt_metal/test_multi_core_kernel.cpp b/tests/tt_metal/tt_metal/test_multi_core_kernel.cpp
index 5a1e242b314..7a24596843b 100644
--- a/tests/tt_metal/tt_metal/test_multi_core_kernel.cpp
+++ b/tests/tt_metal/tt_metal/test_multi_core_kernel.cpp
@@ -16,6 +16,7 @@
 //////////////////////////////////////////////////////////////////////////////////////////
 // TODO: explain what test does
 //////////////////////////////////////////////////////////////////////////////////////////
+using std::vector;
 using namespace tt;
 
 std::tuple<tt_metal::Program, tt_metal::KernelHandle, tt_metal::KernelHandle> create_program(
diff --git a/tests/tt_metal/tt_metal/test_multiple_programs.cpp b/tests/tt_metal/tt_metal/test_multiple_programs.cpp
index a7c0dd7e70f..eef89366d45 100644
--- a/tests/tt_metal/tt_metal/test_multiple_programs.cpp
+++ b/tests/tt_metal/tt_metal/test_multiple_programs.cpp
@@ -12,6 +12,7 @@
 #include "tt_metal/test_utils/deprecated/tensor.hpp"
 #include "test_tiles.hpp"
 
+using std::vector;
 using namespace tt;
 
 struct BinaryOpType {
diff --git a/tests/tt_metal/tt_metal/test_transpose_hc.cpp b/tests/tt_metal/tt_metal/test_transpose_hc.cpp
index 060ec8ac4ca..e3392ac0d23 100644
--- a/tests/tt_metal/tt_metal/test_transpose_hc.cpp
+++ b/tests/tt_metal/tt_metal/test_transpose_hc.cpp
@@ -13,6 +13,7 @@
 
 #include "test_tiles.hpp"
 
+using std::vector;
 using namespace tt;
 
 using std::uint32_t;
diff --git a/tests/tt_metal/tt_metal/test_untilize_eltwise_binary.cpp b/tests/tt_metal/tt_metal/test_untilize_eltwise_binary.cpp
index fcf74dbebfc..6d783347e7b 100644
--- a/tests/tt_metal/tt_metal/test_untilize_eltwise_binary.cpp
+++ b/tests/tt_metal/tt_metal/test_untilize_eltwise_binary.cpp
@@ -11,6 +11,7 @@
 #include "common/bfloat16.hpp"
 #include "test_gold_impls.hpp"
 
+using std::vector;
 using namespace tt;
 
 inline std::vector<uint32_t> gold_standard_untilize(std::vector<uint32_t> src_vec, std::vector<uint32_t> shape) {
diff --git a/tests/tt_metal/tt_metal/tt_dispatch/test_enqueue_program.cpp b/tests/tt_metal/tt_metal/tt_dispatch/test_enqueue_program.cpp
index 4e3ea25ad9f..5b3f3cd7851 100644
--- a/tests/tt_metal/tt_metal/tt_dispatch/test_enqueue_program.cpp
+++ b/tests/tt_metal/tt_metal/tt_dispatch/test_enqueue_program.cpp
@@ -7,6 +7,7 @@
 #include "tt_metal/impl/dispatch/command_queue.hpp"
 #include "tt_metal/detail/tt_metal.hpp"
 
+using std::vector;
 using namespace tt;
 using namespace tt::tt_metal;
 uint32_t NUM_TILES = 2048;
diff --git a/tests/tt_metal/tt_metal/unit_tests/basic/initialize_semaphores.cpp b/tests/tt_metal/tt_metal/unit_tests/basic/initialize_semaphores.cpp
index 58689b51aa9..9be219332a0 100644
--- a/tests/tt_metal/tt_metal/unit_tests/basic/initialize_semaphores.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/basic/initialize_semaphores.cpp
@@ -13,6 +13,7 @@
 #include "tt_metal/detail/util.hpp"
 #include "tt_metal/host_api.hpp"
 
+using std::vector;
 using namespace tt;
 
 namespace unit_tests::initialize_semaphores {
diff --git a/tests/tt_metal/tt_metal/unit_tests/buffer/test_banked.cpp b/tests/tt_metal/tt_metal/unit_tests/buffer/test_banked.cpp
index 7aeabc409a2..62418427c21 100644
--- a/tests/tt_metal/tt_metal/unit_tests/buffer/test_banked.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/buffer/test_banked.cpp
@@ -13,6 +13,7 @@
 #include "tt_metal/test_utils/print_helpers.hpp"
 #include "tt_metal/test_utils/stimulus.hpp"
 
+using std::vector;
 using namespace tt::tt_metal;
 
 namespace basic_tests::buffer::banked {
diff --git a/tests/tt_metal/tt_metal/unit_tests/circular_buffer/test_CircularBuffer_allocation.cpp b/tests/tt_metal/tt_metal/unit_tests/circular_buffer/test_CircularBuffer_allocation.cpp
index b57bff769f8..1df5ec9cdfd 100644
--- a/tests/tt_metal/tt_metal/unit_tests/circular_buffer/test_CircularBuffer_allocation.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/circular_buffer/test_CircularBuffer_allocation.cpp
@@ -10,6 +10,7 @@
 #include "tt_metal/detail/tt_metal.hpp"
 #include "common/bfloat16.hpp"
 
+using std::vector;
 using namespace tt::tt_metal;
 
 namespace basic_tests::circular_buffer {
diff --git a/tests/tt_metal/tt_metal/unit_tests/circular_buffer/test_CircularBuffer_creation.cpp b/tests/tt_metal/tt_metal/unit_tests/circular_buffer/test_CircularBuffer_creation.cpp
index 199aa429f88..1196f802b36 100644
--- a/tests/tt_metal/tt_metal/unit_tests/circular_buffer/test_CircularBuffer_creation.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/circular_buffer/test_CircularBuffer_creation.cpp
@@ -9,6 +9,7 @@
 #include "tt_metal/host_api.hpp"
 #include "tt_metal/impl/buffers/circular_buffer.hpp"
 
+using std::vector;
 using namespace tt::tt_metal;
 
 namespace basic_tests::circular_buffer {
diff --git a/tests/tt_metal/tt_metal/unit_tests/common/device_fixture.hpp b/tests/tt_metal/tt_metal/unit_tests/common/device_fixture.hpp
index 9ccaa93d4bd..c48639f4f36 100644
--- a/tests/tt_metal/tt_metal/unit_tests/common/device_fixture.hpp
+++ b/tests/tt_metal/tt_metal/unit_tests/common/device_fixture.hpp
@@ -29,7 +29,7 @@ class DeviceFixture : public ::testing::Test {
             num_devices_ = 2;
         }
 
-        vector<chip_id_t> ids;
+        std::vector<chip_id_t> ids;
         for (unsigned int id = 0; id < num_devices_; id++) {
             ids.push_back(id);
         }
@@ -94,7 +94,7 @@ class GalaxyFixture : public ::testing::Test {
     void InitializeDevices()
     {
         const size_t num_devices = tt::tt_metal::GetNumAvailableDevices();
-        vector<chip_id_t> ids;
+        std::vector<chip_id_t> ids;
         for (uint32_t id = 0; id < num_devices; id++)
         {
             ids.push_back(id);
diff --git a/tests/tt_metal/tt_metal/unit_tests/common/n300_device_fixture.hpp b/tests/tt_metal/tt_metal/unit_tests/common/n300_device_fixture.hpp
index 355c0b03ab3..08e57a5cb2a 100644
--- a/tests/tt_metal/tt_metal/unit_tests/common/n300_device_fixture.hpp
+++ b/tests/tt_metal/tt_metal/unit_tests/common/n300_device_fixture.hpp
@@ -23,7 +23,7 @@ class N300DeviceFixture : public ::testing::Test {
         num_devices_ = tt::tt_metal::GetNumAvailableDevices();
         if (arch_ == tt::ARCH::WORMHOLE_B0 and tt::tt_metal::GetNumAvailableDevices() == 2 and
             tt::tt_metal::GetNumPCIeDevices() == 1) {
-            vector<chip_id_t> ids;
+            std::vector<chip_id_t> ids;
             for (unsigned int id = 0; id < num_devices_; id++) {
                 ids.push_back(id);
             }
diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_broadcast.cpp b/tests/tt_metal/tt_metal/unit_tests/compute/test_broadcast.cpp
index ac551ffe7f2..d12d89bd88f 100644
--- a/tests/tt_metal/tt_metal/unit_tests/compute/test_broadcast.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/compute/test_broadcast.cpp
@@ -10,6 +10,7 @@
 #include "tt_metal/test_utils/stimulus.hpp"
 #include "test_golden_impls.hpp"
 
+using std::map;
 using namespace tt;
 using namespace tt::test_utils;
 using namespace tt::test_utils::df;
diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_copy_block_matmul_partials.cpp b/tests/tt_metal/tt_metal/unit_tests/compute/test_copy_block_matmul_partials.cpp
index cb9f7bbaf69..314bf2fb127 100644
--- a/tests/tt_metal/tt_metal/unit_tests/compute/test_copy_block_matmul_partials.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/compute/test_copy_block_matmul_partials.cpp
@@ -6,6 +6,7 @@
 #include "tt_metal/test_utils/stimulus.hpp"
 
 
+using std::vector;
 using namespace tt;
 using namespace tt::test_utils;
 
diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_dropout_sfpu_compute.cpp b/tests/tt_metal/tt_metal/unit_tests/compute/test_dropout_sfpu_compute.cpp
index c3269675049..655aeb87cfe 100644
--- a/tests/tt_metal/tt_metal/unit_tests/compute/test_dropout_sfpu_compute.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/compute/test_dropout_sfpu_compute.cpp
@@ -8,6 +8,7 @@
 #include "tt_metal/host_api.hpp"
 #include "common/bfloat16.hpp"
 
+using std::vector;
 using namespace tt;
 using namespace tt::tt_metal;
 
diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_golden_impls.cpp b/tests/tt_metal/tt_metal/unit_tests/compute/test_golden_impls.cpp
index 6988cfb5277..4afc02acaa8 100644
--- a/tests/tt_metal/tt_metal/unit_tests/compute/test_golden_impls.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/compute/test_golden_impls.cpp
@@ -12,6 +12,7 @@
 #include "tt_metal/detail/tt_metal.hpp"
 #include "tests/tt_metal/test_utils/packing.hpp"
 
+using std::vector;
 
 namespace unit_tests::compute {
 
diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_reconfig.cpp b/tests/tt_metal/tt_metal/unit_tests/compute/test_reconfig.cpp
index 95c902c026f..b55c6329938 100644
--- a/tests/tt_metal/tt_metal/unit_tests/compute/test_reconfig.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/compute/test_reconfig.cpp
@@ -7,6 +7,7 @@
 #include "tt_metal/common/bfloat8.hpp"
 #include "tt_metal/test_utils/comparison.hpp"
 
+using std::vector;
 using namespace tt;
 using namespace tt::test_utils;
 
diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_reduce.cpp b/tests/tt_metal/tt_metal/unit_tests/compute/test_reduce.cpp
index 596beb20e8d..8439126997a 100644
--- a/tests/tt_metal/tt_metal/unit_tests/compute/test_reduce.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/compute/test_reduce.cpp
@@ -20,6 +20,7 @@
 #include "common/test_tiles.hpp"
 #include "common/bfloat16.hpp"
 
+using std::vector;
 using namespace tt;
 using namespace tt::tt_metal;
 using namespace tt::test_utils;
diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_sfpu_compute.cpp b/tests/tt_metal/tt_metal/unit_tests/compute/test_sfpu_compute.cpp
index 9a80a7d8819..0a7822c6fbf 100644
--- a/tests/tt_metal/tt_metal/unit_tests/compute/test_sfpu_compute.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/compute/test_sfpu_compute.cpp
@@ -17,6 +17,8 @@
 #include "tt_metal/test_utils/print_helpers.hpp"
 #include "tt_metal/test_utils/stimulus.hpp"
 
+using std::map;
+using std::vector;
 using namespace tt;
 using namespace tt::test_utils;
 using namespace tt::test_utils::df;
diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_single_core_binary_compute.cpp b/tests/tt_metal/tt_metal/unit_tests/compute/test_single_core_binary_compute.cpp
index 3cfd8a8e7cc..19baa412647 100644
--- a/tests/tt_metal/tt_metal/unit_tests/compute/test_single_core_binary_compute.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/compute/test_single_core_binary_compute.cpp
@@ -17,6 +17,8 @@
 #include "tt_metal/test_utils/print_helpers.hpp"
 #include "tt_metal/test_utils/stimulus.hpp"
 
+using std::map;
+using std::vector;
 using namespace tt;
 using namespace tt::test_utils;
 using namespace tt::test_utils::df;
diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_transpose.cpp b/tests/tt_metal/tt_metal/unit_tests/compute/test_transpose.cpp
index c6b0ccc87dd..76736184a2e 100644
--- a/tests/tt_metal/tt_metal/unit_tests/compute/test_transpose.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/compute/test_transpose.cpp
@@ -19,6 +19,7 @@
 #include "test_golden_impls.hpp"
 #include "common/test_tiles.hpp"
 
+using std::vector;
 using namespace tt;
 using namespace tt::test_utils;
 using namespace tt::test_utils::df;
diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_untilize_tilize.cpp b/tests/tt_metal/tt_metal/unit_tests/compute/test_untilize_tilize.cpp
index 91071283a51..5dbdcc7ab6d 100644
--- a/tests/tt_metal/tt_metal/unit_tests/compute/test_untilize_tilize.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/compute/test_untilize_tilize.cpp
@@ -18,6 +18,7 @@
 #include "tt_metal/test_utils/stimulus.hpp"
 #include "test_golden_impls.hpp"
 
+using std::vector;
 using namespace tt;
 using namespace tt::test_utils;
 using namespace tt::tt_metal;
diff --git a/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_iterator.cpp b/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_iterator.cpp
index d475d3c897b..5729e1a6c4b 100644
--- a/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_iterator.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_iterator.cpp
@@ -7,6 +7,8 @@
 #include "tt_metal/common/core_coord.hpp"
 #include "core_coord_fixture.hpp"
 
+using std::vector;
+
 namespace basic_tests::CoreRange {
 
 TEST_F(CoreCoordHarness, TestCoreRangeIterator)
diff --git a/tests/tt_metal/tt_metal/unit_tests/dram/direct.cpp b/tests/tt_metal/tt_metal/unit_tests/dram/direct.cpp
index 081d72fb3a1..791f033d127 100644
--- a/tests/tt_metal/tt_metal/unit_tests/dram/direct.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/dram/direct.cpp
@@ -16,6 +16,7 @@
 #include "tt_metal/test_utils/print_helpers.hpp"
 #include "tt_metal/test_utils/stimulus.hpp"
 
+using std::vector;
 using namespace tt;
 using namespace tt::test_utils;
 using namespace tt::test_utils::df;
diff --git a/tests/tt_metal/tt_metal/unit_tests/fast_dispatch_kernels/test_write_host.cpp b/tests/tt_metal/tt_metal/unit_tests/fast_dispatch_kernels/test_write_host.cpp
index e7640e332d0..f70039820a5 100644
--- a/tests/tt_metal/tt_metal/unit_tests/fast_dispatch_kernels/test_write_host.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/fast_dispatch_kernels/test_write_host.cpp
@@ -12,6 +12,7 @@
 #include "tt_metal/test_utils/env_vars.hpp"
 #include "tt_metal/common/math.hpp"
 
+using std::vector;
 using namespace tt::tt_metal;
 
 // TODO: Remove dependency on "tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/common.h" and remove globals
diff --git a/tests/tt_metal/tt_metal/unit_tests/multichip/ring_gather_kernels.cpp b/tests/tt_metal/tt_metal/unit_tests/multichip/ring_gather_kernels.cpp
index 75c8edc0ae3..6cad6f5d625 100644
--- a/tests/tt_metal/tt_metal/unit_tests/multichip/ring_gather_kernels.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/multichip/ring_gather_kernels.cpp
@@ -18,6 +18,7 @@
 #include "tt_metal/test_utils/print_helpers.hpp"
 #include "tt_metal/test_utils/stimulus.hpp"
 
+using std::vector;
 using namespace tt;
 using namespace tt::test_utils;
 using namespace tt::test_utils::df;
diff --git a/tests/tt_metal/tt_metal/unit_tests_common/basic/test_device_init.cpp b/tests/tt_metal/tt_metal/unit_tests_common/basic/test_device_init.cpp
index 6e82d8b87b4..f4dfae4d653 100644
--- a/tests/tt_metal/tt_metal/unit_tests_common/basic/test_device_init.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_common/basic/test_device_init.cpp
@@ -17,6 +17,7 @@
 #include "tt_metal/impl/device/device.hpp"
 #include "tt_metal/impl/device/device_pool.hpp"
 
+using std::vector;
 using namespace tt;
 using namespace tt::test_utils;
 
diff --git a/tests/tt_metal/tt_metal/unit_tests_common/common/common_fixture.hpp b/tests/tt_metal/tt_metal/unit_tests_common/common/common_fixture.hpp
index 510787d324b..5ee4c1fa79c 100644
--- a/tests/tt_metal/tt_metal/unit_tests_common/common/common_fixture.hpp
+++ b/tests/tt_metal/tt_metal/unit_tests_common/common/common_fixture.hpp
@@ -48,7 +48,7 @@ class CommonFixture: public ::testing::Test {
 
 protected:
     tt::ARCH arch_;
-    vector<tt::tt_metal::v1::DeviceHandle> devices_;
+    std::vector<tt::tt_metal::v1::DeviceHandle> devices_;
     bool slow_dispatch_;
     bool has_remote_devices_;
 
@@ -70,7 +70,7 @@ class CommonFixture: public ::testing::Test {
         // An extra flag for if we have remote devices, as some tests are disabled for fast
         // dispatch + remote devices.
         this->has_remote_devices_ = num_devices > num_pci_devices;
-        vector<chip_id_t> ids;
+        std::vector<chip_id_t> ids;
         for (unsigned int id = 0; id < num_devices; id++) {
             if (SkipTest(id))
                 continue;
diff --git a/tests/tt_metal/tt_metal/unit_tests_common/common/test_dispatch.cpp b/tests/tt_metal/tt_metal/unit_tests_common/common/test_dispatch.cpp
index 5f3332ebcc9..d8e3a4fefe1 100644
--- a/tests/tt_metal/tt_metal/unit_tests_common/common/test_dispatch.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_common/common/test_dispatch.cpp
@@ -6,6 +6,8 @@
 
 #include "tests/tt_metal/tt_metal/unit_tests_common/common/common_fixture.hpp"
 
+using std::vector;
+
 // Test sync w/ semaphores betweeen eth/tensix cores
 // Test will hang in the kernel if the sync doesn't work properly
 static void test_sems_across_core_types(CommonFixture *fixture,
diff --git a/tests/tt_metal/tt_metal/unit_tests_common/common/test_utils.hpp b/tests/tt_metal/tt_metal/unit_tests_common/common/test_utils.hpp
index 3c3f2fa7d02..bc2356eb19d 100644
--- a/tests/tt_metal/tt_metal/unit_tests_common/common/test_utils.hpp
+++ b/tests/tt_metal/tt_metal/unit_tests_common/common/test_utils.hpp
@@ -66,7 +66,7 @@ inline bool StringContainsWithWildcard(const string& s1, const string& s2, char
 
 // Check whether the given file contains a list of strings. Doesn't check for
 // strings between lines in the file.
-inline bool FileContainsAllStrings(string file_name, const vector<string> &must_contain) {
+inline bool FileContainsAllStrings(string file_name, const std::vector<string> &must_contain) {
     std::fstream log_file;
     if (!OpenFile(file_name, log_file, std::fstream::in))
         return false;
@@ -78,7 +78,7 @@ inline bool FileContainsAllStrings(string file_name, const vector<string> &must_
         string line;
         while (getline(log_file, line)) {
             // Check for all target strings in the current line
-            vector<string> found_on_current_line;
+            std::vector<string> found_on_current_line;
             for (const string &s : must_contain_set) {
                 if (StringContainsWithWildcard(s, line, '*'))
                     found_on_current_line.push_back(s);
@@ -110,7 +110,7 @@ inline bool FileContainsAllStrings(string file_name, const vector<string> &must_
 
 // Check whether the given file contains a list of strings (in order). Doesn't check for strings
 // between lines in a file.
-inline bool FileContainsAllStringsInOrder(string file_name, const vector<string> &must_contain) {
+inline bool FileContainsAllStringsInOrder(string file_name, const std::vector<string> &must_contain) {
     std::fstream log_file;
     if (!OpenFile(file_name, log_file, std::fstream::in))
         return false;
diff --git a/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_X_tile.cpp b/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_X_tile.cpp
index 3a87f5d8365..b7b5d17b241 100644
--- a/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_X_tile.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_X_tile.cpp
@@ -17,6 +17,7 @@
 #include "tests/tt_metal/test_utils/print_helpers.hpp"
 #include "tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/matmul_utils.hpp"
 
+using std::vector;
 using namespace tt;
 using namespace tt::test_utils;
 namespace unit_tests_common::matmul::test_matmul_X_tile{
diff --git a/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_large_block.cpp b/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_large_block.cpp
index c808649943c..35722184b45 100644
--- a/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_large_block.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_large_block.cpp
@@ -16,6 +16,7 @@
 #include "tests/tt_metal/test_utils/print_helpers.hpp"
 #include "tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/matmul_utils.hpp"
 
+using std::vector;
 using namespace tt;
 using namespace tt::test_utils;
 
diff --git a/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_multi_core_X_dram.cpp b/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_multi_core_X_dram.cpp
index 2eb2e6975a4..353e9084340 100644
--- a/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_multi_core_X_dram.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_multi_core_X_dram.cpp
@@ -16,6 +16,7 @@
 #include "tests/tt_metal/test_utils/tilization.hpp"
 #include "tests/tt_metal/test_utils/print_helpers.hpp"
 #include "tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/matmul_utils.hpp"
+using std::vector;
 using namespace tt;
 
 namespace unit_tests_common::matmul::test_matmul_multi_core_X_dram {
diff --git a/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_multi_core_multi_dram_in0_mcast_in1_mcast.cpp b/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_multi_core_multi_dram_in0_mcast_in1_mcast.cpp
index 1a24e668847..ff60ec1b853 100644
--- a/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_multi_core_multi_dram_in0_mcast_in1_mcast.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_multi_core_multi_dram_in0_mcast_in1_mcast.cpp
@@ -16,6 +16,7 @@
 #include "tests/tt_metal/test_utils/tilization.hpp"
 #include "tests/tt_metal/test_utils/print_helpers.hpp"
 #include "tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/matmul_utils.hpp"
+using std::vector;
 using namespace tt;
 namespace unit_tests_common::matmul::test_matmul_multi_core_multi_dram_in0_mcast_in1_mcast {
 
diff --git a/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_multi_core_multi_dram_inX_mcast.cpp b/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_multi_core_multi_dram_inX_mcast.cpp
index 416a13037bd..6f7aee8b645 100644
--- a/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_multi_core_multi_dram_inX_mcast.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_multi_core_multi_dram_inX_mcast.cpp
@@ -16,6 +16,7 @@
 #include "tests/tt_metal/test_utils/tilization.hpp"
 #include "tests/tt_metal/test_utils/print_helpers.hpp"
 #include "tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/matmul_utils.hpp"
+using std::vector;
 using namespace tt;
 
 namespace unit_tests_common::matmul::test_matmul_multi_core_multi_dram_inX_mcast {
diff --git a/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_single_core.cpp b/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_single_core.cpp
index 6c4fe79c023..5db120dd9ba 100644
--- a/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_single_core.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_single_core.cpp
@@ -17,6 +17,7 @@
 #include "tests/tt_metal/test_utils/print_helpers.hpp"
 #include "tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/matmul_utils.hpp"
 
+using std::vector;
 using namespace tt;
 
 namespace unit_tests_common::matmul::test_matmul_single_core{
diff --git a/tests/tt_metal/tt_metal/unit_tests_common/compute/test_flatten.cpp b/tests/tt_metal/tt_metal/unit_tests_common/compute/test_flatten.cpp
index 76e7bb007ad..5dbadc80812 100644
--- a/tests/tt_metal/tt_metal/unit_tests_common/compute/test_flatten.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_common/compute/test_flatten.cpp
@@ -14,6 +14,7 @@
 #include "llrt/llrt.hpp"
 
 
+using std::vector;
 using namespace tt;
 
 namespace gtest_smoke::test_flatten{
diff --git a/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_print_before_finish.cpp b/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_print_before_finish.cpp
index 7128ad2dd25..0370b51f3f2 100644
--- a/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_print_before_finish.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_print_before_finish.cpp
@@ -7,6 +7,7 @@
 //////////////////////////////////////////////////////////////////////////////////////////
 // A test for checking that the finish command can wait for the last dprint.
 //////////////////////////////////////////////////////////////////////////////////////////
+using std::vector;
 using namespace tt;
 using namespace tt::tt_metal;
 
diff --git a/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_link_training.cpp b/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_link_training.cpp
index 8ba4b94cb89..dd23509745b 100644
--- a/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_link_training.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_link_training.cpp
@@ -8,6 +8,7 @@
 //////////////////////////////////////////////////////////////////////////////////////////
 // A test for checking watcher polling the eth link training counter.
 //////////////////////////////////////////////////////////////////////////////////////////
+using std::vector;
 using namespace tt;
 using namespace tt::tt_metal;
 
diff --git a/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_noc_sanitize_delays.cpp b/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_noc_sanitize_delays.cpp
index ef31e02d8a3..600872d58ac 100644
--- a/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_noc_sanitize_delays.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_noc_sanitize_delays.cpp
@@ -13,6 +13,7 @@
 //////////////////////////////////////////////////////////////////////////////////////////
 // A test for checking watcher NOC sanitization.
 //////////////////////////////////////////////////////////////////////////////////////////
+using std::vector;
 using namespace tt;
 using namespace tt::tt_metal;
 
diff --git a/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_pause.cpp b/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_pause.cpp
index 46b720e6508..1a21a43a187 100644
--- a/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_pause.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_pause.cpp
@@ -8,6 +8,7 @@
 //////////////////////////////////////////////////////////////////////////////////////////
 // A test for checking watcher pause feature.
 //////////////////////////////////////////////////////////////////////////////////////////
+using std::vector;
 using namespace tt;
 using namespace tt::tt_metal;
 
diff --git a/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_waypoint.cpp b/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_waypoint.cpp
index 4dafe78f539..ffc9fb62e57 100644
--- a/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_waypoint.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_waypoint.cpp
@@ -10,6 +10,7 @@
 //////////////////////////////////////////////////////////////////////////////////////////
 // A test for checking watcher waypoints.
 //////////////////////////////////////////////////////////////////////////////////////////
+using std::vector;
 using namespace tt;
 using namespace tt::tt_metal;
 
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueProgram.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueProgram.cpp
index ec8238158d1..2ad2bb7842d 100644
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueProgram.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueProgram.cpp
@@ -14,6 +14,7 @@
 #include "tt_metal/detail/tt_metal.hpp"
 #include "tt_metal/impl/kernels/kernel.hpp"
 
+using std::vector;
 using namespace tt::tt_metal;
 
 struct CBConfig {
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueTrace.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueTrace.cpp
index 65b6b20d57f..755f5892db0 100644
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueTrace.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueTrace.cpp
@@ -18,6 +18,7 @@
 #include "tt_metal/host_api.hpp"
 
 
+using std::vector;
 using namespace tt;
 using namespace tt::tt_metal;
 
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp
index fc6345f5e9a..e380c41d67c 100644
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp
@@ -13,6 +13,7 @@
 #include "tt_metal/test_utils/print_helpers.hpp"
 #include "tt_metal/impl/device/device.hpp"
 
+using std::vector;
 using namespace tt::tt_metal;
 
 struct BufferStressTestConfig {
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_HostAsyncCQ.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_HostAsyncCQ.cpp
index 03e6eb1004c..e4eceaffb9c 100644
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_HostAsyncCQ.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_HostAsyncCQ.cpp
@@ -16,6 +16,7 @@
 #include "tt_metal/impl/dispatch/command_queue.hpp"
 #include "tt_metal/impl/buffers/circular_buffer.hpp"
 
+using std::vector;
 using namespace tt::tt_metal;
 
 namespace host_cq_test_utils {
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_events.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_events.cpp
index 5a772063742..023462a6cd2 100644
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_events.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_events.cpp
@@ -12,6 +12,7 @@
 #include "tt_metal/impl/event/event.hpp"
 #include "tt_metal/impl/dispatch/command_queue.hpp"
 
+using std::vector;
 using namespace tt::tt_metal;
 
 enum class DataMovementMode: uint8_t {
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/compute/sfpu/sfpu_compute.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/compute/sfpu/sfpu_compute.cpp
index 28e0089323f..83746fe8a54 100644
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/compute/sfpu/sfpu_compute.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/compute/sfpu/sfpu_compute.cpp
@@ -19,6 +19,8 @@
 #include "tt_metal/test_utils/stimulus.hpp"
 #include "tt_metal/impl/device/device.hpp"
 
+using std::map;
+using std::vector;
 using namespace tt;
 using namespace tt::test_utils;
 using namespace tt::test_utils::df;
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/multichip/test_eth_EnqueueProgram.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/multichip/test_eth_EnqueueProgram.cpp
index 1dd077cb1b5..ae36623be52 100644
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/multichip/test_eth_EnqueueProgram.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/multichip/test_eth_EnqueueProgram.cpp
@@ -21,6 +21,7 @@
 #include "tt_metal/test_utils/stimulus.hpp"
 
 
+using std::vector;
 using namespace tt;
 using namespace tt::test_utils;
 using namespace tt::test_utils::df;
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/multichip/test_eth_ring_gather_EnqueueProgram.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/multichip/test_eth_ring_gather_EnqueueProgram.cpp
index 2d67bad56d3..53d9f0d5707 100644
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/multichip/test_eth_ring_gather_EnqueueProgram.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/multichip/test_eth_ring_gather_EnqueueProgram.cpp
@@ -19,6 +19,7 @@
 #include "tt_metal/test_utils/print_helpers.hpp"
 #include "tt_metal/test_utils/stimulus.hpp"
 
+using std::vector;
 using namespace tt;
 using namespace tt::test_utils;
 using namespace tt::test_utils::df;
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/pipelining/basic_pipeline.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/pipelining/basic_pipeline.cpp
index d339c115865..3fc76d32d74 100644
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/pipelining/basic_pipeline.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/pipelining/basic_pipeline.cpp
@@ -22,6 +22,8 @@
 #include "tt_metal/test_utils/stimulus.hpp"
 #include "tt_metal/impl/device/device.hpp"
 
+using std::map;
+using std::vector;
 using namespace tt;
 using namespace tt::test_utils;
 using namespace tt::tt_metal;
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueProgram.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueProgram.cpp
index 6a6e7517591..4e407df6d4e 100644
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueProgram.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueProgram.cpp
@@ -13,6 +13,7 @@
 #include "tt_metal/host_api.hpp"
 #include "tt_metal/detail/tt_metal.hpp"
 
+using std::vector;
 using namespace tt::tt_metal;
 
 struct CBConfig {
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueTrace.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueTrace.cpp
index cef5a8d0c18..1a933c8a2f9 100644
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueTrace.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueTrace.cpp
@@ -8,6 +8,7 @@
 #include "tt_metal/host_api.hpp"
 #include "tt_metal/impl/device/device.hpp"
 
+using std::vector;
 using namespace tt::tt_metal;
 
 struct TestBufferConfig {
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueWaitForEvent.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueWaitForEvent.cpp
index b6c3e82791c..1c08c86fa15 100644
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueWaitForEvent.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueWaitForEvent.cpp
@@ -13,6 +13,7 @@
 #include "tt_metal/impl/event/event.hpp"
 #include "tt_metal/impl/device/device.hpp"
 
+using std::vector;
 using namespace tt::tt_metal;
 
 namespace local_test_functions {
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp
index f294316c6e4..6932ab11955 100644
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp
@@ -11,6 +11,7 @@
 #include "tt_metal/test_utils/env_vars.hpp"
 #include "tt_metal/impl/device/device.hpp"
 
+using std::vector;
 using namespace tt::tt_metal;
 
 
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/common/command_queue_test_utils.hpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/common/command_queue_test_utils.hpp
index e99ea309d5a..a42dd078797 100644
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/common/command_queue_test_utils.hpp
+++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/common/command_queue_test_utils.hpp
@@ -29,7 +29,7 @@ struct BufferStressTestConfig {
 
 inline std::vector<uint32_t> generate_arange_vector(uint32_t size_bytes, uint32_t start = 0) {
     TT_FATAL(size_bytes % sizeof(uint32_t) == 0, "Error");
-    vector<uint32_t> src(size_bytes / sizeof(uint32_t), 0);
+    std::vector<uint32_t> src(size_bytes / sizeof(uint32_t), 0);
 
     for (uint32_t i = 0; i < src.size(); i++) {
         src.at(i) = start + i;
diff --git a/tests/tt_metal/tt_metal/unit_tests_frequent/tests/run_many_times.cpp b/tests/tt_metal/tt_metal/unit_tests_frequent/tests/run_many_times.cpp
index 255ce5d72e6..75116172d4d 100644
--- a/tests/tt_metal/tt_metal/unit_tests_frequent/tests/run_many_times.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_frequent/tests/run_many_times.cpp
@@ -7,6 +7,7 @@
 #include "tt_metal/host_api.hpp"
 #include "tt_metal/impl/device/device.hpp"
 
+using std::vector;
 using namespace tt;
 using namespace tt::tt_metal;
 
diff --git a/tt_metal/common/base.hpp b/tt_metal/common/base.hpp
index 7000e13cd9d..6b36d02ee7e 100644
--- a/tt_metal/common/base.hpp
+++ b/tt_metal/common/base.hpp
@@ -7,56 +7,22 @@
  */
 #pragma once
 
-#include <array>
 #include <cstdint>
-#include <iostream>
-#include <array>
-#include <vector>
-#include <map>
 
-#include "tt_metal/common/tt_backend_api_types.hpp" // These are the types exported to frontend team...
-#include "tt_metal/common/assert.hpp"
-#include "hostdevcommon/kernel_structs.h"
-#include "eth_l1_address_map.h"
-#include "common/constants.hpp"
-#include "common/base_types.hpp"
-
-using std::array;
-using std::ostream;
-using std::uint8_t;
-using std::uint32_t;
-using std::uint64_t;
-using std::vector;
-using std::string;
-using std::size_t;
-using std::map;
+// DO NOT ADD MORE CODE TO THIS FILE
+// THIS FILE POLLUTES ALL TRANSLATION UNITS - tt_metal, ttnn, programming examples, tests, customer code
 
+// FIXME: At least put this in tt namespace
 inline constexpr uint32_t align(uint32_t addr, uint32_t alignment) { return ((addr - 1) | (alignment - 1)) + 1; }
 
-
-namespace tt
-{
-
-/**
- * @brief Specifies the target devices on which the graph can be run.
-*/
-enum class TargetDevice : uint8_t
-{
-    Silicon = 0,
-    Simulator = 1,
-    Invalid = 0xFF,
-};
-
-constexpr uint32_t MAX_AVAILABLE_CHIPS = 16;
-
-struct pair_hash {
-    template <class T1, class T2>
-    std::size_t operator()(const std::pair<T1,T2> &p) const
-    {
-        auto h1 = std::hash<T1>{}(p.first);
-        auto h2 = std::hash<T2>{}(p.second);
-        return h1 ^ h2;
-    }
-};
-
-} // end namespace tt
+namespace tt {
+  /**
+   * @brief Specifies the target devices on which the graph can be run.
+  */
+  enum class TargetDevice : std::uint8_t
+  {
+      Silicon = 0,
+      Simulator = 1,
+      Invalid = 0xFF,
+  };
+}
diff --git a/tt_metal/common/test_common.hpp b/tt_metal/common/test_common.hpp
index eeaa8587d64..6cdf825d0dc 100644
--- a/tt_metal/common/test_common.hpp
+++ b/tt_metal/common/test_common.hpp
@@ -18,12 +18,12 @@
 #include <sstream>
 #include "common/metal_soc_descriptor.h"
 
-// Needed for TargetDevice enum
-#include "common/base.hpp"
 
-inline std::string get_soc_description_file(const tt::ARCH &arch, tt::TargetDevice target_device, string output_dir = "") {
+#include "tt_metal/common/base.hpp"
+
+inline std::string get_soc_description_file(const tt::ARCH &arch, tt::TargetDevice target_device, std::string output_dir = "") {
     // Ability to skip this runtime opt, since trimmed SOC desc limits which DRAM channels are available.
-    string tt_metal_home;
+    std::string tt_metal_home;
     if (getenv("TT_METAL_HOME")) {
         tt_metal_home = getenv("TT_METAL_HOME");
     } else {
diff --git a/tt_metal/impl/buffers/buffer.hpp b/tt_metal/impl/buffers/buffer.hpp
index ec3cbb22aec..77c69707853 100644
--- a/tt_metal/impl/buffers/buffer.hpp
+++ b/tt_metal/impl/buffers/buffer.hpp
@@ -13,7 +13,6 @@
 #include "common/core_coord.hpp"
 #include "common/tt_backend_api_types.hpp"
 #include "hostdevcommon/common_values.hpp"
-#include "tt_metal/common/base.hpp"
 #include "tt_metal/common/constants.hpp"
 #include "tt_metal/common/math.hpp"
 #include "tt_metal/impl/allocator/allocator_types.hpp"
diff --git a/tt_metal/impl/debug/dprint_server.cpp b/tt_metal/impl/debug/dprint_server.cpp
index 880f866ae9e..268bc8ad3fb 100644
--- a/tt_metal/impl/debug/dprint_server.cpp
+++ b/tt_metal/impl/debug/dprint_server.cpp
@@ -30,6 +30,7 @@ using std::string;
 using std::to_string;
 using std::cout;
 using std::endl;
+using std::ostream;
 using std::setw;
 using std::flush;
 using std::tuple;
@@ -154,7 +155,7 @@ struct DebugPrintServerContext {
 
     // A map from Device -> Core Range, which is used to determine which cores on which devices
     // to scan for print data. Also a lock for editing it.
-    std::map<Device*, vector<CoreDescriptor>> device_to_core_range_;
+    std::map<Device*, std::vector<CoreDescriptor>> device_to_core_range_;
     std::map<Device*, bool> device_reads_dispatch_cores_;  // True if given device reads any dispatch cores. Used to
                                                            // know whether dprint can be compiled out.
     std::mutex device_to_core_range_lock_;
@@ -326,7 +327,7 @@ void WriteInitMagic(Device *device, const CoreCoord& phys_core, int hart_id, boo
 
     // TODO(AP): this could use a cleanup - need a different mechanism to know if a kernel is running on device.
     // Force wait for first kernel launch by first writing a non-zero and waiting for a zero.
-    vector<uint32_t> initbuf = vector<uint32_t>(DPRINT_BUFFER_SIZE / sizeof(uint32_t), 0);
+    std::vector<uint32_t> initbuf = std::vector<uint32_t>(DPRINT_BUFFER_SIZE / sizeof(uint32_t), 0);
     initbuf[0] = uint32_t(enabled ? DEBUG_PRINT_SERVER_STARTING_MAGIC : DEBUG_PRINT_SERVER_DISABLED_MAGIC);
     tt::llrt::write_hex_vec_to_core(device->id(), phys_core, initbuf, base_addr);
 } // WriteInitMagic
@@ -339,7 +340,7 @@ bool CheckInitMagicCleared(Device *device, const CoreCoord& phys_core, int hart_
     // compute the buffer address for the requested hart
     uint32_t base_addr = GetDprintBufAddr(device, phys_core, hart_id);
 
-    vector<uint32_t> initbuf = { DEBUG_PRINT_SERVER_STARTING_MAGIC };
+    std::vector<uint32_t> initbuf = { DEBUG_PRINT_SERVER_STARTING_MAGIC };
     auto result = tt::llrt::read_hex_vec_from_core(device->id(), phys_core, base_addr, 4);
     return (result[0] != initbuf[0]);
 } // CheckInitMagicCleared
@@ -445,13 +446,13 @@ void DebugPrintServerContext::AttachDevice(Device* device) {
 
     // If RTOptions doesn't enable DPRINT on this device, return here and don't actually attach it
     // to the server.
-    vector<chip_id_t> chip_ids = tt::llrt::OptionsG.get_feature_chip_ids(tt::llrt::RunTimeDebugFeatureDprint);
+    std::vector<chip_id_t> chip_ids = tt::llrt::OptionsG.get_feature_chip_ids(tt::llrt::RunTimeDebugFeatureDprint);
     if (!tt::llrt::OptionsG.get_feature_all_chips(tt::llrt::RunTimeDebugFeatureDprint))
         if (std::find(chip_ids.begin(), chip_ids.end(), device->id()) == chip_ids.end())
             return;
 
     // Core range depends on whether dprint_all_cores flag is set.
-    vector<CoreDescriptor> print_cores_sanitized;
+    std::vector<CoreDescriptor> print_cores_sanitized;
     for (CoreType core_type : {CoreType::WORKER, CoreType::ETH}) {
         if (tt::llrt::OptionsG.get_feature_all_cores(tt::llrt::RunTimeDebugFeatureDprint, core_type) ==
             tt::llrt::RunTimeDebugClassAll) {
@@ -494,7 +495,7 @@ void DebugPrintServerContext::AttachDevice(Device* device) {
                 tt::llrt::get_core_type_name(core_type));
         } else {
             // No "all cores" option provided, which means print from the cores specified by the user
-            vector<CoreCoord>& print_cores =
+            std::vector<CoreCoord>& print_cores =
                 tt::llrt::OptionsG.get_feature_cores(tt::llrt::RunTimeDebugFeatureDprint)[core_type];
 
             // We should also validate that the cores the user specified are valid worker cores.
@@ -555,7 +556,7 @@ void DebugPrintServerContext::AttachDevice(Device* device) {
 
 void DebugPrintServerContext::DetachDevice(Device* device) {
     // Don't detach the device if it's disabled by env vars - in this case it wasn't attached.
-    vector<chip_id_t> chip_ids = tt::llrt::OptionsG.get_feature_chip_ids(tt::llrt::RunTimeDebugFeatureDprint);
+    std::vector<chip_id_t> chip_ids = tt::llrt::OptionsG.get_feature_chip_ids(tt::llrt::RunTimeDebugFeatureDprint);
     if (!tt::llrt::OptionsG.get_feature_all_chips(tt::llrt::RunTimeDebugFeatureDprint))
         if (std::find(chip_ids.begin(), chip_ids.end(), device->id()) == chip_ids.end())
             return;
@@ -925,7 +926,7 @@ bool DebugPrintServerContext::PeekOneHartNonBlocking(
         // with rpos not aligned to wpos
 
         // write back to device - update rpos only
-        vector<uint32_t> rposbuf;
+        std::vector<uint32_t> rposbuf;
         rposbuf.push_back(rpos);
         uint32_t offs = DebugPrintMemLayout().rpos_offs();
         tt::llrt::write_hex_vec_to_core(chip_id, phys_core, rposbuf, base_addr+offs);
@@ -956,7 +957,7 @@ void DebugPrintServerContext::PollPrintData(uint32_t hart_mask) {
         }
 
         // Make a copy of the device->core map, so that it can be modified while polling.
-        std::map<Device*, vector<CoreDescriptor>> device_to_core_range_copy;
+        std::map<Device*, std::vector<CoreDescriptor>> device_to_core_range_copy;
         device_to_core_range_lock_.lock();
         device_to_core_range_copy = device_to_core_range_;
 
diff --git a/tt_metal/impl/debug/noc_logging.cpp b/tt_metal/impl/debug/noc_logging.cpp
index 2d07fa593fe..2ef251ae58d 100644
--- a/tt_metal/impl/debug/noc_logging.cpp
+++ b/tt_metal/impl/debug/noc_logging.cpp
@@ -97,7 +97,7 @@ void ClearNocData(Device *device) {
         CoreCoord phys_core = device->physical_core_from_logical_core(logical_core);
         for (int risc_id = 0; risc_id < GetNumRiscs(logical_core); risc_id++) {
             uint64_t addr = GetDprintBufAddr(device, phys_core, risc_id);
-            vector<uint32_t> initbuf = vector<uint32_t>(DPRINT_BUFFER_SIZE / sizeof(uint32_t), 0);
+            std::vector<uint32_t> initbuf = std::vector<uint32_t>(DPRINT_BUFFER_SIZE / sizeof(uint32_t), 0);
             tt::llrt::write_hex_vec_to_core(device->id(), phys_core, initbuf, addr);
         }
     }
diff --git a/tt_metal/impl/debug/sanitize_noc_host.hpp b/tt_metal/impl/debug/sanitize_noc_host.hpp
index 9dcd5c8a2d0..58029fe339d 100644
--- a/tt_metal/impl/debug/sanitize_noc_host.hpp
+++ b/tt_metal/impl/debug/sanitize_noc_host.hpp
@@ -24,7 +24,7 @@ namespace tt {
 
 #define DEBUG_VALID_ETH_ADDR(a, l) (((a) >= MEM_ETH_BASE) && ((a) + (l) <= MEM_ETH_BASE + MEM_ETH_SIZE))
 
-static bool coord_found_p(vector<CoreCoord>coords, CoreCoord core) {
+static bool coord_found_p(std::vector<CoreCoord>coords, CoreCoord core) {
     for (CoreCoord item : coords) {
         if (item == core) return true;
     }
diff --git a/tt_metal/impl/debug/watcher_device_reader.cpp b/tt_metal/impl/debug/watcher_device_reader.cpp
index e5446268717..fdcf5638970 100644
--- a/tt_metal/impl/debug/watcher_device_reader.cpp
+++ b/tt_metal/impl/debug/watcher_device_reader.cpp
@@ -114,11 +114,11 @@ const launch_msg_t* get_valid_launch_message(const mailboxes_t *mbox_data) {
 namespace tt::watcher {
 
 WatcherDeviceReader::WatcherDeviceReader(
-    FILE *f, Device *device, vector<string> &kernel_names, void (*set_watcher_exception_message)(const string &)) :
+    FILE *f, Device *device, std::vector<string> &kernel_names, void (*set_watcher_exception_message)(const string &)) :
     f(f), device(device), kernel_names(kernel_names), set_watcher_exception_message(set_watcher_exception_message) {
     // On init, read out eth link retraining register so that we can see if retraining has occurred. WH only for now.
     if (device->arch() == ARCH::WORMHOLE_B0 && tt::llrt::OptionsG.get_watcher_enabled()) {
-        vector<uint32_t> read_data;
+        std::vector<uint32_t> read_data;
         for (const CoreCoord &eth_core : device->get_active_ethernet_cores()) {
             CoreCoord phys_core = device->ethernet_core_from_logical_core(eth_core);
             read_data = tt::llrt::read_hex_vec_from_core(
@@ -131,7 +131,7 @@ WatcherDeviceReader::WatcherDeviceReader(
 WatcherDeviceReader::~WatcherDeviceReader() {
     // On close, read out eth link retraining register so that we can see if retraining has occurred.
     if (device->arch() == ARCH::WORMHOLE_B0 && tt::llrt::OptionsG.get_watcher_enabled()) {
-        vector<uint32_t> read_data;
+        std::vector<uint32_t> read_data;
         for (const CoreCoord &eth_core : device->get_active_ethernet_cores()) {
             CoreCoord phys_core = device->ethernet_core_from_logical_core(eth_core);
             read_data = tt::llrt::read_hex_vec_from_core(
diff --git a/tt_metal/impl/debug/watcher_device_reader.hpp b/tt_metal/impl/debug/watcher_device_reader.hpp
index 7f60ad5d4cf..0e0226eebbc 100644
--- a/tt_metal/impl/debug/watcher_device_reader.hpp
+++ b/tt_metal/impl/debug/watcher_device_reader.hpp
@@ -24,7 +24,7 @@ typedef struct {
 class WatcherDeviceReader {
     public:
      WatcherDeviceReader(
-         FILE *f, Device *device, vector<string> &kernel_names, void (*set_watcher_exception_message)(const string &));
+         FILE *f, Device *device, std::vector<std::string> &kernel_names, void (*set_watcher_exception_message)(const std::string &));
      ~WatcherDeviceReader();
      void Dump(FILE *file = nullptr);
 
@@ -32,9 +32,9 @@ class WatcherDeviceReader {
     // Functions for dumping each watcher feature to the log
     void DumpCore(CoreDescriptor &logical_core, bool is_active_eth_core);
     void DumpL1Status(CoreDescriptor &core, const launch_msg_t *launch_msg);
-    void DumpNocSanitizeStatus(CoreDescriptor &core, const string &core_str, const mailboxes_t *mbox_data, int noc);
-    void DumpAssertStatus(CoreDescriptor &core, const string &core_str, const mailboxes_t *mbox_data);
-    void DumpPauseStatus(CoreDescriptor &core, const string &core_str,const mailboxes_t *mbox_data);
+    void DumpNocSanitizeStatus(CoreDescriptor &core, const std::string &core_str, const mailboxes_t *mbox_data, int noc);
+    void DumpAssertStatus(CoreDescriptor &core, const std::string &core_str, const mailboxes_t *mbox_data);
+    void DumpPauseStatus(CoreDescriptor &core, const std::string &core_str,const mailboxes_t *mbox_data);
     void DumpRingBuffer(CoreDescriptor &core, const mailboxes_t *mbox_data, bool to_stdout);
     void DumpRunState(CoreDescriptor &core, const launch_msg_t *launch_msg, uint32_t state);
     void DumpLaunchMessage(CoreDescriptor &core, const mailboxes_t *mbox_data);
@@ -45,12 +45,12 @@ class WatcherDeviceReader {
 
     // Helper functions
     void LogRunningKernels(CoreDescriptor &core, const launch_msg_t *launch_msg);
-    string GetKernelName(CoreDescriptor &core, const launch_msg_t *launch_msg, uint32_t type);
+    std::string GetKernelName(CoreDescriptor &core, const launch_msg_t *launch_msg, uint32_t type);
 
     FILE *f;
     Device *device;
-    vector<string> &kernel_names;
-    void (* set_watcher_exception_message)(const string &);
+    std::vector<std::string> &kernel_names;
+    void (* set_watcher_exception_message)(const std::string &);
 
     // Information that needs to be kept around on a per-dump basis
     std::set<std::pair<CoreCoord, riscv_id_t>> paused_cores;
diff --git a/tt_metal/impl/debug/watcher_server.cpp b/tt_metal/impl/debug/watcher_server.cpp
index a644e379cf8..06070107aaf 100644
--- a/tt_metal/impl/debug/watcher_server.cpp
+++ b/tt_metal/impl/debug/watcher_server.cpp
@@ -254,7 +254,7 @@ void watcher_init(Device *device) {
     for (tt::llrt::RunTimeDebugFeatures delay_feature = tt::llrt::RunTimeDebugFeatureReadDebugDelay;
          (int)delay_feature <= tt::llrt::RunTimeDebugFeatureAtomicDebugDelay;
          delay_feature = (tt::llrt::RunTimeDebugFeatures)((int)delay_feature + 1)) {
-        vector<chip_id_t> chip_ids = tt::llrt::OptionsG.get_feature_chip_ids(delay_feature);
+        std::vector<chip_id_t> chip_ids = tt::llrt::OptionsG.get_feature_chip_ids(delay_feature);
         bool this_chip_enabled = tt::llrt::OptionsG.get_feature_all_chips(delay_feature) ||
                                  std::find(chip_ids.begin(), chip_ids.end(), device->id()) != chip_ids.end();
         if (this_chip_enabled) {
@@ -275,7 +275,7 @@ void watcher_init(Device *device) {
             }
 
             for (CoreType core_type : {CoreType::WORKER, CoreType::ETH}) {
-                vector<CoreCoord> delayed_cores = tt::llrt::OptionsG.get_feature_cores(delay_feature)[core_type];
+                std::vector<CoreCoord> delayed_cores = tt::llrt::OptionsG.get_feature_cores(delay_feature)[core_type];
                 for (tt_xy_pair logical_core : delayed_cores) {
                     CoreCoord phys_core;
                     bool valid_logical_core = true;
diff --git a/tt_metal/impl/device/device.cpp b/tt_metal/impl/device/device.cpp
index 8afff15103b..e536c9e940a 100644
--- a/tt_metal/impl/device/device.cpp
+++ b/tt_metal/impl/device/device.cpp
@@ -556,7 +556,7 @@ void Device::initialize_and_launch_firmware() {
     go_msg.signal = RUN_MSG_INIT;
 
     // Populate core info, which will be written to device
-    vector<uint32_t> core_info_vec(sizeof(core_info_msg_t) / sizeof(uint32_t));
+    std::vector<uint32_t> core_info_vec(sizeof(core_info_msg_t) / sizeof(uint32_t));
     core_info_msg_t *core_info = (core_info_msg_t *) core_info_vec.data();
 
     const metal_SocDescriptor& soc_d = tt::Cluster::instance().get_soc_desc(this->id());
@@ -593,7 +593,7 @@ void Device::initialize_and_launch_firmware() {
 
     // Determine which noc-coords are harvested
     // TODO(PGK/Almeet): fix this w/ new UMD
-    vector<uint32_t> harvested_rows;
+    std::vector<uint32_t> harvested_rows;
     uint32_t harvested_noc_rows = tt::Cluster::instance().get_harvested_rows(this->id());
     for (uint32_t y = 0; y < soc_d.grid_size.y; y++) {
         bool row_harvested = (harvested_noc_rows >> y) & 0x1;
@@ -2735,7 +2735,7 @@ void Device::configure_command_queue_programs() {
         uint32_t issue_queue_size = this->sysmem_manager_->get_issue_queue_size(cq_id);
         uint32_t completion_queue_start_addr = cq_start + issue_queue_size + get_absolute_cq_offset(channel, cq_id, cq_size);
         uint32_t completion_queue_start_addr_16B = completion_queue_start_addr >> 4;
-        vector<uint32_t> completion_queue_wr_ptr = {completion_queue_start_addr_16B};
+        std::vector<uint32_t> completion_queue_wr_ptr = {completion_queue_start_addr_16B};
         detail::WriteToDeviceL1(mmio_device, completion_q_writer_location, completion_q_rd_ptr, completion_queue_wr_ptr, dispatch_core_type);
         detail::WriteToDeviceL1(mmio_device, completion_q_writer_location, completion_q_wr_ptr, completion_queue_wr_ptr, dispatch_core_type);
         detail::WriteToDeviceL1(mmio_device, completion_q_writer_location, completion_q0_last_event_ptr, zero, dispatch_core_type);
diff --git a/tt_metal/impl/dispatch/command_queue.cpp b/tt_metal/impl/dispatch/command_queue.cpp
index 3c5620b6839..e4ee5405f07 100644
--- a/tt_metal/impl/dispatch/command_queue.cpp
+++ b/tt_metal/impl/dispatch/command_queue.cpp
@@ -663,7 +663,7 @@ void EnqueueProgramCommand::assemble_runtime_args_commands(ProgramCommandSequenc
                                 .noc_xy_addr = this->device->get_noc_unicast_encoding(this->noc_index, physical_core)});
                         }
                     } else {
-                        vector<std::pair<transfer_info_cores, uint32_t>> dst_noc_multicast_info =
+                        std::vector<std::pair<transfer_info_cores, uint32_t>> dst_noc_multicast_info =
                             device->extract_dst_noc_multicast_info<std::vector<CoreRange>>(
                                 kernel->logical_coreranges(), core_type);
                         common_sub_cmds.emplace<std::vector<CQDispatchWritePackedMulticastSubCmd>>(
@@ -749,7 +749,7 @@ void EnqueueProgramCommand::assemble_device_commands(ProgramCommandSequence& pro
                 for (const auto& dst_noc_info : transfer_info.dst_noc_info) {
                     TT_ASSERT(
                         transfer_info.data.size() == write_packed_len,
-                        "Not all data vectors in write packed semaphore cmd equal in len");
+                        "Not all data std::vectors in write packed semaphore cmd equal in len");
                     multicast_sem_sub_cmds[i].emplace_back(CQDispatchWritePackedMulticastSubCmd{
                         .noc_xy_addr = this->device->get_noc_multicast_encoding(
                             this->noc_index, std::get<CoreRange>(dst_noc_info.first)),
@@ -786,7 +786,7 @@ void EnqueueProgramCommand::assemble_device_commands(ProgramCommandSequence& pro
                 for (const auto& dst_noc_info : transfer_info.dst_noc_info) {
                     TT_ASSERT(
                         transfer_info.data.size() == write_packed_len,
-                        "Not all data vectors in write packed semaphore cmd equal in len");
+                        "Not all data std::vectors in write packed semaphore cmd equal in len");
                     unicast_sem_sub_cmds[i].emplace_back(CQDispatchWritePackedUnicastSubCmd{
                         .noc_xy_addr = this->device->get_noc_unicast_encoding(
                             this->noc_index, std::get<CoreCoord>(dst_noc_info.first))});
@@ -2791,7 +2791,7 @@ inline namespace v0 {
 void EnqueueReadBuffer(
     CommandQueue& cq,
     std::variant<std::reference_wrapper<Buffer>, std::shared_ptr<Buffer>> buffer,
-    vector<uint32_t>& dst,
+    std::vector<uint32_t>& dst,
     bool blocking) {
     // TODO(agrebenisan): Move to deprecated
     ZoneScoped;
@@ -2821,7 +2821,7 @@ void EnqueueReadBuffer(
 void EnqueueWriteBuffer(
     CommandQueue& cq,
     std::variant<std::reference_wrapper<Buffer>, std::shared_ptr<Buffer>> buffer,
-    vector<uint32_t>& src,
+    std::vector<uint32_t>& src,
     bool blocking) {
     // TODO(agrebenisan): Move to deprecated
     EnqueueWriteBuffer(cq, buffer, src.data(), blocking);
diff --git a/tt_metal/impl/dispatch/command_queue.hpp b/tt_metal/impl/dispatch/command_queue.hpp
index a840fd19b8a..766346dfe52 100644
--- a/tt_metal/impl/dispatch/command_queue.hpp
+++ b/tt_metal/impl/dispatch/command_queue.hpp
@@ -14,7 +14,6 @@
 #include <vector>
 
 #include "common/env_lib.hpp"
-#include "tt_metal/common/base.hpp"
 #include "tt_metal/impl/dispatch/program_command_sequence.hpp"
 #include "tt_metal/impl/dispatch/command_queue_interface.hpp"
 #include "tt_metal/impl/dispatch/device_command.hpp"
diff --git a/tt_metal/impl/dispatch/command_queue_interface.hpp b/tt_metal/impl/dispatch/command_queue_interface.hpp
index e1a8f3f0b0f..bf8ac017030 100644
--- a/tt_metal/impl/dispatch/command_queue_interface.hpp
+++ b/tt_metal/impl/dispatch/command_queue_interface.hpp
@@ -432,21 +432,21 @@ class SystemMemoryManager {
     chip_id_t device_id;
     uint8_t num_hw_cqs;
     const std::function<void(uint32_t, uint32_t, const uint8_t *)> fast_write_callable;
-    vector<uint32_t> completion_byte_addrs;
+    std::vector<uint32_t> completion_byte_addrs;
     char *cq_sysmem_start;
-    vector<SystemMemoryCQInterface> cq_interfaces;
+    std::vector<SystemMemoryCQInterface> cq_interfaces;
     uint32_t cq_size;
     uint32_t channel_offset;
-    vector<int> cq_to_event;
-    vector<int> cq_to_last_completed_event;
-    vector<std::mutex> cq_to_event_locks;
-    vector<tt_cxy_pair> prefetcher_cores;
-    vector<tt::Writer> prefetch_q_writers;
-    vector<uint32_t> prefetch_q_dev_ptrs;
-    vector<uint32_t> prefetch_q_dev_fences;
+    std::vector<int> cq_to_event;
+    std::vector<int> cq_to_last_completed_event;
+    std::vector<std::mutex> cq_to_event_locks;
+    std::vector<tt_cxy_pair> prefetcher_cores;
+    std::vector<tt::Writer> prefetch_q_writers;
+    std::vector<uint32_t> prefetch_q_dev_ptrs;
+    std::vector<uint32_t> prefetch_q_dev_fences;
 
     bool bypass_enable;
-    vector<uint32_t> bypass_buffer;
+    std::vector<uint32_t> bypass_buffer;
     uint32_t bypass_buffer_write_offset;
 
     WorkerConfigBufferMgr config_buffer_mgr;
@@ -528,7 +528,7 @@ class SystemMemoryManager {
                 prefetch_q_base + dispatch_constants::get(core_type, num_hw_cqs).prefetch_q_entries() *
                                                           sizeof(dispatch_constants::prefetch_q_entry_type);
         }
-        vector<std::mutex> temp_mutexes(num_hw_cqs);
+        std::vector<std::mutex> temp_mutexes(num_hw_cqs);
         cq_to_event_locks.swap(temp_mutexes);
 
         for (uint32_t index = 0; index < hal.get_programmable_core_type_count(); index++) {
@@ -636,7 +636,7 @@ class SystemMemoryManager {
 
     chip_id_t get_device_id() const { return this->device_id; }
 
-    vector<SystemMemoryCQInterface>& get_cq_interfaces() { return this->cq_interfaces; }
+    std::vector<SystemMemoryCQInterface>& get_cq_interfaces() { return this->cq_interfaces; }
 
     void *issue_queue_reserve(uint32_t cmd_size_B, const uint8_t cq_id) {
         if (this->bypass_enable) {
diff --git a/tt_metal/impl/dispatch/data_collection.cpp b/tt_metal/impl/dispatch/data_collection.cpp
index c5c400e01c3..bf18bb2d6f5 100644
--- a/tt_metal/impl/dispatch/data_collection.cpp
+++ b/tt_metal/impl/dispatch/data_collection.cpp
@@ -35,7 +35,7 @@ class DispatchStats {
         Update(other.max_transaction_size, other.min_transaction_size, other.num_writes, other.total_write_size);
     }
 
-    void Dump(std::ofstream &outfile,  map<uint32_t, uint32_t> &raw_data) {
+    void Dump(std::ofstream &outfile,  std::map<uint32_t, uint32_t> &raw_data) {
         outfile << fmt::format("\t\tmax_transaction_size = {}\n", max_transaction_size);
         outfile << fmt::format("\t\tmin_transaction_size = {}\n", min_transaction_size);
         outfile << fmt::format("\t\tnum_writes           = {}\n", num_writes);
@@ -74,7 +74,7 @@ class DispatchData {
 
         // Track stats for all RISCS, as well as per RISC
         DispatchStats total_stats;
-        map<uint32_t, uint32_t> total_data;
+        std::map<uint32_t, uint32_t> total_data;
         for (auto &riscv_and_data : data) {
             // Go through all data and update stats
             DispatchStats riscv_stats;
@@ -98,7 +98,7 @@ class DispatchData {
     }
 
 private:
-    map<RISCV, map<uint32_t, uint32_t>> data; // RISCV -> transaction size -> count
+    std::map<RISCV, std::map<uint32_t, uint32_t>> data; // RISCV -> transaction size -> count
     data_collector_t type;
 };
 
@@ -117,21 +117,21 @@ class DataCollector {
     };
 
     void RecordData(Program &program, data_collector_t type, uint32_t transaction_size, RISCV riscv);
-    void RecordKernelGroups(Program &program, CoreType core_type, vector<KernelGroup> &kernel_groups);
+    void RecordKernelGroups(Program &program, CoreType core_type, std::vector<KernelGroup> &kernel_groups);
     void RecordProgramRun(Program &program);
     void DumpData();
 
 private:
-    map<uint64_t, vector<DispatchData>> program_id_to_dispatch_data;
-    map<uint64_t, map<CoreType, vector<std::pair<kernel_id_array_t, CoreRangeSet>>>> program_id_to_kernel_groups;
-    map<uint64_t, int> program_id_to_call_count;
+    std::map<uint64_t, std::vector<DispatchData>> program_id_to_dispatch_data;
+    std::map<uint64_t, std::map<CoreType, std::vector<std::pair<kernel_id_array_t, CoreRangeSet>>>> program_id_to_kernel_groups;
+    std::map<uint64_t, int> program_id_to_call_count;
 };
 
 void DataCollector::RecordData(Program &program, data_collector_t type, uint32_t transaction_size, RISCV riscv) {
     uint64_t program_id = program.get_id();
     if (program_id_to_dispatch_data.count(program_id) == 0) {
         // If no existing data for this program, initialize starting values.
-        program_id_to_dispatch_data[program_id] = vector<DispatchData>();
+        program_id_to_dispatch_data[program_id] = std::vector<DispatchData>();
         for (int idx = 0; idx < DISPATCH_DATA_COUNT; idx++) {
             data_collector_t curr_type = static_cast<data_collector_t>(idx);
             DispatchData data(curr_type);
@@ -142,7 +142,7 @@ void DataCollector::RecordData(Program &program, data_collector_t type, uint32_t
     program_id_to_dispatch_data[program_id].at(type).Update(transaction_size, riscv);
 }
 
-void DataCollector::RecordKernelGroups(Program &program, CoreType core_type, vector<KernelGroup> &kernel_groups) {
+void DataCollector::RecordKernelGroups(Program &program, CoreType core_type, std::vector<KernelGroup> &kernel_groups) {
     uint64_t program_id = program.get_id();
     // Make a copy of relevant info, since user may destroy program before we dump.
     for (KernelGroup &kernel_group : kernel_groups) {
@@ -189,7 +189,7 @@ void DataCollector::DumpData() {
     std::ofstream outfile = std::ofstream("dispatch_data.txt");
 
     // Extra DispatchData objects to collect data across programs
-    vector<DispatchData *> cross_program_data;
+    std::vector<DispatchData *> cross_program_data;
     for (int idx = 0; idx < DISPATCH_DATA_COUNT; idx++) {
         cross_program_data.push_back(new DispatchData(idx));
     }
@@ -202,7 +202,7 @@ void DataCollector::DumpData() {
         // Dump kernel ids for each kernel group in this program
         for (auto &core_type_and_kernel_groups : program_id_to_kernel_groups[program_id]) {
             CoreType core_type = core_type_and_kernel_groups.first;
-            vector<std::pair<kernel_id_array_t, CoreRangeSet>> &kernel_groups = core_type_and_kernel_groups.second;
+            std::vector<std::pair<kernel_id_array_t, CoreRangeSet>> &kernel_groups = core_type_and_kernel_groups.second;
             outfile << fmt::format("\t{} Kernel Groups: {}\n", core_type, kernel_groups.size());
             for (auto &ids_and_ranges : kernel_groups) {
                 // Dump kernel ids in this group
@@ -266,7 +266,7 @@ void RecordDispatchData(Program &program, data_collector_t type, uint32_t transa
     DataCollector::inst->RecordData(program, type, transaction_size, riscv);
 }
 
-void RecordKernelGroups(Program &program, CoreType core_type, vector<KernelGroup> &kernel_groups) {
+void RecordKernelGroups(Program &program, CoreType core_type, std::vector<KernelGroup> &kernel_groups) {
     // Do nothing if we're not enabling data collection.
     if (!tt::llrt::OptionsG.get_dispatch_data_collection_enabled())
         return;
diff --git a/tt_metal/impl/dispatch/debug_tools.cpp b/tt_metal/impl/dispatch/debug_tools.cpp
index f8c54fa3573..ea5141443b6 100644
--- a/tt_metal/impl/dispatch/debug_tools.cpp
+++ b/tt_metal/impl/dispatch/debug_tools.cpp
@@ -23,7 +23,7 @@ void match_device_program_data_with_host_program_data(const char* host_file, con
     host_dispatch_dump_file.open(host_file);
     device_dispatch_dump_file.open(device_file);
 
-    vector<std::pair<string, vector<string>>> host_map;
+    std::vector<std::pair<string, std::vector<string>>> host_map;
 
 
     string line;
@@ -36,7 +36,7 @@ void match_device_program_data_with_host_program_data(const char* host_file, con
         } else if (line.find("BINARY SPAN") != string::npos or line.find("SEM") != string::npos or line.find("CB") != string::npos) {
             type = line;
         } else {
-            vector<string> host_data = {line};
+            std::vector<string> host_data = {line};
             while (std::getline(host_dispatch_dump_file, line) and (line.find("*") == string::npos)) {
                 host_data.push_back(line);
             }
@@ -44,8 +44,8 @@ void match_device_program_data_with_host_program_data(const char* host_file, con
         }
     }
 
-    vector<vector<string>> device_map;
-    vector<string> device_data;
+    std::vector<std::vector<string>> device_map;
+    std::vector<string> device_data;
     while (std::getline(device_dispatch_dump_file, line) and line != "EXIT_CONDITION") {
         if (line == "CHUNK") {
             if (not device_data.empty()) {
@@ -63,7 +63,7 @@ void match_device_program_data_with_host_program_data(const char* host_file, con
     for (const auto& [type, host_data] : host_map) {
         bool match = false;
 
-        for (const vector<string>& device_data : device_map) {
+        for (const std::vector<string>& device_data : device_map) {
             if (host_data == device_data) {
                 tt::log_info("Matched on {}", type);
                 match = true;
@@ -292,7 +292,7 @@ void dump_completion_queue_entries(
     uint32_t base_addr = (cq_interface.issue_fifo_limit << 4);
 
     // Read out in pages, this is fine since all completion Q entries are page aligned.
-    vector<uint8_t> read_data;
+    std::vector<uint8_t> read_data;
     read_data.resize(dispatch_constants::TRANSFER_PAGE_SIZE);
     tt::log_info("Reading Device {} CQ {}, Completion Queue...", sysmem_manager.get_device_id(), cq_interface.id);
     cq_file << fmt::format(
@@ -382,7 +382,7 @@ void dump_issue_queue_entries(
     uint32_t issue_q_base_addr = cq_interface.offset + cq_interface.cq_start;
 
     // Read out in 4K pages, could do ISSUE_Q_ALIGNMENT chunks to match the entries but this is ~2x faster.
-    vector<uint8_t> read_data;
+    std::vector<uint8_t> read_data;
     read_data.resize(dispatch_constants::TRANSFER_PAGE_SIZE);
     tt::log_info("Reading Device {} CQ {}, Issue Queue...", sysmem_manager.get_device_id(), cq_interface.id);
     iq_file << fmt::format(
@@ -542,7 +542,7 @@ void dump_command_queue_raw_data(
     }
 
     // Read out in pages
-    vector<uint8_t> read_data;
+    std::vector<uint8_t> read_data;
     read_data.resize(dispatch_constants::TRANSFER_PAGE_SIZE);
     out_file << std::endl;
     out_file << fmt::format(
diff --git a/tt_metal/impl/program/program.cpp b/tt_metal/impl/program/program.cpp
index 25004bbab29..69fcdb4af45 100644
--- a/tt_metal/impl/program/program.cpp
+++ b/tt_metal/impl/program/program.cpp
@@ -866,7 +866,7 @@ void detail::Program_::populate_dispatch_data(Device *device) {
     auto extract_dst_noc_unicast_info =
         [&device](const auto &ranges, const CoreType core_type) -> std::vector<std::pair<transfer_info_cores, uint32_t>> {
         // This API extracts all the pairs of noc multicast encodings given a set of core ranges
-        vector<std::pair<transfer_info_cores, uint32_t>> dst_noc_unicast_info;
+        std::vector<std::pair<transfer_info_cores, uint32_t>> dst_noc_unicast_info;
         for (const CoreRange &core_range : ranges) {
             for (auto x = core_range.start_coord.x; x <= core_range.end_coord.x; x++) {
                 for (auto y = core_range.start_coord.y; y <= core_range.end_coord.y; y++) {
@@ -880,13 +880,13 @@ void detail::Program_::populate_dispatch_data(Device *device) {
 
     // Unicast/Multicast Semaphores
     for (const Semaphore &semaphore : this->semaphores()) {
-        vector<uint32_t> semaphore_data(1);
+        std::vector<uint32_t> semaphore_data(1);
         semaphore_data[0] = semaphore.initial_value();
 
         // TODO: use semaphore.core_type from main
         if (semaphore.core_type() == CoreType::WORKER) {
             uint32_t index = hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX);
-            vector<std::pair<transfer_info_cores, uint32_t>> dst_noc_multicast_info =
+            std::vector<std::pair<transfer_info_cores, uint32_t>> dst_noc_multicast_info =
                 device->extract_dst_noc_multicast_info<std::vector<CoreRange>>(
                     semaphore.core_range_set().ranges(), CoreType::WORKER);
             transfer_info transfer_info = {
@@ -898,7 +898,7 @@ void detail::Program_::populate_dispatch_data(Device *device) {
         } else if (semaphore.core_type() == CoreType::ETH) {
             // TODO: we only fast dispatch to active eth...
             uint32_t index = hal.get_programmable_core_type_index(HalProgrammableCoreType::ACTIVE_ETH);
-            vector<std::pair<transfer_info_cores, uint32_t>> dst_noc_unicast_info =
+            std::vector<std::pair<transfer_info_cores, uint32_t>> dst_noc_unicast_info =
                 extract_dst_noc_unicast_info(semaphore.core_range_set().ranges(), CoreType::ETH);
             transfer_info transfer_info = {
                 .dst_base_addr = semaphore.offset(),
@@ -950,7 +950,7 @@ void detail::Program_::populate_dispatch_data(Device *device) {
 
                 uint32_t max_kernel_bin_size = processor_to_firmware_size[sub_kernels[sub_kernel_index]];
 
-                kernel_bin.process_spans([&](vector<uint32_t>::const_iterator mem_ptr, uint64_t dst, uint32_t len) {
+                kernel_bin.process_spans([&](std::vector<uint32_t>::const_iterator mem_ptr, uint64_t dst, uint32_t len) {
 
                     max_kernel_bin_size -= dst - processor_to_firmware_base[sub_kernels[sub_kernel_index]];
 
@@ -1000,7 +1000,7 @@ void detail::Program_::populate_dispatch_data(Device *device) {
                     device->extract_dst_noc_multicast_info<std::vector<CoreRange>>(
                         kernel_group.core_ranges.ranges(), core_type);
 
-                vector<KernelHandle> kernel_ids;
+                std::vector<KernelHandle> kernel_ids;
                 for (auto &optional_id : kernel_group.kernel_ids) {
                     if (optional_id) {
                         kernel_ids.push_back(optional_id.value());
@@ -1015,10 +1015,10 @@ void detail::Program_::populate_dispatch_data(Device *device) {
                 }
             } else {
                 TT_ASSERT(core_type == CoreType::ETH);
-                vector<std::pair<transfer_info_cores, uint32_t>> dst_noc_unicast_info =
+                std::vector<std::pair<transfer_info_cores, uint32_t>> dst_noc_unicast_info =
                     extract_dst_noc_unicast_info(kernel_group.core_ranges.ranges(), core_type);
 
-                vector<KernelHandle> kernel_ids;
+                std::vector<KernelHandle> kernel_ids;
                 if (kernel_group.kernel_ids[DISPATCH_CLASS_ETH_DM0]) {
                     kernel_ids.push_back(kernel_group.kernel_ids[DISPATCH_CLASS_ETH_DM0].value());
                 }
@@ -1046,8 +1046,8 @@ uint32_t detail::Program_::finalize_rt_args(uint32_t programmable_core_type_inde
     // Common RTAs come after unique RTAs
     uint32_t processor_classes = hal.get_processor_classes_count(programmable_core_type_index);
 
-    vector<uint32_t> max_rtas(processor_classes);
-    vector<uint32_t> max_crtas(processor_classes);
+    std::vector<uint32_t> max_rtas(processor_classes);
+    std::vector<uint32_t> max_crtas(processor_classes);
     uint32_t max_unique_rta_size = 0;
     uint32_t total_crta_size = 0;
 
diff --git a/tt_metal/impl/trace/trace.cpp b/tt_metal/impl/trace/trace.cpp
index aaeea3d05b7..59d16af6b8c 100644
--- a/tt_metal/impl/trace/trace.cpp
+++ b/tt_metal/impl/trace/trace.cpp
@@ -30,7 +30,7 @@ size_t interleaved_page_size(
     const uint32_t buf_size, const uint32_t num_banks, const uint32_t min_size, const uint32_t max_size) {
     // Populate power of 2 numbers within min and max as candidates
     TT_FATAL(min_size > 0 and min_size <= max_size, "min_size {} not positive and less than or equal to max_size {}.", min_size, max_size);
-    vector<uint32_t> candidates;
+    std::vector<uint32_t> candidates;
     candidates.reserve(__builtin_clz(min_size) - __builtin_clz(max_size) + 1);
     for (uint32_t size = 1; size <= max_size; size <<= 1) {
         if (size >= min_size) {
@@ -71,7 +71,7 @@ std::shared_ptr<TraceBuffer> Trace::create_empty_trace_buffer() {
 }
 
 void Trace::initialize_buffer(CommandQueue& cq, std::shared_ptr<TraceBuffer> trace_buffer) {
-    vector<uint32_t>& trace_data = trace_buffer->desc->data;
+    std::vector<uint32_t>& trace_data = trace_buffer->desc->data;
     uint64_t unpadded_size = trace_data.size() * sizeof(uint32_t);
     size_t page_size = interleaved_page_size(
         unpadded_size, cq.device()->num_banks(BufferType::DRAM), kExecBufPageMin, kExecBufPageMax);
@@ -98,7 +98,7 @@ void Trace::initialize_buffer(CommandQueue& cq, std::shared_ptr<TraceBuffer> tra
 
 // there is a cost to validation, please use it judiciously
 void Trace::validate_instance(const TraceBuffer& trace_buffer) {
-    vector<uint32_t> backdoor_data;
+    std::vector<uint32_t> backdoor_data;
     detail::ReadFromBuffer(trace_buffer.buffer, backdoor_data);
     if (backdoor_data != trace_buffer.desc->data) {
         log_info(LogMetalTrace, "Trace buffer expected: {}", trace_buffer.desc->data);
diff --git a/tt_metal/jit_build/build.hpp b/tt_metal/jit_build/build.hpp
index 2962cccc8af..fdb3751d435 100644
--- a/tt_metal/jit_build/build.hpp
+++ b/tt_metal/jit_build/build.hpp
@@ -121,7 +121,7 @@ class alignas(CACHE_LINE_ALIGNMENT) JitBuildState {
 
 // Set of build states
 // Used for parallel builds, builds all members in one call
-typedef vector<std::shared_ptr<JitBuildState>> JitBuildStateSet;
+typedef std::vector<std::shared_ptr<JitBuildState>> JitBuildStateSet;
 
 // Exracts a slice of builds from a JitBuildState
 // Used for parallel building a subset of the builds in a JitBuildStateSet
diff --git a/tt_metal/jit_build/data_format.cpp b/tt_metal/jit_build/data_format.cpp
index f5e9e7e8e7f..6e7d67276ad 100644
--- a/tt_metal/jit_build/data_format.cpp
+++ b/tt_metal/jit_build/data_format.cpp
@@ -1,10 +1,20 @@
 // SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
 //
 // SPDX-License-Identifier: Apache-2.0
+
 #include "data_format.hpp"
-#include "hostdevcommon/common_runtime_address_map.h"
-#include <unordered_map>
-#include <set>
+
+#include <iostream>                                    // for basic_ostream
+#include <map>                                         // for operator!=
+#include <set>                                         // for set
+#include <string>                                      // for char_traits
+#include <unordered_map>                               // for unordered_map
+
+#include "fmt/base.h"                                  // for format_string
+#include "tt_metal/common/assert.hpp"                  // for tt_throw, TT_FATAL
+#include "tt_metal/common/base_types.hpp"              // for UnpackToDestMode
+#include "hostdevcommon/common_runtime_address_map.h"  // for NUM_CIRCULAR_B...
+
 namespace tt {
 
 static const std::set<DataFormat> ALL_VALID_FORMATS = {
diff --git a/tt_metal/jit_build/data_format.hpp b/tt_metal/jit_build/data_format.hpp
index a92d6dbd2fc..c4ab84f2679 100644
--- a/tt_metal/jit_build/data_format.hpp
+++ b/tt_metal/jit_build/data_format.hpp
@@ -4,17 +4,16 @@
 
 #pragma once
 #include <cstdint>
-#include <iostream>
-#include <map>
 #include <vector>
-#include <string>
-#include "common/base.hpp"
+#include "common/tt_backend_api_types.hpp"  // for DataFormat
+#include "device/tt_arch_types.h"           // for ARCH
+enum class UnpackToDestMode : std::uint8_t;
 
 namespace tt {
 
 static constexpr uint NUM_OPERANDS = 8;
 
-enum class ExpPrecision : uint8_t
+enum class ExpPrecision : std::uint8_t
 {
   A = 0,
   B = 1,
diff --git a/tt_metal/llrt/llrt.cpp b/tt_metal/llrt/llrt.cpp
index 28521cabc72..dc22b623c3a 100644
--- a/tt_metal/llrt/llrt.cpp
+++ b/tt_metal/llrt/llrt.cpp
@@ -158,7 +158,7 @@ uint32_t generate_risc_startup_addr(bool is_eth_core) {
 }
 
 void program_risc_startup_addr(chip_id_t chip_id, const CoreCoord &core) {
-    vector<uint32_t> jump_to_fw;
+    std::vector<uint32_t> jump_to_fw;
     jump_to_fw.push_back(generate_risc_startup_addr(is_ethernet_core(core, chip_id)));
     write_hex_vec_to_core(chip_id, core, jump_to_fw, 0);
 }
diff --git a/tt_metal/llrt/rtoptions.cpp b/tt_metal/llrt/rtoptions.cpp
index 00494a636b7..baaa49dfdc4 100644
--- a/tt_metal/llrt/rtoptions.cpp
+++ b/tt_metal/llrt/rtoptions.cpp
@@ -220,7 +220,7 @@ void RunTimeOptions::ParseFeatureEnv(RunTimeDebugFeatures feature) {
 void RunTimeOptions::ParseFeatureCoreRange(
     RunTimeDebugFeatures feature, const std::string &env_var, CoreType core_type) {
     char *str = std::getenv(env_var.c_str());
-    vector<CoreCoord> cores;
+    std::vector<CoreCoord> cores;
 
     // Check if "all" is specified, rather than a range of cores.
     feature_targets[feature].all_cores[core_type] = RunTimeDebugClassNoneSpecified;
@@ -280,7 +280,7 @@ void RunTimeOptions::ParseFeatureCoreRange(
 }
 
 void RunTimeOptions::ParseFeatureChipIds(RunTimeDebugFeatures feature, const std::string &env_var) {
-    vector<int> chips;
+    std::vector<int> chips;
     char *env_var_str = std::getenv(env_var.c_str());
 
     // If the environment variable is not empty, parse it.
diff --git a/tt_metal/llrt/tt_cluster.cpp b/tt_metal/llrt/tt_cluster.cpp
index 597fb3b2901..d79b282a415 100644
--- a/tt_metal/llrt/tt_cluster.cpp
+++ b/tt_metal/llrt/tt_cluster.cpp
@@ -9,9 +9,9 @@
 #include <cstdlib>
 #include <filesystem>
 #include <iostream>
-#include <map>                                                       // for map
+#include <map>
 #include <memory>
-#include <set>                                                       // for set
+#include <set>
 #include <stdexcept>
 #include <string>
 #include <tuple>                                                     // for get
@@ -21,7 +21,7 @@
 #include <vector>
 
 #include "fmt/base.h"
-#include "tt_metal/common/base.hpp" // TODO: Eliminate this file, catchall include and is ARCH_NAME dependent
+#include "tt_metal/common/base.hpp"
 #include "tt_metal/common/logger.hpp"
 #include "tt_metal/common/metal_soc_descriptor.h"
 #include "tt_metal/common/test_common.hpp"
@@ -438,7 +438,7 @@ inline uint64_t get_sys_addr(uint32_t chip_x, uint32_t chip_y, uint32_t noc_x, u
     return result;
 }
 
-void Cluster::write_dram_vec(vector<uint32_t> &vec, tt_target_dram dram, uint64_t addr, bool small_access) const {
+void Cluster::write_dram_vec(std::vector<uint32_t> &vec, tt_target_dram dram, uint64_t addr, bool small_access) const {
     int chip_id, d_chan, d_subchannel;
     std::tie(chip_id, d_chan, d_subchannel) = dram;
     const metal_SocDescriptor &desc_to_use = get_soc_desc(chip_id);
@@ -456,7 +456,7 @@ void Cluster::write_dram_vec(vector<uint32_t> &vec, tt_target_dram dram, uint64_
 }
 
 void Cluster::read_dram_vec(
-    vector<uint32_t> &vec, uint32_t sz_in_bytes, tt_target_dram dram, uint64_t addr, bool small_access) const {
+    std::vector<uint32_t> &vec, uint32_t sz_in_bytes, tt_target_dram dram, uint64_t addr, bool small_access) const {
     int chip_id, d_chan, d_subchannel;
     std::tie(chip_id, d_chan, d_subchannel) = dram;
     const metal_SocDescriptor &desc_to_use = get_soc_desc(chip_id);
@@ -502,7 +502,7 @@ void Cluster::read_core(
 }
 
 void Cluster::read_core(
-    vector<uint32_t> &data, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, bool small_access) const {
+    std::vector<uint32_t> &data, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, bool small_access) const {
     data.resize(size_in_bytes / sizeof(uint32_t));
     read_core(data.data(), size_in_bytes, core, addr, small_access);
 }
diff --git a/tt_metal/llrt/tt_cluster.hpp b/tt_metal/llrt/tt_cluster.hpp
index 2bf49c81e8d..9d216bc7f47 100644
--- a/tt_metal/llrt/tt_cluster.hpp
+++ b/tt_metal/llrt/tt_cluster.hpp
@@ -19,6 +19,7 @@
 // clang-format off
 #include "noc/noc_parameters.h"
 #include "eth_interface.h"
+#include "eth_l1_address_map.h"
 #include "dev_msgs.h"
 // clang-format on
 
@@ -27,7 +28,6 @@
 static constexpr std::uint32_t SW_VERSION = 0x00020000;
 
 using tt_target_dram = std::tuple<int, int, int>;
-using tt::TargetDevice;
 
 enum EthRouterMode : uint32_t {
     IDLE = 0,
@@ -79,9 +79,9 @@ class Cluster {
     void deassert_risc_reset_at_core(const tt_cxy_pair &physical_chip_coord) const;
     void assert_risc_reset_at_core(const tt_cxy_pair &physical_chip_coord) const;
 
-    void write_dram_vec(vector<uint32_t> &vec, tt_target_dram dram, uint64_t addr, bool small_access = false) const;
+    void write_dram_vec(std::vector<uint32_t> &vec, tt_target_dram dram, uint64_t addr, bool small_access = false) const;
     void read_dram_vec(
-        vector<uint32_t> &vec,
+        std::vector<uint32_t> &vec,
         uint32_t size_in_bytes,
         tt_target_dram dram,
         uint64_t addr,
@@ -93,7 +93,7 @@ class Cluster {
     void read_core(
         void *mem_ptr, uint32_t sz_in_bytes, tt_cxy_pair core, uint64_t addr, bool small_access = false) const;
     void read_core(
-        vector<uint32_t> &data, uint32_t sz_in_bytes, tt_cxy_pair core, uint64_t addr, bool small_access = false) const;
+        std::vector<uint32_t> &data, uint32_t sz_in_bytes, tt_cxy_pair core, uint64_t addr, bool small_access = false) const;
 
     std::optional<std::tuple<uint32_t, uint32_t>> get_tlb_data(const tt_cxy_pair &target) const {
         chip_id_t mmio_device_id = device_to_mmio_device_.at(target.chip);
diff --git a/tt_metal/programming_examples/add_2_integers_in_compute/add_2_integers_in_compute.cpp b/tt_metal/programming_examples/add_2_integers_in_compute/add_2_integers_in_compute.cpp
index 23f21b3c5c8..80093cb45c1 100644
--- a/tt_metal/programming_examples/add_2_integers_in_compute/add_2_integers_in_compute.cpp
+++ b/tt_metal/programming_examples/add_2_integers_in_compute/add_2_integers_in_compute.cpp
@@ -69,7 +69,7 @@ int main(int argc, char **argv) {
         DataMovementConfig{.processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default});
 
     /* Set the parameters that the compute kernel will use */
-    vector<uint32_t> compute_kernel_args = {};
+    std::vector<uint32_t> compute_kernel_args = {};
 
     /* Use the add_tiles operation in the compute kernel */
     KernelHandle eltwise_binary_kernel_id = CreateKernel(
diff --git a/tt_metal/programming_examples/eltwise_binary/eltwise_binary.cpp b/tt_metal/programming_examples/eltwise_binary/eltwise_binary.cpp
index 73ab8f59cc1..e4a31d64676 100644
--- a/tt_metal/programming_examples/eltwise_binary/eltwise_binary.cpp
+++ b/tt_metal/programming_examples/eltwise_binary/eltwise_binary.cpp
@@ -127,7 +127,7 @@ int main(int argc, char **argv) {
         /*
          * Set the parameters that the compute kernel will use.
          */
-        vector<uint32_t> compute_kernel_args = {
+        std::vector<uint32_t> compute_kernel_args = {
         };
 
         constexpr bool fp32_dest_acc_en = false;
diff --git a/tt_metal/programming_examples/eltwise_sfpu/eltwise_sfpu.cpp b/tt_metal/programming_examples/eltwise_sfpu/eltwise_sfpu.cpp
index 7744018cc4c..bc3e6593501 100644
--- a/tt_metal/programming_examples/eltwise_sfpu/eltwise_sfpu.cpp
+++ b/tt_metal/programming_examples/eltwise_sfpu/eltwise_sfpu.cpp
@@ -90,7 +90,7 @@ int main(int argc, char **argv) {
         /*
          * Set the parameters that the compute kernel will use.
          */
-        vector<uint32_t> compute_kernel_args = {
+        std::vector<uint32_t> compute_kernel_args = {
             num_tiles,
             1
         };
diff --git a/tt_metal/programming_examples/hello_world_compute_kernel/hello_world_compute_kernel.cpp b/tt_metal/programming_examples/hello_world_compute_kernel/hello_world_compute_kernel.cpp
index 2ab6c43d454..90f663adbe6 100644
--- a/tt_metal/programming_examples/hello_world_compute_kernel/hello_world_compute_kernel.cpp
+++ b/tt_metal/programming_examples/hello_world_compute_kernel/hello_world_compute_kernel.cpp
@@ -20,7 +20,7 @@ int main(int argc, char **argv) {
 
     // Configure and Create Void Kernel
 
-    vector<uint32_t> compute_kernel_args = {};
+    std::vector<uint32_t> compute_kernel_args = {};
     KernelHandle void_compute_kernel_id = CreateKernel(
         program,
         "tt_metal/programming_examples/hello_world_compute_kernel/kernels/compute/void_compute_kernel.cpp",
diff --git a/tt_metal/programming_examples/matmul_multi_core/matmul_multi_core.cpp b/tt_metal/programming_examples/matmul_multi_core/matmul_multi_core.cpp
index 409b2c6f009..f655bc93dea 100644
--- a/tt_metal/programming_examples/matmul_multi_core/matmul_multi_core.cpp
+++ b/tt_metal/programming_examples/matmul_multi_core/matmul_multi_core.cpp
@@ -27,7 +27,7 @@ void golden_matmul(std::vector<bfloat16>& a, std::vector<bfloat16>& b, std::vect
 
     float c_f;
     float float_tmp;
-    vector<bfloat16> c_bf(M * N, 0);
+    std::vector<bfloat16> c_bf(M * N, 0);
 
     for (int i = 0; i < M; i++) {
         for (int j = 0; j < N; j++) {
@@ -47,7 +47,7 @@ void golden_matmul(std::vector<bfloat16>& a, std::vector<bfloat16>& b, std::vect
 }
 
 
-void matmul_multi_core(vector<bfloat16>& a, vector<bfloat16>& b, vector<bfloat16>& output, bool bcast_batch,
+void matmul_multi_core(std::vector<bfloat16>& a, std::vector<bfloat16>& b, std::vector<bfloat16>& output, bool bcast_batch,
                         uint32_t M, uint32_t N, uint32_t K, uint32_t B, Device* device) {
 
     /*
@@ -169,7 +169,7 @@ void matmul_multi_core(vector<bfloat16>& a, vector<bfloat16>& b, vector<bfloat16
         all_cores,
         tt_metal::DataMovementConfig{.processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default, .compile_args = writer_compile_time_args});
 
-    vector<uint32_t> compute_args_group_1 = {
+    std::vector<uint32_t> compute_args_group_1 = {
         1, // B
         1, // Mt
         Kt, // Kt
@@ -184,7 +184,7 @@ void matmul_multi_core(vector<bfloat16>& a, vector<bfloat16>& b, vector<bfloat16
     );
 
     if (!core_group_2.ranges().empty()) {
-        vector<uint32_t> compute_args_group_2 = {
+        std::vector<uint32_t> compute_args_group_2 = {
             1, // B
             1, // Mt
             Kt, // Kt
@@ -285,7 +285,7 @@ int main(int argc, char **argv) {
         std::vector<bfloat16> src1_vec = create_random_vector_of_bfloat16_native(dram_buffer_B_size, 1, 12522, -0.2);
 
         /* Golden Matmul running on CPU (Float)*/
-        vector<bfloat16> golden_vec(M * N, 0);
+        std::vector<bfloat16> golden_vec(M * N, 0);
         golden_matmul(src0_vec, src1_vec, golden_vec, M, N, K, B);
 
         /* Input vector tilizing */
@@ -293,7 +293,7 @@ int main(int argc, char **argv) {
         tilize(src1_vec, K, N);
 
         /* Calling the MatMul host program. Read in result into a host vector */
-        vector<bfloat16> result_vec(dram_buffer_C_size/sizeof(bfloat16));
+        std::vector<bfloat16> result_vec(dram_buffer_C_size/sizeof(bfloat16));
         matmul_multi_core(src0_vec, src1_vec, result_vec, false, M, N, K, B, device);
         untilize(result_vec, M, N);
 
diff --git a/tt_metal/programming_examples/matmul_multicore_reuse/matmul_multicore_reuse.cpp b/tt_metal/programming_examples/matmul_multicore_reuse/matmul_multicore_reuse.cpp
index 50226e14d58..5c5f12eedcf 100644
--- a/tt_metal/programming_examples/matmul_multicore_reuse/matmul_multicore_reuse.cpp
+++ b/tt_metal/programming_examples/matmul_multicore_reuse/matmul_multicore_reuse.cpp
@@ -19,7 +19,7 @@ using namespace tt;
 using namespace tt::tt_metal;
 
 
-void golden_matmul(vector<bfloat16>& a, vector<bfloat16>& b, vector<bfloat16>& output,
+void golden_matmul(std::vector<bfloat16>& a, std::vector<bfloat16>& b, std::vector<bfloat16>& output,
                         uint32_t M, uint32_t N, uint32_t K, uint32_t B) {
     std::uint32_t idx_c = 0;
     std::uint32_t idx_a = 0;
@@ -27,7 +27,7 @@ void golden_matmul(vector<bfloat16>& a, vector<bfloat16>& b, vector<bfloat16>& o
 
     float c_f;
     float float_tmp;
-    vector<bfloat16> c_bf(M * N, 0);
+    std::vector<bfloat16> c_bf(M * N, 0);
 
     for (int i = 0; i < M; i++) {
         for (int j = 0; j < N; j++) {
@@ -46,7 +46,7 @@ void golden_matmul(vector<bfloat16>& a, vector<bfloat16>& b, vector<bfloat16>& o
     }
 }
 
-void matmul_multicore_reuse(vector<bfloat16>& a, vector<bfloat16>& b, vector<bfloat16>& output, bool bcast_batch,
+void matmul_multicore_reuse(std::vector<bfloat16>& a, std::vector<bfloat16>& b, std::vector<bfloat16>& output, bool bcast_batch,
                         uint32_t M, uint32_t N, uint32_t K, uint32_t B, Device* device) {
 
     /*
@@ -122,7 +122,7 @@ void matmul_multicore_reuse(vector<bfloat16>& a, vector<bfloat16>& b, vector<bfl
 
     uint32_t out_subblock_num_tiles = out_subblock_h*out_subblock_w;
 
-    vector<uint32_t> compute_kernel_args = {
+    std::vector<uint32_t> compute_kernel_args = {
         in0_block_w, // in0_block_w
         in0_num_subblocks, // in0_num_subblocks
         in0_block_num_tiles, // in0_block_num_tiles
@@ -372,7 +372,7 @@ int main(int argc, char **argv) {
         std::vector<bfloat16> src1_vec = create_random_vector_of_bfloat16_native(dram_buffer_B_size, 1, 12522, -0.3);
 
         /* Golden Matmul running on CPU (Float)*/
-        vector<bfloat16> golden_vec(M * N, 0);
+        std::vector<bfloat16> golden_vec(M * N, 0);
         golden_matmul(src0_vec, src1_vec, golden_vec, M, N, K, B);
 
         /* Input vector tilizing */
@@ -380,7 +380,7 @@ int main(int argc, char **argv) {
         tilize(src1_vec, K, N);
 
         /* Calling the MatMul host program. Read in result into a host vector */
-        vector<bfloat16> result_vec(dram_buffer_C_size/sizeof(bfloat16));
+        std::vector<bfloat16> result_vec(dram_buffer_C_size/sizeof(bfloat16));
         matmul_multicore_reuse(src0_vec, src1_vec, result_vec, false, M, N, K, B, device);
         untilize(result_vec, M, N);
 
diff --git a/tt_metal/programming_examples/matmul_multicore_reuse_mcast/matmul_multicore_reuse_mcast.cpp b/tt_metal/programming_examples/matmul_multicore_reuse_mcast/matmul_multicore_reuse_mcast.cpp
index 97c7b7f06d7..b6beb079bea 100644
--- a/tt_metal/programming_examples/matmul_multicore_reuse_mcast/matmul_multicore_reuse_mcast.cpp
+++ b/tt_metal/programming_examples/matmul_multicore_reuse_mcast/matmul_multicore_reuse_mcast.cpp
@@ -123,7 +123,7 @@ void matmul_multicore_reuse_mcast(std::vector<bfloat16>& a, std::vector<bfloat16
 
     uint32_t out_subblock_num_tiles = out_subblock_h*out_subblock_w;
 
-    vector<uint32_t> compute_kernel_args = {
+    std::vector<uint32_t> compute_kernel_args = {
         in0_block_w, // in0_block_w
         in0_num_subblocks, // in0_num_subblocks
         in0_block_num_tiles, // in0_block_num_tiles
@@ -479,7 +479,7 @@ int main(int argc, char **argv) {
         std::vector<bfloat16> src1_vec = create_random_vector_of_bfloat16_native(dram_buffer_B_size, 1, 12522, -0.3);
 
         /* Golden Matmul running on CPU (Float)*/
-        vector<bfloat16> golden_vec(M * N, 0);
+        std::vector<bfloat16> golden_vec(M * N, 0);
         golden_matmul(src0_vec, src1_vec, golden_vec, M, N, K, B);
 
         /* Input vector tilizing */
@@ -487,7 +487,7 @@ int main(int argc, char **argv) {
         tilize(src1_vec, K, N);
 
         /* Calling the MatMul host program. Read in result into a host vector */
-        vector<bfloat16> result_vec(dram_buffer_C_size/sizeof(bfloat16));
+        std::vector<bfloat16> result_vec(dram_buffer_C_size/sizeof(bfloat16));
         matmul_multicore_reuse_mcast(src0_vec, src1_vec, result_vec, false, M, N, K, B, device);
         untilize(result_vec, M, N);
 
diff --git a/tt_metal/programming_examples/matmul_single_core/matmul_single_core.cpp b/tt_metal/programming_examples/matmul_single_core/matmul_single_core.cpp
index 598e559b967..6e95757ba1e 100644
--- a/tt_metal/programming_examples/matmul_single_core/matmul_single_core.cpp
+++ b/tt_metal/programming_examples/matmul_single_core/matmul_single_core.cpp
@@ -26,7 +26,7 @@ void golden_matmul(std::vector<bfloat16>& a, std::vector<bfloat16>& b, std::vect
 
     float c_f;
     float float_tmp;
-    vector<bfloat16> c_bf(M * N, 0);
+    std::vector<bfloat16> c_bf(M * N, 0);
 
     for (int i = 0; i < M; i++) {
         for (int j = 0; j < N; j++) {
@@ -154,7 +154,7 @@ void matmul_single_core(std::vector<bfloat16>& a, std::vector<bfloat16>& b, std:
         core,
         tt_metal::DataMovementConfig{.processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default, .compile_args = writer_compile_time_args});
 
-    vector<uint32_t> compute_args = {
+    std::vector<uint32_t> compute_args = {
         B, // B
         Mt, // Mt
         Kt, // Kt
@@ -224,7 +224,7 @@ int main(int argc, char **argv) {
         std::vector<bfloat16> src1_vec = create_random_vector_of_bfloat16_native(dram_buffer_B_size, 1, 12522);
 
          /* Golden Matmul running on CPU (Float)*/
-        vector<bfloat16> golden_vec(M * N, 0);
+        std::vector<bfloat16> golden_vec(M * N, 0);
         golden_matmul(src0_vec, src1_vec, golden_vec, M, N, K, B);
 
         /* Input vector tilizing */
@@ -232,7 +232,7 @@ int main(int argc, char **argv) {
         tilize(src1_vec, K, N);
 
         /* Calling the MatMul host program. Read in result into a host vector */
-        vector<bfloat16> result_vec(dram_buffer_C_size/sizeof(bfloat16));
+        std::vector<bfloat16> result_vec(dram_buffer_C_size/sizeof(bfloat16));
         matmul_single_core(src0_vec, src1_vec, result_vec, false, M, N, K, B, device);
         untilize(result_vec, M, N);
 
diff --git a/tt_metal/programming_examples/profiler/test_custom_cycle_count/test_custom_cycle_count.cpp b/tt_metal/programming_examples/profiler/test_custom_cycle_count/test_custom_cycle_count.cpp
index adfa251352a..19bf2d8d43e 100644
--- a/tt_metal/programming_examples/profiler/test_custom_cycle_count/test_custom_cycle_count.cpp
+++ b/tt_metal/programming_examples/profiler/test_custom_cycle_count/test_custom_cycle_count.cpp
@@ -36,7 +36,7 @@ bool RunCustomCycle(tt_metal::Device *device, int loop_count)
         all_cores,
         tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default, .defines = kernel_defines});
 
-    vector<uint32_t> trisc_kernel_args = {};
+    std::vector<uint32_t> trisc_kernel_args = {};
     tt_metal::KernelHandle trisc_kernel = tt_metal::CreateKernel(
         program, "tt_metal/programming_examples/profiler/test_custom_cycle_count/kernels/custom_cycle_count_compute.cpp",
         all_cores,
diff --git a/tt_metal/programming_examples/profiler/test_custom_cycle_count_slow_dispatch/test_custom_cycle_count_slow_dispatch.cpp b/tt_metal/programming_examples/profiler/test_custom_cycle_count_slow_dispatch/test_custom_cycle_count_slow_dispatch.cpp
index 9856c83d007..377c0016c26 100644
--- a/tt_metal/programming_examples/profiler/test_custom_cycle_count_slow_dispatch/test_custom_cycle_count_slow_dispatch.cpp
+++ b/tt_metal/programming_examples/profiler/test_custom_cycle_count_slow_dispatch/test_custom_cycle_count_slow_dispatch.cpp
@@ -36,7 +36,7 @@ bool RunCustomCycle(tt_metal::Device *device, int loop_count)
         all_cores,
         tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default, .defines = kernel_defines});
 
-    vector<uint32_t> trisc_kernel_args = {};
+    std::vector<uint32_t> trisc_kernel_args = {};
     tt_metal::KernelHandle trisc_kernel = tt_metal::CreateKernel(
         program, "tt_metal/programming_examples/profiler/test_custom_cycle_count_slow_dispatch/kernels/custom_cycle_count_compute_slow_dispatch.cpp",
         all_cores,
diff --git a/tt_metal/programming_examples/profiler/test_dispatch_cores/test_dispatch_cores.cpp b/tt_metal/programming_examples/profiler/test_dispatch_cores/test_dispatch_cores.cpp
index 46cfce4c214..dc16098bf9b 100644
--- a/tt_metal/programming_examples/profiler/test_dispatch_cores/test_dispatch_cores.cpp
+++ b/tt_metal/programming_examples/profiler/test_dispatch_cores/test_dispatch_cores.cpp
@@ -34,7 +34,7 @@ void RunCustomCycle(tt_metal::Device *device, int loop_count)
         all_cores,
         tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default, .defines = kernel_defines});
 
-    vector<uint32_t> trisc_kernel_args = {};
+    std::vector<uint32_t> trisc_kernel_args = {};
     tt_metal::KernelHandle trisc_kernel = tt_metal::CreateKernel(
         program, "tt_metal/programming_examples/profiler/test_custom_cycle_count/kernels/custom_cycle_count_compute.cpp",
         all_cores,
diff --git a/tt_metal/programming_examples/profiler/test_full_buffer/test_full_buffer.cpp b/tt_metal/programming_examples/profiler/test_full_buffer/test_full_buffer.cpp
index b3e7a629d6d..34a47b1d5c8 100644
--- a/tt_metal/programming_examples/profiler/test_full_buffer/test_full_buffer.cpp
+++ b/tt_metal/programming_examples/profiler/test_full_buffer/test_full_buffer.cpp
@@ -33,7 +33,7 @@ void RunFillUpAllBuffers(tt_metal::Device *device, int loop_count, bool fast_dis
         program, "tt_metal/programming_examples/profiler/test_full_buffer/kernels/full_buffer.cpp",
         all_cores,
         tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default, .defines = kernel_defines});
-    vector<uint32_t> trisc_kernel_args = {};
+    std::vector<uint32_t> trisc_kernel_args = {};
     tt_metal::KernelHandle trisc_kernel = tt_metal::CreateKernel(
         program, "tt_metal/programming_examples/profiler/test_full_buffer/kernels/full_buffer_compute.cpp",
         all_cores,
diff --git a/tt_metal/programming_examples/profiler/test_multi_op/test_multi_op.cpp b/tt_metal/programming_examples/profiler/test_multi_op/test_multi_op.cpp
index 844d4dc3bdb..4f92c4c0b34 100644
--- a/tt_metal/programming_examples/profiler/test_multi_op/test_multi_op.cpp
+++ b/tt_metal/programming_examples/profiler/test_multi_op/test_multi_op.cpp
@@ -28,7 +28,7 @@ void RunCustomCycle(tt_metal::Device *device, int fastDispatch)
         all_cores,
         tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default});
 
-    vector<uint32_t> trisc_kernel_args = {};
+    std::vector<uint32_t> trisc_kernel_args = {};
     tt_metal::KernelHandle trisc_kernel = tt_metal::CreateKernel(
         program, "tt_metal/programming_examples/profiler/test_multi_op/kernels/multi_op_compute.cpp",
         all_cores,
diff --git a/tt_metal/tools/memset.cpp b/tt_metal/tools/memset.cpp
index 73e9a181bc6..9713f839e5c 100644
--- a/tt_metal/tools/memset.cpp
+++ b/tt_metal/tools/memset.cpp
@@ -7,7 +7,7 @@
 
 #include <unistd.h>
 
-void memset_l1(vector<uint32_t> mem_vec, uint32_t chip_id, uint32_t start_addr) {
+void memset_l1(std::vector<uint32_t> mem_vec, uint32_t chip_id, uint32_t start_addr) {
     // Utility function that writes a memory vector to L1 for all cores at a specific start address.
     const metal_SocDescriptor &sdesc = tt::Cluster::instance().get_soc_desc(chip_id);
     for (auto &worker_core : sdesc.physical_workers) {
@@ -15,7 +15,7 @@ void memset_l1(vector<uint32_t> mem_vec, uint32_t chip_id, uint32_t start_addr)
     }
 }
 
-void memset_dram(vector<uint32_t> mem_vec, uint32_t chip_id, uint32_t start_addr) {
+void memset_dram(std::vector<uint32_t> mem_vec, uint32_t chip_id, uint32_t start_addr) {
     // Utility function that writes a memory to all channels and subchannels at a specific start address.
     const metal_SocDescriptor &sdesc = tt::Cluster::instance().get_soc_desc(chip_id);
     for (uint32_t dram_src_channel_id = 0; dram_src_channel_id < sdesc.dram_cores.size(); dram_src_channel_id++) {
diff --git a/tt_metal/tools/profiler/op_profiler.hpp b/tt_metal/tools/profiler/op_profiler.hpp
index 04aeb667af1..c7c35c1bde2 100644
--- a/tt_metal/tools/profiler/op_profiler.hpp
+++ b/tt_metal/tools/profiler/op_profiler.hpp
@@ -195,7 +195,7 @@ static inline json get_tensor_json(const Tensor& tensor) {
 
 static inline std::vector<json> get_tensors_json(const std::vector<Tensor>& tensors) {
     ZoneScoped;
-    vector<json> ret;
+    std::vector<json> ret;
     for (auto& tensor : tensors) {
         ret.push_back(get_tensor_json(tensor));
     }
@@ -204,7 +204,7 @@ static inline std::vector<json> get_tensors_json(const std::vector<Tensor>& tens
 
 static inline std::vector<json> get_tensors_json(const std::vector<std::optional<const Tensor>>& tensors) {
     ZoneScoped;
-    vector<json> ret;
+    std::vector<json> ret;
     for (auto& tensor : tensors) {
         if (tensor.has_value()) {
             ret.push_back(get_tensor_json(tensor.value()));
@@ -215,7 +215,7 @@ static inline std::vector<json> get_tensors_json(const std::vector<std::optional
 
 static inline std::vector<json> get_tensors_json(const std::vector<std::optional<Tensor>>& tensors) {
     ZoneScoped;
-    vector<json> ret;
+    std::vector<json> ret;
     for (auto& tensor : tensors) {
         if (tensor.has_value()) {
             ret.push_back(get_tensor_json(tensor.value()));
diff --git a/tt_metal/tools/profiler/profiler.cpp b/tt_metal/tools/profiler/profiler.cpp
index ff4c7078723..c761343eaa0 100644
--- a/tt_metal/tools/profiler/profiler.cpp
+++ b/tt_metal/tools/profiler/profiler.cpp
@@ -24,7 +24,7 @@ namespace tt_metal {
 
 void DeviceProfiler::readRiscProfilerResults(
         int device_id,
-        const vector<std::uint32_t> &profile_buffer,
+        const std::vector<std::uint32_t> &profile_buffer,
         const CoreCoord &worker_core
         ){
 
@@ -52,7 +52,7 @@ void DeviceProfiler::readRiscProfilerResults(
     uint32_t coreFlatID = soc_d.physical_routing_to_profiler_flat_id.at(worker_core);
     uint32_t startIndex = coreFlatID * MAX_RISCV_PER_CORE * PROFILER_FULL_HOST_VECTOR_SIZE_PER_RISC;
 
-    vector<std::uint32_t> control_buffer = tt::llrt::read_hex_vec_from_core(
+    std::vector<std::uint32_t> control_buffer = tt::llrt::read_hex_vec_from_core(
         device_id,
         worker_core,
         reinterpret_cast<uint64_t>(profiler_msg->control_vector),
@@ -372,7 +372,7 @@ void DeviceProfiler::generateZoneSourceLocationsHashes()
 
 void DeviceProfiler::dumpResults (
         Device *device,
-        const vector<CoreCoord> &worker_cores,
+        const std::vector<CoreCoord> &worker_cores,
         bool lastDump){
 #if defined(TRACY_ENABLE)
     ZoneScoped;
diff --git a/tt_metal/tools/profiler/profiler.hpp b/tt_metal/tools/profiler/profiler.hpp
index dfcf6986572..483546fd280 100644
--- a/tt_metal/tools/profiler/profiler.hpp
+++ b/tt_metal/tools/profiler/profiler.hpp
@@ -81,7 +81,7 @@ class DeviceProfiler {
         // Helper function for reading risc profile results
         void readRiscProfilerResults(
                 int device_id,
-                const vector<std::uint32_t> &profile_buffer,
+                const std::vector<std::uint32_t> &profile_buffer,
                 const CoreCoord &worker_core);
 
         //Push device results to tracy
@@ -114,7 +114,7 @@ class DeviceProfiler {
         void setOutputDir(const std::string& new_output_dir);
 
         //Traverse all cores on the device and dump the device profile results
-        void dumpResults(Device *device, const vector<CoreCoord> &worker_cores, bool lastDump);
+        void dumpResults(Device *device, const std::vector<CoreCoord> &worker_cores, bool lastDump);
 };
 
 }  // namespace tt_metal
diff --git a/tt_metal/tools/profiler/tt_metal_profiler.cpp b/tt_metal/tools/profiler/tt_metal_profiler.cpp
index 0832677dc7f..b7a8eaae932 100644
--- a/tt_metal/tools/profiler/tt_metal_profiler.cpp
+++ b/tt_metal/tools/profiler/tt_metal_profiler.cpp
@@ -166,7 +166,7 @@ void syncDeviceHost(Device *device, CoreCoord logical_core, std::shared_ptr<tt_m
     constexpr uint32_t briscIndex = 0;
     uint64_t addr = reinterpret_cast<uint64_t>(&profiler_msg->buffer[briscIndex][kernel_profiler::CUSTOM_MARKERS]);
 
-    vector<std::uint32_t> sync_times = tt::llrt::read_hex_vec_from_core(
+    std::vector<std::uint32_t> sync_times = tt::llrt::read_hex_vec_from_core(
             device_id,
             core,
             addr,
@@ -416,7 +416,7 @@ void DumpDeviceProfileResults(Device *device, std::vector<CoreCoord> &worker_cor
                          tt::get_logical_dispatch_cores(device_id, device_num_hw_cqs, dispatch_core_type)) {
                         const auto curr_core = device->physical_core_from_logical_core(core, dispatch_core_type);
                         profiler_msg_t *profiler_msg = device->get_dev_addr<profiler_msg_t *>(curr_core, HalL1MemAddrType::PROFILER);
-                        vector<std::uint32_t> control_buffer = tt::llrt::read_hex_vec_from_core(
+                        std::vector<std::uint32_t> control_buffer = tt::llrt::read_hex_vec_from_core(
                                 device_id,
                                 curr_core,
                                 reinterpret_cast<uint64_t>(profiler_msg->control_vector),
@@ -436,7 +436,7 @@ void DumpDeviceProfileResults(Device *device, std::vector<CoreCoord> &worker_cor
                     {
                         const auto curr_core = device->physical_core_from_logical_core(core, CoreType::ETH);
                         profiler_msg_t *profiler_msg = device->get_dev_addr<profiler_msg_t *>(curr_core, HalL1MemAddrType::PROFILER);
-                        vector<std::uint32_t> control_buffer = tt::llrt::read_hex_vec_from_core(
+                        std::vector<std::uint32_t> control_buffer = tt::llrt::read_hex_vec_from_core(
                                 device_id,
                                 core,
                                 reinterpret_cast<uint64_t>(profiler_msg->control_vector),
diff --git a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/host/reduce_scatter_full_worker_grid.cpp b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/host/reduce_scatter_full_worker_grid.cpp
index 5db23aa52b8..8a2b8ed7815 100644
--- a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/host/reduce_scatter_full_worker_grid.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/host/reduce_scatter_full_worker_grid.cpp
@@ -210,7 +210,7 @@ static std::tuple<KernelHandle, KernelHandle, KernelHandle, std::optional<Kernel
         worker_core_range,
         tt::tt_metal::WriterDataMovementConfig(worker_arg_builder.generate_sender_kernel_ct_args(), worker_defines));
 
-    vector<uint32_t> compute_kernel_args = {};
+    std::vector<uint32_t> compute_kernel_args = {};
     constexpr bool fp32_dest_acc_en = false;
     constexpr bool math_approx_mode = false;
     std::map<string, string> eltwise_defines = ttnn::operations::binary::utils::get_defines(binary_math_op);
diff --git a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/host/reduce_scatter_common.cpp b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/host/reduce_scatter_common.cpp
index 1fedbae5584..e36e49ed2ec 100644
--- a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/host/reduce_scatter_common.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/host/reduce_scatter_common.cpp
@@ -4,7 +4,6 @@
 
 
 #include "ttnn/cpp/ttnn/operations/ccl/reduce_scatter/host/reduce_scatter_common.hpp"
-// #include "tt_metal/common/base.hpp"
 #include "ttnn/cpp/ttnn/operations/ccl/ccl_common.hpp"
 
 #include <cstdint>
diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.cpp
index d06ef84d459..fa67ff3b582 100644
--- a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.cpp
+++ b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.cpp
@@ -24,7 +24,7 @@ namespace optimized_conv_op_utils {
 using namespace tt;
 using namespace tt::tt_metal;
 
-std::pair<vector<uint32_t>, vector<uint32_t>> compute_opt_conv_activation_as_mm_shape(const tt::tt_metal::LegacyShape& conv_activation_shape, ttnn::operations::sliding_window::SlidingWindowConfig sliding_window_config, uint32_t act_block_h_ntiles) {
+std::pair<std::vector<uint32_t>, std::vector<uint32_t>> compute_opt_conv_activation_as_mm_shape(const tt::tt_metal::LegacyShape& conv_activation_shape, ttnn::operations::sliding_window::SlidingWindowConfig sliding_window_config, uint32_t act_block_h_ntiles) {
 
     uint32_t filter_h = (uint32_t)sliding_window_config.window_hw.first;  // filter_h
     uint32_t filter_w = (uint32_t)sliding_window_config.window_hw.second;  // filter_W
diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.hpp b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.hpp
index 3d6eb25c939..038144993ab 100644
--- a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.hpp
+++ b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.hpp
@@ -178,6 +178,6 @@ using namespace tt;
 using namespace tt::tt_metal;
 
 
-std::pair<vector<uint32_t>, vector<uint32_t>> compute_opt_conv_activation_as_mm_shape(const tt::tt_metal::LegacyShape& conv_activation_shape, ttnn::operations::sliding_window::SlidingWindowConfig sliding_window_config, uint32_t act_block_h_ntiles);
+std::pair<std::vector<uint32_t>, std::vector<uint32_t>> compute_opt_conv_activation_as_mm_shape(const tt::tt_metal::LegacyShape& conv_activation_shape, ttnn::operations::sliding_window::SlidingWindowConfig sliding_window_config, uint32_t act_block_h_ntiles);
 
 } // optimized_conv_op_utils
diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp
index 8bd6bd51a0d..30197ecc6a5 100644
--- a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp
@@ -1272,7 +1272,7 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl(
             writer_compile_time_args.end(), split_reader_args.begin(), split_reader_args.end());
     }
 
-    vector<uint32_t> compute_kernel_args = {
+    std::vector<uint32_t> compute_kernel_args = {
         in0_block_w,
         act_num_subblocks,
         in0_block_num_tiles,
diff --git a/ttnn/cpp/ttnn/operations/data_movement/clone/device/clone_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/clone/device/clone_program_factory.cpp
index 8ff30a226ba..1e36a88de03 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/clone/device/clone_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/clone/device/clone_program_factory.cpp
@@ -56,7 +56,7 @@ CloneOperation::ProgramFactory::cached_program_t CloneOperation::ProgramFactory:
     bool input_is_dram = input_buffer->buffer_type() == BufferType::DRAM ? 1 : 0;
     bool output_is_dram = output_buffer->buffer_type() == BufferType::DRAM ? 1 : 0;
 
-    vector<uint32_t> reader_compile_time_args, writer_compile_time_args;
+    std::vector<uint32_t> reader_compile_time_args, writer_compile_time_args;
     if (tilized) {
         reader_compile_time_args = {
             (uint32_t)src_cb_id,
@@ -102,7 +102,7 @@ CloneOperation::ProgramFactory::cached_program_t CloneOperation::ProgramFactory:
             get_compute_kernel_config_args(input.device()->arch(), operation_attributes.compute_kernel_config);
         auto create_compute_kernel = [&](const auto& core_group, uint32_t num_units_per_core) {
             if (!core_group.ranges().empty()) {
-                vector<uint32_t> compute_kernel_args = {
+                std::vector<uint32_t> compute_kernel_args = {
                     (uint32_t)src_cb_id,
                     (uint32_t)dst_cb_id,
                     (uint32_t)num_units_per_core,
diff --git a/ttnn/cpp/ttnn/operations/data_movement/concat/device/concat_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/concat/device/concat_program_factory.cpp
index af79fddf7d9..a638a6d46e8 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/concat/device/concat_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/concat/device/concat_program_factory.cpp
@@ -46,14 +46,14 @@ operation::ProgramWithCallbacks s2s_rm_concat_two_tensors_multi_core(
     uint32_t num_output_rows = output.get_legacy_shape()[-2];
     uint32_t num_input_tensors = input_tensors.size();
 
-    vector<CBHandle> cb_input(num_input_tensors);
-    vector<uint32_t> input_num_units_per_shard_height(num_input_tensors);
-    vector<uint32_t> input_num_units_per_shard_width(num_input_tensors);
+    std::vector<CBHandle> cb_input(num_input_tensors);
+    std::vector<uint32_t> input_num_units_per_shard_height(num_input_tensors);
+    std::vector<uint32_t> input_num_units_per_shard_width(num_input_tensors);
 
     tt::DataFormat cb_data_format = tt_metal::datatype_to_dataformat_converter(output.get_dtype());
     auto all_cores = input_tensors[0].shard_spec().value().grid;
 
-    vector<uint32_t> cb_ids(num_input_tensors);
+    std::vector<uint32_t> cb_ids(num_input_tensors);
     uint32_t input_unit_size = input_tensors[0].shard_spec().value().shape[1] * input_tensors[0].element_size();
     // input CBs
     for (uint32_t input_id = 0; input_id < num_input_tensors; input_id++) {
@@ -201,10 +201,10 @@ operation::ProgramWithCallbacks s2s_concat_multi_core(
         elements_per_page_height = TILE_HEIGHT;
     }
 
-    vector<CBHandle> cb_inputs(num_input_tensors);
-    vector<uint32_t> input_num_pages_per_stick(num_input_tensors);
-    vector<uint32_t> input_num_sticks(num_input_tensors);
-    vector<uint32_t> input_write_offsets(num_input_tensors);
+    std::vector<CBHandle> cb_inputs(num_input_tensors);
+    std::vector<uint32_t> input_num_pages_per_stick(num_input_tensors);
+    std::vector<uint32_t> input_num_sticks(num_input_tensors);
+    std::vector<uint32_t> input_write_offsets(num_input_tensors);
 
     // Assume inputs and output have the same sharding grid.
     const auto all_cores = input_tensors[0].shard_spec().value().grid;
@@ -299,14 +299,14 @@ operation::ProgramWithCallbacks s2i_rm_concat_multi_core(
     uint32_t num_output_rows = output.get_legacy_shape()[-1];
     uint32_t num_input_tensors = input_tensors.size();
 
-    vector<CBHandle> cb_input(num_input_tensors);
-    vector<uint32_t> input_num_units_per_shard_height(num_input_tensors);
-    vector<uint32_t> input_num_units_per_shard_width(num_input_tensors);
+    std::vector<CBHandle> cb_input(num_input_tensors);
+    std::vector<uint32_t> input_num_units_per_shard_height(num_input_tensors);
+    std::vector<uint32_t> input_num_units_per_shard_width(num_input_tensors);
 
     tt::DataFormat cb_data_format = tt_metal::datatype_to_dataformat_converter(output.get_dtype());
     auto all_cores = input_tensors[0].shard_spec().value().grid;
 
-    vector<uint32_t> cb_ids(num_input_tensors);
+    std::vector<uint32_t> cb_ids(num_input_tensors);
     uint32_t input_unit_size = input_tensors[0].shard_spec().value().shape[1] * input_tensors[0].element_size();
     // input CBs
     for (uint32_t input_id = 0; input_id < num_input_tensors; input_id++) {
@@ -357,8 +357,8 @@ operation::ProgramWithCallbacks s2i_rm_concat_multi_core(
             curr_num_output_rows = 0;
         }
 
-        vector<uint32_t> reader_runtime_args = {};
-        vector<uint32_t> writer_runtime_args = {
+        std::vector<uint32_t> reader_runtime_args = {};
+        std::vector<uint32_t> writer_runtime_args = {
             output.buffer()->address(),
             core_id,
             curr_num_output_rows,
@@ -400,8 +400,8 @@ operation::ProgramWithCallbacks s2i_rm_concat_multi_core(
                     curr_num_output_rows = 0;
                 }
 
-                vector<uint32_t> reader_runtime_args = {curr_num_input_tensors};
-                vector<uint32_t> writer_runtime_args = {
+                std::vector<uint32_t> reader_runtime_args = {curr_num_input_tensors};
+                std::vector<uint32_t> writer_runtime_args = {
                     dst_buffer->address(), curr_num_input_tensors, curr_num_output_rows};
                 for (uint32_t input_id = 0; input_id < num_input_tensors; input_id++) {
                     UpdateDynamicCircularBufferAddress(program, input_id, *dst_buffer);
@@ -540,7 +540,7 @@ operation::ProgramWithCallbacks concat_multi_core(
             num_output_pages_per_block += num_accum_pages * dim_pages;
         }
     }
-    vector<uint32_t> common_reader_kernel_args = {0, 0, 0};
+    std::vector<uint32_t> common_reader_kernel_args = {0, 0, 0};
     common_reader_kernel_args.insert(common_reader_kernel_args.end(), src_addr.begin(), src_addr.end());
     common_reader_kernel_args.insert(common_reader_kernel_args.end(), is_dram.begin(), is_dram.end());
     common_reader_kernel_args.insert(
@@ -615,13 +615,13 @@ operation::ProgramWithCallbacks concat_multi_core(
             }
         }
 
-        vector<uint32_t> reader_kernel_args = common_reader_kernel_args;
+        std::vector<uint32_t> reader_kernel_args = common_reader_kernel_args;
         reader_kernel_args[0] = num_pages_per_core;
         reader_kernel_args[1] = curr_tensor;
         reader_kernel_args[2] = curr_tensor_id;
         reader_kernel_args.insert(reader_kernel_args.end(), page_id_per_tensor.begin(), page_id_per_tensor.end());
 
-        vector<uint32_t> writer_kernel_args;
+        std::vector<uint32_t> writer_kernel_args;
         if (rm_layout) {
             writer_kernel_args = {
                 dst_buffer->address(), output.buffer()->page_size(), num_pages_per_core, num_pages_written};
diff --git a/ttnn/cpp/ttnn/operations/data_movement/copy/device/copy_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/copy/device/copy_program_factory.cpp
index 7556845fa77..e1f52f4b6be 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/copy/device/copy_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/copy/device/copy_program_factory.cpp
@@ -102,7 +102,7 @@ operation::ProgramWithCallbacks copy_multi_core(const Tensor &input, const Tenso
         tt::tt_metal::WriterDataMovementConfig(writer_compile_time_args, kernel_defines));
 
     if (convert_dtype) {
-        vector<uint32_t> compute_kernel_args_group_1 = {
+        std::vector<uint32_t> compute_kernel_args_group_1 = {
             num_units_per_core_group_1
         };
         auto eltwise_unary_kernel_group_1 = tt::tt_metal::CreateKernel(
@@ -113,7 +113,7 @@ operation::ProgramWithCallbacks copy_multi_core(const Tensor &input, const Tenso
         );
 
         if (!core_group_2.ranges().empty()) {
-             vector<uint32_t> compute_kernel_args_group_2 = {
+             std::vector<uint32_t> compute_kernel_args_group_2 = {
                 num_units_per_core_group_2
             };
             auto eltwise_unary_kernel_group_2 = tt::tt_metal::CreateKernel(
diff --git a/ttnn/cpp/ttnn/operations/data_movement/pad/device/pad_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/pad/device/pad_program_factory.cpp
index b8d2756443d..8951df02fee 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/pad/device/pad_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/pad/device/pad_program_factory.cpp
@@ -531,7 +531,7 @@ operation::ProgramWithCallbacks pad_tile(const Tensor &a, Tensor& output, const
 }
 
 
-inline void log_rt_args(const CoreCoord& core,  vector<uint32_t>& args) {
+inline void log_rt_args(const CoreCoord& core,  std::vector<uint32_t>& args) {
     for (auto v : args) {
         tt::log_debug(tt::LogOp, "{},{} :: {}", core.x, core.y, v);
     }
@@ -1276,7 +1276,7 @@ inline std::vector<std::pair<std::vector<uint32_t>, std::vector<uint32_t>>> get_
         }
 
         // reader rt args
-        vector<uint32_t> reader_kernel_args;
+        std::vector<uint32_t> reader_kernel_args;
         reader_kernel_args.push_back(core_stick_map.size()); // num_cores
 
         tt::log_debug("num_cores: {}", core_stick_map.size());
@@ -1296,7 +1296,7 @@ inline std::vector<std::pair<std::vector<uint32_t>, std::vector<uint32_t>>> get_
         }
 
         // coalesce the sticks into chunks
-        vector<std::vector<std::vector<uint32_t>>> stick_chunks_per_core;
+        std::vector<std::vector<std::vector<uint32_t>>> stick_chunks_per_core;
         for (auto core_stick_pair : core_stick_map) {
             auto stick_chunks = group_contiguous_and_repeated_values(core_stick_pair.second);
             stick_chunks_per_core.push_back(stick_chunks);
diff --git a/ttnn/cpp/ttnn/operations/data_movement/repeat/device/repeat_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/repeat/device/repeat_program_factory.cpp
index 80aa09138dc..96d173d1712 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/repeat/device/repeat_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/repeat/device/repeat_program_factory.cpp
@@ -97,7 +97,7 @@ operation::ProgramWithCallbacks repeat_multi_core(
         num_pages_per_block = num_accum_pages * dim_pages;
     }
 
-    vector<uint32_t> reader_kernel_args = {src_addr, 0, num_pages_per_block, 0, 0, 0, 0};
+    std::vector<uint32_t> reader_kernel_args = {src_addr, 0, num_pages_per_block, 0, 0, 0, 0};
     if (rm_layout) {
         reader_kernel_args.push_back(src_page_size);
     }
@@ -164,7 +164,7 @@ operation::ProgramWithCallbacks repeat_multi_core(
         reader_kernel_args[5] = curr_block_start_id;
         reader_kernel_args[6] = curr_id;
 
-        vector<uint32_t> writer_kernel_args;
+        std::vector<uint32_t> writer_kernel_args;
         if (rm_layout) {
             writer_kernel_args = {
                 dst_buffer->address(), output.buffer()->page_size(), num_pages_per_core, num_pages_written};
diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded/sharded_to_interleaved/device/sharded_to_interleaved_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/sharded/sharded_to_interleaved/device/sharded_to_interleaved_program_factory.cpp
index e7da8344843..6d585e65a13 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/sharded/sharded_to_interleaved/device/sharded_to_interleaved_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/sharded/sharded_to_interleaved/device/sharded_to_interleaved_program_factory.cpp
@@ -125,7 +125,7 @@ operation::ProgramWithCallbacks sharded_to_interleaved_multi_core(
             tt_metal::WriterDataMovementConfig(writer_compile_time_args));
     }
     if (convert_df) {
-        vector<uint32_t> compute_kernel_args = {num_units_per_shard};
+        std::vector<uint32_t> compute_kernel_args = {num_units_per_shard};
 
         auto eltwise_unary_kernel_group_1 = tt_metal::CreateKernel(
             program,
diff --git a/ttnn/cpp/ttnn/operations/data_movement/slice/device/slice_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/slice/device/slice_program_factory.cpp
index e7215850ea1..e159c7c02c4 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/slice/device/slice_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/slice/device/slice_program_factory.cpp
@@ -61,7 +61,7 @@ inline std::vector<std::pair<std::vector<uint32_t>, std::vector<uint32_t>>> get_
 
     uint32_t unpadded_row_size_bytes_offset = output_buffer->buffer_type() == tt::tt_metal::BufferType::DRAM ? tt::round_up(unpadded_row_size_bytes, TILE_WIDTH) : tt::round_up(unpadded_row_size_bytes, TILE_WIDTH / 2);
 
-    vector<uint32_t> common_reader_kernel_args = {
+    std::vector<uint32_t> common_reader_kernel_args = {
         input_tensor.buffer()->address() + output_tensor_start[-1] * output_tensor.element_size(),
         padded_row_size_bytes,
         unpadded_row_size_bytes,
@@ -108,7 +108,7 @@ inline std::vector<std::pair<std::vector<uint32_t>, std::vector<uint32_t>>> get_
             unpadded_written = unpadded_written / num_unpadded_sticks_per_dim[j];
             start_id += id_per_dim[j] * accumulated_total_per_dim[j - 1];
         }
-        vector<uint32_t> reader_kernel_args = common_reader_kernel_args;
+        std::vector<uint32_t> reader_kernel_args = common_reader_kernel_args;
         //
         uint32_t addr_offset = 5;  // input buffer addr, padded_row_size_bytes, unpadded_row_size_bytes, num_dims
         reader_kernel_args[addr_offset++] = start_id;
@@ -117,7 +117,7 @@ inline std::vector<std::pair<std::vector<uint32_t>, std::vector<uint32_t>>> get_
         reader_kernel_args[addr_offset] = num_read_per_barrier;
         reader_kernel_args.insert(reader_kernel_args.end(), id_per_dim.begin(), id_per_dim.end());
 
-        vector<uint32_t> writer_kernel_args = {
+        std::vector<uint32_t> writer_kernel_args = {
             output_buffer->address(), unpadded_row_size_bytes, unpadded_row_size_bytes_offset, num_sticks_per_core, num_sticks_per_core_read, num_read_per_barrier, num_sticks_written, 0};
         num_sticks_written += num_sticks_per_core;
         ret_val[i] = {reader_kernel_args, writer_kernel_args};
@@ -493,7 +493,7 @@ inline std::vector<std::pair<std::vector<uint32_t>, std::vector<uint32_t>>> get_
         }
 
         // reader rt args
-        vector<uint32_t> reader_kernel_args;
+        std::vector<uint32_t> reader_kernel_args;
         reader_kernel_args.push_back(core_stick_map.size()); // num_cores
 
         tt::log_debug("num_cores: {}", core_stick_map.size());
@@ -513,7 +513,7 @@ inline std::vector<std::pair<std::vector<uint32_t>, std::vector<uint32_t>>> get_
         }
 
         // coalesce the sticks into chunks
-        vector<std::vector<std::vector<uint32_t>>> stick_chunks_per_core;
+        std::vector<std::vector<std::vector<uint32_t>>> stick_chunks_per_core;
         for (auto core_stick_pair : core_stick_map) {
             auto stick_chunks = group_contiguous_values(core_stick_pair.second);
             stick_chunks_per_core.push_back(stick_chunks);
@@ -531,7 +531,7 @@ inline std::vector<std::pair<std::vector<uint32_t>, std::vector<uint32_t>>> get_
             }
         }
 
-        vector<uint32_t> writer_kernel_args;
+        std::vector<uint32_t> writer_kernel_args;
         ret_val[i] = {reader_kernel_args, writer_kernel_args};
     }
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/tilize/device/tilize_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/tilize/device/tilize_program_factory.cpp
index 26152476af7..1f9acdd8e3f 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/tilize/device/tilize_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/tilize/device/tilize_program_factory.cpp
@@ -115,7 +115,7 @@ operation::ProgramWithCallbacks tilize_single_core(const Tensor& a, Tensor& outp
         core,
         tt::tt_metal::WriterDataMovementConfig(writer_compile_time_args));
 
-    vector<uint32_t> compute_args = {
+    std::vector<uint32_t> compute_args = {
         num_tiles / num_tiles_per_block,  // per_core_block_cnt
         num_tiles_per_block               // per_core_block_tile_cnt
     };
@@ -206,8 +206,8 @@ operation::ProgramWithCallbacks tilize_multi_core_interleaved(const Tensor& a, T
 
     /** compute
      */
-    vector<uint32_t> compute_args = {nblocks_per_core, ntiles_per_block};
-    vector<uint32_t> compute_args_cliff = {nblocks_per_core_cliff, ntiles_per_block};
+    std::vector<uint32_t> compute_args = {nblocks_per_core, ntiles_per_block};
+    std::vector<uint32_t> compute_args_cliff = {nblocks_per_core_cliff, ntiles_per_block};
 
     if (core_range.ranges().size() > 0) {
         auto tilize_kernel_id = CreateKernel(
@@ -370,7 +370,7 @@ operation::ProgramWithCallbacks tilize_multi_core_sharded(const Tensor& input, T
         all_cores,
         tt::tt_metal::WriterDataMovementConfig(writer_compile_time_args));
 
-    vector<uint32_t> compute_args = {uint32_t(num_tiles_per_shard / num_tiles_per_row), uint32_t(num_tiles_per_row)};
+    std::vector<uint32_t> compute_args = {uint32_t(num_tiles_per_shard / num_tiles_per_row), uint32_t(num_tiles_per_row)};
 
     auto untilize_kernel_id = tt::tt_metal::CreateKernel(
         program,
diff --git a/ttnn/cpp/ttnn/operations/data_movement/tilize_with_val_padding/device/tilize_with_val_padding_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/tilize_with_val_padding/device/tilize_with_val_padding_program_factory.cpp
index 716264458d8..2baf47aa6b7 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/tilize_with_val_padding/device/tilize_with_val_padding_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/tilize_with_val_padding/device/tilize_with_val_padding_program_factory.cpp
@@ -157,7 +157,7 @@ operation::ProgramWithCallbacks tilize_with_val_padding_single_core(
         core,
         tt::tt_metal::WriterDataMovementConfig({output_cb_index, out_is_dram}));
 
-    vector<uint32_t> compute_kernel_args = {uint32_t(num_tiles / num_tiles_per_block), uint32_t(num_tiles_per_block)};
+    std::vector<uint32_t> compute_kernel_args = {uint32_t(num_tiles / num_tiles_per_block), uint32_t(num_tiles_per_block)};
 
     auto tilize_kernel_id = tt::tt_metal::CreateKernel(
         program,
@@ -292,7 +292,7 @@ operation::ProgramWithCallbacks tilize_with_val_padding_multi_core_interleaved(
         const std::vector<BlockRep>& assignment = core_assignments.at(i);
 
         // reader runtime args
-        vector<uint32_t> reader_rt_args = {
+        std::vector<uint32_t> reader_rt_args = {
             src0_buffer->address(),
             unpadded_row_size_bytes,
             padded_row_size_bytes,
@@ -425,7 +425,7 @@ operation::ProgramWithCallbacks tilize_with_val_padding_multi_core_sharded(
      */
     KernelHandle unary_writer_kernel_id;
     bool out_is_dram = dst_buffer->buffer_type() == BufferType::DRAM ? 1 : 0;
-    vector<uint32_t> writer_ct_args = {
+    std::vector<uint32_t> writer_ct_args = {
         output_cb_index,
     };
     unary_writer_kernel_id = CreateKernel(
@@ -436,7 +436,7 @@ operation::ProgramWithCallbacks tilize_with_val_padding_multi_core_sharded(
 
     /** compute
      */
-    vector<uint32_t> compute_args = {
+    std::vector<uint32_t> compute_args = {
         (uint32_t)nblocks_per_core,  // per_core_block_cnt
         (uint32_t)ntiles_per_block,  // per_block_ntiles
     };
diff --git a/ttnn/cpp/ttnn/operations/data_movement/transpose/device/transpose_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/transpose/device/transpose_program_factory.cpp
index 85c84790518..9cbbaf6cb2b 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/transpose/device/transpose_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/transpose/device/transpose_program_factory.cpp
@@ -1637,7 +1637,7 @@ operation::ProgramWithCallbacks transpose_wh_multi_core_sharded(const Tensor &a,
     uint32_t NHtWt = N * HtWt;
 
     auto bbox = all_cores.bounding_box();
-    vector<CoreCoord> cores = grid_to_cores_with_noop(bbox.end_coord.x, bbox.end_coord.y, num_cores_x, num_cores_y, row_major);
+    std::vector<CoreCoord> cores = grid_to_cores_with_noop(bbox.end_coord.x, bbox.end_coord.y, num_cores_x, num_cores_y, row_major);
 
     std::vector< std::vector<uint32_t> > unary_reader_args = { cores.size(), std::vector<uint32_t>(1) };
     std::vector< std::vector<uint32_t> > unary_compute_args = { cores.size(), std::vector<uint32_t>(5) };
@@ -1703,7 +1703,7 @@ operation::ProgramWithCallbacks transpose_wh_multi_core_sharded(const Tensor &a,
         bool row_major = shard_spec.orientation == ShardOrientation::ROW_MAJOR;
 
         auto bbox = all_cores.bounding_box();
-        vector<CoreCoord> cores = grid_to_cores_with_noop(bbox.end_coord.x, bbox.end_coord.y, num_cores_x, num_cores_y, row_major);
+        std::vector<CoreCoord> cores = grid_to_cores_with_noop(bbox.end_coord.x, bbox.end_coord.y, num_cores_x, num_cores_y, row_major);
         std::vector< std::vector<uint32_t> > unary_reader_args = { cores.size(), std::vector<uint32_t>(1) };
         std::vector< std::vector<uint32_t> > unary_compute_args = { cores.size(), std::vector<uint32_t>(5) };
         std::vector< std::vector<uint32_t> > unary_writer_args = { cores.size(), std::vector<uint32_t>(1) };
diff --git a/ttnn/cpp/ttnn/operations/data_movement/untilize/device/untilize_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/untilize/device/untilize_program_factory.cpp
index 2d9c90c2eb8..22826b7cd4d 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/untilize/device/untilize_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/untilize/device/untilize_program_factory.cpp
@@ -135,11 +135,11 @@ operation::ProgramWithCallbacks untilize_multi_core_parallelize_column(
 
     /** compute
      */
-    vector<uint32_t> compute_args = {
+    std::vector<uint32_t> compute_args = {
         (uint32_t)nblocks_per_core,  // per_core_block_cnt
         (uint32_t)ntiles_per_block,  // per_block_ntiles
     };
-    vector<uint32_t> compute_args_cliff = {
+    std::vector<uint32_t> compute_args_cliff = {
         (uint32_t)nblocks_per_core_cliff ,
         (uint32_t)ntiles_per_block,  // per_block_ntiles
     };
@@ -384,7 +384,7 @@ operation::ProgramWithCallbacks untilize_multi_core(
             tt::tt_metal::ReaderDataMovementConfig(reader_ct_args));
     } else {
         bool src0_is_dram = src0_buffer->buffer_type() == BufferType::DRAM ? 1 : 0;
-        vector<uint32_t> reader_ct_args = {(uint32_t)src0_is_dram};
+        std::vector<uint32_t> reader_ct_args = {(uint32_t)src0_is_dram};
 
         unary_reader_kernel_id = CreateKernel(
             program,
@@ -406,7 +406,7 @@ operation::ProgramWithCallbacks untilize_multi_core(
     } else {
         bool out_is_dram = dst_buffer->buffer_type() == BufferType::DRAM ? 1 : 0;
         if (src_block_sharded) {
-            vector<uint32_t> writer_ct_args = {
+            std::vector<uint32_t> writer_ct_args = {
                 (uint32_t)out_is_dram, (uint32_t)(input_cb_data_format == tt::DataFormat::Float32)};
             unary_writer_kernel_id = CreateKernel(
                 program,
@@ -416,7 +416,7 @@ operation::ProgramWithCallbacks untilize_multi_core(
         } else {
             bool stick_size_is_power_of_two = is_power_of_two_at_least_32(block_size_nbytes);
             uint32_t log2_stick_size = stick_size_is_power_of_two ? (std::uint32_t)std::log2(block_size_nbytes) : 0;
-            vector<uint32_t> writer_ct_args = {
+            std::vector<uint32_t> writer_ct_args = {
                 (uint32_t)out_is_dram,
                 (uint32_t)stick_size_is_power_of_two,
                 (uint32_t)log2_stick_size,
@@ -433,11 +433,11 @@ operation::ProgramWithCallbacks untilize_multi_core(
 
     /** compute
      */
-    vector<uint32_t> compute_args = {
+    std::vector<uint32_t> compute_args = {
         (uint32_t)nblocks_per_core,  // per_core_block_cnt
         (uint32_t)ntiles_per_block,  // per_block_ntiles
     };
-    vector<uint32_t> compute_args_cliff = {
+    std::vector<uint32_t> compute_args_cliff = {
         (uint32_t)nblocks_per_core_cliff,
         (uint32_t)ntiles_per_block,  // per_block_ntiles
     };
@@ -482,7 +482,7 @@ operation::ProgramWithCallbacks untilize_multi_core(
             continue;
         }
         // reader runtime args
-        vector<uint32_t> reader_rt_args;
+        std::vector<uint32_t> reader_rt_args;
 
         if (src_sharded) {
             reader_rt_args = {
@@ -499,7 +499,7 @@ operation::ProgramWithCallbacks untilize_multi_core(
         // ntiles_per_block * nblocks_per_core);
 
         // writer runtime args
-        vector<uint32_t> writer_rt_args;
+        std::vector<uint32_t> writer_rt_args;
         if (out_sharded) {
             writer_rt_args = {
                 ntiles_per_block * nblocks_per_core  // ntiles
@@ -570,7 +570,7 @@ operation::ProgramWithCallbacks untilize_multi_core(
         CoreCoord core = row_major ? CoreCoord{ncores_full % ncores_x, ncores_full / ncores_x}
                                    : CoreCoord{ncores_full / ncores_y, ncores_full % ncores_y};
         // reader runtime args
-        vector<uint32_t> reader_rt_args;
+        std::vector<uint32_t> reader_rt_args;
 
         if (src_sharded) {
             reader_rt_args = {
@@ -587,7 +587,7 @@ operation::ProgramWithCallbacks untilize_multi_core(
         // nblocks_per_core_cliff);
 
         // writer runtime args
-        vector<uint32_t> writer_rt_args;
+        std::vector<uint32_t> writer_rt_args;
         if (out_sharded) {
             writer_rt_args = {
                 ntiles_per_block * nblocks_per_core_cliff  // ntiles
@@ -787,7 +787,7 @@ operation::ProgramWithCallbacks untilize_single_core(
         core,
         tt::tt_metal::WriterDataMovementConfig(writer_compile_time_args));
 
-    vector<uint32_t> compute_args = {
+    std::vector<uint32_t> compute_args = {
         uint32_t(num_tiles / num_tiles_per_block),  // per_core_block_cnt
         uint32_t(num_tiles_per_block)               // per_core_block_tile_cnt
     };
diff --git a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/device/untilize_with_unpadding_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/device/untilize_with_unpadding_program_factory.cpp
index 432aa4f43d9..10c8ce9dc22 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/device/untilize_with_unpadding_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/device/untilize_with_unpadding_program_factory.cpp
@@ -157,7 +157,7 @@ operation::ProgramWithCallbacks untilize_with_unpadding_single_core(
         core,
         tt::tt_metal::WriterDataMovementConfig(writer_compile_time_args));
 
-    vector<uint32_t> compute_args = {uint32_t(num_tiles / num_tiles_per_block), uint32_t(num_tiles_per_block)};
+    std::vector<uint32_t> compute_args = {uint32_t(num_tiles / num_tiles_per_block), uint32_t(num_tiles_per_block)};
 
     std::string compute_kernel(
         "ttnn/cpp/ttnn/operations/data_movement/untilize/device/kernels/compute/pack_untilize.cpp");
@@ -314,7 +314,7 @@ operation::ProgramWithCallbacks untilize_with_unpadding_multi_core_interleaved(
         const std::vector<BlockRep>& assignment = core_assignments.at(i);
 
         // writer runtime args
-        vector<uint32_t> writer_rt_args = {
+        std::vector<uint32_t> writer_rt_args = {
             dst_buffer->address(),
             unpadded_row_size_bytes,
             padded_row_size_bytes,
@@ -475,7 +475,7 @@ operation::ProgramWithCallbacks untilize_with_unpadding_multi_core_sharded(
      */
     KernelHandle unary_writer_kernel_id;
     if (out_sharded) {
-        vector<uint32_t> writer_ct_args = {(uint32_t)output_cb_index, (uint32_t)sharded_output_cb_index};
+        std::vector<uint32_t> writer_ct_args = {(uint32_t)output_cb_index, (uint32_t)sharded_output_cb_index};
         unary_writer_kernel_id = CreateKernel(
             program,
             unpad_tensor_w_16 ? "ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/device/kernels/dataflow/"
@@ -486,7 +486,7 @@ operation::ProgramWithCallbacks untilize_with_unpadding_multi_core_sharded(
             WriterDataMovementConfig(writer_ct_args));
     } else {
         bool out_is_dram = dst_buffer->buffer_type() == BufferType::DRAM ? 1 : 0;
-        vector<uint32_t> writer_ct_args = {
+        std::vector<uint32_t> writer_ct_args = {
             (uint32_t)out_is_dram, (uint32_t)(input_cb_data_format == tt::DataFormat::Float32)};
         unary_writer_kernel_id = CreateKernel(
             program,
@@ -497,7 +497,7 @@ operation::ProgramWithCallbacks untilize_with_unpadding_multi_core_sharded(
 
     /** compute
      */
-    vector<uint32_t> compute_args = {
+    std::vector<uint32_t> compute_args = {
         (uint32_t)nblocks_per_core,  // per_core_block_cnt
         (uint32_t)ntiles_per_block,  // per_block_ntiles
     };
@@ -529,7 +529,7 @@ operation::ProgramWithCallbacks untilize_with_unpadding_multi_core_sharded(
     std::vector<CoreCoord> cores;
 
     if (out_sharded) {
-        vector<uint32_t> writer_rt_args;
+        std::vector<uint32_t> writer_rt_args;
         if (unpad_tensor_w_16) {
             writer_rt_args = {num_output_rows_unpadded, num_input_tiles};
         } else {
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/device/element_wise_multi_core_program_factory.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/device/element_wise_multi_core_program_factory.cpp
index eb57eb345b9..a6f6f3f8650 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/binary/device/element_wise_multi_core_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary/device/element_wise_multi_core_program_factory.cpp
@@ -77,7 +77,7 @@ inline __attribute__((always_inline)) void set_eltwise_binary_runtime_args(
     uint32_t block_height = 0, block_width = 0, block_size = 0, output_width = 0, last_unpadded_block_height = 0,
              last_unpadded_block_width = 0;
     CoreCoord end_core;
-    vector<CoreCoord> cores;
+    std::vector<CoreCoord> cores;
 
     if (shard_spec.has_value()) {
         all_cores = shard_spec.value().grid;
diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_program_factory.cpp b/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_program_factory.cpp
index bca50f4d410..228bef05915 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_program_factory.cpp
@@ -80,12 +80,12 @@ UnaryProgramFactory::cached_program_t UnaryProgramFactory::create(
         all_cores,
         tt::tt_metal::WriterDataMovementConfig(writer_compile_time_args));
 
-    vector<uint32_t> compute_kernel_args_group_1 = {
+    std::vector<uint32_t> compute_kernel_args_group_1 = {
         num_tiles_per_core_group_1,  // per_core_block_cnt
         1                            // per_core_block_size
     };
 
-    vector<UnpackToDestMode> unpack_to_dest_mode(NUM_CIRCULAR_BUFFERS, UnpackToDestMode::Default);
+    std::vector<UnpackToDestMode> unpack_to_dest_mode(NUM_CIRCULAR_BUFFERS, UnpackToDestMode::Default);
     if (args.preserve_fp32_precision) {
         unpack_to_dest_mode[src0_cb_index] = UnpackToDestMode::UnpackToDestFp32;
     }
@@ -106,7 +106,7 @@ UnaryProgramFactory::cached_program_t UnaryProgramFactory::create(
             .defines = unary_defines});
 
     if (!core_group_2.ranges().empty()) {
-        vector<uint32_t> compute_kernel_args_group_2 = {
+        std::vector<uint32_t> compute_kernel_args_group_2 = {
             num_tiles_per_core_group_2,  // per_core_block_cnt
             1                            // per_core_block_size
         };
diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_sharded_program_factory.cpp b/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_sharded_program_factory.cpp
index 2920fb011c2..e9d4c2d84e9 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_sharded_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_sharded_program_factory.cpp
@@ -109,12 +109,12 @@ UnaryShardedProgramFactory::cached_program_t UnaryShardedProgramFactory::create(
         all_cores,
         tt::tt_metal::ReaderDataMovementConfig(reader_compile_time_args, kernel_defines));
 
-    vector<uint32_t> compute_kernel_args_group_1 = {
+    std::vector<uint32_t> compute_kernel_args_group_1 = {
         1,                 // per_core_block_cnt
         num_tile_per_core  // per_core_block_size
     };
 
-    vector<UnpackToDestMode> unpack_to_dest_mode(NUM_CIRCULAR_BUFFERS, UnpackToDestMode::Default);
+    std::vector<UnpackToDestMode> unpack_to_dest_mode(NUM_CIRCULAR_BUFFERS, UnpackToDestMode::Default);
     if (args.preserve_fp32_precision) {
         unpack_to_dest_mode[in_cb_id] = UnpackToDestMode::UnpackToDestFp32;
     }
diff --git a/ttnn/cpp/ttnn/operations/embedding/device/embedding_program_factory.hpp b/ttnn/cpp/ttnn/operations/embedding/device/embedding_program_factory.hpp
index 38b8a222c48..f35d2793889 100644
--- a/ttnn/cpp/ttnn/operations/embedding/device/embedding_program_factory.hpp
+++ b/ttnn/cpp/ttnn/operations/embedding/device/embedding_program_factory.hpp
@@ -162,7 +162,7 @@ operation::ProgramWithCallbacks embeddings_tilized(
              embedding_defines));
 
     if (num_blocks_per_core_group_1 > 0) {
-        vector<uint32_t> compute_args_1 = {
+        std::vector<uint32_t> compute_args_1 = {
             uint32_t(num_blocks_per_core_group_1),  // per_core_block_cnt
             uint32_t(num_tiles_per_block)           // per_core_block_tile_cnt
         };
@@ -174,7 +174,7 @@ operation::ProgramWithCallbacks embeddings_tilized(
     }
 
     if (num_blocks_per_core_group_2 > 0) {
-        vector<uint32_t> compute_args_2 = {
+        std::vector<uint32_t> compute_args_2 = {
             uint32_t(num_blocks_per_core_group_2),  // per_core_block_cnt
             uint32_t(num_tiles_per_block)           // per_core_block_tile_cnt
         };
diff --git a/ttnn/cpp/ttnn/operations/examples/example/device/multi_core_program_factory.cpp b/ttnn/cpp/ttnn/operations/examples/example/device/multi_core_program_factory.cpp
index 56f1d815fc4..451d6db66d9 100644
--- a/ttnn/cpp/ttnn/operations/examples/example/device/multi_core_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/examples/example/device/multi_core_program_factory.cpp
@@ -68,7 +68,7 @@ ExampleDeviceOperation::MultiCore::cached_program_t ExampleDeviceOperation::Mult
         all_cores,
         tt::tt_metal::WriterDataMovementConfig(writer_compile_time_args));
 
-    vector<uint32_t> compute_kernel_args_group_1 = {
+    std::vector<uint32_t> compute_kernel_args_group_1 = {
         num_tiles_per_core_group_1,  // per_core_block_cnt
         1                            // per_core_block_size
     };
@@ -84,7 +84,7 @@ ExampleDeviceOperation::MultiCore::cached_program_t ExampleDeviceOperation::Mult
             .compile_args = compute_kernel_args_group_1});
 
     if (!core_group_2.ranges().empty()) {
-        vector<uint32_t> compute_kernel_args_group_2 = {
+        std::vector<uint32_t> compute_kernel_args_group_2 = {
             num_tiles_per_core_group_2,  // per_core_block_cnt
             1                            // per_core_block_size
         };
diff --git a/ttnn/cpp/ttnn/operations/examples/example/device/single_core_program_factory.cpp b/ttnn/cpp/ttnn/operations/examples/example/device/single_core_program_factory.cpp
index 71e3acfaa72..34dea89e96e 100644
--- a/ttnn/cpp/ttnn/operations/examples/example/device/single_core_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/examples/example/device/single_core_program_factory.cpp
@@ -68,7 +68,7 @@ ExampleDeviceOperation::SingleCore::cached_program_t ExampleDeviceOperation::Sin
         all_cores,
         tt::tt_metal::WriterDataMovementConfig(writer_compile_time_args));
 
-    vector<uint32_t> compute_kernel_args_group_1 = {
+    std::vector<uint32_t> compute_kernel_args_group_1 = {
         num_tiles_per_core_group_1,  // per_core_block_cnt
         1                            // per_core_block_size
     };
@@ -84,7 +84,7 @@ ExampleDeviceOperation::SingleCore::cached_program_t ExampleDeviceOperation::Sin
             .compile_args = compute_kernel_args_group_1});
 
     if (!core_group_2.ranges().empty()) {
-        vector<uint32_t> compute_kernel_args_group_2 = {
+        std::vector<uint32_t> compute_kernel_args_group_2 = {
             num_tiles_per_core_group_2,  // per_core_block_cnt
             1                            // per_core_block_size
         };
diff --git a/ttnn/cpp/ttnn/operations/examples/example_multiple_return/device/single_core_program_factory.cpp b/ttnn/cpp/ttnn/operations/examples/example_multiple_return/device/single_core_program_factory.cpp
index 204e66cb70b..b16a181b43a 100644
--- a/ttnn/cpp/ttnn/operations/examples/example_multiple_return/device/single_core_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/examples/example_multiple_return/device/single_core_program_factory.cpp
@@ -73,7 +73,7 @@ ExampleMultipleReturnDeviceOperation::SingleCore::cached_program_t ExampleMultip
         all_cores,
         tt::tt_metal::WriterDataMovementConfig(writer_compile_time_args));
 
-    vector<uint32_t> compute_kernel_args_group_1 = {
+    std::vector<uint32_t> compute_kernel_args_group_1 = {
         num_tiles_per_core_group_1,  // per_core_block_cnt
         1                            // per_core_block_size
     };
@@ -89,7 +89,7 @@ ExampleMultipleReturnDeviceOperation::SingleCore::cached_program_t ExampleMultip
             .compile_args = compute_kernel_args_group_1});
 
     if (!core_group_2.ranges().empty()) {
-        vector<uint32_t> compute_kernel_args_group_2 = {
+        std::vector<uint32_t> compute_kernel_args_group_2 = {
             num_tiles_per_core_group_2,  // per_core_block_cnt
             1                            // per_core_block_size
         };
diff --git a/ttnn/cpp/ttnn/operations/experimental/matmul/attn_matmul/device/attn_matmul_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/matmul/attn_matmul/device/attn_matmul_program_factory.cpp
index a24a8f45fed..2088ec7bb5c 100644
--- a/ttnn/cpp/ttnn/operations/experimental/matmul/attn_matmul/device/attn_matmul_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/matmul/attn_matmul/device/attn_matmul_program_factory.cpp
@@ -149,7 +149,7 @@ operation::ProgramWithCallbacks multi_core_attn_matmul(const Tensor &a, const Te
         all_device_cores,
         tt::tt_metal::WriterDataMovementConfig(writer_compile_time_args));
 
-    vector<uint32_t> compute_args = {
+    std::vector<uint32_t> compute_args = {
         (uint32_t) transpose_hw_bool, // transpose_hw for matmul_init
     }; // bmm compute kernel the B, Mt, Nt are just 3 for loops that technically act as 1 large loop, so only set Nt for simplicity
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/matmul/group_attn_matmul/device/group_attn_matmul_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/matmul/group_attn_matmul/device/group_attn_matmul_program_factory.cpp
index 011e9c11e97..8849d331c83 100644
--- a/ttnn/cpp/ttnn/operations/experimental/matmul/group_attn_matmul/device/group_attn_matmul_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/matmul/group_attn_matmul/device/group_attn_matmul_program_factory.cpp
@@ -224,7 +224,7 @@ operation::ProgramWithCallbacks multi_core_group_attn_matmul(const Tensor &a, co
         }
     );
 
-    vector<uint32_t> compute_args = {
+    std::vector<uint32_t> compute_args = {
         (uint32_t) transpose_hw_bool, // transpose_hw for matmul_init
         out_subblock_w,
         out_subblock_num_tiles,
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_kv_cache_load_slice/device/nlp_kv_cache_load_slice_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_kv_cache_load_slice/device/nlp_kv_cache_load_slice_program_factory.cpp
index 7db3b1a9e78..c4a346f3e61 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_kv_cache_load_slice/device/nlp_kv_cache_load_slice_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_kv_cache_load_slice/device/nlp_kv_cache_load_slice_program_factory.cpp
@@ -24,7 +24,7 @@ std::vector<std::pair<std::vector<uint32_t>, std::vector<uint32_t>>> get_unpad_r
     auto input_buffer = input_tensor.buffer();
     auto input_shape = input_tensor.get_legacy_shape();
 
-    vector<uint32_t> common_reader_kernel_args = {input_buffer->address(), 0};
+    std::vector<uint32_t> common_reader_kernel_args = {input_buffer->address(), 0};
 
     std::vector<std::pair<std::vector<uint32_t>, std::vector<uint32_t>>> ret_val(num_cores_total);
 
@@ -35,9 +35,9 @@ std::vector<std::pair<std::vector<uint32_t>, std::vector<uint32_t>>> get_unpad_r
         CoreCoord core = {i % num_cores_x, i / num_cores_x};
 
         // reader and writer kernel args
-        vector<uint32_t> reader_kernel_args = common_reader_kernel_args;
+        std::vector<uint32_t> reader_kernel_args = common_reader_kernel_args;
         reader_kernel_args[1] = start_id;
-        vector<uint32_t> writer_kernel_args = {
+        std::vector<uint32_t> writer_kernel_args = {
             num_tiles_per_core,
         };
         ret_val[i] = {reader_kernel_args, writer_kernel_args};
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding/device/rotary_embedding_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding/device/rotary_embedding_program_factory.cpp
index f9ed0492f56..e1b2660ef51 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding/device/rotary_embedding_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding/device/rotary_embedding_program_factory.cpp
@@ -286,7 +286,7 @@ operation::ProgramWithCallbacks rotary_embedding_multi_core(
         all_cores,
         tt_metal::WriterDataMovementConfig(writer_compile_time_args, writer_kernel_defines));
 
-    vector<uint32_t> compute_kernel_args = {
+    std::vector<uint32_t> compute_kernel_args = {
         (std::uint32_t)input_cb_index,
         (std::uint32_t)rotated_input_cb_index,
         (std::uint32_t)cos_cb_index,
diff --git a/ttnn/cpp/ttnn/operations/kv_cache/device/update_cache_op_multi_core.cpp b/ttnn/cpp/ttnn/operations/kv_cache/device/update_cache_op_multi_core.cpp
index 4e7719e53ef..11a8a1b9bb6 100644
--- a/ttnn/cpp/ttnn/operations/kv_cache/device/update_cache_op_multi_core.cpp
+++ b/ttnn/cpp/ttnn/operations/kv_cache/device/update_cache_op_multi_core.cpp
@@ -169,7 +169,7 @@ operation::ProgramWithCallbacks update_cache_multi_core(const Tensor& cache_tens
         all_cores,
         tt::tt_metal::WriterDataMovementConfig(writer_compile_time_args));
 
-    vector<uint32_t> compute_kernel_args = {
+    std::vector<uint32_t> compute_kernel_args = {
         src0_cb_index,
         src1_cb_index,
         interm0_cb_index,
diff --git a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_program_factory.cpp b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_program_factory.cpp
index 09b633211ea..fca73b736cd 100644
--- a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_program_factory.cpp
@@ -109,7 +109,7 @@ operation::ProgramWithCallbacks matmul_multi_core(const Tensor &a, const Tensor
         all_cores,
         tt_metal::WriterDataMovementConfig(writer_compile_time_args));
 
-    vector<uint32_t> compute_args_group_1 = {
+    std::vector<uint32_t> compute_args_group_1 = {
         1,                                 // B
         1,                                 // Mt
         Kt,                                // Kt
@@ -127,7 +127,7 @@ operation::ProgramWithCallbacks matmul_multi_core(const Tensor &a, const Tensor
             .compile_args = compute_args_group_1});
 
     if (!core_group_2.ranges().empty()) {
-        vector<uint32_t> compute_args_group_2 = {
+        std::vector<uint32_t> compute_args_group_2 = {
             1,                                 // B
             1,                                 // Mt
             Kt,                                // Kt
diff --git a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_1d_program_factory.cpp b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_1d_program_factory.cpp
index a292a31be18..af4003a2b24 100644
--- a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_1d_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_1d_program_factory.cpp
@@ -469,7 +469,7 @@ operation::ProgramWithCallbacks create_program_mcast_in0(
 
     uint32_t out_subblock_num_tiles = out_subblock_h * out_subblock_w;
 
-    vector<uint32_t> compute_kernel_args = {
+    std::vector<uint32_t> compute_kernel_args = {
         in0_block_w,             // in0_block_w
         in0_num_subblocks,       // in0_num_subblocks
         in0_block_num_tiles,     // in0_block_num_tiles
@@ -1213,7 +1213,7 @@ operation::ProgramWithCallbacks create_program_mcast_in1(
 
     uint32_t out_subblock_num_tiles = out_subblock_h * out_subblock_w;
 
-    vector<uint32_t> compute_kernel_args = {
+    std::vector<uint32_t> compute_kernel_args = {
         in0_block_w,             // in0_block_w
         in0_num_subblocks,       // in0_num_subblocks
         in0_block_num_tiles,     // in0_block_num_tiles
diff --git a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_2d_program_factory.cpp b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_2d_program_factory.cpp
index 330dce8c720..8fece73c387 100644
--- a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_2d_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_2d_program_factory.cpp
@@ -626,7 +626,7 @@ operation::ProgramWithCallbacks create_program_mcast_in0_in1(
 
     uint32_t out_subblock_num_tiles = out_subblock_h * out_subblock_w;
 
-    vector<uint32_t> compute_kernel_args = {
+    std::vector<uint32_t> compute_kernel_args = {
         in0_block_w,             // in0_block_w
         in0_num_subblocks,       // in0_num_subblocks
         in0_block_num_tiles,     // in0_block_num_tiles
diff --git a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_dram_sharded_program_factory.cpp b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_dram_sharded_program_factory.cpp
index 96d914fbff2..a79f70be4bc 100644
--- a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_dram_sharded_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_dram_sharded_program_factory.cpp
@@ -765,7 +765,7 @@ operation::ProgramWithCallbacks create_program_dram_sharded(
     uint32_t in1_per_core_w = per_core_N_unpad;
     uint32_t out_subblock_num_tiles = out_subblock_h * out_subblock_w;
 
-    vector<uint32_t> compute_kernel_args = {
+    std::vector<uint32_t> compute_kernel_args = {
         in0_block_w,             // in0_block_w
         in0_num_subblocks,       // in0_num_subblocks
         in0_block_num_tiles,     // in0_block_num_tiles
diff --git a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_optimized_program_factory.cpp b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_optimized_program_factory.cpp
index 4a6fd50be09..f7424c5e55a 100644
--- a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_optimized_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_optimized_program_factory.cpp
@@ -192,7 +192,7 @@ operation::ProgramWithCallbacks create_program(
             reader_writer_compile_time_args,
             mm_kernel_in1_reader_writer_defines));
 
-    vector<uint32_t> compute_kernel_args_group_1 = {
+    std::vector<uint32_t> compute_kernel_args_group_1 = {
         in0_block_w,             // in0_block_w
         in0_num_subblocks,       // in0_num_subblocks
         in0_block_num_tiles,     // in0_block_num_tiles
@@ -234,7 +234,7 @@ operation::ProgramWithCallbacks create_program(
             .compile_args = compute_kernel_args_group_1,
             .defines = mm_kernel_defines});
     if (!core_group_2.ranges().empty()) {
-        vector<uint32_t> compute_kernel_args_group_2 = {
+        std::vector<uint32_t> compute_kernel_args_group_2 = {
             in0_block_w,             // in0_block_w
             in0_num_subblocks,       // in0_num_subblocks
             in0_block_num_tiles,     // in0_block_num_tiles
diff --git a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_program_factory.cpp b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_program_factory.cpp
index 482e6c8bca9..2fe8f327b72 100644
--- a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_program_factory.cpp
@@ -62,7 +62,7 @@ tt_metal::operation::ProgramWithCallbacks create_program(
 
     uint32_t out_subblock_num_tiles = out_subblock_h * out_subblock_w;
 
-    vector<uint32_t> compute_kernel_args = {
+    std::vector<uint32_t> compute_kernel_args = {
         in0_block_w,             // in0_block_w
         in0_num_subblocks,       // in0_num_subblocks
         in0_block_num_tiles,     // in0_block_num_tiles
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_dot/device/moreh_dot_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_dot/device/moreh_dot_program_factory.cpp
index 447de9add9c..cad1e01e515 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_dot/device/moreh_dot_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_dot/device/moreh_dot_program_factory.cpp
@@ -81,7 +81,7 @@ MorehDotOperation::SingleCore::cached_program_t MorehDotOperation::SingleCore::c
     const auto writer_kernel_id =
         tt::operations::primary::CreateWriteKernel(program, writer_kernel_file, core, writer_compile_time_args);
 
-    vector<uint32_t> compute_kernel_args = {};
+    std::vector<uint32_t> compute_kernel_args = {};
     std::map<string, string> compute_defines;
     compute_defines["REDUCE_OP"] = "PoolType::SUM";
     compute_defines["REDUCE_DIM"] = "ReduceDim::REDUCE_ROW";
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_dot_backward/device/moreh_dot_backward_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_dot_backward/device/moreh_dot_backward_program_factory.cpp
index bdbeead8284..7bcd39d21bf 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_dot_backward/device/moreh_dot_backward_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_dot_backward/device/moreh_dot_backward_program_factory.cpp
@@ -105,7 +105,7 @@ MorehDotBackwardOperation::SingleCore::cached_program_t MorehDotBackwardOperatio
     const auto writer_kernel_id =
         tt::operations::primary::CreateWriteKernel(program, writer_kernel_file, core, writer_compile_time_args);
 
-    vector<uint32_t> compute_kernel_args = {};
+    std::vector<uint32_t> compute_kernel_args = {};
     std::map<string, string> compute_defines;
 
     const auto compute_kernel_file =
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_rm_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_rm_factory.cpp
index a6c5bddad33..3cd4ccff68d 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_rm_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_rm_factory.cpp
@@ -159,7 +159,7 @@ MorehGetItemOperation::MorehGetItemRmFactory::cached_program_t MorehGetItemOpera
         CoreCoord core = {i / core_h + core_x_offset, i % core_h + core_y_offset};
         uint32_t num_units_per_core = i < g1_numcores ? num_units_per_core_group_1 : num_units_per_core_group_2;
 
-        vector<uint32_t> reader_args = {
+        std::vector<uint32_t> reader_args = {
             // buffers
             input_5d.buffer()->address(),
             index_info[0].address,
@@ -208,7 +208,7 @@ MorehGetItemOperation::MorehGetItemRmFactory::cached_program_t MorehGetItemOpera
             input_unit_size,
         };
 
-        vector<uint32_t> writer_args = {
+        std::vector<uint32_t> writer_args = {
             // buffer
             output.buffer()->address(),
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_tilized_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_tilized_factory.cpp
index 90b4e864bfe..480c6010841 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_tilized_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_tilized_factory.cpp
@@ -225,7 +225,7 @@ MorehGetItemOperation::MorehGetItemTilizedFactory::create(
             CoreCoord core = {i / core_h + core_x_offset, i % core_h + core_y_offset};
             uint32_t num_units_per_core = i < g1_numcores ? num_units_per_core_group_1 : num_units_per_core_group_2;
 
-            vector<uint32_t> reader_args = {
+            std::vector<uint32_t> reader_args = {
                 // buffers
                 input.buffer()->address(),
                 index_info[0].address,
@@ -284,7 +284,7 @@ MorehGetItemOperation::MorehGetItemTilizedFactory::create(
                 num_alignment_width,
             };
 
-            vector<uint32_t> writer_args = {
+            std::vector<uint32_t> writer_args = {
                 // buffers
                 output.buffer()->address(),
 
@@ -452,7 +452,7 @@ MorehGetItemOperation::MorehGetItemTilizedFactory::create(
             CoreCoord core = {i / core_h + core_x_offset, i % core_h + core_y_offset};
             uint32_t num_units_per_core = i < g1_numcores ? num_units_per_core_group_1 : num_units_per_core_group_2;
 
-            vector<uint32_t> reader_args = {
+            std::vector<uint32_t> reader_args = {
                 // buffers
                 input.buffer()->address(),
                 index_info[0].address,
@@ -509,7 +509,7 @@ MorehGetItemOperation::MorehGetItemTilizedFactory::create(
                 input_unit_size,
                 input.element_size(),
             };
-            vector<uint32_t> writer_args = {
+            std::vector<uint32_t> writer_args = {
                 // buffers
                 output.buffer()->address(),
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_helper_functions.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_helper_functions.cpp
index b331eca682b..68964ac8820 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_helper_functions.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_helper_functions.cpp
@@ -129,7 +129,7 @@ std::tuple<uint32_t, CoreRangeSet, CoreRangeSet, CoreRangeSet, uint32_t, uint32_
     MathFidelity math_fidelity,
     bool fp32_dest_acc_en,
     bool math_approx_mode,
-    vector<UnpackToDestMode> unpack_to_dest_mode) {
+    std::vector<UnpackToDestMode> unpack_to_dest_mode) {
     std::vector<KernelHandle> compute_kernel_ids{};
     KernelHandle compute_kernel_id{};
     for (auto arg : args) {
@@ -155,7 +155,7 @@ std::tuple<uint32_t, CoreRangeSet, CoreRangeSet, CoreRangeSet, uint32_t, uint32_
     MathFidelity math_fidelity,
     bool fp32_dest_acc_en,
     bool math_approx_mode,
-    vector<UnpackToDestMode> unpack_to_dest_mode) {
+    std::vector<UnpackToDestMode> unpack_to_dest_mode) {
     KernelHandle compute_kernel_id{0};
     if (arg.num_tile_per_core_group > 0) {
         compute_kernel_id = CreateKernel(
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_helper_functions.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_helper_functions.hpp
index 384b9097f4b..7cc17ae86af 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_helper_functions.hpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_helper_functions.hpp
@@ -86,7 +86,7 @@ struct ComputeKernelArg {
 struct ComputeKernelConfig {
     MathFidelity math_fidelity = MathFidelity::HiFi4;
     bool fp32_dest_acc_en = false;
-    vector<UnpackToDestMode> unpack_to_dest_mode;
+    std::vector<UnpackToDestMode> unpack_to_dest_mode;
     bool math_approx_mode = false;
     std::map<std::string, std::string> defines;
 };
@@ -99,7 +99,7 @@ struct ComputeKernelConfig {
     MathFidelity math_fidelity = MathFidelity::HiFi4,
     bool fp32_dest_acc_en = false,
     bool math_approx_mode = false,
-    vector<UnpackToDestMode> unpack_to_dest_mode = {});
+    std::vector<UnpackToDestMode> unpack_to_dest_mode = {});
 
 [[maybe_unused]] KernelHandle CreateComputeKernel(
     Program &program,
@@ -109,7 +109,7 @@ struct ComputeKernelConfig {
     MathFidelity math_fidelity = MathFidelity::HiFi4,
     bool fp32_dest_acc_en = false,
     bool math_approx_mode = false,
-    vector<UnpackToDestMode> unpack_to_dest_mode = {});
+    std::vector<UnpackToDestMode> unpack_to_dest_mode = {});
 
 [[maybe_unused]] std::vector<KernelHandle> CreateComputeKernel(
     Program &program, const std::string &file_name, std::vector<ComputeKernelArg> args, ComputeKernelConfig config);
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_linear_backward/device/moreh_linear_backward_single_core_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_linear_backward/device/moreh_linear_backward_single_core_program_factory.cpp
index a1f82e41ec7..d7ec2a28105 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_linear_backward/device/moreh_linear_backward_single_core_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_linear_backward/device/moreh_linear_backward_single_core_program_factory.cpp
@@ -99,7 +99,7 @@ MorehBiasAddBackwardOperation::SingleCoreProgramFactory::create(
     ////////////////////////////////////////////////////////////////////////////
     //                      ComputeKernel SetUp
     ////////////////////////////////////////////////////////////////////////////
-    vector<uint32_t> compute_kernel_args = {};
+    std::vector<uint32_t> compute_kernel_args = {};
     std::map<string, string> compute_defines;
     compute_defines["REDUCE_OP"] = "PoolType::SUM";
     compute_defines["REDUCE_DIM"] = "ReduceDim::REDUCE_SCALAR";
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_matmul/device/moreh_matmul_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_matmul/device/moreh_matmul_program_factory.cpp
index a1f27385d23..2be95305f90 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_matmul/device/moreh_matmul_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_matmul/device/moreh_matmul_program_factory.cpp
@@ -372,7 +372,7 @@ MorehMatmulOperation::MultiCoreProgramFactory::cached_program_t MorehMatmulOpera
         compute_args_group_1.push_back(static_cast<uint32_t>(is_scalar_bias));
     }
 
-    vector<UnpackToDestMode> unpack_to_dest_mode(NUM_CIRCULAR_BUFFERS, UnpackToDestMode::Default);
+    std::vector<UnpackToDestMode> unpack_to_dest_mode(NUM_CIRCULAR_BUFFERS, UnpackToDestMode::Default);
     if (fp32_dest_acc_en) {
         compute_defines["FP32_DEST_ACC_EN"] = "1";
         unpack_to_dest_mode[tt::CB::c_intermed0] = UnpackToDestMode::UnpackToDestFp32;
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/moreh_mean_nc_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/moreh_mean_nc_program_factory.cpp
index 6efb0963197..0bd21b787a5 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/moreh_mean_nc_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/moreh_mean_nc_program_factory.cpp
@@ -112,7 +112,7 @@ MorehMeanOperation::MorehMeanNCFactory::cached_program_t MorehMeanOperation::Mor
     if (fp32_dest_acc_en) {
         compute_defines["FP32_DEST_ACC_EN"] = 1;
     }
-    vector<UnpackToDestMode> unpack_to_dest_mode(NUM_CIRCULAR_BUFFERS, UnpackToDestMode::Default);
+    std::vector<UnpackToDestMode> unpack_to_dest_mode(NUM_CIRCULAR_BUFFERS, UnpackToDestMode::Default);
     auto compute_kernel_ids = CreateComputeKernel(
         program,
         compute_kernel_file,
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/moreh_mean_w_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/moreh_mean_w_program_factory.cpp
index 541fa26b0a8..5aed254b1c9 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/moreh_mean_w_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/moreh_mean_w_program_factory.cpp
@@ -113,19 +113,19 @@ MorehMeanOperation::MorehMeanWFactory::cached_program_t MorehMeanOperation::More
     if (fp32_dest_acc_en) {
         compute_defines["FP32_DEST_ACC_EN"] = 1;
     }
-    vector<uint32_t> compute_kernel_args_group_1 = {
+    std::vector<uint32_t> compute_kernel_args_group_1 = {
         units_per_core_group_1,  // Ht
         Wt,                      // Wt
         1,                       // NC
         origin_W,
     };
-    vector<uint32_t> compute_kernel_args_group_2 = {
+    std::vector<uint32_t> compute_kernel_args_group_2 = {
         units_per_core_group_2,  // Ht
         Wt,                      // Wt
         1,                       // NC
         origin_W,
     };
-    vector<UnpackToDestMode> unpack_to_dest_mode(NUM_CIRCULAR_BUFFERS, UnpackToDestMode::Default);
+    std::vector<UnpackToDestMode> unpack_to_dest_mode(NUM_CIRCULAR_BUFFERS, UnpackToDestMode::Default);
 
     auto compute_kernel_ids = CreateComputeKernel(
         program,
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_mean_backward/device/moreh_mean_backward_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_mean_backward/device/moreh_mean_backward_program_factory.cpp
index d6e26dd5b27..c088b4ae548 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_mean_backward/device/moreh_mean_backward_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_mean_backward/device/moreh_mean_backward_program_factory.cpp
@@ -159,7 +159,7 @@ MorehMeanBackwardOperation::MorehMeanBackwardFactory::create(
         "ttnn/cpp/ttnn/operations/moreh/moreh_mean_backward/device/kernels/moreh_mean_backward.cpp";
     const std::vector<uint32_t> compute_args_group_1{num_cols_per_core_group_1, need_bcast_dim[0], need_bcast_dim[1]};
     const std::vector<uint32_t> compute_args_group_2{num_cols_per_core_group_2, need_bcast_dim[0], need_bcast_dim[1]};
-    vector<UnpackToDestMode> unpack_to_dest_mode(NUM_CIRCULAR_BUFFERS, UnpackToDestMode::Default);
+    std::vector<UnpackToDestMode> unpack_to_dest_mode(NUM_CIRCULAR_BUFFERS, UnpackToDestMode::Default);
     auto compute_kernel_ids = tt::operations::primary::CreateComputeKernel(
         program,
         compute_kernel_file,
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step1/device/moreh_nll_loss_step1_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step1/device/moreh_nll_loss_step1_program_factory.cpp
index 5301951bcc6..a552aa2eb05 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step1/device/moreh_nll_loss_step1_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step1/device/moreh_nll_loss_step1_program_factory.cpp
@@ -148,7 +148,7 @@ MorehNllLossStep1DeviceOperation::Factory::cached_program_t MorehNllLossStep1Dev
         }
 
         uint32_t element_size = weight_has_value ? weight.value().element_size() : 0;
-        vector<uint32_t> reader_args = {
+        std::vector<uint32_t> reader_args = {
             target_addr,
             weight_addr,
             static_cast<uint32_t>(ignore_index),
@@ -161,7 +161,7 @@ MorehNllLossStep1DeviceOperation::Factory::cached_program_t MorehNllLossStep1Dev
             target.element_size(),
         };
 
-        vector<uint32_t> writer_args = {output_addr, num_units_per_core, tile_offset};
+        std::vector<uint32_t> writer_args = {output_addr, num_units_per_core, tile_offset};
 
         SetRuntimeArgs(program, reader_kernel_id, core, reader_args);
         SetRuntimeArgs(program, writer_kernel_id, core, writer_args);
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/moreh_nll_loss_step2_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/moreh_nll_loss_step2_program_factory.cpp
index 92a00767a12..608e82b1b57 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/moreh_nll_loss_step2_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/moreh_nll_loss_step2_program_factory.cpp
@@ -148,7 +148,7 @@ MorehNllLossStep2DeviceOperation::Factory::cached_program_t moreh_nll_loss_step2
             TT_THROW("Core not in specified core ranges");
         }
 
-        vector<uint32_t> reader_args = {
+        std::vector<uint32_t> reader_args = {
             input_addr,
             target_addr,
             weight_addr,
@@ -161,7 +161,7 @@ MorehNllLossStep2DeviceOperation::Factory::cached_program_t moreh_nll_loss_step2
             input.element_size(),
         };
 
-        vector<uint32_t> writer_args = {
+        std::vector<uint32_t> writer_args = {
             output_addr,
             units_per_core,
             tile_offset,
@@ -327,7 +327,7 @@ MorehNllLossStep2DeviceOperation::Factory::cached_program_t moreh_nll_loss_step2
             TT_THROW("Core not in specified core ranges");
         }
 
-        vector<uint32_t> reader_args = {
+        std::vector<uint32_t> reader_args = {
             input_addr,
             target_addr,
             weight_addr,
@@ -341,7 +341,7 @@ MorehNllLossStep2DeviceOperation::Factory::cached_program_t moreh_nll_loss_step2
             input.element_size(),
         };
 
-        vector<uint32_t> writer_args = {
+        std::vector<uint32_t> writer_args = {
             output_addr,
             units_per_core,
             tile_offset,
@@ -516,7 +516,7 @@ MorehNllLossStep2DeviceOperation::Factory::cached_program_t moreh_nll_loss_step2
             TT_THROW("Core not in specified core ranges");
         }
 
-        vector<uint32_t> reader_args = {
+        std::vector<uint32_t> reader_args = {
             input_addr,
             target_addr,
             weight_addr,
@@ -532,7 +532,7 @@ MorehNllLossStep2DeviceOperation::Factory::cached_program_t moreh_nll_loss_step2
             input.element_size(),
         };
 
-        vector<uint32_t> writer_args = {
+        std::vector<uint32_t> writer_args = {
             output_addr,
             units_per_core,
             tile_offset,
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_sgd/device/moreh_sgd_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_sgd/device/moreh_sgd_program_factory.cpp
index 796aa0e121b..e8be177e76f 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_sgd/device/moreh_sgd_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_sgd/device/moreh_sgd_program_factory.cpp
@@ -200,7 +200,7 @@ MorehSgdOperation::ProgramFactory::cached_program_t MorehSgdOperation::ProgramFa
         u_weight_decay.f = weight_decay;
         u_one.f = 1.0f;
 
-        vector<uint32_t> reader_args = {
+        std::vector<uint32_t> reader_args = {
             param_in.buffer()->address(),
             grad.buffer()->address(),
             momentum_buffer_in.has_value() ? momentum_buffer_in.value().buffer()->address() : 0,
@@ -213,7 +213,7 @@ MorehSgdOperation::ProgramFactory::cached_program_t MorehSgdOperation::ProgramFa
             u_one.u,
         };
 
-        vector<uint32_t> writer_args = {
+        std::vector<uint32_t> writer_args = {
             param_out.buffer()->address(),
             momentum_buffer_out.has_value() ? momentum_buffer_out.value().buffer()->address() : 0,
             num_tiles_per_core,
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_c_large/softmax_c_large.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_c_large/softmax_c_large.cpp
index f70bb6b5813..73a87fcf49c 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_c_large/softmax_c_large.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_c_large/softmax_c_large.cpp
@@ -128,10 +128,10 @@ MorehSoftmaxOperation::MorehSoftmaxCLargeFactory::create(
             TT_THROW("Core not in specified core ranges");
         }
 
-        vector<uint32_t> reader_args = {
+        std::vector<uint32_t> reader_args = {
             input.buffer()->address(), num_tiles_per_core, tile_offset, outer_stride, inner_size, dim_size};
 
-        vector<uint32_t> writer_args = {
+        std::vector<uint32_t> writer_args = {
             output.buffer()->address(), num_tiles_per_core, tile_offset, outer_stride, inner_size, dim_size};
 
         SetRuntimeArgs(program, reader_kernel_id, core, reader_args);
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_h_large/softmax_h_large.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_h_large/softmax_h_large.cpp
index 85532b0bcfc..4b38c7c02e3 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_h_large/softmax_h_large.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_h_large/softmax_h_large.cpp
@@ -127,7 +127,7 @@ MorehSoftmaxOperation::MorehSoftmaxHLargeFactory::create(
         uint32_t mask_h = input.get_logical_shape()[-2] % tt::constants::TILE_HEIGHT;
         if (mask_h == 0)
             mask_h = tt::constants::TILE_HEIGHT;
-        vector<uint32_t> reader_args = {
+        std::vector<uint32_t> reader_args = {
             input.buffer()->address(),
             num_tiles_per_core,
             tile_offset,
@@ -136,7 +136,7 @@ MorehSoftmaxOperation::MorehSoftmaxHLargeFactory::create(
             *reinterpret_cast<uint32_t*>(&scaler),
             mask_h};
 
-        vector<uint32_t> writer_args = {output.buffer()->address(), num_tiles_per_core, tile_offset, Ht, Wt};
+        std::vector<uint32_t> writer_args = {output.buffer()->address(), num_tiles_per_core, tile_offset, Ht, Wt};
 
         SetRuntimeArgs(program, reader_kernel_id, core, reader_args);
         SetRuntimeArgs(program, writer_kernel_id, core, writer_args);
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_h_small/softmax_h_small.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_h_small/softmax_h_small.cpp
index 7dc08209882..b182ec8d63c 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_h_small/softmax_h_small.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_h_small/softmax_h_small.cpp
@@ -128,7 +128,7 @@ MorehSoftmaxOperation::MorehSoftmaxHSmallFactory::create(
         uint32_t mask_h = shape.without_padding()[-2] % tt::constants::TILE_HEIGHT;
         if (mask_h == 0)
             mask_h = tt::constants::TILE_HEIGHT;
-        vector<uint32_t> reader_args = {
+        std::vector<uint32_t> reader_args = {
             input.buffer()->address(),
             num_tiles_per_core,
             tile_offset,
@@ -137,7 +137,7 @@ MorehSoftmaxOperation::MorehSoftmaxHSmallFactory::create(
             *reinterpret_cast<uint32_t*>(&scaler),
             mask_h};
 
-        vector<uint32_t> writer_args = {output.buffer()->address(), num_tiles_per_core, tile_offset, Ht, Wt};
+        std::vector<uint32_t> writer_args = {output.buffer()->address(), num_tiles_per_core, tile_offset, Ht, Wt};
 
         SetRuntimeArgs(program, reader_kernel_id, core, reader_args);
         SetRuntimeArgs(program, writer_kernel_id, core, writer_args);
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_w_large/softmax_w_large.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_w_large/softmax_w_large.cpp
index 612677427e2..2622708e47f 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_w_large/softmax_w_large.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_w_large/softmax_w_large.cpp
@@ -128,7 +128,7 @@ MorehSoftmaxOperation::MorehSoftmaxWLargeFactory::create(
         uint32_t mask_w = shape.without_padding()[-1] % tt::constants::TILE_WIDTH;
         if (mask_w == 0)
             mask_w = tt::constants::TILE_WIDTH;
-        vector<uint32_t> reader_args = {
+        std::vector<uint32_t> reader_args = {
             input.buffer()->address(),
             num_tiles_per_core,
             tile_offset,
@@ -136,7 +136,7 @@ MorehSoftmaxOperation::MorehSoftmaxWLargeFactory::create(
             *reinterpret_cast<uint32_t*>(&scaler),
             mask_w};
 
-        vector<uint32_t> writer_args = {output.buffer()->address(), num_tiles_per_core, tile_offset, Wt};
+        std::vector<uint32_t> writer_args = {output.buffer()->address(), num_tiles_per_core, tile_offset, Wt};
 
         SetRuntimeArgs(program, reader_kernel_id, core, reader_args);
         SetRuntimeArgs(program, writer_kernel_id, core, writer_args);
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_w_small/softmax_w_small.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_w_small/softmax_w_small.cpp
index 3bb8ba52f3d..a43840e1949 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_w_small/softmax_w_small.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_w_small/softmax_w_small.cpp
@@ -127,7 +127,7 @@ MorehSoftmaxOperation::MorehSoftmaxWSmallFactory::create(
         uint32_t mask_w = shape.without_padding()[-1] % tt::constants::TILE_WIDTH;
         if (mask_w == 0)
             mask_w = tt::constants::TILE_WIDTH;
-        vector<uint32_t> reader_args = {
+        std::vector<uint32_t> reader_args = {
             input.buffer()->address(),
             num_tiles_per_core,
             tile_offset,
@@ -135,7 +135,7 @@ MorehSoftmaxOperation::MorehSoftmaxWSmallFactory::create(
             *reinterpret_cast<uint32_t*>(&scaler),
             mask_w};
 
-        vector<uint32_t> writer_args = {output.buffer()->address(), num_tiles_per_core, tile_offset, Wt};
+        std::vector<uint32_t> writer_args = {output.buffer()->address(), num_tiles_per_core, tile_offset, Wt};
 
         SetRuntimeArgs(program, reader_kernel_id, core, reader_args);
         SetRuntimeArgs(program, writer_kernel_id, core, writer_args);
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_c_large/softmax_backward_c_large.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_c_large/softmax_backward_c_large.cpp
index 6031a007f22..9445e917f7b 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_c_large/softmax_backward_c_large.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_c_large/softmax_backward_c_large.cpp
@@ -130,7 +130,7 @@ MorehSoftmaxBackwardOperation::MorehSoftmaxBackwardCLargeFactory::create(
             TT_THROW("Core not in specified core ranges");
         }
 
-        vector<uint32_t> reader_args = {
+        std::vector<uint32_t> reader_args = {
             output.buffer()->address(),
             output_grad.buffer()->address(),
             num_tiles_per_core,
@@ -139,7 +139,7 @@ MorehSoftmaxBackwardOperation::MorehSoftmaxBackwardCLargeFactory::create(
             inner_size,
             dim_size};
 
-        vector<uint32_t> writer_args = {
+        std::vector<uint32_t> writer_args = {
             input_grad.buffer()->address(), num_tiles_per_core, tile_offset, outer_stride, inner_size, dim_size};
 
         SetRuntimeArgs(program, reader_kernel_id, core, reader_args);
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_h_large/softmax_backward_h_large.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_h_large/softmax_backward_h_large.cpp
index 0174df56159..7d8f06884dc 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_h_large/softmax_backward_h_large.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_h_large/softmax_backward_h_large.cpp
@@ -132,7 +132,7 @@ MorehSoftmaxBackwardOperation::MorehSoftmaxBackwardHLargeFactory::create(
         uint32_t mask_h = shape.without_padding()[-2] % tt::constants::TILE_HEIGHT;
         if (mask_h == 0)
             mask_h = tt::constants::TILE_HEIGHT;
-        vector<uint32_t> reader_args = {
+        std::vector<uint32_t> reader_args = {
             output.buffer()->address(),
             output_grad.buffer()->address(),
             num_tiles_per_core,
@@ -142,7 +142,7 @@ MorehSoftmaxBackwardOperation::MorehSoftmaxBackwardHLargeFactory::create(
             *reinterpret_cast<uint32_t*>(&scaler),
             mask_h};
 
-        vector<uint32_t> writer_args = {input_grad.buffer()->address(), num_tiles_per_core, tile_offset, Ht, Wt};
+        std::vector<uint32_t> writer_args = {input_grad.buffer()->address(), num_tiles_per_core, tile_offset, Ht, Wt};
 
         SetRuntimeArgs(program, reader_kernel_id, core, reader_args);
         SetRuntimeArgs(program, writer_kernel_id, core, writer_args);
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_h_small/softmax_backward_h_small.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_h_small/softmax_backward_h_small.cpp
index fe72331b129..997d1b56259 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_h_small/softmax_backward_h_small.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_h_small/softmax_backward_h_small.cpp
@@ -130,7 +130,7 @@ MorehSoftmaxBackwardOperation::MorehSoftmaxBackwardHSmallFactory::create(
         uint32_t mask_h = shape.without_padding()[-2] % tt::constants::TILE_HEIGHT;
         if (mask_h == 0)
             mask_h = tt::constants::TILE_HEIGHT;
-        vector<uint32_t> reader_args = {
+        std::vector<uint32_t> reader_args = {
             output.buffer()->address(),
             output_grad.buffer()->address(),
             num_tiles_per_core,
@@ -140,7 +140,7 @@ MorehSoftmaxBackwardOperation::MorehSoftmaxBackwardHSmallFactory::create(
             *reinterpret_cast<uint32_t*>(&scaler),
             mask_h};
 
-        vector<uint32_t> writer_args = {input_grad.buffer()->address(), num_tiles_per_core, tile_offset, Ht, Wt};
+        std::vector<uint32_t> writer_args = {input_grad.buffer()->address(), num_tiles_per_core, tile_offset, Ht, Wt};
 
         SetRuntimeArgs(program, reader_kernel_id, core, reader_args);
         SetRuntimeArgs(program, writer_kernel_id, core, writer_args);
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_w_large/softmax_backward_w_large.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_w_large/softmax_backward_w_large.cpp
index fd720537431..8090c3c232f 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_w_large/softmax_backward_w_large.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_w_large/softmax_backward_w_large.cpp
@@ -132,7 +132,7 @@ MorehSoftmaxBackwardOperation::MorehSoftmaxBackwardWLargeFactory::create(
         uint32_t mask_w = shape.without_padding()[-1] % tt::constants::TILE_WIDTH;
         if (mask_w == 0)
             mask_w = tt::constants::TILE_WIDTH;
-        vector<uint32_t> reader_args = {
+        std::vector<uint32_t> reader_args = {
             output.buffer()->address(),
             output_grad.buffer()->address(),
             num_tiles_per_core,
@@ -141,7 +141,7 @@ MorehSoftmaxBackwardOperation::MorehSoftmaxBackwardWLargeFactory::create(
             *reinterpret_cast<uint32_t*>(&scaler),
             mask_w};
 
-        vector<uint32_t> writer_args = {input_grad.buffer()->address(), num_tiles_per_core, tile_offset, Wt};
+        std::vector<uint32_t> writer_args = {input_grad.buffer()->address(), num_tiles_per_core, tile_offset, Wt};
 
         SetRuntimeArgs(program, reader_kernel_id, core, reader_args);
         SetRuntimeArgs(program, writer_kernel_id, core, writer_args);
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_w_small/softmax_backward_w_small.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_w_small/softmax_backward_w_small.cpp
index 5e04e023140..213741f30de 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_w_small/softmax_backward_w_small.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_w_small/softmax_backward_w_small.cpp
@@ -127,7 +127,7 @@ MorehSoftmaxBackwardOperation::MorehSoftmaxBackwardWSmallFactory::create(
         uint32_t mask_w = shape.without_padding()[-1] % tt::constants::TILE_WIDTH;
         if (mask_w == 0)
             mask_w = tt::constants::TILE_WIDTH;
-        vector<uint32_t> reader_args = {
+        std::vector<uint32_t> reader_args = {
             output.buffer()->address(),
             output_grad.buffer()->address(),
             num_tiles_per_core,
@@ -136,7 +136,7 @@ MorehSoftmaxBackwardOperation::MorehSoftmaxBackwardWSmallFactory::create(
             *reinterpret_cast<uint32_t*>(&scaler),
             mask_w};
 
-        vector<uint32_t> writer_args = {input_grad.buffer()->address(), num_tiles_per_core, tile_offset, Wt};
+        std::vector<uint32_t> writer_args = {input_grad.buffer()->address(), num_tiles_per_core, tile_offset, Wt};
 
         SetRuntimeArgs(program, reader_kernel_id, core, reader_args);
         SetRuntimeArgs(program, writer_kernel_id, core, writer_args);
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_h_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_h_program_factory.cpp
index 285e20a337a..594b27f1ff0 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_h_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_h_program_factory.cpp
@@ -154,13 +154,13 @@ MorehSumOperation::MorehSumHFactory::cached_program_t MorehSumOperation::MorehSu
         reduce_defines["FP32_DEST_ACC_EN"] = "1";
     }
 
-    vector<uint32_t> compute_kernel_args_group_1 = {
+    std::vector<uint32_t> compute_kernel_args_group_1 = {
         Ht,                         // Ht
         num_cols_per_core_group_1,  // Wt
         1,                          // NC
         origin_H};
 
-    vector<UnpackToDestMode> unpack_to_dest_mode(NUM_CIRCULAR_BUFFERS, UnpackToDestMode::Default);
+    std::vector<UnpackToDestMode> unpack_to_dest_mode(NUM_CIRCULAR_BUFFERS, UnpackToDestMode::Default);
     if (fp32_dest_acc_en) {
         unpack_to_dest_mode[tt::CB::c_intermed0] = UnpackToDestMode::UnpackToDestFp32;
     }
@@ -177,7 +177,7 @@ MorehSumOperation::MorehSumHFactory::cached_program_t MorehSumOperation::MorehSu
             .defines = reduce_defines});
 
     if (!core_group_2.ranges().empty()) {
-        vector<uint32_t> compute_kernel_args_group_2 = {
+        std::vector<uint32_t> compute_kernel_args_group_2 = {
             Ht,                         // Ht
             num_cols_per_core_group_2,  // Wt
             1,                          // NC
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_nc_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_nc_program_factory.cpp
index d123bc890da..ebd8d432981 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_nc_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_nc_program_factory.cpp
@@ -104,7 +104,7 @@ MorehSumOperation::MorehSumNCFactory::cached_program_t MorehSumOperation::MorehS
         compute_defines["FP32_DEST_ACC_EN"] = "1";
     }
     // set unpack_to_dest_mode to the same value as fp32_dest_acc_en
-    vector<UnpackToDestMode> unpack_to_dest_mode(NUM_CIRCULAR_BUFFERS, UnpackToDestMode::Default);
+    std::vector<UnpackToDestMode> unpack_to_dest_mode(NUM_CIRCULAR_BUFFERS, UnpackToDestMode::Default);
     auto compute_kernel_file =
         "ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_nc_impl_kernels/moreh_sum_nc.cpp";
     if (device->arch() == tt::ARCH::GRAYSKULL) {
diff --git a/ttnn/cpp/ttnn/operations/normalization/layernorm/device/multi_core/layernorm_op_multi_core.cpp b/ttnn/cpp/ttnn/operations/normalization/layernorm/device/multi_core/layernorm_op_multi_core.cpp
index 392542fdaad..7973e5174e0 100644
--- a/ttnn/cpp/ttnn/operations/normalization/layernorm/device/multi_core/layernorm_op_multi_core.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/layernorm/device/multi_core/layernorm_op_multi_core.cpp
@@ -252,7 +252,7 @@ operation::ProgramWithCallbacks layernorm_multi_core(
         tt::tt_metal::WriterDataMovementConfig(writer_compile_time_args)
     );
 
-    vector<uint32_t> compute_args = { Wt, block_size, gamma.has_value(), beta.has_value(), fp32_dest_acc_en };
+    std::vector<uint32_t> compute_args = { Wt, block_size, gamma.has_value(), beta.has_value(), fp32_dest_acc_en };
 
     auto compute_kernels_id = CreateKernel(
         program,
diff --git a/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/multi_core/layernorm_post_all_gather_op_multi_core.cpp b/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/multi_core/layernorm_post_all_gather_op_multi_core.cpp
index 339f881f6fa..3daf51392fc 100644
--- a/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/multi_core/layernorm_post_all_gather_op_multi_core.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/multi_core/layernorm_post_all_gather_op_multi_core.cpp
@@ -270,7 +270,7 @@ operation::ProgramWithCallbacks layernorm_post_allgather_multi_core(
         tt::tt_metal::WriterDataMovementConfig(writer_compile_time_args)
     );
 
-    vector<uint32_t> compute_args = { Wt, block_size, stats_tiles_cols, gamma.has_value(), beta.has_value(), fp32_dest_acc_en };
+    std::vector<uint32_t> compute_args = { Wt, block_size, stats_tiles_cols, gamma.has_value(), beta.has_value(), fp32_dest_acc_en };
 
     auto compute_kernels_id = CreateKernel(
         program,
diff --git a/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/multi_core/layernorm_pre_all_gather_op_multi_core.cpp b/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/multi_core/layernorm_pre_all_gather_op_multi_core.cpp
index a0f86c3f1e8..aedca4b1c2a 100644
--- a/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/multi_core/layernorm_pre_all_gather_op_multi_core.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/multi_core/layernorm_pre_all_gather_op_multi_core.cpp
@@ -183,7 +183,7 @@ operation::ProgramWithCallbacks layernorm_pre_allgather_multi_core(
         tt::tt_metal::WriterDataMovementConfig(writer_compile_time_args)
     );
 
-    vector<uint32_t> compute_args = { Wt, block_size };
+    std::vector<uint32_t> compute_args = { Wt, block_size };
 
     auto compute_kernels_id = CreateKernel(
         program,
diff --git a/ttnn/cpp/ttnn/operations/numpy/functions.hpp b/ttnn/cpp/ttnn/operations/numpy/functions.hpp
index 62c2dc058cc..f6540f6e168 100644
--- a/ttnn/cpp/ttnn/operations/numpy/functions.hpp
+++ b/ttnn/cpp/ttnn/operations/numpy/functions.hpp
@@ -444,7 +444,7 @@ static Tensor fill_first_val_into_tensor(
     auto owned_buffer = tt::tt_metal::owned_buffer::create<T>(physical_volume);  // ouput
     auto device_buffer = input_tensor.device_buffer();
     uint32_t size_in_bytes = device_buffer->size();
-    vector<T> data_vec;
+    std::vector<T> data_vec;
     const char* TT_METAL_SLOW_DISPATCH_MODE = std::getenv("TT_METAL_SLOW_DISPATCH_MODE");
     if (TT_METAL_SLOW_DISPATCH_MODE == nullptr) {
         data_vec.resize(size_in_bytes / sizeof(T));
@@ -478,7 +478,7 @@ static Tensor prod_result_computation_GS(
     auto owned_buffer = tt::tt_metal::owned_buffer::create<T>(input_tensor.volume());  // ouput
     auto device_buffer = input_tensor.device_buffer();
     uint32_t size_in_bytes = device_buffer->size();
-    vector<T> data_vec;
+    std::vector<T> data_vec;
     const char* TT_METAL_SLOW_DISPATCH_MODE = std::getenv("TT_METAL_SLOW_DISPATCH_MODE");
     if (TT_METAL_SLOW_DISPATCH_MODE == nullptr) {
         data_vec.resize(size_in_bytes / sizeof(T));
@@ -527,7 +527,7 @@ static Tensor prod_result_computation_WH_B0(
     auto owned_buffer = tt::tt_metal::owned_buffer::create<T>(tt::tt_metal::compute_volume(s_a));  // ouput
     auto device_buffer = input_tensor.device_buffer();
     uint32_t size_in_bytes = device_buffer->size();
-    vector<T> data_vec;
+    std::vector<T> data_vec;
     const char* TT_METAL_SLOW_DISPATCH_MODE = std::getenv("TT_METAL_SLOW_DISPATCH_MODE");
     if (TT_METAL_SLOW_DISPATCH_MODE == nullptr) {
         data_vec.resize(size_in_bytes / sizeof(T));
@@ -654,7 +654,7 @@ static Tensor manual_insertion(
         "Required shape volume must match old shape volume");
     auto device_buffer = input_tensor.device_buffer();
     uint32_t size_in_bytes = device_buffer->size();
-    vector<T> data_vec;
+    std::vector<T> data_vec;
     const char* TT_METAL_SLOW_DISPATCH_MODE = std::getenv("TT_METAL_SLOW_DISPATCH_MODE");
     if (TT_METAL_SLOW_DISPATCH_MODE == nullptr) {
         data_vec.resize(size_in_bytes / sizeof(T));
diff --git a/ttnn/cpp/ttnn/operations/pool/downsample/device/downsample_program_factory.cpp b/ttnn/cpp/ttnn/operations/pool/downsample/device/downsample_program_factory.cpp
index 29e75f98df8..c0b6399660a 100644
--- a/ttnn/cpp/ttnn/operations/pool/downsample/device/downsample_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/pool/downsample/device/downsample_program_factory.cpp
@@ -570,7 +570,7 @@ operation::ProgramWithCallbacks downsample_single_core(
         core_range,
         tt::tt_metal::WriterDataMovementConfig(writer_compile_time_args));
 
-    vector<uint32_t> compute_args = {
+    std::vector<uint32_t> compute_args = {
         input_cb_index,
         halo_prev_input_cb_index,
         halo_next_input_cb_index,
@@ -761,7 +761,7 @@ operation::ProgramWithCallbacks downsample_single_core(
         TT_ASSERT(v.output_flat_h == 0);
 
         // Compile runtime args
-        vector<uint32_t> compile_rt_kernel_args = {
+        std::vector<uint32_t> compile_rt_kernel_args = {
             local_input_num_rows_of_tiles,
             local_input_offset_rows_of_tiles,
             halo_prev_read_enabled,
@@ -773,7 +773,7 @@ operation::ProgramWithCallbacks downsample_single_core(
         tt::tt_metal::SetRuntimeArgs(program, downsample_compute_kernel_id, core, compile_rt_kernel_args);
 
         // Writer runtime args
-        vector<uint32_t> writer_kernel_args = {
+        std::vector<uint32_t> writer_kernel_args = {
             (uint32_t)img_height,
             (uint32_t)img_width,
             (uint32_t)img_stride_h,
diff --git a/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_bilinear_program_factory_multicore.cpp b/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_bilinear_program_factory_multicore.cpp
index c29bb743c73..cb69c9ec164 100644
--- a/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_bilinear_program_factory_multicore.cpp
+++ b/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_bilinear_program_factory_multicore.cpp
@@ -235,7 +235,7 @@ operation::ProgramWithCallbacks bilinear_multi_core(const Tensor &input, Tensor&
 
     // runtime args
     uint32_t reader_nargs = 10;
-    vector<uint32_t> reader_rt_args(reader_nargs);
+    std::vector<uint32_t> reader_rt_args(reader_nargs);
     reader_rt_args[0] = input_stick_nbytes;
     reader_rt_args[1] = input_nsticks_per_core / in_w;
     reader_rt_args[2] = scale_factor_h;
diff --git a/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_op.cpp b/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_op.cpp
index d43270ca96a..da671cb659a 100644
--- a/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_op.cpp
+++ b/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_op.cpp
@@ -59,7 +59,7 @@ std::vector<Tensor> UpSample::create_output_tensors(const std::vector<Tensor> &i
             auto output_shape = compute_output_shapes(inputs).at(0);
             if (input.memory_config().memory_layout == TensorMemoryLayout::HEIGHT_SHARDED) {
                 auto ncores = input_shard_spec.num_cores();
-                array<uint32_t, 2> output_shard_shape = {div_up(output_shape[0] * output_shape[1] * output_shape[2], ncores), output_shape[-1]};
+                std::array<uint32_t, 2> output_shard_shape = {div_up(output_shape[0] * output_shape[1] * output_shape[2], ncores), output_shape[-1]};
                 auto output_shard_spec = input_shard_spec;
                 output_shard_spec.shape = output_shard_shape;
                 mem_config.shard_spec = output_shard_spec;
@@ -72,7 +72,7 @@ std::vector<Tensor> UpSample::create_output_tensors(const std::vector<Tensor> &i
                 auto core_range = *shard_grid.begin();
                 uint32_t ncores_w = core_range.end_coord.x + 1;
                 uint32_t ncores_h = core_range.end_coord.y + 1;
-                // array<uint32_t, 2> output_shard_shape = {output_shape[0] * output_shape[1] * output_shape[2] / ncores_h, output_shape[-1] / ncores_w};
+                // std::array<uint32_t, 2> output_shard_shape = {output_shape[0] * output_shape[1] * output_shape[2] / ncores_h, output_shape[-1] / ncores_w};
                 // auto output_shard_spec = input_shard_spec;
                 // output_shard_spec.shape = output_shard_shape;
                 // mem_config.shard_spec = output_shard_spec;
diff --git a/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_program_factory_multicore.cpp b/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_program_factory_multicore.cpp
index e20cf9bfd19..b2deccc8f2f 100644
--- a/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_program_factory_multicore.cpp
+++ b/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_program_factory_multicore.cpp
@@ -130,7 +130,7 @@ operation::ProgramWithCallbacks upsample_multi_core(const Tensor &input, Tensor&
     // runtime args
 
     uint32_t writer_nargs = 7;
-    vector<uint32_t> writer_rt_args(writer_nargs);
+    std::vector<uint32_t> writer_rt_args(writer_nargs);
     writer_rt_args[0] = input_stick_nbytes;
     writer_rt_args[1] = input_nsticks_per_core / in_w;
     writer_rt_args[2] = scale_factor_h;
diff --git a/ttnn/cpp/ttnn/operations/reduction/generic/device/multi_core_h/reduce_op_multi_core_h.cpp b/ttnn/cpp/ttnn/operations/reduction/generic/device/multi_core_h/reduce_op_multi_core_h.cpp
index 1a5fd46bce4..fd6424cb0bf 100644
--- a/ttnn/cpp/ttnn/operations/reduction/generic/device/multi_core_h/reduce_op_multi_core_h.cpp
+++ b/ttnn/cpp/ttnn/operations/reduction/generic/device/multi_core_h/reduce_op_multi_core_h.cpp
@@ -149,7 +149,7 @@ operation::ProgramWithCallbacks reduce_multi_core_h(
     tt_metal::KernelHandle writer_kernel_id;
 
     if (out_sharded) {
-        vector<uint32_t> writer_ct_args = {
+        std::vector<uint32_t> writer_ct_args = {
             output_cb_index,
         };
         writer_kernel_id = CreateKernel(
@@ -168,7 +168,7 @@ operation::ProgramWithCallbacks reduce_multi_core_h(
             tt_metal::WriterDataMovementConfig(writer_compile_time_args));
     }
     std::map<string, string> reduce_defines = reduce_op_utils::get_defines(reduce_op, ReduceOpDim::H);
-    vector<uint32_t> compute_kernel_args_group_1 = {
+    std::vector<uint32_t> compute_kernel_args_group_1 = {
         Ht,                         // Ht
         num_cols_per_core_group_1,  // Wt
         1,                          // NC
@@ -185,7 +185,7 @@ operation::ProgramWithCallbacks reduce_multi_core_h(
             .defines = reduce_defines});
 
     if (!core_group_2.ranges().empty()) {
-        vector<uint32_t> compute_kernel_args_group_2 = {
+        std::vector<uint32_t> compute_kernel_args_group_2 = {
             Ht,                         // Ht
             num_cols_per_core_group_2,  // Wt
             1,                          // NC
@@ -208,11 +208,11 @@ operation::ProgramWithCallbacks reduce_multi_core_h(
         uint32_t shard_Wt = num_cols_per_core_group_1 / NC;
         uint32_t shard_row_size = shard_Wt * src0_single_tile_size;
         uint32_t shard_batch_size = shard_row_size * Ht;
-        vector<uint32_t> reader_rt_args = {
+        std::vector<uint32_t> reader_rt_args = {
             num_cols_per_core_group_1 * Ht, shard_Wt, Ht, NC, shard_row_size, shard_batch_size, packed_scaler_value};
         tt_metal::SetRuntimeArgs(program, reader_kernel_id, all_cores, reader_rt_args);
 
-        vector<uint32_t> writer_rt_args = {num_cols_per_core_group_1};
+        std::vector<uint32_t> writer_rt_args = {num_cols_per_core_group_1};
         tt_metal::SetRuntimeArgs(program, writer_kernel_id, all_cores, writer_rt_args);
     } else {
         for (uint32_t i = 0, num_cols_read = 0; i < num_cores; i++) {
diff --git a/ttnn/cpp/ttnn/operations/reduction/generic/device/multi_core_w/reduce_op_multi_core_w.cpp b/ttnn/cpp/ttnn/operations/reduction/generic/device/multi_core_w/reduce_op_multi_core_w.cpp
index 9205f800f79..1756cf29345 100644
--- a/ttnn/cpp/ttnn/operations/reduction/generic/device/multi_core_w/reduce_op_multi_core_w.cpp
+++ b/ttnn/cpp/ttnn/operations/reduction/generic/device/multi_core_w/reduce_op_multi_core_w.cpp
@@ -94,7 +94,7 @@ operation::ProgramWithCallbacks reduce_multi_core_w(
         all_cores,
         tt_metal::WriterDataMovementConfig(writer_compile_time_args, reduce_defines));
 
-    vector<uint32_t> compute_kernel_args_group_1 = {
+    std::vector<uint32_t> compute_kernel_args_group_1 = {
         num_rows_per_core_group_1,  // Ht
         Wt,                         // Wt
         1,                          // NC
@@ -111,7 +111,7 @@ operation::ProgramWithCallbacks reduce_multi_core_w(
             .defines = reduce_defines});
 
     if (!core_group_2.ranges().empty()) {
-        vector<uint32_t> compute_kernel_args_group_2 = {
+        std::vector<uint32_t> compute_kernel_args_group_2 = {
             num_rows_per_core_group_2,  // Ht
             Wt,                         // Wt
             1,                          // NC
diff --git a/ttnn/cpp/ttnn/operations/reduction/generic/device/single_core_hw/reduce_op_single_core_hw.cpp b/ttnn/cpp/ttnn/operations/reduction/generic/device/single_core_hw/reduce_op_single_core_hw.cpp
index f3cfe56730c..66eaccf2e20 100644
--- a/ttnn/cpp/ttnn/operations/reduction/generic/device/single_core_hw/reduce_op_single_core_hw.cpp
+++ b/ttnn/cpp/ttnn/operations/reduction/generic/device/single_core_hw/reduce_op_single_core_hw.cpp
@@ -96,7 +96,7 @@ operation::ProgramWithCallbacks reduce_single_core_hw(
         core,
         tt_metal::WriterDataMovementConfig(writer_compile_time_args));
 
-    vector<uint32_t> compute_kernel_args = {
+    std::vector<uint32_t> compute_kernel_args = {
         Ht,  // Ht
         Wt,  // Wt
         NC,  // NC
diff --git a/ttnn/cpp/ttnn/operations/reduction/prod/device/prod_all_program_factory.cpp b/ttnn/cpp/ttnn/operations/reduction/prod/device/prod_all_program_factory.cpp
index 4c7a83cd11c..390dec0034a 100644
--- a/ttnn/cpp/ttnn/operations/reduction/prod/device/prod_all_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/reduction/prod/device/prod_all_program_factory.cpp
@@ -67,7 +67,7 @@ namespace primary {
             core,
             tt_metal::WriterDataMovementConfig{writer_compile_time_args});
 
-        vector<uint32_t> compute_kernel_args = {
+        std::vector<uint32_t> compute_kernel_args = {
             num_tiles, // per_core_block_cnt
             1 // per_core_block_size
         };
diff --git a/ttnn/cpp/ttnn/operations/sliding_window/reference_sliding_window.cpp b/ttnn/cpp/ttnn/operations/sliding_window/reference_sliding_window.cpp
index e01bb7997cd..06e8a52e7b5 100644
--- a/ttnn/cpp/ttnn/operations/sliding_window/reference_sliding_window.cpp
+++ b/ttnn/cpp/ttnn/operations/sliding_window/reference_sliding_window.cpp
@@ -71,7 +71,7 @@ owned_buffer::Buffer<bfloat16> conv_using_op_trace_metadata(
     uint32_t padded_input_w,
     uint32_t out_tensor_size) {
     auto conv_tensor_buf = owned_buffer::create<bfloat16>(out_tensor_size);
-    vector<float> input_window;
+    std::vector<float> input_window;
     uint32_t out_idx = 0;
     for (auto anchor : op_trace_metadata) {
         for (uint32_t h = 0; h < filter_h; h++) {
@@ -135,10 +135,10 @@ owned_buffer::Buffer<bfloat16> conv_using_shard_boundaries(
 
 owned_buffer::Buffer<bfloat16> conv_using_sliding_window_op_config(
     const owned_buffer::Buffer<bfloat16> &input_padded_tensor_buf,
-    const vector<float> &filter_vector,
+    const std::vector<float> &filter_vector,
     const std::vector<uint32_t> &op_trace_metadata,
-    const vector<std::pair<uint32_pair_t, uint32_pair_t>> &shard_boundaries,
-    const vector<std::vector<uint16_t>> &sharded_input_top_left_indices,
+    const std::vector<std::pair<uint32_pair_t, uint32_pair_t>> &shard_boundaries,
+    const std::vector<std::vector<uint16_t>> &sharded_input_top_left_indices,
     uint32_t input_h,
     uint32_t input_w,
     uint32_t stride_h,
@@ -149,7 +149,7 @@ owned_buffer::Buffer<bfloat16> conv_using_sliding_window_op_config(
     uint32_t out_tensor_size) {
     auto conv_tensor_buf = owned_buffer::create<bfloat16>(out_tensor_size);
 
-    vector<float> input_window;
+    std::vector<float> input_window;
     uint32_t out_idx = 0;
 
     for (auto j = 0; j < sharded_input_top_left_indices.size(); j++) {
@@ -176,7 +176,7 @@ owned_buffer::Buffer<bfloat16> conv_using_sliding_window_op_config(
 }
 
 std::vector<bool> pad_metadata_from_tensor_metadata(const std::vector<std::pair<bool, uint32_pair_t>> &tensor_metadata) {
-    vector<bool> ref_pad_metadata;
+    std::vector<bool> ref_pad_metadata;
     for (auto i = 0; i < tensor_metadata.size(); i++) {
         auto is_pad_stick = tensor_metadata[i].first;
         if (is_pad_stick) {
diff --git a/ttnn/cpp/ttnn/operations/sliding_window/reference_sliding_window.hpp b/ttnn/cpp/ttnn/operations/sliding_window/reference_sliding_window.hpp
index 90a899ccb86..c6de3eeec0d 100644
--- a/ttnn/cpp/ttnn/operations/sliding_window/reference_sliding_window.hpp
+++ b/ttnn/cpp/ttnn/operations/sliding_window/reference_sliding_window.hpp
@@ -25,14 +25,14 @@ owned_buffer::Buffer<bfloat16> ref_conv_op(
     const Shape &input_nchw_shape,
     uint32_t stride_h,
     uint32_t stride_w,
-    const vector<float> &filter_vector,
+    const std::vector<float> &filter_vector,
     const Shape &filter_pyt_tensor_shape,
     const Shape &out_golden_pyt_tensor_shape);
 
 // Calculate convolution using op_trace_metadata on padded input buffer.
 owned_buffer::Buffer<bfloat16> conv_using_op_trace_metadata(
     const owned_buffer::Buffer<bfloat16> &input_padded_tensor_buf,
-    const vector<float> &filter_vector,
+    const std::vector<float> &filter_vector,
     const std::vector<uint32_t> &op_trace_metadata,
     uint32_t stride_h,
     uint32_t stride_w,
@@ -44,8 +44,8 @@ owned_buffer::Buffer<bfloat16> conv_using_op_trace_metadata(
 // Calculate convolution using shards on padded input buffer.
 owned_buffer::Buffer<bfloat16> conv_using_shard_boundaries(
     const owned_buffer::Buffer<bfloat16> &input_padded_tensor_buf,
-    const vector<float> &filter_vector,
-    const vector<std::pair<uint32_pair_t, uint32_pair_t>> &shard_boundaries,
+    const std::vector<float> &filter_vector,
+    const std::vector<std::pair<uint32_pair_t, uint32_pair_t>> &shard_boundaries,
     uint32_t stride_h,
     uint32_t stride_w,
     uint32_t padded_input_h,
@@ -59,10 +59,10 @@ owned_buffer::Buffer<bfloat16> conv_using_shard_boundaries(
 // Calculate convolution using sliding window op configs on padded input buffer.
 owned_buffer::Buffer<bfloat16> conv_using_sliding_window_op_config(
     const owned_buffer::Buffer<bfloat16> &input_padded_tensor_buf,
-    const vector<float> &filter_vector,
+    const std::vector<float> &filter_vector,
     const std::vector<uint32_t> &op_trace_metadata,
-    const vector<std::pair<uint32_pair_t, uint32_pair_t>> &shard_boundaries,
-    const vector<std::vector<uint16_t>> &sharded_input_top_left_indices,
+    const std::vector<std::pair<uint32_pair_t, uint32_pair_t>> &shard_boundaries,
+    const std::vector<std::vector<uint16_t>> &sharded_input_top_left_indices,
     uint32_t input_h,
     uint32_t input_w,
     uint32_t stride_h,
@@ -73,23 +73,23 @@ owned_buffer::Buffer<bfloat16> conv_using_sliding_window_op_config(
     uint32_t out_tensor_size);
 
 // Calculate Padding using tensor metadata.
-vector<bool> pad_metadata_from_tensor_metadata(const vector<std::pair<bool, uint32_pair_t>> &tensor_metadata);
+std::vector<bool> pad_metadata_from_tensor_metadata(const std::vector<std::pair<bool, uint32_pair_t>> &tensor_metadata);
 
 // Calculate Indices of pads in padded input buffer using halo kernel config's flattened pad config.
-vector<uint32_t> pad_indices_from_flattened_pad_config(
-    const vector<vector<uint16_t>> &flattened_pad_config,
-    const vector<std::pair<uint32_pair_t, uint32_pair_t>> &shard_boundaries);
+std::vector<uint32_t> pad_indices_from_flattened_pad_config(
+    const std::vector<std::vector<uint16_t>> &flattened_pad_config,
+    const std::vector<std::pair<uint32_pair_t, uint32_pair_t>> &shard_boundaries);
 
 // Calculate Indices of valid inputs in padded input buffer using halo kernel config's flattened local configs.
-vector<uint32_t> input_indices_from_flattened_local_config(
-    const vector<vector<uint16_t>> &flattened_local_config,
-    const vector<std::pair<uint32_pair_t, uint32_pair_t>> &shard_boundaries);
+std::vector<uint32_t> input_indices_from_flattened_local_config(
+    const std::vector<std::vector<uint16_t>> &flattened_local_config,
+    const std::vector<std::pair<uint32_pair_t, uint32_pair_t>> &shard_boundaries);
 
 // Calculate Indices of valid inputs in padded input buffer using halo kernel config's flattened remote configs.
-vector<uint32_t> input_indices_from_flattened_remote_config(
+std::vector<uint32_t> input_indices_from_flattened_remote_config(
     tt::tt_metal::Device *device,
-    const vector<vector<uint16_t>> &flattened_remote_config,
-    const vector<std::pair<uint32_pair_t, uint32_pair_t>> &shard_boundaries,
+    const std::vector<std::vector<uint16_t>> &flattened_remote_config,
+    const std::vector<std::pair<uint32_pair_t, uint32_pair_t>> &shard_boundaries,
     bool remote_read = false,
     bool is_block_sharded = false,
     bool transpose_mcast = false);
diff --git a/ttnn/cpp/ttnn/tensor/tensor_impl.cpp b/ttnn/cpp/ttnn/tensor/tensor_impl.cpp
index bea72879c2d..d72df0a512e 100644
--- a/ttnn/cpp/ttnn/tensor/tensor_impl.cpp
+++ b/ttnn/cpp/ttnn/tensor/tensor_impl.cpp
@@ -664,7 +664,7 @@ Tensor to_host_helper(const Tensor& tensor, bool blocking = true, uint8_t cq_id
     auto device = tensor.device();
     TT_ASSERT(device != nullptr && "Need device to be set copy data from device to host!");
     uint32_t size_in_bytes = device_buffer->size();
-    vector<T> data_vec;
+    std::vector<T> data_vec;
     const char* TT_METAL_SLOW_DISPATCH_MODE = std::getenv("TT_METAL_SLOW_DISPATCH_MODE");
     if (TT_METAL_SLOW_DISPATCH_MODE == nullptr) {
         data_vec.resize(size_in_bytes / sizeof(T));
diff --git a/ttnn/cpp/ttnn/tensor/tensor_impl.hpp b/ttnn/cpp/ttnn/tensor/tensor_impl.hpp
index a27b2e0bfdb..c9aaaf31b24 100644
--- a/ttnn/cpp/ttnn/tensor/tensor_impl.hpp
+++ b/ttnn/cpp/ttnn/tensor/tensor_impl.hpp
@@ -271,7 +271,7 @@ inline void read_data_from_device_buffer(
 }
 
 template <typename T>
-inline void read_data_from_device_buffer(DeviceBuffer device_buffer, vector<T>& host_buffer) {
+inline void read_data_from_device_buffer(DeviceBuffer device_buffer, std::vector<T>& host_buffer) {
     std::vector<uint32_t> host_buffer_uint32;
     ::detail::ReadFromBuffer(device_buffer, host_buffer_uint32);
     host_buffer = unpack_uint32_vec<T>(host_buffer_uint32);
diff --git a/ttnn/cpp/ttnn/tensor/tensor_utils.hpp b/ttnn/cpp/ttnn/tensor/tensor_utils.hpp
index 692e5b361fa..09f436acafd 100644
--- a/ttnn/cpp/ttnn/tensor/tensor_utils.hpp
+++ b/ttnn/cpp/ttnn/tensor/tensor_utils.hpp
@@ -64,7 +64,7 @@ static std::vector<uint32_t> compute_strides(const ttnn::SimpleShape& shape) {
     return strides;
 }
 
-static int compute_flat_indices(const vector<int>& indices, const vector<std::uint32_t> strides) {
+static int compute_flat_indices(const std::vector<int>& indices, const std::vector<std::uint32_t> strides) {
     int flat_index = 0;
     for (auto i = 0; i < indices.size(); i++) {
         flat_index += indices[i] * strides[i];

From 2ca85fee86b6577948756fd05536595facd3c4b1 Mon Sep 17 00:00:00 2001
From: Nguyen Truong Thanh <wally.nguyensub@gmail.com>
Date: Sat, 26 Oct 2024 23:58:31 +0700
Subject: [PATCH 20/30] #13931: Implement index fill (#13933)

* #13931: Implement index fill

* #13931: Move index_size to compile time
---
 .../unit_tests/operations/test_index_fill.py  | 138 ++++++++++++
 ttnn/CMakeLists.txt                           |   5 +
 ttnn/cpp/pybind11/operations/__init__.hpp     |   4 +
 .../device/index_fill_device_operation.cpp    |  67 ++++++
 .../device/index_fill_device_operation.hpp    |  59 ++++++
 .../device/index_fill_multi_core_factory.cpp  | 196 ++++++++++++++++++
 .../device/kernels/reader_index_fill.cpp      | 106 ++++++++++
 .../device/kernels/writer_index_fill.cpp      |  33 +++
 .../ttnn/operations/index_fill/index_fill.cpp |  21 ++
 .../ttnn/operations/index_fill/index_fill.hpp |  23 ++
 .../index_fill/index_fill_pybind.cpp          |  44 ++++
 .../index_fill/index_fill_pybind.hpp          |  13 ++
 12 files changed, 709 insertions(+)
 create mode 100644 tests/ttnn/unit_tests/operations/test_index_fill.py
 create mode 100644 ttnn/cpp/ttnn/operations/index_fill/device/index_fill_device_operation.cpp
 create mode 100644 ttnn/cpp/ttnn/operations/index_fill/device/index_fill_device_operation.hpp
 create mode 100644 ttnn/cpp/ttnn/operations/index_fill/device/index_fill_multi_core_factory.cpp
 create mode 100644 ttnn/cpp/ttnn/operations/index_fill/device/kernels/reader_index_fill.cpp
 create mode 100644 ttnn/cpp/ttnn/operations/index_fill/device/kernels/writer_index_fill.cpp
 create mode 100644 ttnn/cpp/ttnn/operations/index_fill/index_fill.cpp
 create mode 100644 ttnn/cpp/ttnn/operations/index_fill/index_fill.hpp
 create mode 100644 ttnn/cpp/ttnn/operations/index_fill/index_fill_pybind.cpp
 create mode 100644 ttnn/cpp/ttnn/operations/index_fill/index_fill_pybind.hpp

diff --git a/tests/ttnn/unit_tests/operations/test_index_fill.py b/tests/ttnn/unit_tests/operations/test_index_fill.py
new file mode 100644
index 00000000000..8935f5c5bab
--- /dev/null
+++ b/tests/ttnn/unit_tests/operations/test_index_fill.py
@@ -0,0 +1,138 @@
+# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+
+import ttnn
+import torch
+from tests.ttnn.utils_for_testing import assert_equal
+
+
+def run_index_fill_test(shape, dim, value, dtype, device):
+    if len(shape) - 1 < dim:
+        pytest.skip("Given dim is higher than tensor rank")
+
+    if dtype == torch.int32:
+        torch_input = torch.randint(0, 100, shape, dtype=torch.int32)
+    else:
+        torch_input = torch.rand(shape, dtype=dtype)
+    torch_index = torch.tensor([0, 2])
+    torch_output = torch.index_fill(torch_input, dim, torch_index, value)
+
+    tt_input = ttnn.from_torch(torch_input, device=device)
+    tt_index = ttnn.from_torch(torch_index, device=device)
+
+    ttnn_output = ttnn.index_fill(tt_input, dim, tt_index, value)
+    ttnn_output = ttnn.to_torch(ttnn_output)
+
+    assert assert_equal(ttnn_output, torch_output)
+
+
+@pytest.mark.parametrize(
+    "shape",
+    [
+        [32, 32],  # multiple of 32
+        [12, 24],  # not multiple of 32
+        [23, 41, 32],  # multiple of 32
+        [9, 5, 38],  # not multiple of 32
+        [3, 4, 5, 32],  # multiple of 32
+        [41, 21, 33, 34],  # not multiple of 32,
+    ],
+)
+@pytest.mark.parametrize(
+    "dim",
+    [
+        0,
+        1,
+        2,
+        3,
+    ],
+)
+@pytest.mark.parametrize(
+    "value",
+    [
+        2.5,
+        1.72,
+    ],
+)
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        torch.float32,
+        torch.bfloat16,
+    ],
+)
+def test_index_fill_float(shape, dim, value, dtype, device):
+    torch.manual_seed(2024)
+
+    run_index_fill_test(shape, dim, value, dtype, device)
+
+
+@pytest.mark.parametrize(
+    "shape",
+    [
+        [32, 32],  # multiple of 32
+        [12, 23],  # not multiple of 32
+        [27, 12, 32],  # multiple of 32
+        [61, 3, 6],  # not multiple of 32
+        [6, 3, 7, 32],  # multiple of 32
+        [13, 15, 22, 13],  # not multiple of 32
+    ],
+)
+@pytest.mark.parametrize(
+    "dim",
+    [
+        0,
+        1,
+        2,
+        3,
+    ],
+)
+@pytest.mark.parametrize(
+    "value",
+    [
+        15,
+        12,
+    ],
+)
+def test_index_fill_int(shape, dim, value, device):
+    torch.manual_seed(2024)
+
+    run_index_fill_test(shape, dim, value, torch.int32, device)
+
+
+@pytest.mark.parametrize(
+    "shape",
+    [
+        [32, 32],  # multiple of 32
+        [12, 23],  # not multiple of 32
+        [27, 12, 32],  # multiple of 32
+        [61, 3, 6],  # not multiple of 32
+        [4, 3, 7, 32],  # multiple of 32
+        [13, 15, 22, 13],  # not multiple of 32
+    ],
+)
+@pytest.mark.parametrize(
+    "dim",
+    [
+        0,
+    ],
+)
+@pytest.mark.parametrize(
+    "value",
+    [
+        2002,
+    ],
+)
+def test_index_fill_callback(shape, dim, value, device, use_program_cache):
+    torch.manual_seed(2024)
+    for i in range(2):
+        run_index_fill_test(shape, dim, value, torch.int32, device)
+        if i == 0:
+            num_program_cache_entries = device.num_program_cache_entries()
+            assert num_program_cache_entries > 0
+        else:
+            assert device.num_program_cache_entries() == num_program_cache_entries
+        torch_dummy = torch.randn([32, 32])
+        tt_dummy = ttnn.from_torch(torch_dummy, device=device)
diff --git a/ttnn/CMakeLists.txt b/ttnn/CMakeLists.txt
index 873296beece..d03fbfbdc64 100644
--- a/ttnn/CMakeLists.txt
+++ b/ttnn/CMakeLists.txt
@@ -519,6 +519,11 @@ set(ALL_TTNN_SRCS
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/moreh/moreh_sum/moreh_sum_pybind.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/moreh/moreh_sum/moreh_sum.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/moreh/moreh_helper_functions.cpp
+
+    ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/index_fill/index_fill.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/index_fill/index_fill_pybind.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/index_fill/device/index_fill_device_operation.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/index_fill/device/index_fill_multi_core_factory.cpp
 )
 
 #Split src and python bindings
diff --git a/ttnn/cpp/pybind11/operations/__init__.hpp b/ttnn/cpp/pybind11/operations/__init__.hpp
index 0083e077e7f..20d7494c87b 100644
--- a/ttnn/cpp/pybind11/operations/__init__.hpp
+++ b/ttnn/cpp/pybind11/operations/__init__.hpp
@@ -29,6 +29,7 @@
 #include "ttnn/operations/experimental/experimental_pybind.hpp"
 #include "ttnn/operations/full/full_pybind.hpp"
 #include "ttnn/operations/full_like/full_like_pybind.hpp"
+#include "ttnn/operations/index_fill/index_fill_pybind.hpp"
 #include "ttnn/operations/kv_cache/kv_cache_pybind.hpp"
 #include "ttnn/operations/loss/loss_pybind.hpp"
 #include "ttnn/operations/matmul/matmul_pybind.hpp"
@@ -148,6 +149,9 @@ void py_module(py::module& module) {
 
     auto m_uniform = module.def_submodule("uniform", "uniform operations");
     uniform::bind_uniform_operation(m_uniform);
+
+    auto m_index_fill = module.def_submodule("index_fill", "index_fill operation");
+    index_fill::bind_index_fill_operation(m_index_fill);
 }
 }  // namespace operations
 
diff --git a/ttnn/cpp/ttnn/operations/index_fill/device/index_fill_device_operation.cpp b/ttnn/cpp/ttnn/operations/index_fill/device/index_fill_device_operation.cpp
new file mode 100644
index 00000000000..6b8ff0ba570
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/index_fill/device/index_fill_device_operation.cpp
@@ -0,0 +1,67 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+#include "index_fill_device_operation.hpp"
+
+#include "ttnn/tensor/tensor.hpp"
+
+namespace ttnn::operations::index_fill {
+IndexFillOperation::program_factory_t IndexFillOperation::select_program_factory(
+    const operation_attributes_t& operation_attributes, const tensor_args_t& tensor_args) {
+    return MultiCore{};
+}
+
+void IndexFillOperation::validate(
+    const operation_attributes_t& operation_attributes, const tensor_args_t& tensor_args) {
+    const auto& input = tensor_args.input;
+    const auto& index = tensor_args.index;
+    const uint32_t dim = operation_attributes.dim;
+    TT_FATAL(input.storage_type() == StorageType::DEVICE, "Index fill: Input must be on device");
+    TT_FATAL(input.buffer() != nullptr, "Index fill: Input must be allocated in buffer on device");
+    TT_FATAL(
+        input.memory_config().memory_layout == TensorMemoryLayout::INTERLEAVED,
+        "Index fill: Not currently supporting sharding");
+    TT_FATAL(
+        operation_attributes.memory_config.memory_layout == TensorMemoryLayout::INTERLEAVED,
+        "Index fill: Not currently supporting sharding");
+    TT_FATAL(index.get_logical_shape().rank() == 1,
+        "Index fill: Index tensor must be 1D!");
+    TT_FATAL(
+        dim < input.get_logical_shape().rank() && dim >= 0,
+        "Index fill: Invalid dimension");
+    TT_FATAL(index.get_logical_shape().rank() == 1,
+        "Index fill: Index tensor must be 1D!");
+}
+void IndexFillOperation::validate_on_program_cache_miss(
+    const operation_attributes_t& operation_attributes, const tensor_args_t& tensor_args) {
+    validate(operation_attributes, tensor_args);
+}
+void IndexFillOperation::validate_on_program_cache_hit(
+    const operation_attributes_t& operation_attributes, const tensor_args_t& tensor_args) {
+    validate(operation_attributes, tensor_args);
+}
+IndexFillOperation::shape_return_value_t IndexFillOperation::compute_output_shapes(
+    const operation_attributes_t& operation_attributes, const tensor_args_t& tensor_args) {
+    return tensor_args.input.get_logical_shape();
+}
+IndexFillOperation::tensor_return_value_t IndexFillOperation::create_output_tensors(
+    const operation_attributes_t& operation_attributes, const tensor_args_t& tensor_args) {
+    const auto output_shape = compute_output_shapes(operation_attributes, tensor_args);
+    const auto& input = tensor_args.input;
+    return create_device_tensor(
+        output_shape,
+        input.tensor_attributes->dtype,
+        input.tensor_attributes->layout,
+        input.device(),
+        operation_attributes.memory_config);
+}
+std::tuple<IndexFillOperation::operation_attributes_t, IndexFillOperation::tensor_args_t> IndexFillOperation::invoke(
+    const Tensor& input,
+    const uint32_t dim,
+    const Tensor& index,
+    const std::variant<float, int> value,
+    const std::optional<MemoryConfig>& memory_config) {
+    return {
+        operation_attributes_t{dim, value, memory_config.value_or(input.memory_config())}, tensor_args_t{input, index}};
+}
+}  // namespace ttnn::operations::index_fill
diff --git a/ttnn/cpp/ttnn/operations/index_fill/device/index_fill_device_operation.hpp b/ttnn/cpp/ttnn/operations/index_fill/device/index_fill_device_operation.hpp
new file mode 100644
index 00000000000..7ed3cb413c3
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/index_fill/device/index_fill_device_operation.hpp
@@ -0,0 +1,59 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+#include <optional>
+#include <variant>
+
+#include "ttnn/decorators.hpp"
+#include "ttnn/tensor/tensor.hpp"
+#include "ttnn/types.hpp"
+namespace ttnn::operations::index_fill {
+struct IndexFillOperation {
+    struct operation_attributes_t {
+        const uint32_t dim;
+        const std::variant<float, int> value;
+        const MemoryConfig memory_config;
+    };
+    struct tensor_args_t {
+        const Tensor& input;
+        const Tensor& index;
+    };
+    using shape_return_value_t = SimpleShape;
+    using tensor_return_value_t = Tensor;
+    struct MultiCore {
+        struct shared_variables_t {
+            KernelHandle reader_kernel_id;
+            KernelHandle writer_kernel_id;
+            std::size_t num_cores;
+            std::size_t num_cores_y;
+        };
+        using cached_program_t = ttnn::device_operation::CachedProgram<shared_variables_t>;
+        static cached_program_t create(
+            const operation_attributes_t& operation_attributes,
+            const tensor_args_t& tensor_args,
+            tensor_return_value_t& output);
+        static void override_runtime_arguments(
+            cached_program_t& cached_program,
+            const operation_attributes_t& operation_attributes,
+            const tensor_args_t& tensor_args,
+            tensor_return_value_t& output);
+    };
+    using program_factory_t = std::variant<MultiCore>;
+    static program_factory_t select_program_factory(const operation_attributes_t&, const tensor_args_t&);
+    static void validate_on_program_cache_miss(const operation_attributes_t&, const tensor_args_t&);
+    static void validate_on_program_cache_hit(const operation_attributes_t&, const tensor_args_t&);
+    static void validate(const operation_attributes_t&, const tensor_args_t&);
+    static shape_return_value_t compute_output_shapes(const operation_attributes_t&, const tensor_args_t&);
+    static tensor_return_value_t create_output_tensors(const operation_attributes_t&, const tensor_args_t&);
+    static std::tuple<operation_attributes_t, tensor_args_t> invoke(
+        const Tensor& input,
+        const uint32_t dim,
+        const Tensor& index,
+        const std::variant<float, int> value,
+        const std::optional<MemoryConfig>& memory_config);
+};
+}  // namespace ttnn::operations::index_fill
+namespace ttnn::prim {
+constexpr auto index_fill =
+    ttnn::register_operation<"ttnn::prim::index_fill", ttnn::operations::index_fill::IndexFillOperation>();
+}
diff --git a/ttnn/cpp/ttnn/operations/index_fill/device/index_fill_multi_core_factory.cpp b/ttnn/cpp/ttnn/operations/index_fill/device/index_fill_multi_core_factory.cpp
new file mode 100644
index 00000000000..7327d13178f
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/index_fill/device/index_fill_multi_core_factory.cpp
@@ -0,0 +1,196 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "host_api.hpp"
+#include "impl/buffers/circular_buffer_types.hpp"
+#include "index_fill_device_operation.hpp"
+#include "tt_metal/common/work_split.hpp"
+#include "tt_metal/host_api.hpp"
+#include "ttnn/tensor/types.hpp"
+
+using namespace tt;
+using namespace tt::tt_metal;
+using namespace tt::constants;
+using namespace tt::tt_metal::detail;
+
+union datatype {
+    uint32_t u32;
+    float f32;
+} u_fill_value;
+
+namespace ttnn::operations::index_fill {
+IndexFillOperation::MultiCore::cached_program_t IndexFillOperation::MultiCore::create(
+    const operation_attributes_t& operation_attributes,
+    const tensor_args_t& tensor_args,
+    tensor_return_value_t& output) {
+    const Tensor& index = tensor_args.index;
+    const Tensor& input = tensor_args.input;
+    uint32_t dim = operation_attributes.dim;
+
+    auto dtype = input.get_dtype();
+
+    const auto input_shape = input.get_logical_shape();
+    const auto n = input_shape.rank();
+
+    uint32_t num_rows_to_fill_per_index = 1;
+    for (int i = n - 2; i > dim; i--) {
+        num_rows_to_fill_per_index *= input_shape[i];
+    }
+
+    auto fill_value = operation_attributes.value;
+    if (std::holds_alternative<int>(fill_value)) {
+        u_fill_value.u32 = std::get<int>(fill_value);
+    } else if (std::holds_alternative<float>(fill_value)) {
+        u_fill_value.f32 = std::get<float>(fill_value);
+    }
+
+    auto num_rows = input.volume() / input.get_logical_shape()[-1];
+    Program program{};
+    Device* device = input.device();
+
+    auto compute_with_storage_grid_size = device->compute_with_storage_grid_size();
+    uint32_t num_cores_x = compute_with_storage_grid_size.x;
+    uint32_t num_cores_y = compute_with_storage_grid_size.y;
+
+    auto [num_cores, all_cores, core_group_1, core_group_2, num_rows_per_core_group_1, num_rows_per_core_group_2] =
+        tt_metal::split_work_to_cores(compute_with_storage_grid_size, num_rows);
+
+    auto input_data_format = datatype_to_dataformat_converter(dtype);
+    auto index_data_format = datatype_to_dataformat_converter(index.get_dtype());
+    auto output_data_format = datatype_to_dataformat_converter(output.get_dtype());
+
+    uint32_t input_unit_size = input.get_logical_shape()[-1] * input.element_size();
+    uint32_t rounded_input_unit_size = round_up_to_mul32(input_unit_size);
+
+    uint32_t index_unit_size = index.volume() * index.element_size();
+    uint32_t rounded_index_unit_size = round_up_to_mul32(index_unit_size);
+
+    uint32_t output_unit_size = output.get_logical_shape()[-1] * output.element_size();
+    uint32_t rounded_output_unit_size = round_up_to_mul32(output_unit_size);
+
+    auto src_cb_index = CB::c_in0;
+    CircularBufferConfig cb_src_config =
+        CircularBufferConfig(rounded_input_unit_size, {{src_cb_index, input_data_format}})
+            .set_page_size(src_cb_index, rounded_input_unit_size);
+    auto cb_src = CreateCircularBuffer(program, all_cores, cb_src_config);
+    std::map<string, string> reader_defines;
+
+    switch (dtype) {
+        case DataType::BFLOAT16: reader_defines["OUTPUT_DTYPE_BFLOAT16"] = "1"; break;
+        case DataType::INT32: reader_defines["OUTPUT_DTYPE_INT32"] = "1"; break;
+        case DataType::FLOAT32: reader_defines["OUTPUT_DTYPE_FLOAT32"] = "1"; break;
+        default:
+            TT_FATAL(false, "Unsupported datatype");
+            break;
+    }
+
+    auto index_cb_index = CB::c_in1;
+    CircularBufferConfig cb_index_config =
+        CircularBufferConfig(rounded_index_unit_size, {{index_cb_index, index_data_format}})
+            .set_page_size(index_cb_index, rounded_index_unit_size);
+    auto cb_index = CreateCircularBuffer(program, all_cores, cb_index_config);
+
+    auto dst_cb_index = CB::c_out0;
+    CircularBufferConfig dst_cb_config =
+        CircularBufferConfig(rounded_output_unit_size, {{dst_cb_index, output_data_format}})
+            .set_page_size(dst_cb_index, rounded_output_unit_size);
+    auto cb_dst = CreateCircularBuffer(program, all_cores, dst_cb_config);
+
+    bool in_is_dram = input.buffer()->is_dram();
+    bool index_is_dram = index.buffer()->is_dram();
+    bool out_is_dram = output.buffer()->is_dram();
+
+    // Create Kernels
+    // reader
+    std::vector<uint32_t> reader_compile_time_args = {
+        (std::uint32_t)in_is_dram,
+        (std::uint32_t)index_is_dram,
+        (std::uint32_t)src_cb_index,
+        (std::uint32_t)index_cb_index,
+        (std::uint32_t)(dim == n - 1),
+        (std::uint32_t)index.volume()};
+
+    auto reader_kernel_id = CreateKernel(
+        program,
+        "ttnn/cpp/ttnn/operations/index_fill/device/kernels/reader_index_fill.cpp",
+        all_cores,
+        ReaderDataMovementConfig(reader_compile_time_args));
+
+    std::vector<uint32_t> writer_compile_time_args = {(std::uint32_t)out_is_dram};
+
+    auto writer_kernel_id = CreateKernel(
+        program,
+        "ttnn/cpp/ttnn/operations/index_fill/device/kernels/writer_index_fill.cpp",
+        all_cores,
+        WriterDataMovementConfig(writer_compile_time_args));
+
+    uint32_t unit_offset = 0;
+    uint32_t num_cores_group_1 = core_group_1.num_cores();
+    auto cores = grid_to_cores(num_cores, num_cores_x, num_cores_y);
+    for (uint32_t i = 0; i < cores.size(); i++) {
+        const auto& core = cores[i];
+        uint32_t num_rows_per_core = i < num_cores_group_1 ? num_rows_per_core_group_1 : num_rows_per_core_group_2;
+        if (core_group_1.core_coord_in_core_ranges(core)) {
+            num_rows_per_core = num_rows_per_core_group_1;
+        } else if (core_group_2.core_coord_in_core_ranges(core)) {
+            num_rows_per_core = num_rows_per_core_group_2;
+        } else {
+            TT_FATAL(false, "Core not in specified core ranges");
+        }
+        SetRuntimeArgs(
+            program,
+            reader_kernel_id,
+            core,
+            {input.buffer()->address(),
+             index.buffer()->address(),
+             u_fill_value.u32,
+             input_unit_size,
+             index_unit_size,
+             unit_offset,
+             num_rows_per_core,
+             num_rows_to_fill_per_index,
+             input_shape[dim]});
+        SetRuntimeArgs(
+            program,
+            writer_kernel_id,
+            core,
+            {output.buffer()->address(), num_rows_per_core, unit_offset, output_unit_size});
+
+        unit_offset += num_rows_per_core;
+    }
+
+    return {std::move(program), {reader_kernel_id, writer_kernel_id, num_cores, num_cores_y}};
+}
+
+void IndexFillOperation::MultiCore::override_runtime_arguments(
+    cached_program_t& cached_program,
+    const operation_attributes_t& operation_attributes,
+    const tensor_args_t& tensor_args,
+    tensor_return_value_t& output) {
+    auto& program = cached_program.program;
+    auto& reader_kernel_id = cached_program.shared_variables.reader_kernel_id;
+    auto& writer_kernel_id = cached_program.shared_variables.writer_kernel_id;
+    auto& num_cores = cached_program.shared_variables.num_cores;
+    auto& num_cores_y = cached_program.shared_variables.num_cores_y;
+
+    auto src_buffer = tensor_args.input.buffer()->address();
+    auto index_buffer = tensor_args.index.buffer()->address();
+    auto output_buffer = output.buffer()->address();
+
+    for (uint32_t i = 0; i < num_cores; i++) {
+        CoreCoord core = {i / num_cores_y, i % num_cores_y};
+        {
+            auto& runtime_args = GetRuntimeArgs(program, reader_kernel_id, core);
+            runtime_args[0] = src_buffer;
+            runtime_args[1] = index_buffer;
+        }
+
+        {
+            auto& runtime_args = GetRuntimeArgs(program, writer_kernel_id, core);
+            runtime_args[0] = output_buffer;
+        }
+    }
+}
+
+}  // namespace ttnn::operations::index_fill
diff --git a/ttnn/cpp/ttnn/operations/index_fill/device/kernels/reader_index_fill.cpp b/ttnn/cpp/ttnn/operations/index_fill/device/kernels/reader_index_fill.cpp
new file mode 100644
index 00000000000..1da1cba100a
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/index_fill/device/kernels/reader_index_fill.cpp
@@ -0,0 +1,106 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <stdint.h>
+
+#include "dataflow_api.h"
+#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
+typedef union {
+    float f;
+    uint32_t u;
+} value;
+
+bool is_in_indices(uint32_t *index_ptr, uint32_t size, uint32_t row_id) {
+    for (uint32_t i = 0; i < size; i++) {
+        if (row_id == index_ptr[i]) {
+            return true;
+        }
+    }
+    return false;
+}
+
+void kernel_main() {
+    uint32_t input_addr = get_arg_val<uint32_t>(0);
+    uint32_t index_addr = get_arg_val<uint32_t>(1);
+    uint32_t fill_value = get_arg_val<uint32_t>(2);
+    uint32_t input_page_size = get_arg_val<uint32_t>(3);
+    uint32_t index_page_size = get_arg_val<uint32_t>(4);
+    uint32_t start_row_id = get_arg_val<uint32_t>(5);
+    uint32_t num_rows_per_core = get_arg_val<uint32_t>(6);
+    uint32_t num_rows_to_fill_per_index = get_arg_val<uint32_t>(7);
+    uint32_t dim = get_arg_val<uint32_t>(8);
+
+    constexpr bool input_is_dram = get_compile_time_arg_val(0) == 1;
+    constexpr bool index_is_dram = get_compile_time_arg_val(1) == 1;
+    constexpr uint32_t src_cb_id = get_compile_time_arg_val(2);
+    constexpr uint32_t index_cb_id = get_compile_time_arg_val(3);
+    constexpr bool is_last_dim = get_compile_time_arg_val(4) == 1;
+    constexpr uint32_t index_size = get_compile_time_arg_val(5);
+
+    constexpr uint32_t onetile = 1;
+
+    const InterleavedAddrGen<input_is_dram> s0 = {.bank_base_address = input_addr, .page_size = input_page_size};
+
+    const InterleavedAddrGen<index_is_dram> s1 = {.bank_base_address = index_addr, .page_size = index_page_size};
+
+    value val;
+    val.u = fill_value;
+
+    cb_reserve_back(index_cb_id, onetile);
+
+    uint32_t index_cb_reader = get_write_ptr(index_cb_id);
+    uint64_t index_noc_addr = get_noc_addr(0, s1);
+    noc_async_read(index_noc_addr, index_cb_reader, index_page_size);
+    noc_async_read_barrier();
+    uint32_t *index_ptr = reinterpret_cast<uint32_t *>(index_cb_reader);
+    if (is_last_dim) {
+        for (uint32_t row_id = start_row_id; row_id < start_row_id + num_rows_per_core; row_id++) {
+            cb_reserve_back(src_cb_id, onetile);
+            uint32_t src_cb_reader = get_write_ptr(src_cb_id);
+            uint64_t input_noc_addr = get_noc_addr(row_id, s0);
+            noc_async_read(input_noc_addr, src_cb_reader, input_page_size);
+            noc_async_read_barrier();
+
+            uint32_t *input_ptr = reinterpret_cast<uint32_t *>(src_cb_reader);
+
+            for (uint32_t i = 0; i < index_size; i++) {
+                uint32_t current_index = index_ptr[i];
+                input_ptr[current_index] = fill_value;
+            }
+
+            cb_push_back(src_cb_id, onetile);
+        }
+    } else {
+        for (uint32_t row_id = start_row_id; row_id < start_row_id + num_rows_per_core; row_id++) {
+            cb_reserve_back(src_cb_id, onetile);
+            uint32_t src_cb_reader = get_write_ptr(src_cb_id);
+            uint64_t input_noc_addr = get_noc_addr(row_id, s0);
+            noc_async_read(input_noc_addr, src_cb_reader, input_page_size);
+            noc_async_read_barrier();
+
+            if (is_in_indices(index_ptr, index_size, row_id / num_rows_to_fill_per_index % dim)) {
+#ifdef OUTPUT_DTYPE_BFLOAT16
+                auto ptr = reinterpret_cast<uint16_t *>(write_addr);
+                for (uint32_t i = 0; i < index_size; ++i) {
+                    ptr[i] = val.u >> 16;
+                }
+#endif
+#ifdef OUTPUT_DTYPE_INT32
+                auto ptr = reinterpret_cast<uint32_t *>(write_addr);
+                for (uint32_t i = 0; i < index_size; ++i) {
+                    ptr[i] = fill_value;
+                }
+#endif
+#ifdef OUTPUT_DTYPE_FLOAT32
+                auto ptr = reinterpret_cast<float *>(write_addr);
+                for (uint32_t i = 0; i < index_size; ++i) {
+                    ptr[i] = val.f;
+                }
+#endif
+            }
+            cb_push_back(src_cb_id, onetile);
+        }
+    }
+    cb_push_back(index_cb_id, onetile);
+}
diff --git a/ttnn/cpp/ttnn/operations/index_fill/device/kernels/writer_index_fill.cpp b/ttnn/cpp/ttnn/operations/index_fill/device/kernels/writer_index_fill.cpp
new file mode 100644
index 00000000000..3ecfca0c0a7
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/index_fill/device/kernels/writer_index_fill.cpp
@@ -0,0 +1,33 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "dataflow_api.h"
+
+void kernel_main() {
+    uint32_t output_buffer_address = get_arg_val<uint32_t>(0);
+    uint32_t num_rows_per_core = get_arg_val<uint32_t>(1);
+    uint32_t start_id = get_arg_val<uint32_t>(2);
+    uint32_t output_unit_size = get_arg_val<uint32_t>(3);
+
+    constexpr uint32_t dst_cb_id = tt::CB::c_out0;
+    constexpr uint32_t src_cb_id = tt::CB::c_in0;
+    constexpr bool output_is_dram = get_compile_time_arg_val(0) == 1;
+
+    constexpr uint32_t onetile = 1;
+
+    const InterleavedAddrGen<output_is_dram> s = {
+        .bank_base_address = output_buffer_address,
+        .page_size = output_unit_size,
+    };
+    for (uint32_t i = start_id; i < start_id + num_rows_per_core; i++) {
+        cb_wait_front(src_cb_id, onetile);
+
+        uint32_t writer_ptr = get_read_ptr(src_cb_id);
+        uint64_t output_noc_addr = get_noc_addr(i, s);
+        noc_async_write(writer_ptr, output_noc_addr, output_unit_size);
+        noc_async_write_barrier();
+
+        cb_pop_front(src_cb_id, onetile);
+    }
+}
diff --git a/ttnn/cpp/ttnn/operations/index_fill/index_fill.cpp b/ttnn/cpp/ttnn/operations/index_fill/index_fill.cpp
new file mode 100644
index 00000000000..513d7f42190
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/index_fill/index_fill.cpp
@@ -0,0 +1,21 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "index_fill.hpp"
+
+#include "ttnn/decorators.hpp"
+#include "ttnn/operations/index_fill/device/index_fill_device_operation.hpp"
+
+namespace ttnn::operations::index_fill {
+
+Tensor IndexFill::invoke(
+    const Tensor &input,
+    const uint32_t dim,
+    const Tensor &index,
+    const std::variant<float, int> value,
+    const std::optional<MemoryConfig> &memory_config) {
+    return ttnn::prim::index_fill(input, dim, index, value, memory_config);
+}
+
+}  // namespace ttnn::operations::index_fill
diff --git a/ttnn/cpp/ttnn/operations/index_fill/index_fill.hpp b/ttnn/cpp/ttnn/operations/index_fill/index_fill.hpp
new file mode 100644
index 00000000000..9f0393873b5
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/index_fill/index_fill.hpp
@@ -0,0 +1,23 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+#include "ttnn/decorators.hpp"
+
+namespace ttnn::operations::index_fill {
+
+struct IndexFill {
+    static Tensor invoke(
+        const Tensor &input,
+        const uint32_t dim,
+        const Tensor &index,
+        const std::variant<float, int> value,
+        const std::optional<MemoryConfig> &memory_config);
+};
+}  // namespace ttnn::operations::index_fill
+
+namespace ttnn {
+constexpr auto index_fill =
+    ttnn::register_operation_with_auto_launch_op<"ttnn::index_fill", ttnn::operations::index_fill::IndexFill>();
+}  // namespace ttnn
diff --git a/ttnn/cpp/ttnn/operations/index_fill/index_fill_pybind.cpp b/ttnn/cpp/ttnn/operations/index_fill/index_fill_pybind.cpp
new file mode 100644
index 00000000000..64275bbf3a5
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/index_fill/index_fill_pybind.cpp
@@ -0,0 +1,44 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "index_fill_pybind.hpp"
+
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "index_fill.hpp"
+#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "ttnn/operations/index_fill/device/index_fill_device_operation.hpp"
+
+namespace py = pybind11;
+
+namespace ttnn::operations::index_fill {
+
+void bind_index_fill_operation(py::module& module) {
+    auto doc =
+        R"doc(index_fill(input: Tensor, dim: uint32, index: Tensor, value: int or float, memory_config: MemoryConfig) -> Tensor
+    Create or fill a tensor with the given value, with the specified `memory_config`.
+    This operation only supports ROW_MAJOR_LAYOUT for now.
+    Args:
+        * :attr:`input`: The tensor that we will operate on
+        * :attr:`dim`: The dimension that we need to fill the value along.
+        * :attr:`index`: The index that we need to fill the value in.
+        * :attr:`value`: The value which will be used to fill the output tensor
+        * :attr:`memory_config`: The memory configuration for the output tensor.
+    )doc";
+
+    bind_registered_operation(
+        module,
+        ttnn::index_fill,
+        doc,
+        ttnn::pybind_arguments_t{
+            py::arg("input"),
+            py::arg("dim"),
+            py::arg("index"),
+            py::arg("value"),
+            py::kw_only(),
+            py::arg("memory_config") = std::nullopt});
+}
+
+}  // namespace ttnn::operations::index_fill
diff --git a/ttnn/cpp/ttnn/operations/index_fill/index_fill_pybind.hpp b/ttnn/cpp/ttnn/operations/index_fill/index_fill_pybind.hpp
new file mode 100644
index 00000000000..49e664decdc
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/index_fill/index_fill_pybind.hpp
@@ -0,0 +1,13 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "pybind11/pybind_fwd.hpp"
+
+namespace py = pybind11;
+
+namespace ttnn::operations::index_fill {
+void bind_index_fill_operation(py::module& module);
+}

From 82b97e5c37f7dbb614a3e7070e1f8e53b0f31d87 Mon Sep 17 00:00:00 2001
From: Andrew Fuller <afuller@tenstorrent.com>
Date: Sat, 26 Oct 2024 15:59:58 -0400
Subject: [PATCH 21/30] #0: Put shell strict mode in place (#14237)

---
 .github/workflows/build-artifact.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/build-artifact.yaml b/.github/workflows/build-artifact.yaml
index a6d5e1f0bea..bcfbd7bf2f4 100644
--- a/.github/workflows/build-artifact.yaml
+++ b/.github/workflows/build-artifact.yaml
@@ -127,6 +127,8 @@ jobs:
             -e ARCH_NAME=${{ matrix.arch }}
             -w ${{ github.workspace }}
           run: |
+            set -eu # basic shell hygiene
+
             # /tmp is a tmpfs; more efficient than persisted storage
             mkdir -p /tmp/ccache
             export CCACHE_TEMPDIR=/tmp/ccache

From ada4ad8a44d37d57833edfe80eea4a3cb9f03525 Mon Sep 17 00:00:00 2001
From: Andrew Fuller <afuller@tenstorrent.com>
Date: Sat, 26 Oct 2024 16:05:54 -0400
Subject: [PATCH 22/30] #14001: Run pre-commit on PRs so CMake formatting (and
 others) are checked (#14157)

Black is stubbed out because that's a required check on the branch and GitHub is erroring out when I try to remove it from Required.  To be removed + Pre-commit marked as Required in its place.
---
 .github/workflows/all-static-checks.yaml | 37 ++++++++++++++++++++----
 1 file changed, 32 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/all-static-checks.yaml b/.github/workflows/all-static-checks.yaml
index 914ba2fa1ee..df96cc19546 100644
--- a/.github/workflows/all-static-checks.yaml
+++ b/.github/workflows/all-static-checks.yaml
@@ -8,6 +8,38 @@ on:
       - "main"
 
 jobs:
+  pre-commit:
+    name: Run Pre-commit Hooks
+    runs-on: ubuntu-latest
+    permissions:
+      contents: write
+      pull-requests: write
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0  # Fetch all history so 'origin/main' is available
+          fetch-refs: true  # Ensure all refs are fetched
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.11
+
+      - name: Run Pre-commit
+        uses: pre-commit/action@v3.0.1
+        with:
+          extra_args: |
+            --from-ref ${{ github.event_name == 'pull_request' && format('refs/remotes/origin/{0}', github.event.pull_request.base.ref) || 'HEAD^' }} \
+            --to-ref HEAD
+        continue-on-error: false
+  check-black:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Do Nothing
+        run: echo "Black is covered by pre-commit. This is a placeholder to be removed after updating branch restrictions."
+
+
   check-spdx-licenses:
     runs-on: ubuntu-latest
     steps:
@@ -27,11 +59,6 @@ jobs:
       - uses: actions/checkout@v4
       - name: Check kernel count in base metal is less than maximum
         run: if (( $(find tt_metal/kernels/ -type f | wc -l) > 7 )); then exit 1; fi
-  check-black:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v4
-      - uses: psf/black@23.10.1
   check-doc:
     runs-on: ubuntu-latest
     steps:

From 4ae4ac3c30cd24ea27cbac8cc5811c90d077e9c0 Mon Sep 17 00:00:00 2001
From: o2buzzle <76864037+o2buzzle@users.noreply.github.com>
Date: Thu, 24 Oct 2024 04:43:21 +0000
Subject: [PATCH 23/30] #14200: add bfp8_b tests for moreh_dot_backward

---
 .../operations/test_moreh_dot_backward.py     | 124 ++++++++++++------
 .../ttnn/unit_tests/operations/test_utils.py  |  20 +++
 2 files changed, 105 insertions(+), 39 deletions(-)

diff --git a/tests/ttnn/unit_tests/operations/test_moreh_dot_backward.py b/tests/ttnn/unit_tests/operations/test_moreh_dot_backward.py
index 3e51cbd1c93..51b443396bf 100644
--- a/tests/ttnn/unit_tests/operations/test_moreh_dot_backward.py
+++ b/tests/ttnn/unit_tests/operations/test_moreh_dot_backward.py
@@ -12,14 +12,26 @@
     get_compute_kernel_options,
     compute_kernel_options,
     compute_kernel_ids,
+    get_ttnn_torch_dtype,
 )
 
 
 def get_tensors(
-    input_shape, other_shape, output_shape, require_input_grad, require_other_grad, is_1d, device, use_randint=True
+    input_shape,
+    other_shape,
+    output_shape,
+    require_input_grad,
+    require_other_grad,
+    is_1d,
+    device,
+    npu_dtype=ttnn.bfloat16,
+    use_randint=True,
 ):
-    npu_dtype = ttnn.bfloat16
-    cpu_dtype = torch.bfloat16
+    cpu_dtype = get_ttnn_torch_dtype(npu_dtype)
+    if cpu_dtype is None:
+        # panic
+        assert False
+
     npu_layout = ttnn.TILE_LAYOUT
     cpu_layout = ttnn.ROW_MAJOR_LAYOUT
 
@@ -33,9 +45,9 @@ def get_tensors(
         other = torch.rand(other_shape, dtype=cpu_dtype)
         output = torch.rand(output_shape, dtype=cpu_dtype)
 
-    tt_input = ttnn.Tensor(input, npu_dtype).pad_to_tile(float(1)).to(npu_layout).to(device)
-    tt_other = ttnn.Tensor(other, npu_dtype).pad_to_tile(float("nan")).to(npu_layout).to(device)
-    tt_output = ttnn.Tensor(output, npu_dtype).pad_to_tile(float("nan")).to(npu_layout).to(device)
+    tt_input = ttnn.from_torch(input, npu_dtype, layout=npu_layout, device=device)
+    tt_other = ttnn.from_torch(other, npu_dtype, layout=npu_layout, device=device)
+    tt_output = ttnn.from_torch(output, npu_dtype, layout=npu_layout, device=device)
 
     torch_input = input.reshape(-1) if is_1d else input
     torch_other = other.reshape(-1) if is_1d else other
@@ -44,25 +56,16 @@ def get_tensors(
     output_grad = tt_output_grad = torch_output_grad = tt_input_grad = tt_other_grad = None
     if require_input_grad or require_other_grad:
         output_grad = torch.randint(-2, 3, output_shape, dtype=cpu_dtype)
-        # tt_output_grad = ttnn.Tensor(output_grad, npu_dtype).pad_to_tile(float("nan")).to(npu_layout).to(device)
-        tt_output_grad = ttnn.Tensor(output_grad, npu_dtype).pad_to_tile(float(-1)).to(npu_layout).to(device)
+        tt_output_grad = ttnn.from_torch(output_grad, npu_dtype, layout=npu_layout, device=device)
         torch_output_grad = output_grad[0][0][0][0] if is_1d else output_grad
 
         if require_input_grad:
             input_grad = torch.full(input_shape, float("nan"), dtype=cpu_dtype)
-            tt_input_grad = ttnn.Tensor(input_grad, npu_dtype).pad_to_tile(float("nan")).to(npu_layout).to(device)
+            tt_input_grad = ttnn.from_torch(input_grad, npu_dtype, layout=npu_layout, device=device)
 
         if require_other_grad:
             other_grad = torch.full(other_shape, float("nan"), dtype=cpu_dtype)
-            tt_other_grad = (
-                ttnn.Tensor(
-                    other_grad,
-                    npu_dtype,
-                )
-                .pad_to_tile(float("nan"))
-                .to(npu_layout)
-                .to(device)
-            )
+            tt_other_grad = ttnn.from_torch(other_grad, npu_dtype, layout=npu_layout, device=device)
 
     return (
         tt_input,
@@ -77,24 +80,7 @@ def get_tensors(
     )
 
 
-@pytest.mark.parametrize(
-    "input_shape",
-    (
-        [1, 1, 1, 10],  # test not mutiple of 32 case
-        [1, 1, 1, 32],  # test single tile
-        [1, 1, 1, 352],  # test multiple tiles
-        [1, 1, 1, 323],  # test multiple tiles, not a multiple of 32
-    ),
-)
-@pytest.mark.parametrize(
-    "requires_grad",
-    (
-        (True, False),
-        (False, True),
-        (True, True),
-    ),
-)
-def test_moreh_matmul_1d_backward(input_shape, requires_grad, device):
+def run_moreh_dot_backward(input_shape, requires_grad, device, dtype=ttnn.bfloat16, use_randint=True):
     torch.manual_seed(3072)
     require_input_grad, require_other_grad = requires_grad
     output_shape = [1, 1, 1, 1]
@@ -109,7 +95,9 @@ def test_moreh_matmul_1d_backward(input_shape, requires_grad, device):
         torch_input,
         torch_other,
         torch_output_grad,
-    ) = get_tensors(input_shape, input_shape, output_shape, require_input_grad, require_other_grad, True, device)
+    ) = get_tensors(
+        input_shape, input_shape, output_shape, require_input_grad, require_other_grad, True, device, dtype, use_randint
+    )
     # torch matmul
     torch_out = torch.matmul(
         torch_input.requires_grad_(require_input_grad), torch_other.requires_grad_(require_other_grad)
@@ -125,7 +113,7 @@ def test_moreh_matmul_1d_backward(input_shape, requires_grad, device):
     rtol = atol = 0.1
     cpu_layout = ttnn.ROW_MAJOR_LAYOUT
     if require_input_grad:
-        ttcpu_input_grad = tt_input_grad.cpu().to(cpu_layout).unpad_from_tile(input_shape).to_torch()
+        ttcpu_input_grad = ttnn.to_torch(tt_input_grad)
 
         passing, output_pcc = comp_allclose_and_pcc(
             torch_input.grad, ttcpu_input_grad.reshape(-1), pcc=0.999, rtol=rtol, atol=atol
@@ -135,7 +123,7 @@ def test_moreh_matmul_1d_backward(input_shape, requires_grad, device):
         assert passing
 
     if require_other_grad:
-        ttcpu_other_grad = tt_other_grad.cpu().to(cpu_layout).unpad_from_tile(input_shape).to_torch()
+        ttcpu_other_grad = ttnn.to_torch(tt_other_grad)
 
         passing, output_pcc = comp_allclose_and_pcc(
             torch_other.grad, ttcpu_other_grad.reshape(-1), pcc=0.999, rtol=rtol, atol=atol
@@ -143,3 +131,61 @@ def test_moreh_matmul_1d_backward(input_shape, requires_grad, device):
         logger.debug(f"other_grad passing={passing}")
         logger.debug(f"other_grad pcc={output_pcc}")
         assert passing
+
+
+@pytest.mark.parametrize(
+    "input_shape",
+    (
+        [1, 1, 1, 10],  # test not mutiple of 32 case
+        [1, 1, 1, 32],  # test single tile
+        [1, 1, 1, 352],  # test multiple tiles
+        [1, 1, 1, 323],  # test multiple tiles, not a multiple of 32
+    ),
+)
+@pytest.mark.parametrize(
+    "requires_grad",
+    (
+        [True, False],
+        [False, True],
+        [True, True],
+    ),
+)
+@pytest.mark.parametrize("use_randint", (True, False))
+@pytest.mark.parametrize("dtype", ([ttnn.bfloat16, ttnn.bfloat8_b]))
+def test_moreh_dot_backward(input_shape, requires_grad, dtype, use_randint, device):
+    run_moreh_dot_backward(input_shape, requires_grad, device, dtype, use_randint)
+
+
+@pytest.mark.parametrize(
+    "input_shape",
+    (
+        [1, 1, 1, 10],  # test not mutiple of 32 case
+        [1, 1, 1, 32],  # test single tile
+        [1, 1, 1, 352],  # test multiple tiles
+        [1, 1, 1, 323],  # test multiple tiles, not a multiple of 32
+    ),
+)
+@pytest.mark.parametrize(
+    "requires_grad",
+    (
+        [True, False],
+        [False, True],
+        [True, True],
+    ),
+)
+def test_moreh_dot_backward_callback(
+    input_shape,
+    requires_grad,
+    device,
+    use_program_cache,
+):
+    num_program_in_cache = []
+    for i in range(2):
+        run_moreh_dot_backward(input_shape, requires_grad, device)
+        num_program_in_cache.append(device.num_program_cache_entries())
+        dummy = torch.randn([32, 32])
+        tt_dummy = ttnn.from_torch(dummy, device=device)
+
+    logger.info(f"num_program_in_cache={num_program_in_cache}")
+    assert num_program_in_cache[0] > 0
+    assert num_program_in_cache[0] == num_program_in_cache[1]
diff --git a/tests/ttnn/unit_tests/operations/test_utils.py b/tests/ttnn/unit_tests/operations/test_utils.py
index 82c1b0f6bd9..9c304c02d27 100644
--- a/tests/ttnn/unit_tests/operations/test_utils.py
+++ b/tests/ttnn/unit_tests/operations/test_utils.py
@@ -3,6 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import ttnn
+import torch
 from models.utility_functions import is_wormhole_b0
 import copy
 import pytest
@@ -177,3 +178,22 @@ def get_lib_dtype(lib, dtype):
             "int32": lib.int32,
         }
         return dtype_map.get(dtype, None)
+
+
+def get_ttnn_torch_dtype(ttnn_dtype: ttnn.DataType) -> torch.dtype:
+    """
+    Maps a ttnn.DataType to the corresponding torch dtype that can handle them.
+    Parameters:
+    ttnn_dtype: ttnn.DataType
+        The ttnn data type to be mapped.
+    Returns:
+    torch.dtype or None
+        The corresponding torch dtype if the mapping exists, otherwise None.
+    """
+    dtype_map = {
+        ttnn.bfloat16: torch.bfloat16,
+        ttnn.float32: torch.float32,
+        ttnn.bfloat8_b: torch.bfloat16,
+        ttnn.int32: torch.int32,
+    }
+    return dtype_map.get(ttnn_dtype, None)

From 47cca2da8cc705af533011fa09f297a1a687b19c Mon Sep 17 00:00:00 2001
From: Mohamed Bahnas <116673264+mbahnasTT@users.noreply.github.com>
Date: Mon, 28 Oct 2024 08:11:18 +0300
Subject: [PATCH 24/30] [skip ci] Update README.md (#14315)

---
 README.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/README.md b/README.md
index d917d520b0d..6863a9a9a9b 100644
--- a/README.md
+++ b/README.md
@@ -53,7 +53,6 @@
 | [ViT](./models/demos/grayskull/vit)                                         | 9     | [e150](https://tenstorrent.com/hardware/grayskull)       | 1,360   | 2,000      |             |
 | [ViT](./models/demos/wormhole/vit)                                          | 8     | [n150](https://tenstorrent.com/hardware/wormhole)        | 912     | 1,600      |             |
 | [Stable Diffusion 1.4 (512x512)](./models/demos/wormhole/stable_diffusion)  | 1     | [n150](https://tenstorrent.com/hardware/wormhole)        | 0.167   | 0.3        |             |
-| [U-Net](./models/experimental/functional_unet)                              | 2     | [n150](https://tenstorrent.com/hardware/wormhole)        | 530     | 1000       | [v0.53.0-rc22](https://github.com/tenstorrent/tt-metal/tree/v0.53.0-rc22) |
 
 
 ## NLPs

From c5d56a8639884e319297e3ed81600e0416b49604 Mon Sep 17 00:00:00 2001
From: Artem Yerofieiev <169092593+ayerofieiev-tt@users.noreply.github.com>
Date: Sun, 27 Oct 2024 22:26:52 -0700
Subject: [PATCH 25/30] [skip ci] Add a tech report for TTNN Graph Tracing
 (#14333)

* [skip ci] Create graph-tracing.md

* Remove trailing spaces
---
 tech_reports/ttnn/graph-tracing.md | 708 +++++++++++++++++++++++++++++
 1 file changed, 708 insertions(+)
 create mode 100644 tech_reports/ttnn/graph-tracing.md

diff --git a/tech_reports/ttnn/graph-tracing.md b/tech_reports/ttnn/graph-tracing.md
new file mode 100644
index 00000000000..e1e07c1d731
--- /dev/null
+++ b/tech_reports/ttnn/graph-tracing.md
@@ -0,0 +1,708 @@
+# TT-NN Graph Trace
+TT-NN provides a mechanism for tracing operations and memory activities in a neural network's execution.
+
+Using this trace it is possible to analyze the operation even without executing it on the accelerator.
+The output trace can then be processed to get a single number like Peak Memory Load or to print tabular data or visualize a call graph.
+
+## 🪄 How to Use
+Wrap any number of TT-NN calls with `GraphProcessor::begin_graph_capture` and `GraphProcessor::end_graph_capture` or use with any callable.
+In the example below `ttnn::zeros` is not included in a trace, but `ttnn::add` is
+https://github.com/tenstorrent/tt-metal/blob/4ae4ac3c30cd24ea27cbac8cc5811c90d077e9c0/tests/ttnn/unit_tests/gtests/test_graph_add.cpp#L50-L58
+
+You can then analyze the trace with some of the provided utility functions
+https://github.com/tenstorrent/tt-metal/blob/4ae4ac3c30cd24ea27cbac8cc5811c90d077e9c0/tests/ttnn/unit_tests/gtests/test_graph_add.cpp#L64-L66
+or process it manually to extract whatever data in whatever format, like this table
+```
+           current_op                           event  total_cb  total_buffer                                                                                 info
+0            ttnn::add                        begin_op         0       9011200                                                 {'inputs': '8', 'name': 'ttnn::add'}
+1         ttnn::repeat                        begin_op         0       9011200                                              {'inputs': '2', 'name': 'ttnn::repeat'}
+2         ttnn::repeat                 buffer_allocate         0      17203200    {'address': '753696', 'layout': 'INTERLEAVED', 'size': '8192000', 'type': 'DRAM'}
+3         ttnn::repeat                 buffer_allocate         0      17209344   {'address': '1073735680', 'layout': 'INTERLEAVED', 'size': '6144', 'type': 'DRAM'}
+4         ttnn::repeat        circular_buffer_allocate      4096      17209344    {'addr': '107360', 'core_range_set': '{[(x=0,y=0) - (x=7,y=7)]}', 'size': '4096'}
+5         ttnn::repeat               buffer_deallocate      4096      17203200                               {'layout': 'INTERLEAVED', 'size': '0', 'type': 'DRAM'}
+6         ttnn::repeat  circular_buffer_deallocate_all         0      17203200                                                                                   {}
+7   ttnn::prim::binary                        begin_op         0      17203200                                       {'inputs': '10', 'name': 'ttnn::prim::binary'}
+8   ttnn::prim::binary                 buffer_allocate         0      25395200   {'address': '1437728', 'layout': 'INTERLEAVED', 'size': '8192000', 'type': 'DRAM'}
+9   ttnn::prim::binary                 buffer_allocate         0      25409536  {'address': '1073735680', 'layout': 'INTERLEAVED', 'size': '14336', 'type': 'DRAM'}
+10  ttnn::prim::binary        circular_buffer_allocate      4096      25409536    {'addr': '107360', 'core_range_set': '{[(x=0,y=0) - (x=7,y=7)]}', 'size': '4096'}
+11  ttnn::prim::binary        circular_buffer_allocate      8192      25409536    {'addr': '111456', 'core_range_set': '{[(x=0,y=0) - (x=7,y=7)]}', 'size': '4096'}
+12  ttnn::prim::binary        circular_buffer_allocate     12288      25409536    {'addr': '115552', 'core_range_set': '{[(x=0,y=0) - (x=7,y=7)]}', 'size': '4096'}
+13  ttnn::prim::binary               buffer_deallocate     12288      25395200                               {'layout': 'INTERLEAVED', 'size': '0', 'type': 'DRAM'}
+14  ttnn::prim::binary  circular_buffer_deallocate_all         0      25395200                                                                                   {}
+15           ttnn::add               buffer_deallocate         0      17203200                               {'layout': 'INTERLEAVED', 'size': '0', 'type': 'DRAM'}
+16           ttnn::add  circular_buffer_deallocate_all         0      17203200                                                                                   {}
+```
+or a graph
+![trace](https://github.com/user-attachments/assets/42501a1f-8354-4b3b-a5d9-707f30b23f4f)
+
+## Trace Format
+Trace is captured as a JSON and you can find the code producing it [here](https://github.com/tenstorrent/tt-metal/blob/main/ttnn/cpp/ttnn/graph/graph_processor.cpp).
+Below you can find a detailed description of the schema.
+
+### Node Types
+The trace is represented as a directed graph, where each node corresponds to a specific operation or memory event. Below is an overview of the various types of nodes that can be present in the trace:
+
+First, each node has these parameters
+
+* `counter`: The unique identifier of the node within the graph.
+* `node_type`: node type, available types are listed below
+* `params`: the map of parameters \[mapping a string string property name to a string value\]
+* `connections`: An array of connections to subsequent nodes.
+
+### Node Connections
+Each node in the graph maintains a list of connections to other nodes. These connections represent the flow of data and control through the various operations and memory events during the execution of the network.
+
+### 1\. capture\_start
+Marks the beginning of the graph capture process. This node is the root of the graph and does not have any parent nodes.
+
+#### Parameters
+* Empty, as this is a marker node.
+
+#### Connections
+* First element is next operation call
+* Last element is the corresponding `capture_end`
+
+### 2\. capture\_end
+Marks the end of the graph capture process.
+
+#### Parameters
+* Empty, as this is a marker node.
+
+### 3\. function\_start
+Represents the beginning of a function or operation within the graph. This node captures details about the function name and the number of input parameters it received.
+
+#### Parameters
+* `inputs`: Number of input parameters.
+* `name`: The name of the function or operation.
+
+#### Connections
+* Another op call, primitive op call, or a corresponding `function_end`
+
+#### Functions types
+* TT-NN operation: for example ttnn::add, ttnn::repeat
+* TT-NN primitive operation: for example ttnn::prim::binary (if it uses TMP infra) or ttnn::prim::old\_infra\_device\_operation (if not)
+* TT-NN device operation: name “Device Operation” (should change in future)
+* Unregistered, but manually tracked functions: (will change)
+  create\_device\_tensor, Tensor::to, Tensor::cpu, Tensor::cpu\_sharded, Tensor::print, Tensor::pad, Tensor::unpad, Tensor::reshape
+  tt::tt\_metal::detail::convert\_python\_tensor\_to\_tt\_tensor, tt::tt\_metal::detail::convert\_tt\_tensor\_to\_torch\_tensor
+
+### 4\. function\_end
+Marks the end of a function or operation. This node is paired with the corresponding `begin_function` node and records the outputs or final state of the function.
+
+#### Parameters
+* `name`: The name of the function or operation.
+
+#### Connections
+* Output tensor/s
+* Next control flow node (potentially `capture_end` node)
+
+### 5\. buffer
+Represents the allocation of a memory buffer. This node records details about the buffer's size, type (e.g., DRAM or L1 cache), and layout.
+
+#### Parameters
+* `size`: The size of the buffer in bytes.
+* `type`: The type of memory (e.g., "DRAM", "L1").
+* `layout`: The memory layout (e.g., "INTERLEAVED", "SINGLE\_BANK").
+
+#### Connections
+* Single element in the connection list specifies the associated Tensor ID
+
+### 6\. buffer\_allocate
+Denotes the allocation of a buffer in memory. This node captures the specifics of the memory allocation event, including the buffer's address and type.
+
+#### Parameters
+* `size`: The size of the buffer in bytes.
+* `address`: The memory address of the buffer.
+* `type`: The type of memory (e.g., "DRAM", "L1").
+* `layout`: The memory layout.
+
+#### Connections
+* Single element in the connections list specifies the allocated buffer ID
+
+### 7\. buffer\_deallocate
+Represents the deallocation of a buffer from memory. This node records the details of the buffer being deallocated.
+
+#### Parameters
+* `size`: The size of the buffer in bytes.
+* `type`: The type of memory (e.g., "DRAM", "L1").
+* `layout`: The memory layout.
+
+#### Connections
+* Single element in the connections list specifies the deallocated buffer ID
+
+### 8\. circular\_buffer\_allocate
+Represents the allocation of a circular buffer, typically used in handling streaming data or multi-buffering strategies. This node captures details like the core range set involved and the buffer size.
+
+#### Parameters
+* `size`: The size of the circular buffer in bytes.
+* `address`: The memory address associated with the buffer.
+* `core_range_set`: The range of cores involved in the circular buffer.
+
+#### Connections
+Usually empty
+
+### 9\. circular\_buffer\_deallocate\_all
+Marks the deallocation of all circular buffers. This is a bulk operation and is connected to all circular buffer nodes that are being deallocated.
+
+#### Parameters
+Empty, as this operation deallocates all circular buffers.
+
+#### Connections
+Usually empty
+
+### 10\. tensor
+Represents a tensor in the graph. This node captures the tensor's shape and is connected to the memory buffer it uses, if applicable.
+`[#]` means that each tensor is indexed, and instead of \# in real trace you will see an id
+
+#### Parameters
+* `tensor_id`: The identified of the tensor.
+* `shape`: The shape of the tensor.
+
+#### Connections
+Usually specifies function\_start of a function where given tensor is passed as an argument/used
+
+## Operation Dispatching
+When run in `NO_DISPATCH` run mode, real allocations do not happen, so trace collection does not have side effects on the allocator state.
+You can pass unrealistically big tensors in this mode and unless an operation does upper limit validation, you still can collect the trace.
+In this mode trace collection is faster because ops are dispatched to the device.
+
+When run in the `NORMAL` mode, memory can be fragmented, which can lead to a different trace and you see real addresses where everything is allocated.
+
+## Python
+Tracing is available through Python too
+https://github.com/tenstorrent/tt-metal/blob/4ae4ac3c30cd24ea27cbac8cc5811c90d077e9c0/tests/ttnn/unit_tests/test_graph_capture.py#L21-L25
+
+Here is a sample code to print a table from the beginning of this document
+
+```py
+def process_allocations(graph):
+   df = pd.DataFrame(columns=['current_op', 'event', 'total_cb', 'total_buffer', 'info'])
+
+   cur_op = []
+   total_cb = 0
+   total_buffer = 0
+   tensors = set()
+   i = 1 # lets skip initial node
+   while i < len(graph):
+       params = ''
+       v = graph[i]
+       params = v.params
+       print(v, len(df))
+       i += 1
+       if v.node_type == 'function_start':
+           if len(cur_op) == 0:
+               #entring first op, lets get all input tensors
+               while i < len(graph):
+                   print(graph[i], len(df))
+                   if graph[i].node_type == 'buffer':
+                       total_buffer += int(graph[i].params['size'])
+                       i += 1
+                   elif graph[i].node_type == 'tensor':
+                       i += 1
+                   else:
+                       break
+           name = v.params['name']
+           if name == "ttnn::prim::old_infra_device_operation":
+               name = "ttnn::prim::old_infra_op"
+           cur_op.append(name)
+       if v.node_type == 'circular_buffer_allocate':
+           total_cb += int(v.params['size'])
+       if v.node_type == 'circular_buffer_deallocate_all':
+           total_cb = 0
+       if v.node_type == 'buffer_allocate':
+           total_buffer += int(v.params['size'])
+       if v.node_type == 'function_end':
+           cur_op.pop()
+           #continue
+       if v.node_type == 'tensor':
+           continue
+       if v.node_type == 'buffer_deallocate':
+           total_buffer -= int(graph[v.connections[0]].params['size'])
+       if v.node_type == 'buffer':
+           continue
+       if len(cur_op) > 0:
+           data =  {'current_op': cur_op[-1], 'event' : v.node_type, 'total_cb': total_cb, 'total_buffer': total_buffer, 'info' : params}
+           df.loc[len(df)] = data
+   return df
+```
+
+## Sample Trace
+
+This is a sample trace of running `ttnn::add(Shape\[1, 1, 32, 32\], Shape\[4, 1, 32, 32\])`.
+This setup requires to broadcast the first tensor, so trace contains a call to ttnn::repeat.
+High level call stack here is:
+
+```
+ttnn::add
+ttnn::repeat
+ttnn::prim::old_infra_device_operation (calling ttnn primitive operation)
+Device Operation (dispatching device operation)
+create_device_tensor (creates intermediate output for ttnn::repeat)
+ttnn::prim::binary (calling ttnn primitive operation)
+Device Operation (dispatching device operation)
+create_device_tensor (creates final output)
+```
+
+And you can see when each Buffer and CB is allocated / deallocated.
+
+### PrettyPrint
+
+```
+Capture Start
+Begin: tt::tt_metal::detail::convert_python_tensor_to_tt_tensor
+End:   tt::tt_metal::detail::convert_python_tensor_to_tt_tensor
+Add Tensor: 0
+Begin: ttnn::to_layout
+    Begin: Tensor::reshape
+    End:   Tensor::reshape
+    Add Tensor: 1
+    Begin: Tensor::pad
+    End:   Tensor::pad
+    Add Tensor: 2
+    Begin: Tensor::to
+    End:   Tensor::to
+    Add Tensor: 3
+End:   ttnn::to_layout
+Begin: Tensor::to
+    Add Device Buffer
+    Allocate Device Buffer
+End:   Tensor::to
+Add Tensor: 4
+Begin: ttnn::add
+    Begin: Tensor::to
+        Add Tensor: -1
+        Add Device Buffer
+        Allocate Device Buffer
+    End:   Tensor::to
+    Add Tensor: 5
+    Begin: ttnn::prim::binary
+        Begin: BinaryDeviceOperation
+            Begin: tt::tt_metal::create_device_tensor
+                Add Device Buffer
+                Allocate Device Buffer
+            End:   tt::tt_metal::create_device_tensor
+            Add Tensor: 6
+            Add Tensor: 6
+            Add Device Buffer
+            Allocate Device Buffer
+            Allocate Device Buffer
+            Allocate Device Buffer
+            Allocate Device Buffer
+            Deallocate Device Buffer
+        End:   BinaryDeviceOperation
+        Add Tensor: 7
+    End:   ttnn::prim::binary
+    Deallocate Device Buffer
+End:   ttnn::add
+Begin: Tensor::cpu
+End:   Tensor::cpu
+Add Tensor: 8
+Begin: Tensor::to
+End:   Tensor::to
+Add Tensor: 9
+Begin: tt::tt_metal::detail::convert_tt_tensor_to_torch_tensor
+End:   tt::tt_metal::detail::convert_tt_tensor_to_torch_tensor
+Deallocate Device Buffer
+Deallocate Device Buffer
+```
+
+### Visualizer
+
+![visualizer](https://github.com/user-attachments/assets/03df00c6-4902-416d-a26a-6ffe874537a5)
+
+
+## Raw JSON
+```
+[
+    {
+        "connections": [
+            1,
+            32
+        ],
+        "counter": 0,
+        "node_type": "capture_start",
+        "params": {}
+    },
+    {
+        "connections": [
+            3,
+            5,
+            6,
+            18,
+            30,
+            31
+        ],
+        "counter": 1,
+        "node_type": "function_start",
+        "params": {
+            "inputs": "2",
+            "name": "ttnn::add"
+        }
+    },
+    {
+        "connections": [
+            1,
+            18
+        ],
+        "counter": 2,
+        "node_type": "tensor",
+        "params": {
+            "shape": "ttnn.Shape([4, 3, 32, 32])"
+        }
+    },
+    {
+        "connections": [
+            2,
+            2
+        ],
+        "counter": 3,
+        "name": "buffer",
+        "params": {
+            "layout": "INTERLEAVED",
+            "size": "24576",
+            "type": "L1"
+        }
+    },
+    {
+        "connections": [
+            1,
+            6
+        ],
+        "counter": 4,
+        "name": "tensor[1]",
+        "params": {
+            "shape": "ttnn.Shape([1, 3, 32, 32])"
+        }
+    },
+    {
+        "connections": [
+            4,
+            4
+        ],
+        "counter": 5,
+        "name": "buffer",
+        "params": {
+            "layout": "INTERLEAVED",
+            "size": "6144",
+            "type": "L1"
+        }
+    },
+    {
+        "connections": [
+            7,
+            17
+        ],
+        "counter": 6,
+        "name": "function_start",
+        "params": {
+            "inputs": "2",
+            "name": "ttnn::repeat"
+        }
+    },
+    {
+        "connections": [
+            8,
+            16
+        ],
+        "counter": 7,
+        "name": "function_start",
+        "params": {
+            "inputs": "5",
+            "name": "ttnn::prim::old_infra_device_operation"
+        }
+    },
+    {
+        "connections": [
+            9,
+            14,
+            15
+        ],
+        "counter": 8,
+        "name": "function_start",
+        "params": {
+            "inputs": "2",
+            "name": "Device Operation"
+        }
+    },
+    {
+        "connections": [
+            10,
+            11,
+            12
+        ],
+        "counter": 9,
+        "name": "function_start",
+        "params": {
+            "inputs": "5",
+            "name": "create_device_tensor"
+        }
+    },
+    {
+        "connections": [
+            13,
+            13,
+            13,
+            13,
+            13
+        ],
+        "counter": 10,
+        "name": "buffer",
+        "params": {
+            "layout": "INTERLEAVED",
+            "size": "24576",
+            "type": "L1"
+        }
+    },
+    {
+        "connections": [
+            10
+        ],
+        "counter": 11,
+        "name": "buffer_allocate",
+        "params": {
+            "address": "1953396066",
+            "layout": "INTERLEAVED",
+            "size": "24576",
+            "type": "L1"
+        }
+    },
+    {
+        "connections": [
+            13
+        ],
+        "counter": 12,
+        "name": "function_end",
+        "params": {
+            "name": "create_device_tensor"
+        }
+    },
+    {
+        "connections": [
+            18
+        ],
+        "counter": 13,
+        "name": "tensor[2]",
+        "params": {
+            "shape": "ttnn.Shape([4, 3, 32, 32])"
+        }
+    },
+    {
+        "connections": [],
+        "counter": 14,
+        "name": "circular_buffer_allocate",
+        "params": {
+            "address": "0",
+            "core_range_set": "{[(x=0,y=0) - (x=0,y=7)], [(x=1,y=0) - (x=1,y=3)]}",
+            "size": "4096"
+        }
+    },
+    {
+        "connections": [
+            13
+        ],
+        "counter": 15,
+        "name": "function_end",
+        "params": {
+            "name": "Device Operation"
+        }
+    },
+    {
+        "connections": [
+            13
+        ],
+        "counter": 16,
+        "name": "function_end",
+        "params": {
+            "name": "ttnn::prim::old_infra_device_operation"
+        }
+    },
+    {
+        "connections": [
+            13,
+            18
+        ],
+        "counter": 17,
+        "name": "function_end",
+        "params": {
+            "name": "ttnn::repeat"
+        }
+    },
+    {
+        "connections": [
+            19,
+            29
+        ],
+        "counter": 18,
+        "name": "function_start",
+        "params": {
+            "inputs": "10",
+            "name": "ttnn::prim::binary"
+        }
+    },
+    {
+        "connections": [
+            20,
+            25,
+            26,
+            27,
+            28
+        ],
+        "counter": 19,
+        "name": "function_start",
+        "params": {
+            "inputs": "2",
+            "name": "Device Operation"
+        }
+    },
+    {
+        "connections": [
+            21,
+            22,
+            23
+        ],
+        "counter": 20,
+        "name": "function_start",
+        "params": {
+            "inputs": "5",
+            "name": "create_device_tensor"
+        }
+    },
+    {
+        "connections": [
+            24,
+            24,
+            24,
+            24
+        ],
+        "counter": 21,
+        "name": "buffer",
+        "params": {
+            "layout": "INTERLEAVED",
+            "size": "24576",
+            "type": "L1"
+        }
+    },
+    {
+        "connections": [
+            21
+        ],
+        "counter": 22,
+        "name": "buffer_allocate",
+        "params": {
+            "address": "0",
+            "layout": "INTERLEAVED",
+            "size": "24576",
+            "type": "L1"
+        }
+    },
+    {
+        "connections": [
+            24
+        ],
+        "counter": 23,
+        "name": "function_end",
+        "params": {
+            "name": "create_device_tensor"
+        }
+    },
+    {
+        "connections": [],
+        "counter": 24,
+        "name": "tensor[3]",
+        "params": {
+            "shape": "ttnn.Shape([4, 3, 32, 32])"
+        }
+    },
+    {
+        "connections": [],
+        "counter": 25,
+        "name": "circular_buffer_allocate",
+        "params": {
+            "address": "0",
+            "core_range_set": "{[(x=0,y=0) - (x=7,y=7)]}",
+            "size": "4096"
+        }
+    },
+    {
+        "connections": [],
+        "counter": 26,
+        "name": "circular_buffer_allocate",
+        "params": {
+            "address": "0",
+            "core_range_set": "{[(x=0,y=0) - (x=7,y=7)]}",
+            "size": "4096"
+        }
+    },
+    {
+        "connections": [],
+        "counter": 27,
+        "name": "circular_buffer_allocate",
+        "params": {
+            "address": "0",
+            "core_range_set": "{[(x=0,y=0) - (x=7,y=7)]}",
+            "size": "4096"
+        }
+    },
+    {
+        "connections": [
+            24
+        ],
+        "counter": 28,
+        "name": "function_end",
+        "params": {
+            "name": "Device Operation"
+        }
+    },
+    {
+        "connections": [
+            24
+        ],
+        "counter": 29,
+        "name": "function_end",
+        "params": {
+            "name": "ttnn::prim::binary"
+        }
+    },
+    {
+        "connections": [
+            10
+        ],
+        "counter": 30,
+        "name": "buffer_deallocate",
+        "params": {
+            "layout": "INTERLEAVED",
+            "size": "0",
+            "type": "L1"
+        }
+    },
+    {
+        "connections": [
+            24,
+            33
+        ],
+        "counter": 31,
+        "name": "function_end",
+        "params": {
+            "name": "ttnn::add"
+        }
+    },
+    {
+        "connections": [
+            21
+        ],
+        "counter": 32,
+        "name": "buffer_deallocate",
+        "params": {
+            "layout": "INTERLEAVED",
+            "size": "0",
+            "type": "L1"
+        }
+    },
+    {
+        "connections": [],
+        "counter": 33,
+        "name": "capture_end",
+        "params": {}
+    }
+]
+```

From 9b21ae31d9601948337ce4fee0304f3387b93895 Mon Sep 17 00:00:00 2001
From: Bui Chi Trung <52347285+BuiChiTrung@users.noreply.github.com>
Date: Mon, 28 Oct 2024 15:35:28 +0700
Subject: [PATCH 26/30] #13878: Add bernoulli operation to TTNN (#13921)

* #13320: add draft code to test dropout op

#13320: rollback add_2_integers_in_compute changes

#13320: add draft code for uniform in programming_examples

#13320: check random numbers frequency using map

#13320: add draft code

#13320: try remove reader kernel and cb1

#13320: add skeleton code for uniform

#13220: support from, to param

#13320: remove reader kernel

#13320: add cb_intermed0

#13320: add unit-test

#13320: refactor writer kernel by exposing an sfpu API

#13320: remove native dir

#13320: refactor

#13320: update unit test

#13320: change compute_kernel_config param to const optional and add license

#13320: add cb id as compile args

#13320: raise exception when from = to

#13320: skip test for grayskull

#13320: revise uniform operation

#13320: update program factory

#13320: update unit-test for bfloat16 and refactor cb var name in kernels

#13320: add feature to benchmark ttnn with torch uniform

#13320: refactor

#13320: update compute_output_shapes func

#13320: update unit-test

#13320: add draft code to test dropout op

#13320: rollback add_2_integers_in_compute changes

#13320: add draft code for uniform in programming_examples

#13320: check random numbers frequency using map

#13320: add draft code

#13320: try remove reader kernel and cb1

#13320: add skeleton code for uniform

#13220: support from, to param

#13320: remove reader kernel

#13320: add cb_intermed0

#13320: add unit-test

#13320: refactor writer kernel by exposing an sfpu API

#13320: remove native dir

#13320: refactor

#13320: update unit test

#13320: change compute_kernel_config param to const optional and add license

#13320: add cb id as compile args

#13320: raise exception when from = to

#13320: skip test for grayskull

#13320: revise uniform operation

#13320: update program factory

#13320: update unit-test for bfloat16 and refactor cb var name in kernels

#13320: add feature to benchmark ttnn with torch uniform

#13320: refactor

#13320: update compute_output_shapes func

#13878: draft version

#13878: backup error code when use cb_out0

#13878: replace cb_out0 with cb_intermed1

#13878: refator

#13878: add comment to writer kernel

#13878: update unit tests

#13878: rollback submodule

#13878: remove dprint

#13878: change param name to output and dtype

#13320: add draft code to test dropout op

#13320: rollback add_2_integers_in_compute changes

#13320: add draft code for uniform in programming_examples

#13320: check random numbers frequency using map

#13320: add draft code

#13320: try remove reader kernel and cb1

#13320: add skeleton code for uniform

#13220: support from, to param

#13320: remove reader kernel

#13320: add cb_intermed0

#13320: add unit-test

#13320: refactor writer kernel by exposing an sfpu API

#13320: remove native dir

#13320: refactor

#13320: update unit test

#13320: change compute_kernel_config param to const optional and add license

#13320: add cb id as compile args

#13320: raise exception when from = to

#13320: skip test for grayskull

#13320: revise uniform operation

#13320: update program factory

#13320: update unit-test for bfloat16 and refactor cb var name in kernels

#13320: add feature to benchmark ttnn with torch uniform

#13320: refactor

#13320: update compute_output_shapes func

#13320: update unit-test

#13320: update sfpu api

#13320: remove un-used header in kernel and counter in sfpu API

#13320: remove TTI_SFPSETSGN instr

#13320: allow set fp32_dest_acc to false

#13320: update callback test

#13878: update unit-tests

* #13878: update writer kernel

* #13878: update writer kernel

* #13878: remove unused sfpu api

* #13878: remove unused header

* #13878: update docs
---
 .../unit_tests/operations/test_bernoulli.py   | 120 +++++++++++++
 .../unit_tests/operations/test_uniform.py     |  20 +--
 ttnn/CMakeLists.txt                           |   6 +-
 ttnn/cpp/pybind11/operations/__init__.hpp     |   4 +
 .../ttnn/operations/bernoulli/bernoulli.cpp   |  19 ++
 .../ttnn/operations/bernoulli/bernoulli.hpp   |  23 +++
 .../operations/bernoulli/bernoulli_pybind.cpp |  46 +++++
 .../operations/bernoulli/bernoulli_pybind.hpp |  13 ++
 .../device/bernoulli_device_operation.cpp     |  85 +++++++++
 .../device/bernoulli_device_operation.hpp     |  69 ++++++++
 .../device/bernoulli_program_factory.cpp      | 166 ++++++++++++++++++
 .../device/kernels/compute_bernoulli.cpp      |  43 +++++
 .../device/kernels/reader_bernoulli.cpp       |  26 +++
 .../device/kernels/writer_bernoulli.cpp       |  78 ++++++++
 .../operations/uniform/uniform_pybind.cpp     |  36 ++--
 15 files changed, 721 insertions(+), 33 deletions(-)
 create mode 100644 tests/ttnn/unit_tests/operations/test_bernoulli.py
 create mode 100644 ttnn/cpp/ttnn/operations/bernoulli/bernoulli.cpp
 create mode 100644 ttnn/cpp/ttnn/operations/bernoulli/bernoulli.hpp
 create mode 100644 ttnn/cpp/ttnn/operations/bernoulli/bernoulli_pybind.cpp
 create mode 100644 ttnn/cpp/ttnn/operations/bernoulli/bernoulli_pybind.hpp
 create mode 100644 ttnn/cpp/ttnn/operations/bernoulli/device/bernoulli_device_operation.cpp
 create mode 100644 ttnn/cpp/ttnn/operations/bernoulli/device/bernoulli_device_operation.hpp
 create mode 100644 ttnn/cpp/ttnn/operations/bernoulli/device/bernoulli_program_factory.cpp
 create mode 100644 ttnn/cpp/ttnn/operations/bernoulli/device/kernels/compute_bernoulli.cpp
 create mode 100644 ttnn/cpp/ttnn/operations/bernoulli/device/kernels/reader_bernoulli.cpp
 create mode 100644 ttnn/cpp/ttnn/operations/bernoulli/device/kernels/writer_bernoulli.cpp

diff --git a/tests/ttnn/unit_tests/operations/test_bernoulli.py b/tests/ttnn/unit_tests/operations/test_bernoulli.py
new file mode 100644
index 00000000000..5c3c68ab5c4
--- /dev/null
+++ b/tests/ttnn/unit_tests/operations/test_bernoulli.py
@@ -0,0 +1,120 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import pytest
+import ttnn
+import numpy as np
+from tests.ttnn.unit_tests.operations.test_utils import (
+    get_compute_kernel_options,
+    compute_kernel_options,
+    compute_kernel_ids,
+    get_lib_dtype,
+)
+from collections import Counter
+from loguru import logger
+
+
+def run_bernoulli(shape, in_dtype, out_dtype, device, is_out_alloc=False, compute_kernel_options=None):
+    compute_kernel_config = get_compute_kernel_options(compute_kernel_options)
+    cpu_input = torch.rand(shape, dtype=get_lib_dtype(torch, in_dtype))
+    npu_input = ttnn.from_torch(cpu_input, device=device, dtype=get_lib_dtype(ttnn, in_dtype), layout=ttnn.TILE_LAYOUT)
+
+    npu_output = None
+    if is_out_alloc:
+        cpu_output = torch.rand(shape, dtype=get_lib_dtype(torch, out_dtype))
+        npu_output = ttnn.from_torch(
+            cpu_output, device=device, dtype=get_lib_dtype(ttnn, out_dtype), layout=ttnn.TILE_LAYOUT
+        )
+
+    one_probs = []
+    for _ in range(10):
+        if is_out_alloc:
+            ttnn.bernoulli(
+                npu_input,
+                output=npu_output,
+                dtype=get_lib_dtype(ttnn, out_dtype),
+                compute_kernel_config=compute_kernel_config,
+            )
+        else:
+            npu_output = ttnn.bernoulli(
+                npu_input,
+                dtype=get_lib_dtype(ttnn, out_dtype),
+                compute_kernel_config=compute_kernel_config,
+            )
+
+        tt_output = ttnn.to_torch(npu_output).reshape(shape)
+        tt_output_list = tt_output.flatten().tolist()
+
+        c = Counter(tt_output_list)
+        one_probs.append(c[1] / len(tt_output_list))
+
+    expected_one_prob = 0.5
+    assert np.allclose(expected_one_prob, np.mean(one_probs), rtol=0.05)
+
+
+# fmt: off
+@pytest.mark.parametrize("shape",
+    [
+        [2003],
+        [500, 500],
+        [1, 512, 2, 256],
+    ],
+)
+@pytest.mark.parametrize("in_dtype",
+    [
+        "bfloat16",
+        "float32"
+    ]
+)
+@pytest.mark.parametrize("out_dtype",
+    [
+        "bfloat16",
+        "float32"
+    ]
+)
+@pytest.mark.parametrize("is_out_alloc",
+    [
+        True,
+        False
+    ]
+)
+# fmt: on
+def test_bernoulli(shape, in_dtype, out_dtype, device, is_out_alloc):
+    torch.manual_seed(0)
+    run_bernoulli(shape, in_dtype, out_dtype, device, is_out_alloc)
+
+
+@pytest.mark.parametrize(
+    "shape",
+    [
+        [1, 21, 123, 24],
+    ],
+)
+@pytest.mark.parametrize("in_dtype", ["float32"])
+@pytest.mark.parametrize("out_dtype", ["float32"])
+@pytest.mark.parametrize("is_out_alloc", [True, False])
+def test_bernoulli_callback(shape, in_dtype, out_dtype, device, is_out_alloc, use_program_cache):
+    torch.manual_seed(0)
+    num_program_cache_entries_list = []
+    for i in range(2):
+        run_bernoulli(shape, in_dtype, out_dtype, device, is_out_alloc)
+        # Add dummy tensor to make sure that created tensor in 2 iteration don't share the same addr
+        tt_dummy_tensor = ttnn.empty([1, 1, 32, 32], ttnn.bfloat16, ttnn.TILE_LAYOUT, device)
+        num_program_cache_entries_list.append(device.num_program_cache_entries())
+    logger.info(f"num_program_cache_entries_list={num_program_cache_entries_list}")
+    assert num_program_cache_entries_list[0] > 0
+    assert num_program_cache_entries_list[0] == num_program_cache_entries_list[1]
+
+
+@pytest.mark.parametrize(
+    "shape",
+    [[512, 512], [5, 4, 70, 40]],
+)
+@pytest.mark.parametrize("in_dtype", ["float32"])
+@pytest.mark.parametrize("out_dtype", ["float32"])
+@pytest.mark.parametrize("compute_kernel_options", compute_kernel_options, ids=compute_kernel_ids)
+def test_uniform_with_compute_kernel_options(shape, in_dtype, out_dtype, device, compute_kernel_options):
+    torch.manual_seed(0)
+    run_bernoulli(shape, in_dtype, out_dtype, device, compute_kernel_options)
diff --git a/tests/ttnn/unit_tests/operations/test_uniform.py b/tests/ttnn/unit_tests/operations/test_uniform.py
index 0ee59766878..9c3f05a6a6a 100644
--- a/tests/ttnn/unit_tests/operations/test_uniform.py
+++ b/tests/ttnn/unit_tests/operations/test_uniform.py
@@ -94,9 +94,9 @@ def run_uniform(shape, rand_range, dtype, device, compute_kernel_options=None, m
         )
 
 
-# fmt: off
 @skip_for_grayskull("Requires wormhole_b0 to run")
-@pytest.mark.parametrize("shape",
+@pytest.mark.parametrize(
+    "shape",
     [
         [32, 32],
         [64, 64],
@@ -105,20 +105,8 @@ def run_uniform(shape, rand_range, dtype, device, compute_kernel_options=None, m
         [1024, 1024],
     ],
 )
-@pytest.mark.parametrize("rand_range",
-    [
-        [0, 1],
-        [2.1, 9],
-        [-5.1, 1.2]
-    ]
-)
-@pytest.mark.parametrize("dtype",
-    [
-        "bfloat16",
-        "float32"
-    ]
-)
-# fmt: on
+@pytest.mark.parametrize("rand_range", [[0, 1], [2.1, 9], [-5.1, 1.2]])
+@pytest.mark.parametrize("dtype", ["bfloat16", "float32"])
 def test_uniform(shape, rand_range, dtype, device):
     torch.manual_seed(0)
     run_uniform(shape, rand_range, dtype, device)
diff --git a/ttnn/CMakeLists.txt b/ttnn/CMakeLists.txt
index d03fbfbdc64..5042aedc602 100644
--- a/ttnn/CMakeLists.txt
+++ b/ttnn/CMakeLists.txt
@@ -371,7 +371,10 @@ set(ALL_TTNN_SRCS
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/uniform/uniform_pybind.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/uniform/device/uniform_device_operation.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/uniform/device/uniform_program_factory.cpp
-
+    ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/bernoulli/bernoulli.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/bernoulli/bernoulli_pybind.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/bernoulli/device/bernoulli_device_operation.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/bernoulli/device/bernoulli_program_factory.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/moreh/moreh_adam/device/moreh_adam_device_operation.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/moreh/moreh_adam/device/moreh_adam_program_factory.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/moreh/moreh_adam/moreh_adam_pybind.cpp
@@ -519,7 +522,6 @@ set(ALL_TTNN_SRCS
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/moreh/moreh_sum/moreh_sum_pybind.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/moreh/moreh_sum/moreh_sum.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/moreh/moreh_helper_functions.cpp
-
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/index_fill/index_fill.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/index_fill/index_fill_pybind.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/index_fill/device/index_fill_device_operation.cpp
diff --git a/ttnn/cpp/pybind11/operations/__init__.hpp b/ttnn/cpp/pybind11/operations/__init__.hpp
index 20d7494c87b..6c165efb027 100644
--- a/ttnn/cpp/pybind11/operations/__init__.hpp
+++ b/ttnn/cpp/pybind11/operations/__init__.hpp
@@ -10,6 +10,7 @@
 #include "pybind11/operations/copy.hpp"
 #include "pybind11/operations/core.hpp"
 #include "pybind11/operations/creation.hpp"
+#include "ttnn/operations/bernoulli/bernoulli_pybind.hpp"
 #include "ttnn/operations/ccl/all_gather/all_gather_pybind.hpp"
 #include "ttnn/operations/ccl/reduce_scatter/reduce_scatter_pybind.hpp"
 #include "ttnn/operations/conv/conv2d/conv2d_pybind.hpp"
@@ -152,6 +153,9 @@ void py_module(py::module& module) {
 
     auto m_index_fill = module.def_submodule("index_fill", "index_fill operation");
     index_fill::bind_index_fill_operation(m_index_fill);
+
+    auto m_bernoulli = module.def_submodule("bernoulli", "bernoulli operations");
+    bernoulli::bind_bernoulli_operation(m_bernoulli);
 }
 }  // namespace operations
 
diff --git a/ttnn/cpp/ttnn/operations/bernoulli/bernoulli.cpp b/ttnn/cpp/ttnn/operations/bernoulli/bernoulli.cpp
new file mode 100644
index 00000000000..61c4dbe5622
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/bernoulli/bernoulli.cpp
@@ -0,0 +1,19 @@
+
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bernoulli.hpp"
+
+#include "device/bernoulli_device_operation.hpp"
+
+namespace ttnn::operations::bernoulli {
+Tensor Bernoulli::invoke(
+    const Tensor& input,
+    const std::optional<Tensor>& output,
+    const std::optional<DataType>& dtype,
+    const std::optional<MemoryConfig>& memory_config,
+    const std::optional<DeviceComputeKernelConfig>& compute_kernel_config) {
+    return ttnn::prim::bernoulli(input, output, dtype, memory_config, compute_kernel_config);
+}
+}  // namespace ttnn::operations::bernoulli
diff --git a/ttnn/cpp/ttnn/operations/bernoulli/bernoulli.hpp b/ttnn/cpp/ttnn/operations/bernoulli/bernoulli.hpp
new file mode 100644
index 00000000000..0562def8df9
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/bernoulli/bernoulli.hpp
@@ -0,0 +1,23 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+#include "ttnn/decorators.hpp"
+#include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp"
+
+namespace ttnn::operations::bernoulli {
+struct Bernoulli {
+    static Tensor invoke(
+        const Tensor& input,
+        const std::optional<Tensor>& output,
+        const std::optional<DataType>& dtype,
+        const std::optional<MemoryConfig>& memory_config,
+        const std::optional<DeviceComputeKernelConfig>& compute_kernel_config);
+};
+}  // namespace ttnn::operations::bernoulli
+
+namespace ttnn {
+constexpr auto bernoulli =
+    ttnn::register_operation_with_auto_launch_op<"ttnn::bernoulli", ttnn::operations::bernoulli::Bernoulli>();
+}  // namespace ttnn
diff --git a/ttnn/cpp/ttnn/operations/bernoulli/bernoulli_pybind.cpp b/ttnn/cpp/ttnn/operations/bernoulli/bernoulli_pybind.cpp
new file mode 100644
index 00000000000..405aefc834c
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/bernoulli/bernoulli_pybind.cpp
@@ -0,0 +1,46 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bernoulli_pybind.hpp"
+
+#include "bernoulli.hpp"
+#include "pybind11/decorators.hpp"
+
+namespace ttnn::operations::bernoulli {
+void bind_bernoulli_operation(py::module &module) {
+    std::string doc =
+        R"doc(
+        Generates a tensor to draw binary random numbers (0 or 1) from a Bernoulli distribution.
+
+        Args:
+            input (ttnn.Tensor): The input tensor of probability values for the Bernoulli distribution.
+
+        Keyword args:
+            output (ttnn.Tensor, optional): The output tensor.
+            dtype (ttnn.DataType, optional): Output tensor dtype, default float32.
+            memory_config (ttnn.MemoryConfig, optional): Memory configuration for the operation. Defaults to `None`.
+            compute_kernel_config (ttnn.DeviceComputeKernelConfig, optional): Configuration for the compute kernel. Defaults to `None`.
+
+        Returns:
+            ttnn.Tensor: the output tensor.
+
+        Example:
+            >>> input = ttnn.to_device(ttnn.from_torch(torch.empty(3, 3).uniform_(0, 1), dtype=torch.bfloat16)), device=device)
+            >>> output = ttnn.bernoulli(input)
+
+        )doc";
+
+    bind_registered_operation(
+        module,
+        ttnn::bernoulli,
+        doc,
+        ttnn::pybind_arguments_t{
+            py::arg("input"),
+            py::kw_only(),
+            py::arg("output") = std::nullopt,
+            py::arg("dtype") = std::nullopt,
+            py::arg("memory_config") = std::nullopt,
+            py::arg("compute_kernel_config") = std::nullopt});
+}
+}  // namespace ttnn::operations::bernoulli
diff --git a/ttnn/cpp/ttnn/operations/bernoulli/bernoulli_pybind.hpp b/ttnn/cpp/ttnn/operations/bernoulli/bernoulli_pybind.hpp
new file mode 100644
index 00000000000..5c321318bba
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/bernoulli/bernoulli_pybind.hpp
@@ -0,0 +1,13 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "pybind11/pybind_fwd.hpp"
+
+namespace py = pybind11;
+
+namespace ttnn::operations::bernoulli {
+void bind_bernoulli_operation(py::module &module);
+}  // namespace ttnn::operations::bernoulli
diff --git a/ttnn/cpp/ttnn/operations/bernoulli/device/bernoulli_device_operation.cpp b/ttnn/cpp/ttnn/operations/bernoulli/device/bernoulli_device_operation.cpp
new file mode 100644
index 00000000000..da542429466
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/bernoulli/device/bernoulli_device_operation.cpp
@@ -0,0 +1,85 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bernoulli_device_operation.hpp"
+
+namespace ttnn::operations::bernoulli {
+
+BernoulliDeviceOperation::program_factory_t BernoulliDeviceOperation::select_program_factory(
+    const operation_attributes_t& operation_attributes, const tensor_args_t& tensor_args) {
+    return ProgramFactory{};
+}
+
+void BernoulliDeviceOperation::validate_inputs(
+    const operation_attributes_t& operation_attributes, const tensor_args_t& tensor_args) {
+    const auto& input = tensor_args.input;
+    const auto& output = tensor_args.output;
+
+    TT_FATAL(input.storage_type() == StorageType::DEVICE, "Bernoulli: Input tensor need to be on device");
+    TT_FATAL(input.buffer() != nullptr, "Bernoulli: Input tensor need to be allocated in buffers on device");
+    TT_FATAL((input.get_layout() == Layout::TILE), "Bernoulli: Input tensor must be tilized");
+    TT_FATAL(
+        input.get_dtype() == DataType::BFLOAT16 || input.get_dtype() == DataType::FLOAT32,
+        "Bernoulli: Input tensor must be Float32 or Bfloat16");
+
+    if (output.has_value()) {
+        TT_FATAL(output.value().storage_type() == StorageType::DEVICE, "Bernoulli: Output tensor need to be on device");
+        TT_FATAL(
+            output.value().buffer() != nullptr, "Bernoulli: Output tensor need to be allocated in buffers on device");
+        TT_FATAL((output.value().get_layout() == Layout::TILE), "Bernoulli: Output tensor must be tilized");
+        TT_FATAL(
+            output.value().get_dtype() == DataType::BFLOAT16 || output.value().get_dtype() == DataType::FLOAT32,
+            "Bernoulli: Output tensor must be Float32 or Bfloat16");
+        TT_FATAL(
+            input.get_logical_volume() == output.value().get_logical_volume(),
+            "Bernoulli: Output and input tensor shape must be equal");
+    }
+}
+
+void BernoulliDeviceOperation::validate_on_program_cache_miss(
+    const operation_attributes_t& operation_attributes, const tensor_args_t& tensor_args) {
+    validate_inputs(operation_attributes, tensor_args);
+}
+
+void BernoulliDeviceOperation::validate_on_program_cache_hit(
+    const operation_attributes_t& operation_attributes, const tensor_args_t& tensor_args) {
+    validate_inputs(operation_attributes, tensor_args);
+}
+
+BernoulliDeviceOperation::shape_return_value_t BernoulliDeviceOperation::compute_output_shapes(
+    const operation_attributes_t& operation_attributes, const tensor_args_t& tensor_args) {
+    return tensor_args.input.get_logical_shape();
+}
+
+BernoulliDeviceOperation::tensor_return_value_t BernoulliDeviceOperation::create_output_tensors(
+    const operation_attributes_t& operation_attributes, const tensor_args_t& tensor_args) {
+    if (tensor_args.output.has_value()) {
+        return tensor_args.output.value();
+    }
+
+    auto output_shapes = compute_output_shapes(operation_attributes, tensor_args);
+    return create_device_tensor(
+        output_shapes,
+        operation_attributes.dtype,
+        Layout::TILE,
+        tensor_args.input.device(),
+        operation_attributes.memory_config);
+}
+
+std::tuple<BernoulliDeviceOperation::operation_attributes_t, BernoulliDeviceOperation::tensor_args_t>
+BernoulliDeviceOperation::invoke(
+    const Tensor& input,
+    const std::optional<Tensor>& output,
+    const std::optional<DataType>& dtype,
+    const std::optional<MemoryConfig>& memory_config,
+    const std::optional<DeviceComputeKernelConfig>& compute_kernel_config) {
+    return {
+        operation_attributes_t{
+            dtype.value_or(DataType::FLOAT32),
+            memory_config.value_or(input.memory_config()),
+            init_device_compute_kernel_config(input.device()->arch(), compute_kernel_config, MathFidelity::HiFi4)},
+        tensor_args_t{input, output}};
+}
+
+}  // namespace ttnn::operations::bernoulli
diff --git a/ttnn/cpp/ttnn/operations/bernoulli/device/bernoulli_device_operation.hpp b/ttnn/cpp/ttnn/operations/bernoulli/device/bernoulli_device_operation.hpp
new file mode 100644
index 00000000000..d15841d1442
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/bernoulli/device/bernoulli_device_operation.hpp
@@ -0,0 +1,69 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "ttnn/decorators.hpp"
+#include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp"
+
+namespace ttnn::operations::bernoulli {
+
+struct BernoulliDeviceOperation {
+    struct operation_attributes_t {
+        const DataType dtype;
+        const MemoryConfig memory_config;
+        const DeviceComputeKernelConfig compute_kernel_config;
+    };
+
+    struct tensor_args_t {
+        const Tensor& input;
+        const std::optional<Tensor>& output;
+    };
+
+    using shape_return_value_t = SimpleShape;
+    using tensor_return_value_t = Tensor;
+
+    struct ProgramFactory {
+        struct shared_variables_t {
+            KernelHandle reader_kernel_id;
+            KernelHandle compute_kernel_id;
+            KernelHandle writer_kernel_id;
+            std::vector<CoreCoord> cores;
+        };
+
+        using cached_program_t = ttnn::device_operation::CachedProgram<shared_variables_t>;
+
+        static cached_program_t create(
+            const operation_attributes_t& operation_attributes,
+            const tensor_args_t& tensor_args,
+            tensor_return_value_t& output);
+
+        static void override_runtime_arguments(
+            cached_program_t& cached_program,
+            const operation_attributes_t& operation_attributes,
+            const tensor_args_t& tensor_args,
+            tensor_return_value_t& output);
+    };
+
+    using program_factory_t = std::variant<ProgramFactory>;
+
+    static program_factory_t select_program_factory(const operation_attributes_t&, const tensor_args_t&);
+    static void validate_inputs(const operation_attributes_t& attributes, const tensor_args_t& tensor_args);
+    static void validate_on_program_cache_miss(const operation_attributes_t&, const tensor_args_t&);
+    static void validate_on_program_cache_hit(const operation_attributes_t&, const tensor_args_t&);
+    static shape_return_value_t compute_output_shapes(const operation_attributes_t&, const tensor_args_t&);
+    static tensor_return_value_t create_output_tensors(const operation_attributes_t&, const tensor_args_t&);
+
+    static std::tuple<operation_attributes_t, tensor_args_t> invoke(
+        const Tensor& input,
+        const std::optional<Tensor>& output,
+        const std::optional<DataType>& dtype,
+        const std::optional<MemoryConfig>& memory_config,
+        const std::optional<DeviceComputeKernelConfig>& compute_kernel_config);
+};
+
+}  // namespace ttnn::operations::bernoulli
+
+namespace ttnn::prim {
+constexpr auto bernoulli =
+    ttnn::register_operation<"ttnn::prim::bernoulli", ttnn::operations::bernoulli::BernoulliDeviceOperation>();
+}  // namespace ttnn::prim
diff --git a/ttnn/cpp/ttnn/operations/bernoulli/device/bernoulli_program_factory.cpp b/ttnn/cpp/ttnn/operations/bernoulli/device/bernoulli_program_factory.cpp
new file mode 100644
index 00000000000..b9a5067b1a4
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/bernoulli/device/bernoulli_program_factory.cpp
@@ -0,0 +1,166 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+#include "bernoulli_device_operation.hpp"
+#include "common/constants.hpp"
+#include "impl/kernels/kernel_types.hpp"
+#include "tt_metal/common/work_split.hpp"
+#include "ttnn/tensor/types.hpp"
+
+namespace ttnn::operations::bernoulli {
+
+using namespace tt;
+using namespace tt::tt_metal;
+
+std::mt19937 rng(std::time(0));
+std::uniform_int_distribution d(1, 1 << 20);
+
+uint32_t get_random_seed() { return d(rng); }
+
+BernoulliDeviceOperation::ProgramFactory::cached_program_t BernoulliDeviceOperation::ProgramFactory::create(
+    const operation_attributes_t& operation_attributes,
+    const tensor_args_t& tensor_args,
+    tensor_return_value_t& output) {
+    const Tensor& input = tensor_args.input;
+
+    Device* device = output.device();
+    auto grid = device->compute_with_storage_grid_size();
+    auto core_h = grid.y;
+
+    uint32_t units_to_divide = output.volume() / constants::TILE_HW;
+    auto [num_cores, all_cores, core_group_1, core_group_2, units_per_core_group_1, units_per_core_group_2] =
+        split_work_to_cores(grid, units_to_divide);
+
+    uint32_t num_cores_x = grid.x;
+    uint32_t num_cores_y = grid.y;
+    auto cores = grid_to_cores(num_cores, num_cores_x, num_cores_y);
+
+    Program program = Program();
+
+    constexpr uint32_t num_tiles = 2;
+    auto in_data_format = datatype_to_dataformat_converter(input.dtype());
+    const uint32_t in_dtype_tile_size = tile_size(in_data_format);
+    constexpr uint32_t in_cb_id = CB::c_in0;
+    CircularBufferConfig cb_in_config =
+        CircularBufferConfig(num_tiles * in_dtype_tile_size, {{in_cb_id, in_data_format}})
+            .set_page_size(in_cb_id, in_dtype_tile_size);
+    CBHandle cb_input = tt_metal::CreateCircularBuffer(program, all_cores, cb_in_config);
+
+    const uint32_t float32_tile_size = tile_size(tt::DataFormat::Float32);
+    constexpr uint32_t intermed_cb_id = CB::c_intermed0;
+    CircularBufferConfig cb_intermed_config =
+        CircularBufferConfig(num_tiles * float32_tile_size, {{intermed_cb_id, tt::DataFormat::Float32}})
+            .set_page_size(intermed_cb_id, float32_tile_size);
+    CBHandle cb_intermed = tt_metal::CreateCircularBuffer(program, all_cores, cb_intermed_config);
+
+    auto out_data_format = datatype_to_dataformat_converter(output.dtype());
+    const uint32_t out_dtype_tile_size = tile_size(out_data_format);
+    constexpr uint32_t intermed1_cb_id = CB::c_intermed1;
+    CircularBufferConfig cb_intermed1_config =
+        CircularBufferConfig(1 * out_dtype_tile_size, {{intermed1_cb_id, out_data_format}})
+            .set_page_size(intermed1_cb_id, out_dtype_tile_size);
+    CBHandle cb_intermed1 = tt_metal::CreateCircularBuffer(program, all_cores, cb_intermed1_config);
+
+    const std::string kernels_dir_path = "ttnn/cpp/ttnn/operations/bernoulli/device/kernels/";
+    const uint32_t input_is_dram = input.buffer()->buffer_type() == BufferType::DRAM ? 1 : 0;
+    const std::vector<uint32_t> reader_compile_time_args{in_cb_id, input_is_dram};
+    const std::string reader_file_path = kernels_dir_path + "reader_bernoulli.cpp";
+    const std::vector<uint32_t> compute_compile_time_args{intermed_cb_id};
+    const std::string compute_file_path = kernels_dir_path + "compute_bernoulli.cpp";
+    const uint32_t output_is_dram = output.buffer()->buffer_type() == BufferType::DRAM ? 1 : 0;
+    const std::vector<uint32_t> writer_compile_time_args{in_cb_id, intermed_cb_id, intermed1_cb_id, output_is_dram};
+    const std::string writer_file_path = kernels_dir_path + "writer_bernoulli.cpp";
+
+    std::map<string, string> writer_defines;
+    switch (input.dtype()) {
+        case DataType::BFLOAT16: writer_defines["INPUT_DTYPE_BFLOAT16"] = "1"; break;
+        case DataType::FLOAT32: writer_defines["INPUT_DTYPE_FLOAT32"] = "1"; break;
+        default: break;
+    }
+    switch (output.dtype()) {
+        case DataType::BFLOAT16: writer_defines["OUTPUT_DTYPE_BFLOAT16"] = "1"; break;
+        case DataType::FLOAT32: writer_defines["OUTPUT_DTYPE_FLOAT32"] = "1"; break;
+        default: break;
+    }
+
+    KernelHandle reader_kernel_id = tt_metal::CreateKernel(
+        program, reader_file_path, all_cores, ReaderDataMovementConfig(reader_compile_time_args));
+    KernelHandle writer_kernel_id = tt_metal::CreateKernel(
+        program, writer_file_path, all_cores, WriterDataMovementConfig(writer_compile_time_args, writer_defines));
+    auto [math_fidelity, math_approx_mode, fp32_dest_acc_en, packer_l1_acc, dst_full_sync_en] =
+        get_compute_kernel_config_args(device->arch(), operation_attributes.compute_kernel_config);
+    KernelHandle compute_kernel_id = CreateKernel(
+        program,
+        compute_file_path,
+        all_cores,
+        ComputeConfig{
+            .math_fidelity = math_fidelity,
+            .fp32_dest_acc_en =
+                true,  // must always be true otherwise, generated float number are always in range of [0.4, 0.5]
+            .dst_full_sync_en = dst_full_sync_en,
+            .math_approx_mode = math_approx_mode,
+            .compile_args = compute_compile_time_args,
+        });
+
+    uint32_t tile_offset = 0;
+    for (const auto& core : cores) {
+        uint32_t units_per_core;
+        if (core_group_1.core_coord_in_core_ranges(core)) {
+            units_per_core = units_per_core_group_1;
+        } else if (core_group_2.core_coord_in_core_ranges(core)) {
+            units_per_core = units_per_core_group_2;
+        } else {
+            TT_THROW("Core not in specified core ranges");
+        }
+
+        std::vector<uint32_t> reader_runtime_args = {input.buffer()->address(), tile_offset, units_per_core};
+        SetRuntimeArgs(program, reader_kernel_id, core, reader_runtime_args);
+
+        std::vector<uint32_t> compute_runtime_args = {get_random_seed(), tile_offset, units_per_core};
+        SetRuntimeArgs(program, compute_kernel_id, core, compute_runtime_args);
+
+        std::vector<uint32_t> writer_runtime_args = {output.buffer()->address(), tile_offset, units_per_core};
+        SetRuntimeArgs(program, writer_kernel_id, core, writer_runtime_args);
+
+        tile_offset += units_per_core;
+    }
+
+    return {
+        std::move(program),
+        {.reader_kernel_id = reader_kernel_id,
+         .compute_kernel_id = compute_kernel_id,
+         .writer_kernel_id = writer_kernel_id,
+         .cores = cores}};
+}
+
+void BernoulliDeviceOperation::ProgramFactory::override_runtime_arguments(
+    cached_program_t& cached_program,
+    const operation_attributes_t& operation_attributes,
+    const tensor_args_t& tensor_args,
+    tensor_return_value_t& output) {
+    auto& program = cached_program.program;
+    auto& reader_kernel_id = cached_program.shared_variables.reader_kernel_id;
+    auto& writer_kernel_id = cached_program.shared_variables.writer_kernel_id;
+    auto& compute_kernel_id = cached_program.shared_variables.compute_kernel_id;
+    auto& cores = cached_program.shared_variables.cores;
+
+    const uint32_t input_addr = tensor_args.input.buffer()->address();
+    const uint32_t output_addr = output.buffer()->address();
+
+    for (const auto& core : cores) {
+        {
+            auto& runtime_args = GetRuntimeArgs(program, reader_kernel_id, core);
+            runtime_args[0] = input_addr;
+        }
+        {
+            auto& runtime_args = GetRuntimeArgs(program, compute_kernel_id, core);
+            runtime_args[0] = get_random_seed();
+        }
+        {
+            auto& runtime_args = GetRuntimeArgs(program, writer_kernel_id, core);
+            runtime_args[0] = output_addr;
+        }
+    }
+}
+
+}  // namespace ttnn::operations::bernoulli
diff --git a/ttnn/cpp/ttnn/operations/bernoulli/device/kernels/compute_bernoulli.cpp b/ttnn/cpp/ttnn/operations/bernoulli/device/kernels/compute_bernoulli.cpp
new file mode 100644
index 00000000000..816ad043d7e
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/bernoulli/device/kernels/compute_bernoulli.cpp
@@ -0,0 +1,43 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "compute_kernel_api.h"
+#include "compute_kernel_api/eltwise_unary/eltwise_unary.h"
+#include "compute_kernel_api/eltwise_unary/rand.h"
+
+namespace NAMESPACE {
+
+void MAIN {
+    constexpr uint32_t intermed_cb_id = get_compile_time_arg_val(0);
+
+    const uint32_t seed = get_arg_val<uint32_t>(0);
+    const uint32_t start_id = get_arg_val<uint32_t>(1);
+    const uint32_t num_tiles = get_arg_val<uint32_t>(2);
+    const uint32_t end_id = start_id + num_tiles;
+
+    init_sfpu(intermed_cb_id);
+
+    union f2u {
+        float f;
+        uint32_t u;
+    } rand_scale;
+    rand_scale.f = 1;
+    uint32_t rand_from = 0;
+
+    rand_tile_init(seed);
+    for (uint32_t i = start_id; i < end_id; ++i) {
+        cb_reserve_back(intermed_cb_id, 1);
+
+        tile_regs_acquire();
+        rand_tile(0, rand_from, rand_scale.u);
+        tile_regs_commit();
+
+        tile_regs_wait();
+        pack_tile(0, intermed_cb_id, 0);
+        tile_regs_release();
+
+        cb_push_back(intermed_cb_id, 1);
+    }
+}
+}  // namespace NAMESPACE
diff --git a/ttnn/cpp/ttnn/operations/bernoulli/device/kernels/reader_bernoulli.cpp b/ttnn/cpp/ttnn/operations/bernoulli/device/kernels/reader_bernoulli.cpp
new file mode 100644
index 00000000000..ad3af7f5d26
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/bernoulli/device/kernels/reader_bernoulli.cpp
@@ -0,0 +1,26 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "dataflow_api.h"
+
+void kernel_main() {
+    constexpr uint32_t in_cb_id = get_compile_time_arg_val(0);
+    constexpr bool input_is_dram = get_compile_time_arg_val(1) == 1;
+
+    uint32_t input_addr = get_arg_val<uint32_t>(0);
+    uint32_t start_id = get_arg_val<uint32_t>(1);
+    uint32_t num_tiles = get_arg_val<uint32_t>(2);
+    uint32_t end_id = start_id + num_tiles;
+
+    const InterleavedAddrGenFast<input_is_dram> input_addrg = {
+        .bank_base_address = input_addr, .page_size = get_tile_size(in_cb_id), .data_format = get_dataformat(in_cb_id)};
+
+    for (uint32_t i = start_id; i < end_id; ++i) {
+        cb_reserve_back(in_cb_id, 1);
+        uint32_t in_cb_write_ptr = get_write_ptr(in_cb_id);
+        noc_async_read_tile(i, input_addrg, in_cb_write_ptr);
+        noc_async_read_barrier();
+        cb_push_back(in_cb_id, 1);
+    }
+}
diff --git a/ttnn/cpp/ttnn/operations/bernoulli/device/kernels/writer_bernoulli.cpp b/ttnn/cpp/ttnn/operations/bernoulli/device/kernels/writer_bernoulli.cpp
new file mode 100644
index 00000000000..f2bdbc8c9d8
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/bernoulli/device/kernels/writer_bernoulli.cpp
@@ -0,0 +1,78 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+#include "common/constants.hpp"
+#include "dataflow_api.h"
+
+using namespace tt;
+
+void kernel_main() {
+    constexpr uint32_t in_cb_id = get_compile_time_arg_val(0);
+    constexpr uint32_t intermed_cb_id = get_compile_time_arg_val(1);
+    constexpr uint32_t intermed1_cb_id = get_compile_time_arg_val(2);
+    constexpr bool output_is_dram = get_compile_time_arg_val(3) == 1;
+
+    auto out_addr = get_arg_val<uint32_t>(0);
+    auto start_id = get_arg_val<uint32_t>(1);
+    auto num_tiles = get_arg_val<uint32_t>(2);
+    uint32_t end_id = start_id + num_tiles;
+
+    const InterleavedAddrGenFast<output_is_dram> output_addrg = {
+        .bank_base_address = out_addr,
+        .page_size = get_tile_size(intermed1_cb_id),
+        .data_format = get_dataformat(intermed1_cb_id)};
+
+    cb_reserve_back(intermed1_cb_id, 1);
+    uint32_t intermed1_cb_write_ptr = get_write_ptr(intermed1_cb_id);
+
+    for (uint32_t i = start_id; i < end_id; ++i) {
+        cb_wait_front(in_cb_id, 1);
+        cb_wait_front(intermed_cb_id, 1);
+
+        uint32_t intermed_cb_read_ptr = get_read_ptr(intermed_cb_id);
+        uint32_t in_cb_read_ptr = get_read_ptr(in_cb_id);
+
+        auto in_cb_addr = reinterpret_cast<uint8_t *>(in_cb_read_ptr);
+        auto intermed_cb_addr = reinterpret_cast<float *>(intermed_cb_read_ptr);
+        auto intermed1_cb_addr = reinterpret_cast<uint8_t *>(intermed1_cb_write_ptr);
+
+        for (uint32_t k = 0; k < constants::TILE_WIDTH; k++) {
+            for (uint32_t j = 0; j < constants::TILE_HEIGHT; j++) {
+                float rand_float = *intermed_cb_addr;
+
+                float input = 0;
+#ifdef INPUT_DTYPE_FLOAT32
+                input = *reinterpret_cast<float *>(in_cb_addr);
+                in_cb_addr += 4;
+#endif
+#ifdef INPUT_DTYPE_BFLOAT16  // cast: uint16 => uint32 => float and write to input variable.
+                uint16_t *in_u16_ptr = reinterpret_cast<uint16_t *>(in_cb_addr);
+                uint32_t u32 = static_cast<uint32_t>(*in_u16_ptr) << 16;
+                float *f_ptr = reinterpret_cast<float *>(&u32);
+                input = *f_ptr;
+                in_cb_addr += 2;
+#endif
+                float output = 0;
+                if (rand_float <= input) {
+                    output = 1;
+                }
+
+#ifdef OUTPUT_DTYPE_FLOAT32
+                *(float *)intermed1_cb_addr = output;
+                intermed1_cb_addr += 4;
+#endif
+#ifdef OUTPUT_DTYPE_BFLOAT16
+                uint16_t *out_u16_ptr = reinterpret_cast<uint16_t *>(&output) + 1;
+                *(uint16_t *)intermed1_cb_addr = *out_u16_ptr;
+                intermed1_cb_addr += 2;
+#endif
+                intermed_cb_addr += 1;
+            }
+        }
+        cb_pop_front(in_cb_id, 1);
+        cb_pop_front(intermed_cb_id, 1);
+
+        noc_async_write_tile(i, output_addrg, intermed1_cb_write_ptr);
+        noc_async_write_barrier();
+    }
+}
diff --git a/ttnn/cpp/ttnn/operations/uniform/uniform_pybind.cpp b/ttnn/cpp/ttnn/operations/uniform/uniform_pybind.cpp
index dc31fff1b31..39fd1275da4 100644
--- a/ttnn/cpp/ttnn/operations/uniform/uniform_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/uniform/uniform_pybind.cpp
@@ -9,21 +9,27 @@
 
 namespace ttnn::operations::uniform {
 void bind_uniform_operation(py::module &module) {
-    auto doc =
-        R"doc(uniform(input: Tensor, from: float = 0, to: float = 1, memory_config: Optional[MemoryConfig] = None, compute_kernel_config: Optional[ComputeKernelConfig] = None) -> Tensor
-    Generates a tensor with values drawn from a uniform distribution [`from`, `to`). The input tensor provides the shape for the output tensor, while the data type remains unchanged.
-    This operation allows configuration of memory allocation using `memory_config` and computation settings via `compute_kernel_config`.
-
-    Args:
-        * :attr:`input`: The tensor that provides the shape for the generated uniform tensor.
-        * :attr:`from`: The lower bound of the uniform distribution. Defaults to 0.
-        * :attr:`to`: The upper bound of the uniform distribution. Defaults to 1.
-        * :attr:`memory_config`: The memory configuration for the generated tensor.
-        * :attr:`compute_kernel_config`: Optional configuration for the compute kernel used during generation.
-
-    Returns:
-        Tensor: A new tensor with the same shape as `input` and values drawn from the specified uniform distribution.
-    )doc";
+    std::string doc =
+        R"doc(
+        Update in-place the input tensor with values drawn from the continuous uniform distribution 1 / (`to` - `from`).
+
+        Args:
+            input (ttnn.Tensor): The tensor that provides the shape for the generated uniform tensor.
+            from (float32): The lower bound of the uniform distribution. Defaults to 0.
+            to (float32): The upper bound of the uniform distribution. Defaults to 1.
+
+        Keyword args:
+            memory_config (ttnn.MemoryConfig, optional): Memory configuration for the operation. Defaults to `None`.
+            compute_kernel_config (ttnn.DeviceComputeKernelConfig, optional): Configuration for the compute kernel. Defaults to `None`.
+
+        Returns:
+            ttnn.Tensor: The `input` tensor with updated values drawn from the specified uniform distribution.
+
+        Example:
+            >>> input = ttnn.to_device(ttnn.from_torch(torch.ones(3, 3), dtype=torch.bfloat16)), device=device)
+            >>> ttnn.uniform(input)
+
+        )doc";
 
     bind_registered_operation(
         module,

From 091199015a2dc98c4ca97439b3bc9f1455545fdd Mon Sep 17 00:00:00 2001
From: Mark O'Connor <moconnor@tenstorrent.com>
Date: Mon, 28 Oct 2024 11:46:07 +0100
Subject: [PATCH 27/30] Make layernorm assert instead of silently giving
 garbage output when subblock_w too large (#14223)

* #14222: Add assert for subblock_w when fp32 not used as well

* #14223: Use TT_FATAL and handle dst_full_sync_en
---
 .../multi_core/layernorm_op_multi_core.cpp     | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/ttnn/cpp/ttnn/operations/normalization/layernorm/device/multi_core/layernorm_op_multi_core.cpp b/ttnn/cpp/ttnn/operations/normalization/layernorm/device/multi_core/layernorm_op_multi_core.cpp
index 7973e5174e0..d9e387165e4 100644
--- a/ttnn/cpp/ttnn/operations/normalization/layernorm/device/multi_core/layernorm_op_multi_core.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/layernorm/device/multi_core/layernorm_op_multi_core.cpp
@@ -412,7 +412,7 @@ operation::ProgramWithCallbacks layernorm_multi_core_sharded(
     bool is_post_all_gather = distributed_norm_stage == DistributedLayerNormStage::POST_ALL_GATHER;
 
     ////////////////////////////////////////////////////////////////////////////
-    //                      Grayskull Device Setup
+    //                            Device Setup
     ////////////////////////////////////////////////////////////////////////////
     Device *device = a.device();
 
@@ -422,8 +422,20 @@ operation::ProgramWithCallbacks layernorm_multi_core_sharded(
     auto [math_fidelity, math_approx_mode, fp32_dest_acc_en, packer_l1_acc, dst_full_sync_en] =
         get_compute_kernel_config_args(device->arch(), compute_kernel_config);
 
-    if (fp32_dest_acc_en) {
-        TT_ASSERT(subblock_wt <= 4, "subblock width must less than 4 in fp32 mode");
+    if (dst_full_sync_en == false) {
+        if (fp32_dest_acc_en) {
+            TT_FATAL(subblock_wt <= 4, "subblock_wt={}, but subblock width must less than 4 tiles in fp32 mode when dst_full_sync_en is false", subblock_wt);
+        }
+        else {
+            TT_FATAL(subblock_wt <= 8, "subblock_wt={}, but subblock width must less than 8 tiles when dst_full_sync_en is false", subblock_wt);
+        }
+    } else {
+        if (fp32_dest_acc_en) {
+            TT_FATAL(subblock_wt <= 8, "subblock_wt={}, but subblock width must less than 8 tiles in fp32 mode when dst_full_sync_en is true", subblock_wt);
+        }
+        else {
+            TT_FATAL(subblock_wt <= 16, "subblock_wt={}, but subblock width must less than 16 tiles when dst_full_sync_en is true", subblock_wt);
+        }
     }
 
     tt::DataFormat out_data_format = tt::tt_metal::datatype_to_dataformat_converter(output.get_dtype());

From c5e11acd46e5d4a1cc7a6e385d7e473aea5acd68 Mon Sep 17 00:00:00 2001
From: Bui Chi Trung <52347285+BuiChiTrung@users.noreply.github.com>
Date: Mon, 28 Oct 2024 18:26:52 +0700
Subject: [PATCH 28/30] #14342: Hotfix skip grayskull test on bernoulli op
 (#14343)

#14342: skip grayskull test on bernoulli op
---
 tests/ttnn/unit_tests/operations/test_bernoulli.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/ttnn/unit_tests/operations/test_bernoulli.py b/tests/ttnn/unit_tests/operations/test_bernoulli.py
index 5c3c68ab5c4..104259801b9 100644
--- a/tests/ttnn/unit_tests/operations/test_bernoulli.py
+++ b/tests/ttnn/unit_tests/operations/test_bernoulli.py
@@ -12,6 +12,7 @@
     compute_kernel_ids,
     get_lib_dtype,
 )
+from models.utility_functions import skip_for_grayskull
 from collections import Counter
 from loguru import logger
 
@@ -55,6 +56,7 @@ def run_bernoulli(shape, in_dtype, out_dtype, device, is_out_alloc=False, comput
 
 
 # fmt: off
+@skip_for_grayskull("Requires wormhole_b0 to run")
 @pytest.mark.parametrize("shape",
     [
         [2003],
@@ -86,6 +88,7 @@ def test_bernoulli(shape, in_dtype, out_dtype, device, is_out_alloc):
     run_bernoulli(shape, in_dtype, out_dtype, device, is_out_alloc)
 
 
+@skip_for_grayskull("Requires wormhole_b0 to run")
 @pytest.mark.parametrize(
     "shape",
     [
@@ -108,6 +111,7 @@ def test_bernoulli_callback(shape, in_dtype, out_dtype, device, is_out_alloc, us
     assert num_program_cache_entries_list[0] == num_program_cache_entries_list[1]
 
 
+@skip_for_grayskull("Requires wormhole_b0 to run")
 @pytest.mark.parametrize(
     "shape",
     [[512, 512], [5, 4, 70, 40]],

From 76087bddd6b4df92c3c6bca58796e3566babf7db Mon Sep 17 00:00:00 2001
From: Shwetank Singh <ssingh@tenstorrent.com>
Date: Fri, 25 Oct 2024 07:29:06 +0000
Subject: [PATCH 29/30] #13725: yolo on CI

---
 models/experimental/yolov4/demo/demo.py       | 64 +++++++++--------
 .../experimental/yolov4/reference/yolov4.py   | 13 ++++
 .../yolov4/ttnn/weight_parameter_update.py    | 71 +++++++++++++++++++
 models/experimental/yolov4/ttnn/yolov4.py     |  5 +-
 tests/scripts/run_python_model_tests.sh       |  1 +
 5 files changed, 125 insertions(+), 29 deletions(-)
 create mode 100644 models/experimental/yolov4/ttnn/weight_parameter_update.py

diff --git a/models/experimental/yolov4/demo/demo.py b/models/experimental/yolov4/demo/demo.py
index 3c340808b25..ea4aa375530 100644
--- a/models/experimental/yolov4/demo/demo.py
+++ b/models/experimental/yolov4/demo/demo.py
@@ -13,7 +13,8 @@
 
 from models.experimental.yolov4.reference.yolov4 import Yolov4
 from models.experimental.yolov4.ttnn.yolov4 import TtYOLOv4
-
+from models.experimental.yolov4.ttnn.weight_parameter_update import update_weight_parameters
+from collections import OrderedDict
 import ttnn
 from models.utility_functions import skip_for_grayskull
 
@@ -418,11 +419,7 @@ def do_detect(model, img, conf_thresh, nms_thresh, n_classes, device=None, class
         if not is_torch_model:
             input_shape = img.shape
             input_tensor = torch.permute(img, (0, 2, 3, 1))
-
-            input_tensor = input_tensor.reshape(
-                input_tensor.shape[0], 1, input_tensor.shape[1] * input_tensor.shape[2], input_tensor.shape[3]
-            )
-            input_tensor = ttnn.from_torch(input_tensor, device=device)
+            input_tensor = ttnn.from_torch(input_tensor, ttnn.bfloat16)
             img = input_tensor
             t1 = time.time()
 
@@ -534,33 +531,44 @@ def do_detect(model, img, conf_thresh, nms_thresh, n_classes, device=None, class
 
 @skip_for_grayskull()
 @pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True)
-def test_yolov4_model(device, model_location_generator, reset_seeds, input_path):
+@pytest.mark.parametrize(
+    "use_pretrained_weight",
+    [True, False],
+    ids=[
+        "pretrained_weight_true",
+        "pretrained_weight_false",
+    ],
+)
+def test_yolov4_model(device, model_location_generator, reset_seeds, input_path, use_pretrained_weight):
     model_path = model_location_generator("models", model_subdir="Yolo")
-    if model_path == "models":
-        if not os.path.exists("tests/ttnn/integration_tests/yolov4/yolov4.pth"):  # check if yolov4.th is availble
-            os.system(
-                "tests/ttnn/integration_tests/yolov4/yolov4_weights_download.sh"
-            )  # execute the yolov4_weights_download.sh file
-
-        weights_pth = "tests/ttnn/integration_tests/yolov4/yolov4.pth"
-    else:
-        weights_pth = str(model_path / "yolov4.pth")
-
-    ttnn_model = TtYOLOv4(weights_pth)
-
-    torch_model = Yolov4()
+    if use_pretrained_weight:
+        if model_path == "models":
+            if not os.path.exists("tests/ttnn/integration_tests/yolov4/yolov4.pth"):  # check if yolov4.th is availble
+                os.system(
+                    "tests/ttnn/integration_tests/yolov4/yolov4_weights_download.sh"
+                )  # execute the yolov4_weights_download.sh file
+
+            weights_pth = "tests/ttnn/integration_tests/yolov4/yolov4.pth"
+        else:
+            weights_pth = str(model_path / "yolov4.pth")
 
-    new_state_dict = {}
-    ds_state_dict = {k: v for k, v in ttnn_model.torch_model.items()}
+        ttnn_model = TtYOLOv4(weights_pth)
+        torch_model = Yolov4()
+        new_state_dict = {}
+        ds_state_dict = {k: v for k, v in ttnn_model.torch_model.items()}
 
-    keys = [name for name, parameter in torch_model.state_dict().items()]
-    values = [parameter for name, parameter in ds_state_dict.items()]
+        keys = [name for name, parameter in torch_model.state_dict().items()]
+        values = [parameter for name, parameter in ds_state_dict.items()]
 
-    for i in range(len(keys)):
-        new_state_dict[keys[i]] = values[i]
+        for i in range(len(keys)):
+            new_state_dict[keys[i]] = values[i]
 
-    torch_model.load_state_dict(new_state_dict)
-    torch_model.eval()
+        torch_model.load_state_dict(new_state_dict)
+        torch_model.eval()
+    else:
+        torch_model = Yolov4.from_random_weights()
+        ttnn_weights = update_weight_parameters(OrderedDict(torch_model.state_dict()))
+        ttnn_model = TtYOLOv4(ttnn_weights)
 
     n_classes = 80
     namesfile = "models/experimental/yolov4/demo/coco.names"
diff --git a/models/experimental/yolov4/reference/yolov4.py b/models/experimental/yolov4/reference/yolov4.py
index d65869585c5..691c808a29c 100644
--- a/models/experimental/yolov4/reference/yolov4.py
+++ b/models/experimental/yolov4/reference/yolov4.py
@@ -36,3 +36,16 @@ def forward(self, input: torch.Tensor):
         x4, x5, x6 = self.head(x20, x13, x6)
 
         return x4, x5, x6
+
+    @staticmethod
+    def from_random_weights():
+        model = Yolov4()
+        model.eval()
+
+        new_state_dict = {}
+        for name, parameter in model.state_dict().items():
+            if isinstance(parameter, torch.FloatTensor):
+                new_state_dict[name] = parameter
+
+        model.load_state_dict(new_state_dict)
+        return model
diff --git a/models/experimental/yolov4/ttnn/weight_parameter_update.py b/models/experimental/yolov4/ttnn/weight_parameter_update.py
new file mode 100644
index 00000000000..cfe3d864713
--- /dev/null
+++ b/models/experimental/yolov4/ttnn/weight_parameter_update.py
@@ -0,0 +1,71 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+import re
+from collections import OrderedDict
+
+
+def update_weigth_keys(key):
+    key = key.replace("downsample", "down")
+    key = key.replace("neck", "neek")
+    if ".res" in key:
+
+        def res_name_update(match):
+            chr = match.group(1)
+            num = int(match.group(2))
+            if num == 0 or num == 1:
+                return f".{chr}.0.conv.{num}."
+            if num == 3 or num == 4:
+                return f".{chr}.1.conv.{num-3}."
+
+        key = re.sub(r"\.res\.", r".resblock.", key)
+        key = re.sub(r"\.(\d+)\.(\d+)\.", res_name_update, key)
+        return key
+    if "neek" in key:
+
+        def neek_underscore_update_rule(match):
+            chr = match.group(1)
+            num1 = int(match.group(2))
+            num2 = int(match.group(3))
+            dict = {
+                (7, 2): 8,
+                (7, 3): 9,
+                (7, 4): 11,
+                (8, 2): 12,
+                (7, 5): 13,
+                (9, 2): 15,
+                (9, 3): 16,
+                (9, 4): 18,
+                (10, 2): 19,
+                (9, 5): 20,
+            }
+            if chr == "b":
+                return f".conv{dict[(num1, num2)]}.conv.1."
+            return f".conv{dict[(num1, num2)]}.conv.0."
+
+        def neck_rename_update(match):
+            chr = match.group(1)
+            num = int(match.group(2))
+            if num <= 7:
+                return f".conv{num}.conv.1." if chr == "b" else f".conv{num}.conv.0."
+            dict = {8: 10, 9: 14, 10: 17}
+            return f".conv{dict[num]}.conv.1." if chr == "b" else f".conv{dict[num]}.conv.0."
+
+        updated_name = re.sub(r"\.([a-z])(\d+)_(\d+)\.", neek_underscore_update_rule, key)
+        if key != updated_name:  # chk if name got updated
+            return updated_name
+        updated_name = re.sub(r"\.([a-z])(\d+)\.", neck_rename_update, key)
+        if key != updated_name:
+            return updated_name
+    key = re.sub(r"\.c(\d+)\.", r".conv\1.conv.0.", key)
+    key = re.sub(r"\.b(\d+)\.", r".conv\1.conv.1.", key)
+    return key
+
+
+def update_weight_parameters(model_weight):
+    ttnn_model_random_weight = OrderedDict()
+    for key, weight in model_weight.items():
+        updated_key = update_weigth_keys(key)
+        ttnn_model_random_weight[updated_key] = weight
+    return ttnn_model_random_weight
diff --git a/models/experimental/yolov4/ttnn/yolov4.py b/models/experimental/yolov4/ttnn/yolov4.py
index fd951678893..015e490d24f 100644
--- a/models/experimental/yolov4/ttnn/yolov4.py
+++ b/models/experimental/yolov4/ttnn/yolov4.py
@@ -25,7 +25,10 @@
 
 class TtYOLOv4:
     def __init__(self, path) -> None:
-        self.torch_model = torch.load(path)
+        if type(path) is str:
+            self.torch_model = torch.load(path)
+        else:
+            self.torch_model = path
         self.torch_keys = self.torch_model.keys()
         self.down1 = Down1(self)
         self.down2 = Down2(self)
diff --git a/tests/scripts/run_python_model_tests.sh b/tests/scripts/run_python_model_tests.sh
index 9ec73c2e37b..0643861ec26 100755
--- a/tests/scripts/run_python_model_tests.sh
+++ b/tests/scripts/run_python_model_tests.sh
@@ -35,6 +35,7 @@ run_python_model_tests_wormhole_b0() {
     # higher sequence lengths and different formats trigger memory issues
     pytest models/demos/falcon7b_common/tests/unit_tests/test_falcon_matmuls_and_bmms_with_mixed_precision.py -k "seq_len_128 and in0_BFLOAT16-in1_BFLOAT8_B-out_BFLOAT16-weights_DRAM"
     pytest tests/ttnn/integration_tests/resnet/test_ttnn_functional_resnet50_new.py -k "pretrained_weight_false"
+    pytest models/experimental/yolov4/demo/demo.py -k "pretrained_weight_false"
 
     # Unet Shallow
     WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -svv models/experimental/functional_unet/tests/test_unet_model.py

From 4a11a11a1d435fecac30308bccd0ff5e38cbd8cc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bojan=20Ro=C5=A1ko?=
 <156314064+broskoTT@users.noreply.github.com>
Date: Mon, 28 Oct 2024 13:55:41 +0100
Subject: [PATCH 30/30] #14328: UMD fix l1 membar log (#14328)

umd fix membar
Force-merging to ensure we don't spit out crazy amounts of logs on CI.
---
 tt_metal/third_party/umd | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tt_metal/third_party/umd b/tt_metal/third_party/umd
index 6deb8d7d2c6..357154e0782 160000
--- a/tt_metal/third_party/umd
+++ b/tt_metal/third_party/umd
@@ -1 +1 @@
-Subproject commit 6deb8d7d2c6513af090d91c58e3ace53b4564b4e
+Subproject commit 357154e078258810da1e84d74556cb4d0c0cde64