From 2b73ca590f4470938c324353a6b97fa03e2e3735 Mon Sep 17 00:00:00 2001
From: Joseph Chu <jchu@tenstorrent.com>
Date: Sat, 21 Sep 2024 00:53:38 +0000
Subject: [PATCH] #11403: Support 2x4-submeshes across 8x4 mesh

1. Submeshing to support creating submesh on galaxy mesh
2. Key change to start enabling more T3000 Tests onto galaxy:
- Currently ttnn.all_gather(..) in a ring relies on MeshDevice being
initialized in ring-order. Now we decouple this so we don't require that
MeshDevice is initialized with devices in a ring-order. Instead, we now
explicitly request for a ring-order in the operation that requires it.
---
 conftest.py                                   |   9 +-
 .../test_falcon_create_qkv_heads.py           |   2 +-
 .../tests/unit_tests/test_falcon_softmax.py   |   2 +-
 .../t3000/llama2_70b/tests/test_llama_perf.py |   1 -
 .../llama2_70b/tt/llama_mlp_optimized.py      |   3 +-
 tests/scripts/tg/run_tg_model_perf_tests.sh   |   6 +-
 .../test_multidevice_TG.py                    |  16 ++
 .../unit_tests/gtests/test_ccl_on_galaxy.cpp  |  12 +-
 .../unit_tests/operations/test_all_gather.py  | 178 +++++++++++++-----
 .../test_all_gather_llama_perf_sweep.py       |   2 +-
 .../operations/test_all_gather_nightly.py     |   2 +-
 .../test_distributed_layernorm_sharded.py     |   2 +-
 .../test_reduce_scatter_llama_perf_sweep.py   |   2 +-
 .../operations/test_reduce_scatter_nightly.py |   2 +-
 .../test_reduce_scatter_post_commit.py        |   2 +-
 tests/ttnn/unit_tests/test_multi_device.py    |   4 +
 .../device/mesh_configurations/T3000.json     |   2 +-
 tt_metal/impl/device/mesh_device.cpp          | 142 +++++++++++---
 tt_metal/impl/device/mesh_device.hpp          |  19 +-
 tt_metal/impl/device/mesh_device_view.cpp     | 111 ++++++++++-
 tt_metal/impl/device/mesh_device_view.hpp     |  19 +-
 ttnn/cpp/pybind11/multi_device.hpp            |  22 +--
 .../ccl/all_gather/device/all_gather_op.cpp   |   8 +-
 .../device/all_gather_matmul_op.cpp           |   7 +-
 ttnn/cpp/ttnn/tensor/tensor.cpp               |   8 +-
 25 files changed, 444 insertions(+), 139 deletions(-)

diff --git a/conftest.py b/conftest.py
index 36a8d35b6b5..f212baf1963 100644
--- a/conftest.py
+++ b/conftest.py
@@ -229,13 +229,13 @@ def pcie_mesh_device(request, silicon_arch_name, silicon_arch_wormhole_b0, devic
     except (ValueError, AttributeError):
         num_pcie_devices_requested = len(device_ids)
 
+    if num_pcie_devices_requested != 4:
+        pytest.skip("Only 4 PCIe devices are supported for testing")
+
     request.node.pci_ids = device_ids[:num_pcie_devices_requested]
 
     mesh_device = ttnn.open_mesh_device(
-        ttnn.MeshShape(1, num_pcie_devices_requested),
-        dispatch_core_type=get_dispatch_core_type(),
-        **device_params,
-        physical_device_ids=device_ids[:num_pcie_devices_requested],
+        ttnn.MeshShape(2, 2), dispatch_core_type=get_dispatch_core_type(), **device_params, offset=(0, 1)
     )
 
     logger.debug(f"multidevice with {mesh_device.get_num_devices()} devices is created")
@@ -255,6 +255,7 @@ def t3k_mesh_device(request, silicon_arch_name, silicon_arch_wormhole_b0, device
     if ttnn.get_num_devices() < 8:
         pytest.skip()
 
+    request.node.pci_ids = ttnn.get_pcie_device_ids()
     mesh_device = ttnn.open_mesh_device(
         ttnn.MeshShape(2, 4),
         dispatch_core_type=get_dispatch_core_type(),
diff --git a/models/demos/t3000/falcon40b/tests/unit_tests/test_falcon_create_qkv_heads.py b/models/demos/t3000/falcon40b/tests/unit_tests/test_falcon_create_qkv_heads.py
index 8dc05c8cbc7..c1c758e6604 100644
--- a/models/demos/t3000/falcon40b/tests/unit_tests/test_falcon_create_qkv_heads.py
+++ b/models/demos/t3000/falcon40b/tests/unit_tests/test_falcon_create_qkv_heads.py
@@ -10,7 +10,7 @@
 from tests.tt_eager.python_api_testing.sweep_tests.comparison_funcs import (
     comp_pcc,
 )
-from models.utility_functions import torch2tt_tensor, tt2torch_tensor, skip_for_grayskull, get_devices_for_t3000
+from models.utility_functions import torch2tt_tensor, tt2torch_tensor, skip_for_grayskull
 from models.demos.t3000.falcon40b.tt.model_config import (
     get_model_config,
 )
diff --git a/models/demos/t3000/falcon40b/tests/unit_tests/test_falcon_softmax.py b/models/demos/t3000/falcon40b/tests/unit_tests/test_falcon_softmax.py
index 6a14cac85b5..ccbed44e454 100644
--- a/models/demos/t3000/falcon40b/tests/unit_tests/test_falcon_softmax.py
+++ b/models/demos/t3000/falcon40b/tests/unit_tests/test_falcon_softmax.py
@@ -11,7 +11,7 @@
 from tests.tt_eager.python_api_testing.sweep_tests.comparison_funcs import (
     comp_pcc,
 )
-from models.utility_functions import torch2tt_tensor, tt2torch_tensor, skip_for_grayskull, get_devices_for_t3000
+from models.utility_functions import torch2tt_tensor, tt2torch_tensor, skip_for_grayskull
 from models.demos.t3000.falcon40b.tt.model_config import (
     get_model_config,
 )
diff --git a/models/demos/t3000/llama2_70b/tests/test_llama_perf.py b/models/demos/t3000/llama2_70b/tests/test_llama_perf.py
index b4e81524ec6..8ea224c848a 100644
--- a/models/demos/t3000/llama2_70b/tests/test_llama_perf.py
+++ b/models/demos/t3000/llama2_70b/tests/test_llama_perf.py
@@ -25,7 +25,6 @@
     disable_compilation_reports,
     nearest_32,
     skip_for_grayskull,
-    get_devices_for_t3000,
 )
 from models.perf.perf_utils import prep_perf_report
 from models.perf.device_perf_utils import run_device_perf, check_device_perf, prep_device_perf_report
diff --git a/models/demos/t3000/llama2_70b/tt/llama_mlp_optimized.py b/models/demos/t3000/llama2_70b/tt/llama_mlp_optimized.py
index 1ac4f56a41a..4d30605e63c 100644
--- a/models/demos/t3000/llama2_70b/tt/llama_mlp_optimized.py
+++ b/models/demos/t3000/llama2_70b/tt/llama_mlp_optimized.py
@@ -72,12 +72,11 @@ def load_weights(self):
             padded_w3[:, :, :, :H4] = self.state_dict[w3_str].transpose(-2, -1)
 
         # w1: 8k x 4k. width-sharded on 12 banks, 4224 over 12 banks.
-        device = self.mesh_device.get_device(0)
         weight_grid = ttnn.CoreRangeSet(
             {
                 ttnn.CoreRange(
                     ttnn.CoreCoord(0, 0),
-                    ttnn.CoreCoord(device.dram_grid_size().x - 1, device.dram_grid_size().y - 1),
+                    ttnn.CoreCoord(self.mesh_device.dram_grid_size().x - 1, self.mesh_device.dram_grid_size().y - 1),
                 )
             }
         )
diff --git a/tests/scripts/tg/run_tg_model_perf_tests.sh b/tests/scripts/tg/run_tg_model_perf_tests.sh
index 7cd43da8c89..76d85050dcc 100755
--- a/tests/scripts/tg/run_tg_model_perf_tests.sh
+++ b/tests/scripts/tg/run_tg_model_perf_tests.sh
@@ -1,6 +1,10 @@
 #!/bin/bash
 
-run_tg_llm_tests() {
+run_t3k_tests_on_tg_tests() {
+
+  echo "LOG_METAL: Running T3000 tests on TG"
+  env pytest -n auto models/demos/t3000/llama2_70b/tests/test_llama_perf_decode.py -m "model_perf_t3000" --timeout=600 ; fail+=$?
+
   # Merge all the generated reports
   env python models/perf/merge_perf_results.py; fail+=$?
 
diff --git a/tests/ttnn/multichip_unit_tests/test_multidevice_TG.py b/tests/ttnn/multichip_unit_tests/test_multidevice_TG.py
index 3d927636f91..4ca13900f54 100644
--- a/tests/ttnn/multichip_unit_tests/test_multidevice_TG.py
+++ b/tests/ttnn/multichip_unit_tests/test_multidevice_TG.py
@@ -1573,3 +1573,19 @@ def test_sharded_distributed_layernorm(mesh_device, input_width, input_height, c
     is_pass, output_pcc = comp_pcc(torch_output_tensor, tt_output_tensor, pcc=0.999)
 
     assert is_pass, f"PCC value: {output_pcc}"
+
+
+def test_ttnn_multi_device_all_gather_all_devices(t3k_mesh_device):
+    """Example test for running a 2x4-Ring All-Gather on galaxy"""
+    full_tensor = torch.ones((1, 1, 32, 32 * t3k_mesh_device.get_num_devices()), dtype=torch.bfloat16)
+    for i in range(t3k_mesh_device.get_num_devices()):
+        full_tensor[..., i * 32 : (i + 1) * 32] = i
+
+    ttnn_tensor = ttnn.from_torch(full_tensor, mesh_mapper=ShardTensorToMesh(t3k_mesh_device, dim=3))
+    ttnn_tensor = ttnn.to_device(ttnn_tensor, t3k_mesh_device)
+    ttnn_tensor = ttnn.all_gather(ttnn_tensor, dim=3, num_links=1)
+
+    device_tensors: typing.List[ttnn.Tensor] = ttnn.get_device_tensors(ttnn_tensor)
+    for device_tensor in device_tensors:
+        device_tensor_torch = ttnn.to_torch(device_tensor)
+        assert torch.all(device_tensor_torch == full_tensor)
diff --git a/tests/ttnn/unit_tests/gtests/test_ccl_on_galaxy.cpp b/tests/ttnn/unit_tests/gtests/test_ccl_on_galaxy.cpp
index 6d7ed90ee8b..2d2e9950467 100644
--- a/tests/ttnn/unit_tests/gtests/test_ccl_on_galaxy.cpp
+++ b/tests/ttnn/unit_tests/gtests/test_ccl_on_galaxy.cpp
@@ -130,8 +130,9 @@ TEST(GalaxyTests, TestAllGatherDeadlock) {
     }
     // Iterate over each row and run line all-gather multiple times.
     // For each row, send adversarial traffic to the first chip, that can hang the network if the CCL is not tagged.
+    auto view = MeshDeviceView(*mesh);
     for (uint32_t row = 0; row < 8; row++) {
-        auto devs = mesh->get_devices_on_row(row);
+        auto devs = view.get_devices_on_row(row);
         std::vector<uint32_t> device_ids = {};
         for (auto dev : devs) {
             device_ids.push_back(dev->id());
@@ -189,13 +190,14 @@ TEST(GalaxyTests, TestReduceScatterDeadlock) {
     std::shared_ptr<MeshDevice> mesh = ttnn::multi_device::open_mesh_device(mesh_shape, 0, 0, 1, DispatchCoreType::WORKER);
     // Create the outer ring on which Reduce Scatter will be run. This allows us to verify that there are no deadlocks when we send CCLs to the
     // first tunnel (forward path).
-    std::vector<Device*> ring_devices = mesh->get_devices_on_row(0); // Tunnel 0
-    std::vector<Device*> ring_devices_1 = mesh->get_devices_on_column(mesh_shape.second - 1); // Orthogonal to tunnel .. no deadlocks
+    auto view = MeshDeviceView(*mesh);
+    std::vector<Device*> ring_devices = view.get_devices_on_row(0); // Tunnel 0
+    std::vector<Device*> ring_devices_1 = view.get_devices_on_column(mesh_shape.second - 1); // Orthogonal to tunnel .. no deadlocks
     ring_devices_1 = std::vector<Device*>(ring_devices_1.begin() + 1, ring_devices_1.end());
-    std::vector<Device*> ring_devices_2 = mesh->get_devices_on_row(7); // Tunnel 7 .. potential deadlocks with lack of buffering
+    std::vector<Device*> ring_devices_2 = view.get_devices_on_row(7); // Tunnel 7 .. potential deadlocks with lack of buffering
     std::reverse(ring_devices_2.begin(), ring_devices_2.end());
     ring_devices_2 = std::vector<Device*>(ring_devices_2.begin() + 1, ring_devices_2.end());
-    std::vector<Device*> ring_devices_3 = mesh->get_devices_on_column(0); // Orthogonal to tunnel .. no deadlocks
+    std::vector<Device*> ring_devices_3 = view.get_devices_on_column(0); // Orthogonal to tunnel .. no deadlocks
     std::reverse(ring_devices_3.begin(), ring_devices_3.end());
     ring_devices_3 = std::vector<Device*>(ring_devices_3.begin() + 1, ring_devices_3.end() - 1);
 
diff --git a/tests/ttnn/unit_tests/operations/test_all_gather.py b/tests/ttnn/unit_tests/operations/test_all_gather.py
index bc58fef10eb..501d85be176 100644
--- a/tests/ttnn/unit_tests/operations/test_all_gather.py
+++ b/tests/ttnn/unit_tests/operations/test_all_gather.py
@@ -119,7 +119,7 @@ def run_with_trace(
 
 
 def run_all_gather_impl(
-    all_devices,
+    t3k_mesh_device,
     num_devices,
     input_shape,
     dim,
@@ -130,12 +130,11 @@ def run_all_gather_impl(
     use_program_cache,
     function_level_defaults,
     all_gather_operation,
-    devices,
     num_iters=1,
     enable_async=False,
 ):
     # Use Async mode based on test input config
-    for device in all_devices:
+    for device in t3k_mesh_device.get_devices():
         device.enable_async(enable_async)
     if enable_async:
         logger.info(f"Using Async Mode for All Gather Op Dispatch")
@@ -149,17 +148,19 @@ def run_all_gather_impl(
     logger.info(f"dim: {dim}")
 
     input_tensor = torch.rand(input_shape).bfloat16()
+    input_tensor_mesh = ttnn.from_torch(
+        input_tensor,
+        dtype=input_dtype,
+        layout=layout,
+        device=t3k_mesh_device,
+        memory_config=mem_config,
+        mesh_mapper=ShardTensorToMesh(t3k_mesh_device, dim=dim),
+    )
 
-    input_tensors = torch.chunk(input_tensor, num_devices, dim)
-    tt_input_tensors = []
-    for i, t in enumerate(input_tensors):
-        tt_input_tensors.append(ttnn.Tensor(t, input_dtype).to(layout).to(devices[i], mem_config))
-
-    input_tensor_mesh = ttnn.aggregate_as_tensor(tt_input_tensors)
     for i in range(num_iters):
         tt_out_tensor = all_gather_operation(input_tensor_mesh, dim, num_links=num_links, memory_config=mem_config)
 
-        for d in devices:
+        for d in t3k_mesh_device.get_devices():
             ttnn.synchronize_device(d)
         logger.info(f"Done iteration {i}")
 
@@ -199,7 +200,7 @@ def run_all_gather_on_n300_impl(
         pytest.skip(f"Skipping unsupported case {message}.")
 
     return run_all_gather_impl(
-        all_devices,
+        t3k_mesh_device,
         num_devices,
         input_shape,
         dim,
@@ -210,14 +211,13 @@ def run_all_gather_on_n300_impl(
         use_program_cache,
         function_level_defaults,
         all_gather_operation,
-        all_devices,
         num_iters,
         enable_async,
     )
 
 
 def run_all_gather_on_t3000_impl(
-    all_devices,
+    t3k_mesh_device,
     num_devices,
     input_shape,
     dim,
@@ -231,7 +231,7 @@ def run_all_gather_on_t3000_impl(
     num_iters=1,
     enable_async=False,
 ):
-    if len(all_devices) != 8:
+    if t3k_mesh_device.get_num_devices() != 8:
         pytest.skip("Not T3000!")
 
     (is_known_failure, message) = is_unsupported_case_t3k(
@@ -240,10 +240,8 @@ def run_all_gather_on_t3000_impl(
     if is_known_failure:
         pytest.skip(f"Skipping unsupported case {message}.")
 
-    devices = get_devices_for_t3000(all_devices, num_devices)
-
     return run_all_gather_impl(
-        all_devices,
+        t3k_mesh_device,
         num_devices,
         input_shape,
         dim,
@@ -254,14 +252,13 @@ def run_all_gather_on_t3000_impl(
         use_program_cache,
         function_level_defaults,
         all_gather_operation,
-        devices,
         num_iters,
         enable_async,
     )
 
 
 def run_all_gather_on_t3000_impl_tight_loop(
-    all_devices,
+    t3k_mesh_device,
     num_devices,
     input_shape,
     dim,
@@ -276,7 +273,7 @@ def run_all_gather_on_t3000_impl_tight_loop(
     enable_async=False,
 ):
     run_all_gather_on_t3000_impl(
-        all_devices,
+        t3k_mesh_device,
         num_devices,
         input_shape,
         dim,
@@ -324,7 +321,7 @@ def run_all_gather_on_t3000_impl_tight_loop(
 @pytest.mark.parametrize("num_iters", [1])  # restore to 500: https://github.com/tenstorrent/tt-metal/issues/9686
 @pytest.mark.parametrize("enable_async", [True, False])
 def test_all_gather_on_t3000_post_commit_looping(
-    all_devices,
+    t3k_mesh_device,
     num_devices,
     input_shape,
     dim,
@@ -338,7 +335,7 @@ def test_all_gather_on_t3000_post_commit_looping(
     enable_async,
 ):
     run_all_gather_on_t3000_impl_tight_loop(
-        all_devices,
+        t3k_mesh_device,
         num_devices,
         input_shape,
         dim,
@@ -386,7 +383,7 @@ def test_all_gather_on_t3000_post_commit_looping(
 @pytest.mark.parametrize("num_iters", [1000])  # TODO: restore to 500
 @pytest.mark.parametrize("enable_async", [True, False])
 def test_all_gather_on_t3000_nightly_commit_looping(
-    all_devices,
+    t3k_mesh_device,
     num_devices,
     input_shape,
     dim,
@@ -400,7 +397,7 @@ def test_all_gather_on_t3000_nightly_commit_looping(
     enable_async,
 ):
     run_all_gather_on_t3000_impl_tight_loop(
-        all_devices,
+        t3k_mesh_device,
         num_devices,
         input_shape,
         dim,
@@ -422,6 +419,90 @@ def test_all_gather_on_t3000_nightly_commit_looping(
     "num_devices, num_links, input_shape, dim, layout",
     [
         (4, 2, [4, 1, 33, 256], 0, ttnn.ROW_MAJOR_LAYOUT),  # https://github.com/tenstorrent/tt-metal/issues/9686
+        # (8, 1, [8, 8, 256, 384], 1, ttnn.ROW_MAJOR_LAYOUT),           # https://github.com/tenstorrent/tt-metal/issues/9686
+        # (4, 2, [8, 8, 256, 384], 1, ttnn.ROW_MAJOR_LAYOUT),           # https://github.com/tenstorrent/tt-metal/issues/9686
+        # (4, 2, [8, 8, 256, 384], 1, ttnn.TILE_LAYOUT),           # https://github.com/tenstorrent/tt-metal/issues/9686
+        # (8, 1, [8, 8, 256, 384], 1, ttnn.TILE_LAYOUT),           # https://github.com/tenstorrent/tt-metal/issues/9686
+        # (4, 2, [8, 5, 13, 384], 3, ttnn.ROW_MAJOR_LAYOUT),           # https://github.com/tenstorrent/tt-metal/issues/9686
+        # (8, 1, [8, 5, 13, 512], 3, ttnn.ROW_MAJOR_LAYOUT),           # https://github.com/tenstorrent/tt-metal/issues/9686
+        # (4, 2, [8, 5, 32, 384], 3, ttnn.TILE_LAYOUT),           # https://github.com/tenstorrent/tt-metal/issues/9686
+        # (8, 1, [8, 5, 32, 512], 3, ttnn.TILE_LAYOUT),
+        # Only for BFP8B
+        # # ([1, 1, 640, 32768], 3, ttnn.TILE_LAYOUT),        # https://github.com/tenstorrent/tt-metal/issues/9686
+        # # MLP AllGather,  Llama 2 decode attn, mlp. Llama2, Falcon 40B decode mlp attn
+        # (8, 1, [1, 1, 32, 32768], 3, ttnn.TILE_LAYOUT),        # https://github.com/tenstorrent/tt-metal/issues/9686
+        # (4, 2, [1, 1, 32, 16384], 3, ttnn.TILE_LAYOUT),        # https://github.com/tenstorrent/tt-metal/issues/9686
+        # # (4, 2, [1, 1, 32, 32768], 3, ttnn.TILE_LAYOUT),        # https://github.com/tenstorrent/tt-metal/issues/9686
+        # # (8, 1, [1, 1, 32, 32768], 3, ttnn.ROW_MAJOR_LAYOUT),        # https://github.com/tenstorrent/tt-metal/issues/9686
+        # # Input, Selfout, Final AllGather,  Llama2, Falcon 40B decode mlp attn
+        # (8, 1, [1, 1, 32, 8192], 3, ttnn.TILE_LAYOUT),        # https://github.com/tenstorrent/tt-metal/issues/9686
+        # (4, 2, [1, 1, 32, 8192], 3, ttnn.TILE_LAYOUT),        # https://github.com/tenstorrent/tt-metal/issues/9686
+        # (8, 1, [1, 1, 32, 8192], 3, ttnn.ROW_MAJOR_LAYOUT),        # https://github.com/tenstorrent/tt-metal/issues/9686
+        # # Falcon 40B prefill
+        # # 8 chips
+        # (8, 1, [1, 1, 2048, 8192], 3, ttnn.TILE_LAYOUT),          # https://github.com/tenstorrent/tt-metal/issues/9686
+        # (8, 1, [1, 1, 2048, 8192], 3, ttnn.ROW_MAJOR_LAYOUT),          # https://github.com/tenstorrent/tt-metal/issues/9686
+        # # Falcon 40B prefill, also mixtral expert reduction (w/ zero filled tensor)
+        # # 8 chips
+        # (8, 1, [1, 1, 2048, 32768], 3, ttnn.TILE_LAYOUT),  # https://github.com/tenstorrent/tt-metal/issues/9686
+        # # Llama/falcon40B galaxy mlp weights stationary -> emulation of row/col reduce
+        # (8, 1, [1, 1, 256, 1024], 2, ttnn.TILE_LAYOUT),          # https://github.com/tenstorrent/tt-metal/issues/9686
+        # (8, 1, [1, 1, 246, 4096], 2, ttnn.ROW_MAJOR_LAYOUT),          # https://github.com/tenstorrent/tt-metal/issues/9686
+        # (8, 1, [1, 1, 246, 4096], 2, ttnn.TILE_LAYOUT),          # https://github.com/tenstorrent/tt-metal/issues/9686
+        # (8, 1, [1, 1, 8192, 32], 2, ttnn.TILE_LAYOUT),          # https://github.com/tenstorrent/tt-metal/issues/9686
+        # (8, 1, [1, 1, 1024, 256], 3, ttnn.TILE_LAYOUT),          # https://github.com/tenstorrent/tt-metal/issues/9686
+        # (8, 1, [1, 1, 256, 2048], 2, ttnn.TILE_LAYOUT),          # https://github.com/tenstorrent/tt-metal/issues/9686
+        # (8, 1, [1, 1, 256, 8192], 2, ttnn.TILE_LAYOUT),  # double on reduction dim for 8 chip          # https://github.com/tenstorrent/tt-metal/issues/9686
+        # (8, 1, [8, 1, 256, 32], 0, ttnn.TILE_LAYOUT),          # https://github.com/tenstorrent/tt-metal/issues/9686
+        # (8, 1, [8, 8, 128, 4096], 1, ttnn.TILE_LAYOUT),          # https://github.com/tenstorrent/tt-metal/issues/9686
+    ],
+)
+@pytest.mark.parametrize(
+    "input_dtype",
+    [
+        ttnn.bfloat16,
+        # ttnn.bfloat8_b,          # https://github.com/tenstorrent/tt-metal/issues/9686
+    ],
+)
+@pytest.mark.parametrize(
+    "mem_config",
+    [
+        ttnn.MemoryConfig(buffer_type=ttnn.BufferType.DRAM),
+        # ttnn.MemoryConfig(buffer_type=ttnn.BufferType.L1),  # https://github.com/tenstorrent/tt-metal/issues/9686
+    ],
+)
+def test_x4_all_gather_on_t3000_post_commit(
+    pcie_mesh_device,
+    num_devices,
+    input_shape,
+    dim,
+    num_links,
+    input_dtype,
+    layout,
+    mem_config,
+    use_program_cache,
+    function_level_defaults,
+):
+    run_all_gather_on_t3000_impl(
+        pcie_mesh_device,
+        num_devices,
+        input_shape,
+        dim,
+        num_links,
+        input_dtype,
+        layout,
+        mem_config,
+        use_program_cache,
+        function_level_defaults,
+        all_gather_operation=ttnn.all_gather,
+    )
+
+
+# Enumerate the post-commit cases explicitly
+@skip_for_grayskull("Requires eth connected devices to run")
+@pytest.mark.parametrize(
+    "num_devices, num_links, input_shape, dim, layout",
+    [
         (8, 1, [8, 1, 33, 256], 0, ttnn.ROW_MAJOR_LAYOUT),  # https://github.com/tenstorrent/tt-metal/issues/9686
         # (8, 1, [8, 8, 256, 384], 1, ttnn.ROW_MAJOR_LAYOUT),           # https://github.com/tenstorrent/tt-metal/issues/9686
         # (4, 2, [8, 8, 256, 384], 1, ttnn.ROW_MAJOR_LAYOUT),           # https://github.com/tenstorrent/tt-metal/issues/9686
@@ -476,7 +557,7 @@ def test_all_gather_on_t3000_nightly_commit_looping(
     ],
 )
 def test_all_gather_on_t3000_post_commit(
-    all_devices,
+    t3k_mesh_device,
     num_devices,
     input_shape,
     dim,
@@ -488,7 +569,7 @@ def test_all_gather_on_t3000_post_commit(
     function_level_defaults,
 ):
     run_all_gather_on_t3000_impl(
-        all_devices,
+        t3k_mesh_device,
         num_devices,
         input_shape,
         dim,
@@ -557,7 +638,7 @@ def run_line_all_gather(
 
 # This run function is deprecated and is only intended to be used by 2-link tests on t3k
 def run_line_all_gather_deprecated(
-    all_devices,
+    pcie_mesh_device,
     num_devices,
     input_shape,
     dim,
@@ -570,10 +651,7 @@ def run_line_all_gather_deprecated(
     enable_async,
     num_iters=1,
 ):
-    if len(all_devices) != 8:
-        pytest.skip("Not T3000!")
-
-    for device in all_devices:
+    for device in pcie_mesh_device.get_devices():
         device.enable_async(enable_async)
 
     logger.info(f"Input shape: {input_shape}")
@@ -585,23 +663,23 @@ def run_line_all_gather_deprecated(
     if is_known_failure:
         pytest.skip(f"Skipping unsupported case {message}.")
 
-    devices = get_devices_for_t3000(all_devices, num_devices)
-
     logger.info(f"Input shape: {input_shape}")
     logger.info(f"dim: {dim}")
 
     input_tensor = torch.rand(input_shape).bfloat16()
 
-    input_tensors = torch.chunk(input_tensor, num_devices, dim)
-    tt_input_tensors = []
-    for i, t in enumerate(input_tensors):
-        tt_input_tensors.append(ttnn.Tensor(t, input_dtype).to(layout).to(devices[i], mem_config))
-
-    input_tensor_mesh = ttnn.aggregate_as_tensor(tt_input_tensors)
+    input_tensor_mesh = ttnn.from_torch(
+        input_tensor,
+        mesh_mapper=ShardTensorToMesh(pcie_mesh_device, dim=dim),
+        memory_config=mem_config,
+        device=pcie_mesh_device,
+        layout=layout,
+        dtype=input_dtype,
+    )
     for i in range(num_iters):
         tt_out_tensor = ttnn.line_all_gather(input_tensor_mesh, dim, num_links=num_links, memory_config=mem_config)
 
-        for d in devices:
+        for d in pcie_mesh_device.get_devices():
             ttnn.synchronize_device(d)
         logger.info(f"Done iteration {i}")
 
@@ -711,7 +789,7 @@ def test_line_all_gather_on_t3000_post_commit(
 )
 @pytest.mark.parametrize("enable_async", [True, False])
 def test_line_all_gather_on_t3000_post_commit_two_link(
-    all_devices,
+    pcie_mesh_device,
     num_devices,
     input_shape,
     dim,
@@ -725,7 +803,7 @@ def test_line_all_gather_on_t3000_post_commit_two_link(
     num_iters=1,
 ):
     run_line_all_gather_deprecated(
-        all_devices,
+        pcie_mesh_device,
         num_devices,
         input_shape,
         dim,
@@ -984,7 +1062,7 @@ def test_all_gather_on_t3000_nightly(
         pytest.xfail(reason="Known failure")
 
     run_all_gather_on_t3000_impl(
-        all_devices,
+        t3k_mesh_device,
         num_devices,
         input_shape,
         dim,
@@ -1094,12 +1172,14 @@ def run_all_gather_sharded(
     tt_input_tensors_dups = []
     tt_input_tensors = []
 
-    for i, t in enumerate(input_tensors):
-        tt_input_tensors_dups.append(ttnn.Tensor(t, input_dtype).to(tensor_layout).to(devices[i], input_mem_config))
-        tt_input_tensors.append(ttnn.Tensor(t, input_dtype).to(tensor_layout).to(devices[i], input_mem_config))
-
-    input_tensor_mesh = ttnn.aggregate_as_tensor(tt_input_tensors)
-
+    input_tensor_mesh = ttnn.from_torch(
+        unchunked_input_tensor,
+        layout=tensor_layout,
+        dtype=input_dtype,
+        device=t3k_mesh_device,
+        memory_config=input_mem_config,
+        mesh_mapper=ShardTensorToMesh(t3k_mesh_device, dim=dim),
+    )
     if trace_mode:
         tt_out_tensor = run_with_trace(
             t3k_mesh_device,
diff --git a/tests/ttnn/unit_tests/operations/test_all_gather_llama_perf_sweep.py b/tests/ttnn/unit_tests/operations/test_all_gather_llama_perf_sweep.py
index 18f9855f85b..d04d4a6094c 100644
--- a/tests/ttnn/unit_tests/operations/test_all_gather_llama_perf_sweep.py
+++ b/tests/ttnn/unit_tests/operations/test_all_gather_llama_perf_sweep.py
@@ -7,7 +7,7 @@
 from loguru import logger
 import ttnn
 from tests.tt_eager.python_api_testing.sweep_tests.comparison_funcs import comp_equal, comp_pcc
-from models.utility_functions import skip_for_grayskull, get_devices_for_t3000
+from models.utility_functions import skip_for_grayskull
 import itertools
 from ttnn import ShardTensorToMesh
 from tests.ttnn.unit_tests.operations.test_all_gather import run_all_gather_sharded
diff --git a/tests/ttnn/unit_tests/operations/test_all_gather_nightly.py b/tests/ttnn/unit_tests/operations/test_all_gather_nightly.py
index 43ce613608e..7c4005cda1b 100644
--- a/tests/ttnn/unit_tests/operations/test_all_gather_nightly.py
+++ b/tests/ttnn/unit_tests/operations/test_all_gather_nightly.py
@@ -7,7 +7,7 @@
 from loguru import logger
 import ttnn
 from tests.tt_eager.python_api_testing.sweep_tests.comparison_funcs import comp_equal, comp_pcc
-from models.utility_functions import skip_for_grayskull, get_devices_for_t3000
+from models.utility_functions import skip_for_grayskull
 from tests.ttnn.unit_tests.operations.test_all_gather import (
     is_unsupported_case,
     run_line_all_gather,
diff --git a/tests/ttnn/unit_tests/operations/test_distributed_layernorm_sharded.py b/tests/ttnn/unit_tests/operations/test_distributed_layernorm_sharded.py
index 9f256b465f5..d07d54d7d12 100644
--- a/tests/ttnn/unit_tests/operations/test_distributed_layernorm_sharded.py
+++ b/tests/ttnn/unit_tests/operations/test_distributed_layernorm_sharded.py
@@ -13,7 +13,7 @@
     comp_allclose,
 )
 
-from models.utility_functions import tt2torch_tensor, get_devices_for_t3000, skip_for_grayskull
+from models.utility_functions import tt2torch_tensor, skip_for_grayskull
 
 
 def rms_norm(x, gamma, eps):
diff --git a/tests/ttnn/unit_tests/operations/test_reduce_scatter_llama_perf_sweep.py b/tests/ttnn/unit_tests/operations/test_reduce_scatter_llama_perf_sweep.py
index bfee281eb3b..1f6aef79beb 100644
--- a/tests/ttnn/unit_tests/operations/test_reduce_scatter_llama_perf_sweep.py
+++ b/tests/ttnn/unit_tests/operations/test_reduce_scatter_llama_perf_sweep.py
@@ -7,7 +7,7 @@
 from loguru import logger
 import ttnn
 from tests.tt_eager.python_api_testing.sweep_tests.comparison_funcs import comp_equal, comp_pcc
-from models.utility_functions import skip_for_grayskull, get_devices_for_t3000
+from models.utility_functions import skip_for_grayskull
 from tests.ttnn.unit_tests.operations.test_reduce_scatter_post_commit import run_reduce_scatter_sharded_test
 
 
diff --git a/tests/ttnn/unit_tests/operations/test_reduce_scatter_nightly.py b/tests/ttnn/unit_tests/operations/test_reduce_scatter_nightly.py
index 9ab877af87c..d7e91e884d6 100644
--- a/tests/ttnn/unit_tests/operations/test_reduce_scatter_nightly.py
+++ b/tests/ttnn/unit_tests/operations/test_reduce_scatter_nightly.py
@@ -11,7 +11,7 @@
     is_unsupported_case,
     run_reduce_scatter_test,
 )
-from models.utility_functions import skip_for_grayskull, get_devices_for_t3000
+from models.utility_functions import skip_for_grayskull
 import itertools
 
 
diff --git a/tests/ttnn/unit_tests/operations/test_reduce_scatter_post_commit.py b/tests/ttnn/unit_tests/operations/test_reduce_scatter_post_commit.py
index 8749f14f5b6..fc3c6ce8bbc 100644
--- a/tests/ttnn/unit_tests/operations/test_reduce_scatter_post_commit.py
+++ b/tests/ttnn/unit_tests/operations/test_reduce_scatter_post_commit.py
@@ -7,7 +7,7 @@
 from loguru import logger
 import ttnn
 from tests.tt_eager.python_api_testing.sweep_tests.comparison_funcs import comp_equal, comp_pcc
-from models.utility_functions import skip_for_grayskull, get_devices_for_t3000
+from models.utility_functions import skip_for_grayskull
 
 
 def is_unsupported_case(input_shape, scatter_dim, math_op, mem_config, num_devices, num_links, input_dtype, layout):
diff --git a/tests/ttnn/unit_tests/test_multi_device.py b/tests/ttnn/unit_tests/test_multi_device.py
index f1c2728857f..982e539689d 100644
--- a/tests/ttnn/unit_tests/test_multi_device.py
+++ b/tests/ttnn/unit_tests/test_multi_device.py
@@ -587,3 +587,7 @@ def test_validate_as_tensor(tmp_path, mesh_device, height, width):
     for device in mesh_device.get_devices():
         device_tensor = ttnn.get_device_tensor(tensor, device)
         assert torch.allclose(ttnn.to_torch(device_tensor), torch_input_tensor)
+
+
+def test_ttnn_visualize_mesh_device(t3k_mesh_device):
+    ttnn.visualize_mesh_device(t3k_mesh_device)
diff --git a/tt_metal/impl/device/mesh_configurations/T3000.json b/tt_metal/impl/device/mesh_configurations/T3000.json
index 2c62209d01f..acfe3edac00 100644
--- a/tt_metal/impl/device/mesh_configurations/T3000.json
+++ b/tt_metal/impl/device/mesh_configurations/T3000.json
@@ -1,6 +1,6 @@
 {
     "logical_to_physical_coordinates": [
       [[0, 0], [0, 0, 0, 0]], [[0, 1], [0, 1, 0, 0]], [[0, 2], [0, 2, 0, 0]], [[0, 3], [0, 3, 0, 0]],
-      [[1, 0], [1, 3, 0, 0]], [[1, 1], [1, 2, 0, 0]], [[1, 2], [1, 1, 0, 0]], [[1, 3], [1, 0, 0, 0]]
+      [[1, 0], [1, 0, 0, 0]], [[1, 1], [1, 1, 0, 0]], [[1, 2], [1, 2, 0, 0]], [[1, 3], [1, 3, 0, 0]]
     ]
 }
diff --git a/tt_metal/impl/device/mesh_device.cpp b/tt_metal/impl/device/mesh_device.cpp
index e90d4a8925e..7a57ad834c2 100644
--- a/tt_metal/impl/device/mesh_device.cpp
+++ b/tt_metal/impl/device/mesh_device.cpp
@@ -127,6 +127,14 @@ std::vector<chip_id_t> SystemMesh::get_mapped_physical_device_ids(const MeshDevi
     }
     return physical_device_ids;
 }
+void SystemMesh::register_mesh_device(const std::shared_ptr<MeshDevice> &mesh_device, const std::vector<Device*>& devices) {
+    std::vector<chip_id_t> physical_device_ids;
+    for (auto device : devices) {
+        physical_device_ids.push_back(device->id());
+    }
+    this->assigned_mesh_device_devices.insert({mesh_device->get_mesh_id(), mesh_device});
+    this->assigned_devices.insert({mesh_device->get_mesh_id(), physical_device_ids});
+}
 
 std::vector<Device*> SystemMesh::map_mesh_device(
     std::shared_ptr<MeshDevice> mesh_device,
@@ -145,7 +153,6 @@ std::vector<Device*> SystemMesh::map_mesh_device(
     TT_FATAL(requested_num_rows <= max_num_rows, "Requested too many rows: {} > {}", requested_num_rows, max_num_rows);
     TT_FATAL(requested_num_rows*requested_num_cols <= max_num_rows*max_num_cols, "Requested submesh is too big: {}x{}", requested_num_rows, requested_num_cols);
 
-    this->assigned_mesh_device_devices.insert({mesh_device->get_mesh_id(), mesh_device});
 
     auto physical_device_ids = user_provided_physical_device_ids.empty() ?
         this->get_mapped_physical_device_ids(MeshDeviceConfig{mesh_device->shape(), offset}) :
@@ -158,27 +165,34 @@ std::vector<Device*> SystemMesh::map_mesh_device(
     for (auto physical_device_id : physical_device_ids) {
         auto mapped_device = this->opened_devices[mesh_device->get_mesh_id()].at(physical_device_id);
         mapped_devices.push_back(mapped_device);
-        this->assigned_devices[mesh_device->get_mesh_id()].push_back(physical_device_id);
         this->assigned_physical_id_to_device.insert({physical_device_id, mapped_device});
     }
+
+    this->register_mesh_device(mesh_device, mapped_devices); // TODO: change this
     return mapped_devices;
 }
 
 void SystemMesh::unmap_mesh_device(const std::shared_ptr<MeshDevice>& mesh_device) {
     auto mesh_id = mesh_device->get_mesh_id();
-
-    // Clean up all state related to this virtual mesh
     this->assigned_mesh_device_devices.erase(mesh_id);
 
-    // Remove the devices from assigned_physical_id_to_device
-    for (auto physical_id : this->assigned_devices.at(mesh_id)) {
-        this->assigned_physical_id_to_device.erase(physical_id);
+    // Close the devices
+    if (mesh_device->is_parent_mesh()) {
+        for (auto physical_id : this->assigned_devices.at(mesh_id)) {
+            this->assigned_physical_id_to_device.erase(physical_id);
+        }
+        tt::tt_metal::detail::CloseDevices(this->opened_devices.at(mesh_id));
+        this->opened_devices.erase(mesh_id);
     }
     this->assigned_devices.erase(mesh_id);
+}
 
-    // Close the devices
-    tt::tt_metal::detail::CloseDevices(this->opened_devices.at(mesh_id));
-    this->opened_devices.erase(mesh_id);
+Device* SystemMesh::get_device(const chip_id_t physical_device_id) {
+    auto it = this->assigned_physical_id_to_device.find(physical_device_id);
+    if (it == this->assigned_physical_id_to_device.end()) {
+        TT_THROW("Physical Device ID: {} not found in assigned devices", physical_device_id);
+    }
+    return it->second;
 }
 
 static MeshDeviceID generate_unique_mesh_id() {
@@ -186,7 +200,8 @@ static MeshDeviceID generate_unique_mesh_id() {
     return next_id++;
 }
 
-MeshDevice::MeshDevice(const MeshShape& mesh_device_shape) : mesh_device_shape(mesh_device_shape), mesh_id(generate_unique_mesh_id()) {}
+MeshDevice::MeshDevice(const MeshShape& mesh_device_shape, std::shared_ptr<MeshDevice> parent_mesh)
+    : mesh_device_shape(mesh_device_shape), mesh_id(generate_unique_mesh_id()), parent_mesh(parent_mesh) {}
 
 std::shared_ptr<MeshDevice> MeshDevice::create(
     const MeshShape& mesh_device_shape,
@@ -203,6 +218,36 @@ std::shared_ptr<MeshDevice> MeshDevice::create(
     return mesh_device;
 }
 
+std::shared_ptr<MeshDevice> MeshDevice::create_submesh(const MeshShape &submesh_shape, const MeshOffset &offset) {
+    if (submesh_shape.first <= 0 || submesh_shape.second <= 0) {
+        TT_THROW("Invalid submesh shape: ({}, {}). Both dimensions must be positive.", submesh_shape.first, submesh_shape.second);
+    }
+
+    if (offset.first < 0 || offset.second < 0) {
+        TT_THROW("Invalid offset: ({}, {}). Offset must be non-negative.", offset.first, offset.second);
+    }
+
+    if (offset.first + submesh_shape.first > this->mesh_device_shape.first ||
+        offset.second + submesh_shape.second > this->mesh_device_shape.second) {
+        TT_THROW("Submesh ({}x{}) with offset ({}, {}) does not fit within parent mesh ({}x{}).",
+                 submesh_shape.first, submesh_shape.second,
+                 offset.first, offset.second,
+                 this->mesh_device_shape.first, this->mesh_device_shape.second);
+    }
+
+    auto submesh = std::make_shared<MeshDevice>(submesh_shape, shared_from_this());
+    auto start_coordinate = Coordinate{offset.first, offset.second};
+    auto end_coordinate = Coordinate{offset.first + submesh_shape.first - 1, offset.second + submesh_shape.second - 1};
+    submesh->primary_view = std::make_unique<MeshDeviceView>(*this, start_coordinate, end_coordinate);
+    submesh->devices = submesh->primary_view->get_devices();
+    SystemMesh::instance().register_mesh_device(submesh, submesh->devices);
+    this->submeshes.push_back(submesh);
+    log_trace(LogMetal, "Instantiating submesh {}: {}x{} with offset: {} {}", submesh->get_mesh_id(), submesh_shape.first, submesh_shape.second, offset.first, offset.second);
+    log_trace(LogMetal, "Submesh {} instantiated with {} devices", submesh->get_mesh_id(), submesh->devices);
+
+    return submesh;
+}
+
 void MeshDevice::initialize(
     size_t l1_small_size,
     size_t trace_region_size,
@@ -223,16 +268,18 @@ void MeshDevice::initialize(
     this->devices = instance.map_mesh_device(
         shared_from_this(), num_command_queues, l1_small_size, trace_region_size, dispatch_core_type, offset, physical_device_ids);
     this->primary_view = std::make_unique<tt::tt_metal::MeshDeviceView>(*this);
-
-    for (int device_index = 0; device_index < this->devices.size(); device_index++) {
-        this->physical_id_to_device_index.insert({this->devices[device_index]->id(), device_index});
-    }
 }
 
 MeshDevice::~MeshDevice() {
     if (not this->devices.empty()) {
         this->close_devices();
     }
+    for (auto submesh : this->submeshes) {
+        submesh->close_devices();
+    }
+    this->primary_view.reset();
+    this->devices.clear();
+    this->parent_mesh.reset();
 }
 
 Device* MeshDevice::get_device_index(int logical_device_id) const {
@@ -241,23 +288,15 @@ Device* MeshDevice::get_device_index(int logical_device_id) const {
 }
 
 Device* MeshDevice::get_device(int physical_device_id) const {
-    return this->devices.at(this->physical_id_to_device_index.at(physical_device_id));
+    return SystemMesh::instance().get_device(physical_device_id);
 }
 
-std::vector<Device*> MeshDevice::get_devices() const { return this->devices; }
+std::vector<Device*> MeshDevice::get_devices() const { return this->primary_view->get_devices(IterationOrder::LINE); }
 
 Device* MeshDevice::get_device(int row_idx, int col_idx) const {
     return this->get_device_index(row_idx * num_cols() + col_idx);
 }
 
-std::vector<Device*> MeshDevice::get_devices_on_row(int row_idx) const {
-    return this->primary_view->get_devices_on_row(row_idx);
-}
-
-std::vector<Device*> MeshDevice::get_devices_on_column(int col_idx) const {
-    return this->primary_view->get_devices_on_column(col_idx);
-}
-
 const DeviceIds MeshDevice::get_device_ids() const {
     DeviceIds device_ids;
     for (auto device : this->get_devices()) {
@@ -283,7 +322,6 @@ MeshShape MeshDevice::shape() const { return this->mesh_device_shape; }
 void MeshDevice::close_devices() {
     SystemMesh::instance().unmap_mesh_device(shared_from_this());
     this->devices.clear();
-    this->physical_id_to_device_index.clear();
     this->primary_view.reset();
 }
 
@@ -295,8 +333,60 @@ std::shared_ptr<const MeshDeviceView> MeshDevice::get_view() const { return this
 
 std::shared_ptr<MeshDeviceView> MeshDevice::get_view() { return this->primary_view; }
 
+std::vector<std::shared_ptr<MeshDeviceView>> MeshDevice::get_submesh_views() {
+    std::vector<std::shared_ptr<MeshDeviceView>> submesh_views;
+    if (this->submeshes.empty()) {
+        submesh_views.push_back(this->get_view());
+    }
+    else {
+        for (auto submesh : this->submeshes) {
+            submesh_views.push_back(submesh->get_view());
+        }
+    }
+    return submesh_views;
+}
+
 MeshDeviceID MeshDevice::get_mesh_id() const { return this->mesh_id; }
 
+bool MeshDevice::is_parent_mesh() const { return this->parent_mesh == nullptr; }
+
+std::shared_ptr<MeshDevice> SystemMesh::get_mesh_device(const std::vector<chip_id_t>& physical_device_ids) {
+    log_trace(LogMetal, "Getting mesh device for {} physical devices: {}", physical_device_ids.size(), physical_device_ids);
+    std::unordered_set<chip_id_t> input_set(physical_device_ids.begin(), physical_device_ids.end());
+
+    for (const auto& [mesh_id, mesh_device] : this->assigned_mesh_device_devices) {
+        const auto& assigned_devices = this->assigned_devices.at(mesh_id);
+        std::unordered_set<chip_id_t> assigned_set(assigned_devices.begin(), assigned_devices.end());
+        log_trace(LogMetal, "Assigned devices: {}", assigned_devices);
+
+        if (input_set == assigned_set) {
+            return mesh_device;
+        }
+    }
+    TT_THROW("No mesh device found for the provided devices");
+}
+
+std::shared_ptr<MeshDevice> MeshDevice::fetch_mesh_device(const std::vector<Device*>& devices) {
+    TT_FATAL(devices.size() > 0, "No devices provided");
+    auto& instance = SystemMesh::instance();
+    std::vector<chip_id_t> physical_device_ids;
+    for (auto device : devices) {
+        physical_device_ids.push_back(device->id());
+    }
+    return instance.get_mesh_device(physical_device_ids);
+}
+
+std::vector<std::shared_ptr<MeshDevice>> MeshDevice::get_submeshes() const { return this->submeshes; }
+
+std::shared_ptr<MeshDeviceView> MeshDevice::get_view(const Device* device) {
+    for (auto submesh_view : this->get_submesh_views()) {
+        if (submesh_view->contains_device(device->id())) {
+            return submesh_view;
+        }
+    }
+    TT_THROW("Device {} not found in any submesh view", device->id());
+}
+
 std::ostream& operator<<(std::ostream& os, const MeshDevice& mesh_device) { return os << mesh_device.to_string(); }
 
 bool validate_worker_modes(const std::vector<Device*>& workers) {
diff --git a/tt_metal/impl/device/mesh_device.hpp b/tt_metal/impl/device/mesh_device.hpp
index 940110973cc..8a5cccc041a 100644
--- a/tt_metal/impl/device/mesh_device.hpp
+++ b/tt_metal/impl/device/mesh_device.hpp
@@ -72,6 +72,7 @@ class SystemMesh {
 
     // Get the physical device IDs mapped to a MeshDevice
     std::vector<chip_id_t> get_mapped_physical_device_ids(const MeshDeviceConfig &config) const;
+    void register_mesh_device(const std::shared_ptr<MeshDevice> &mesh_device, const std::vector<Device*>& devices);
 
     // Map MeshDevice to physical devices
     std::vector<Device *> map_mesh_device(
@@ -85,14 +86,18 @@ class SystemMesh {
 
     // Unmap MeshDevice, releasing the associated physical devices.
     void unmap_mesh_device(const std::shared_ptr<MeshDevice> &mesh_device);
+    std::shared_ptr<MeshDevice> get_mesh_device(const std::vector<chip_id_t>& physical_device_ids);
+    Device* get_device(const chip_id_t physical_device_id);
 };
 
 class MeshDevice : public std::enable_shared_from_this<MeshDevice> {
+  private:
     MeshDeviceID mesh_id;
     MeshShape mesh_device_shape;
     std::shared_ptr<MeshDeviceView> primary_view;
     std::vector<Device *> devices;
-    std::unordered_map<chip_id_t, int> physical_id_to_device_index;
+    std::shared_ptr<MeshDevice> parent_mesh;
+    std::vector<std::shared_ptr<MeshDevice>> submeshes;
 
     void initialize(
         size_t l1_small_size,
@@ -103,7 +108,7 @@ class MeshDevice : public std::enable_shared_from_this<MeshDevice> {
         const std::vector<chip_id_t> &physical_device_ids);
 
    public:
-    MeshDevice(const MeshShape &mesh_device_shape);
+    MeshDevice(const MeshShape &mesh_device_shape, std::shared_ptr<MeshDevice> parent_mesh = nullptr);
     ~MeshDevice();
 
     MeshDevice(const MeshDevice &) = delete;
@@ -116,8 +121,6 @@ class MeshDevice : public std::enable_shared_from_this<MeshDevice> {
     Device *get_device_index(int logical_device_id) const;
     Device *get_device(int physical_device_id) const;
     Device *get_device(int row_idx, int col_idx) const;
-    std::vector<Device *> get_devices_on_row(int row_idx) const;
-    std::vector<Device *> get_devices_on_column(int col_idx) const;
 
     const DeviceIds get_device_ids() const;
 
@@ -138,6 +141,7 @@ class MeshDevice : public std::enable_shared_from_this<MeshDevice> {
 
     std::string to_string() const;
     MeshDeviceID get_mesh_id() const;
+    bool is_parent_mesh() const;
 
     static std::shared_ptr<MeshDevice> create(
         const MeshShape &mesh_device_shape,
@@ -147,6 +151,13 @@ class MeshDevice : public std::enable_shared_from_this<MeshDevice> {
         DispatchCoreType dispatch_core_type,
         const std::pair<size_t, size_t> &offset = {0, 0},
         const std::vector<chip_id_t> &physical_device_ids = {});
+
+    std::vector<std::shared_ptr<MeshDevice>> get_submeshes() const;
+    std::vector<std::shared_ptr<MeshDeviceView>> get_submesh_views();
+    std::shared_ptr<MeshDeviceView> get_view(const Device* device);
+
+    std::shared_ptr<MeshDevice> create_submesh(const MeshShape &submesh_shape, const MeshOffset &offset = {0, 0});
+    static std::shared_ptr<MeshDevice> fetch_mesh_device(const std::vector<Device*>& devices);
 };
 
 std::ostream &operator<<(std::ostream &os, const MeshDevice &mesh_device);
diff --git a/tt_metal/impl/device/mesh_device_view.cpp b/tt_metal/impl/device/mesh_device_view.cpp
index cc4a227780f..8276596b9f1 100644
--- a/tt_metal/impl/device/mesh_device_view.cpp
+++ b/tt_metal/impl/device/mesh_device_view.cpp
@@ -11,6 +11,16 @@ namespace tt::tt_metal {
 
 using MeshDevice = tt::tt_metal::MeshDevice;
 
+static std::vector<MeshDeviceView::device_pointer> get_devices_from_coordinates(MeshDeviceView& mesh, const std::vector<Coordinate>& coords) {
+    std::vector<MeshDeviceView::device_pointer> devices;
+    for (const auto& coord : coords) {
+        if (auto device = mesh.get_device(coord.row, coord.col)) {
+            devices.push_back(device);
+        }
+    }
+    return devices;
+}
+
 MeshDeviceView::MeshDeviceView(const MeshDevice& mesh)
     : top_left_(0, 0), bottom_right_(mesh.num_rows() - 1, mesh.num_cols() - 1) {
     for (size_t row = 0; row < mesh.num_rows(); ++row) {
@@ -24,12 +34,12 @@ MeshDeviceView::MeshDeviceView(const MeshDevice& mesh)
 }
 
 MeshDeviceView::MeshDeviceView(const MeshDevice& mesh, Coordinate top_left, Coordinate bottom_right)
-    : top_left_(top_left), bottom_right_(bottom_right) {
+    : top_left_(0, 0), bottom_right_(Coordinate{bottom_right.row - top_left.row, bottom_right.col - top_left.col}) {
     for (size_t row = top_left.row; row <= bottom_right.row; ++row) {
         for (size_t col = top_left.col; col <= bottom_right.col; ++col) {
             if (auto device = mesh.get_device(row, col)) {
                 devices_.push_back(device);
-                device_coordinates_[(device)->id()] = {row, col};
+                device_coordinates_[(device)->id()] = {row - top_left.row, col - top_left.col};
             }
         }
     }
@@ -55,10 +65,6 @@ MeshDeviceView::const_device_pointer MeshDeviceView::get_device(size_t row, size
     return nullptr;
 }
 
-const std::vector<MeshDeviceView::device_pointer>& MeshDeviceView::get_devices() const {
-    return devices_;
-}
-
 MeshDeviceView::DeviceView MeshDeviceView::get_devices(const Coordinate& start, const Coordinate& end) {
     if (start.row > end.row || start.col > end.col) {
         log_fatal("Invalid coordinates: start {} must be less than or equal to end {}", start, end);
@@ -158,6 +164,11 @@ bool MeshDeviceView::operator==(const MeshDeviceView& other) const {
            bottom_right_ == other.bottom_right_;
 }
 
+
+bool MeshDeviceView::contains_device(chip_id_t device_id) const {
+    return device_coordinates_.find(device_id) != device_coordinates_.end();
+}
+
 Coordinate MeshDeviceView::find_device(chip_id_t device_id) const {
     auto it = device_coordinates_.find(device_id);
     if (it != device_coordinates_.end()) {
@@ -199,5 +210,93 @@ void MeshDeviceView::validate_coordinates() const {
         throw std::invalid_argument("Invalid coordinates: top_left must be less than or equal to bottom_right");
     }
 }
+// Get the boundary coordinates of the subgrid defined by offset and shape
+std::vector<Coordinate> MeshDeviceView::get_ring_coordinates(const MeshShape& shape, const Coordinate& offset) const {
+    std::vector<Coordinate> boundary_coords;
+
+    size_t start_row = offset.row;
+    size_t start_col = offset.col;
+    size_t end_row = offset.row + shape.first - 1;
+    size_t end_col = offset.col + shape.second - 1;
+
+    // Validate the specified subgrid
+    if (start_row >= num_rows() || start_col >= num_cols() ||
+        end_row >= num_rows() || end_col >= num_cols()) {
+        throw std::invalid_argument("Subgrid is out of mesh bounds.");
+    }
+
+    // Traverse the top row from left to right
+    for (size_t col = start_col; col <= end_col; ++col) {
+        boundary_coords.emplace_back(Coordinate{start_row, col});
+    }
+
+    // Traverse the rightmost column from top+1 to bottom
+    for (size_t row = start_row + 1; row <= end_row; ++row) {
+        boundary_coords.emplace_back(Coordinate{row, end_col});
+    }
+
+    // Traverse the bottom row from right to left, if there is more than one row
+    if (end_row > start_row and end_col > start_col) {
+        for (size_t col = end_col - 1; col + 1 > start_col; --col) {
+            boundary_coords.emplace_back(Coordinate{end_row, col});
+        }
+        for (size_t row = end_row - 1; row > start_row; --row) {
+            boundary_coords.emplace_back(Coordinate{row, start_col});
+        }
+    }
+
+    return boundary_coords;
+}
+
+std::vector<Coordinate> MeshDeviceView::get_line_coordinates(size_t length, const Coordinate& offset) const {
+    std::vector<Coordinate> line_coords;
+    auto [row, col] = offset;
+    bool left_to_right = true;
+
+    for (size_t i = 0; i < length && row < num_rows() && col < num_cols(); ++i) {
+        line_coords.emplace_back(Coordinate{row, col});
+
+        if (left_to_right) {
+            if (col < num_cols() - 1) {
+                ++col;
+            } else {
+                ++row;
+                left_to_right = false;
+            }
+        } else {
+            if (col > 0) {
+                --col;
+            } else {
+                ++row;
+                left_to_right = true;
+            }
+        }
+    }
+
+    return line_coords;
+}
+
+std::vector<MeshDeviceView::device_pointer> MeshDeviceView::get_line_devices() {
+    auto boundary_coords = get_line_coordinates(this->num_rows() * this->num_cols(), this->top_left_);
+    return get_devices_from_coordinates(*this, boundary_coords);
+}
+
+std::vector<MeshDeviceView::device_pointer> MeshDeviceView::get_ring_devices() {
+    auto boundary_coords = get_ring_coordinates(shape(), this->top_left_);
+    return get_devices_from_coordinates(*this, boundary_coords);
+}
+
+MeshDeviceView::DeviceView MeshDeviceView::get_devices(IterationOrder order) {
+    switch (order) {
+        case IterationOrder::ROW_MAJOR:
+            return this->devices_;
+        case IterationOrder::RING:
+            return this->get_ring_devices();
+        case IterationOrder::LINE:
+            return this->get_line_devices();
+        default:
+            TT_THROW("Unsupported iteration order: {}", order);
+    }
+}
 
 } // namespace tt::tt_metal
diff --git a/tt_metal/impl/device/mesh_device_view.hpp b/tt_metal/impl/device/mesh_device_view.hpp
index 73c9e2b61c2..9e6f40eb39c 100644
--- a/tt_metal/impl/device/mesh_device_view.hpp
+++ b/tt_metal/impl/device/mesh_device_view.hpp
@@ -53,6 +53,13 @@ struct Coordinate {
  *    specific sub-regions. This is particularly useful for collective communication operations
  *    (CCL-ops), such as line all-gather, which require column or row views of the device mesh.
  */
+
+enum class IterationOrder {
+    ROW_MAJOR,
+    RING,
+    LINE
+};
+
 class MeshDeviceView {
 public:
     using device_pointer = Device*;
@@ -68,12 +75,11 @@ class MeshDeviceView {
     [[nodiscard]] device_pointer get_device(size_t row, size_t col);
     [[nodiscard]] const_device_pointer get_device(size_t row, size_t col) const;
 
-    [[nodiscard]] const std::vector<device_pointer>& get_devices() const;
-
     // Get devices spanning the rectangular region defined by the top-left and bottom-right coordinates
     // devices are returned in row-major order with start/end coordinates inclusive
     [[nodiscard]] DeviceView get_devices(const Coordinate& start, const Coordinate& end);
     [[nodiscard]] DeviceView get_devices(const MeshShape& shape);
+    [[nodiscard]] DeviceView get_devices(IterationOrder order = IterationOrder::ROW_MAJOR);
 
     [[nodiscard]] DeviceView get_devices_on_row(size_t row) const;
     [[nodiscard]] DeviceView get_devices_on_column(size_t col) const;
@@ -86,7 +92,7 @@ class MeshDeviceView {
 
     [[nodiscard]] bool empty() const noexcept;
     [[nodiscard]] size_t size() const noexcept;
-    [[nodiscard]] std::pair<size_t, size_t> shape() const noexcept;
+    [[nodiscard]] MeshShape shape() const noexcept;
     [[nodiscard]] bool contains(const Coordinate& coord) const noexcept;
     [[nodiscard]] const_device_pointer at(const Coordinate& coord) const noexcept;
 
@@ -99,10 +105,17 @@ class MeshDeviceView {
     [[nodiscard]] std::size_t num_cols() const { return bottom_right_.col - top_left_.col + 1; }
     [[nodiscard]] std::size_t num_devices() const { return devices_.size(); }
 
+    [[nodiscard]] bool contains_device(chip_id_t device_id) const;
     [[nodiscard]] Coordinate find_device(chip_id_t device_id) const;
     [[nodiscard]] chip_id_t find_device_id(const Coordinate& coord) const;
 
 private:
+    [[nodiscard]] std::vector<device_pointer> get_ring_devices();
+    [[nodiscard]] std::vector<device_pointer> get_line_devices();
+
+    [[nodiscard]] std::vector<Coordinate> get_ring_coordinates(const MeshShape& shape, const Coordinate& offset) const;
+    [[nodiscard]] std::vector<Coordinate> get_line_coordinates(size_t length, const Coordinate& offset) const;
+
     std::vector<device_pointer> devices_;
     std::unordered_map<chip_id_t, Coordinate> device_coordinates_;
     Coordinate top_left_;
diff --git a/ttnn/cpp/pybind11/multi_device.hpp b/ttnn/cpp/pybind11/multi_device.hpp
index 70d9755d040..f468217ebe7 100644
--- a/ttnn/cpp/pybind11/multi_device.hpp
+++ b/ttnn/cpp/pybind11/multi_device.hpp
@@ -47,6 +47,7 @@ void py_module(py::module& module) {
             py::arg("offset"),
             py::arg("physical_device_ids"))
         .def("get_num_devices", &MeshDevice::num_devices)
+        .def("get_mesh_id", &MeshDevice::get_mesh_id)
         .def("get_device_ids", &MeshDevice::get_device_ids)
         .def(
             "get_device",
@@ -62,26 +63,7 @@ void py_module(py::module& module) {
             Returns:
                 List[Device]: The devices in the device mesh.
         )doc")
-        .def(
-            "get_devices_on_row",
-            &MeshDevice::get_devices_on_row,
-            py::return_value_policy::reference,
-            R"doc(
-            Get the devices in a row of the device mesh.
-
-            Returns:
-                List[Device]: The devices on a row in the device mesh.
-        )doc")
-        .def(
-            "get_devices_on_column",
-            &MeshDevice::get_devices_on_column,
-            py::return_value_policy::reference,
-            R"doc(
-            Get the devices in a row of the device mesh.
-
-            Returns:
-                List[Device]: The devices on a row in the device mesh.
-        )doc")
+        .def("create_submesh", &MeshDevice::create_submesh, py::arg("submesh_shape"), py::arg("offset") = std::pair<size_t, size_t>{0, 0}, py::return_value_policy::reference_internal, py::keep_alive<0, 1>())
         .def(
             "compute_with_storage_grid_size",
             &MeshDevice::compute_with_storage_grid_size,
diff --git a/ttnn/cpp/ttnn/operations/ccl/all_gather/device/all_gather_op.cpp b/ttnn/cpp/ttnn/operations/ccl/all_gather/device/all_gather_op.cpp
index 66f70cb1ef1..80fe9642e64 100644
--- a/ttnn/cpp/ttnn/operations/ccl/all_gather/device/all_gather_op.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/all_gather/device/all_gather_op.cpp
@@ -6,6 +6,7 @@
 #include "ttnn/deprecated/tt_dnn/op_library/math.hpp"
 
 #include "tt_metal/host_api.hpp"
+#include "tt_metal/impl/device/mesh_device.hpp"
 
 #include "ttnn/tensor/tensor_utils.hpp"
 
@@ -191,17 +192,18 @@ Tensor all_gather(
     if (num_devices == 2){
         ccl_topology = ttnn::ccl::Topology::Linear;
     }
+    auto mesh_device = MeshDevice::fetch_mesh_device(devices);
     std::vector<Tensor> output_tensors = {Tensor(operation::get_workers_for_op_output({input_tensor}))};
     operation::launch_op(
-        [dim, num_links, memory_config, user_defined_num_workers, user_defined_num_buffers_per_channel, devices, ccl_topology](
+        [=](
             const std::vector<Tensor>& input_tensors,
             const std::vector<std::optional<const Tensor>>& optional_input_tensors,
             const std::vector<std::optional<Tensor>>& optional_output_tensors) mutable -> std::vector<Tensor> {
 
             const auto& input_tensor = input_tensors.at(0);
-
+            auto submesh_view = mesh_device->get_view(input_tensor.device());
             return operation::run(
-                create_all_gather_struct(input_tensor, dim, num_links, memory_config, user_defined_num_workers, user_defined_num_buffers_per_channel, devices, ccl_topology),
+                create_all_gather_struct(input_tensor, dim, num_links, memory_config, user_defined_num_workers, user_defined_num_buffers_per_channel, submesh_view->get_devices(IterationOrder::RING), ccl_topology),
                 {input_tensor});
         },
         {input_tensor},
diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/device/all_gather_matmul_op.cpp b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/device/all_gather_matmul_op.cpp
index 7ec8010cf47..c7101eeac40 100644
--- a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/device/all_gather_matmul_op.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/device/all_gather_matmul_op.cpp
@@ -138,15 +138,15 @@ std::vector <ttnn::Tensor> all_gather_matmul(
 
     TT_FATAL(std::getenv("TT_METAL_SLOW_DISPATCH_MODE") == nullptr, "AllGatherMatmul is only supported for Fast Dispatch");
 
-    auto devices = input_tensor.get_workers();
     std::vector<Tensor> output_tensors = {ttnn::Tensor(operation::get_workers_for_op_output({input_tensor, weight_tensor})),
                                             ttnn::Tensor(operation::get_workers_for_op_output({input_tensor, weight_tensor})),
                                             ttnn::Tensor(operation::get_workers_for_op_output({input_tensor, weight_tensor}))};
     std::vector<std::optional<const ttnn::Tensor>> optional_input_tensors = {std::nullopt};
 
+    auto mesh_device = MeshDevice::fetch_mesh_device(input_tensor.get_workers());
 
     operation::launch_op(
-        [dim, all_gather_core_grid_offset, num_links, memory_config_ag, user_defined_num_workers, user_defined_num_buffers_per_channel, memory_config_mm, transpose_a, transpose_b, dtype, program_config, activation, compute_kernel_config, core_grid, devices](
+        [dim, all_gather_core_grid_offset, num_links, memory_config_ag, user_defined_num_workers, user_defined_num_buffers_per_channel, memory_config_mm, transpose_a, transpose_b, dtype, program_config, activation, compute_kernel_config, core_grid, mesh_device](
             const std::vector<Tensor>& input_tensors,
             const std::vector<std::optional<const ttnn::Tensor>>& optional_input_tensors,
             const std::vector<std::optional<Tensor>>& optional_output_tensors) mutable -> std::vector<Tensor> {
@@ -155,7 +155,8 @@ std::vector <ttnn::Tensor> all_gather_matmul(
             const auto& weight_tensor = input_tensors[1];
 
             /* AllGather setup */
-            ttnn::AllGather all_gather_struct = ttnn::create_all_gather_struct(input_tensor, dim, num_links, memory_config_ag, user_defined_num_workers, user_defined_num_buffers_per_channel, devices, ttnn::ccl::Topology::Ring);
+            auto submesh_view = mesh_device->get_view(input_tensor.device());
+            ttnn::AllGather all_gather_struct = ttnn::create_all_gather_struct(input_tensor, dim, num_links, memory_config_ag, user_defined_num_workers, user_defined_num_buffers_per_channel,  submesh_view->get_devices(IterationOrder::RING), ttnn::ccl::Topology::Ring);
 
             // Create the all gather output tensor used as input (activation) to the matmul
             ttnn::Tensor all_gather_out_tensor = all_gather_struct.create_output_tensors({input_tensor})[0];
diff --git a/ttnn/cpp/ttnn/tensor/tensor.cpp b/ttnn/cpp/ttnn/tensor/tensor.cpp
index d72b0bf50f8..33e59b484b1 100644
--- a/ttnn/cpp/ttnn/tensor/tensor.cpp
+++ b/ttnn/cpp/ttnn/tensor/tensor.cpp
@@ -698,7 +698,7 @@ Tensor allocate_tensor_on_device(
     const std::optional<Tile>& tile
     ) {
     // Top level wrapper to asynchronously create a device tensor (multi-device)
-    Tensor device_tensor = Tensor(mesh_device->get_devices());
+    Tensor device_tensor = Tensor(mesh_device->get_view()->get_devices(IterationOrder::LINE));
     uint32_t device_tensor_ref_count = device_tensor.tensor_attributes->record_main_thread_ref_count();
     const auto& workers = device_tensor.get_workers();
     uint32_t num_workers = workers.size();
@@ -791,14 +791,16 @@ std::vector<Device*> distribute_tensor_to_mesh(const Tensor& tensor, MeshDevice&
         return workers;
     };
 
-    if (mesh_device.get_view() != nullptr and std::holds_alternative<MultiDeviceHostStorage>(tensor.get_storage())) {
+    auto mesh_view = mesh_device.get_view();
+    if (mesh_view != nullptr and std::holds_alternative<MultiDeviceHostStorage>(tensor.get_storage())) {
         const auto& host_storage = std::get<tt::tt_metal::MultiDeviceHostStorage>(tensor.get_storage());
 
         return std::visit([&](const auto& strategy) {
             using StrategyType = std::decay_t<decltype(strategy)>;
             if constexpr (std::is_same_v<StrategyType, ShardTensor2D>) {
-                auto mesh_view = mesh_device.get_view();
                 return mesh_view->get_devices(strategy.shard_mesh);
+            } else if constexpr (std::is_same_v<StrategyType, ShardTensor>) {
+                return mesh_view->get_devices(IterationOrder::LINE);
             } else {
                 return get_multi_device_workers(mesh_device.get_devices());
             }