#11403: Support 2x4-submeshes across 8x4 mesh

1. Submeshing to support creating submesh on galaxy mesh 2. Key change to start enabling more T3000 Tests onto galaxy: - Currently ttnn.all_gather(..) in a ring relies on MeshDevice being initialized in ring-order. Now we decouple this so we don't require that MeshDevice is initialized with devices in a ring-order. Instead, we now explicitly request for a ring-order in the operation that requires it.
tenstorrent · Oct 1, 2024 · 2b73ca5 · 2b73ca5
1 parent d8706ff
commit 2b73ca5
Show file tree

Hide file tree

Showing 25 changed files with 444 additions and 139 deletions.
diff --git a/conftest.py b/conftest.py
@@ -229,13 +229,13 @@ def pcie_mesh_device(request, silicon_arch_name, silicon_arch_wormhole_b0, devic
     except (ValueError, AttributeError):
         num_pcie_devices_requested = len(device_ids)
 
+    if num_pcie_devices_requested != 4:
+        pytest.skip("Only 4 PCIe devices are supported for testing")
+
     request.node.pci_ids = device_ids[:num_pcie_devices_requested]
 
     mesh_device = ttnn.open_mesh_device(
-        ttnn.MeshShape(1, num_pcie_devices_requested),
-        dispatch_core_type=get_dispatch_core_type(),
-        **device_params,
-        physical_device_ids=device_ids[:num_pcie_devices_requested],
+        ttnn.MeshShape(2, 2), dispatch_core_type=get_dispatch_core_type(), **device_params, offset=(0, 1)
     )
 
     logger.debug(f"multidevice with {mesh_device.get_num_devices()} devices is created")
@@ -255,6 +255,7 @@ def t3k_mesh_device(request, silicon_arch_name, silicon_arch_wormhole_b0, device
     if ttnn.get_num_devices() < 8:
         pytest.skip()
 
+    request.node.pci_ids = ttnn.get_pcie_device_ids()
     mesh_device = ttnn.open_mesh_device(
         ttnn.MeshShape(2, 4),
         dispatch_core_type=get_dispatch_core_type(),

diff --git a/models/demos/t3000/falcon40b/tests/unit_tests/test_falcon_create_qkv_heads.py b/models/demos/t3000/falcon40b/tests/unit_tests/test_falcon_create_qkv_heads.py
@@ -10,7 +10,7 @@
 from tests.tt_eager.python_api_testing.sweep_tests.comparison_funcs import (
     comp_pcc,
 )
-from models.utility_functions import torch2tt_tensor, tt2torch_tensor, skip_for_grayskull, get_devices_for_t3000
+from models.utility_functions import torch2tt_tensor, tt2torch_tensor, skip_for_grayskull
 from models.demos.t3000.falcon40b.tt.model_config import (
     get_model_config,
 )

diff --git a/models/demos/t3000/falcon40b/tests/unit_tests/test_falcon_softmax.py b/models/demos/t3000/falcon40b/tests/unit_tests/test_falcon_softmax.py
@@ -11,7 +11,7 @@
 from tests.tt_eager.python_api_testing.sweep_tests.comparison_funcs import (
     comp_pcc,
 )
-from models.utility_functions import torch2tt_tensor, tt2torch_tensor, skip_for_grayskull, get_devices_for_t3000
+from models.utility_functions import torch2tt_tensor, tt2torch_tensor, skip_for_grayskull
 from models.demos.t3000.falcon40b.tt.model_config import (
     get_model_config,
 )

diff --git a/models/demos/t3000/llama2_70b/tests/test_llama_perf.py b/models/demos/t3000/llama2_70b/tests/test_llama_perf.py
@@ -25,7 +25,6 @@
     disable_compilation_reports,
     nearest_32,
     skip_for_grayskull,
-    get_devices_for_t3000,
 )
 from models.perf.perf_utils import prep_perf_report
 from models.perf.device_perf_utils import run_device_perf, check_device_perf, prep_device_perf_report

diff --git a/models/demos/t3000/llama2_70b/tt/llama_mlp_optimized.py b/models/demos/t3000/llama2_70b/tt/llama_mlp_optimized.py
@@ -72,12 +72,11 @@ def load_weights(self):
             padded_w3[:, :, :, :H4] = self.state_dict[w3_str].transpose(-2, -1)
 
         # w1: 8k x 4k. width-sharded on 12 banks, 4224 over 12 banks.
-        device = self.mesh_device.get_device(0)
         weight_grid = ttnn.CoreRangeSet(
             {
                 ttnn.CoreRange(
                     ttnn.CoreCoord(0, 0),
-                    ttnn.CoreCoord(device.dram_grid_size().x - 1, device.dram_grid_size().y - 1),
+                    ttnn.CoreCoord(self.mesh_device.dram_grid_size().x - 1, self.mesh_device.dram_grid_size().y - 1),
                 )
             }
         )

diff --git a/tests/scripts/tg/run_tg_model_perf_tests.sh b/tests/scripts/tg/run_tg_model_perf_tests.sh
@@ -1,6 +1,10 @@
 #!/bin/bash
 
-run_tg_llm_tests() {
+run_t3k_tests_on_tg_tests() {
+
+  echo "LOG_METAL: Running T3000 tests on TG"
+  env pytest -n auto models/demos/t3000/llama2_70b/tests/test_llama_perf_decode.py -m "model_perf_t3000" --timeout=600 ; fail+=$?
+
   # Merge all the generated reports
   env python models/perf/merge_perf_results.py; fail+=$?
 

diff --git a/tests/ttnn/multichip_unit_tests/test_multidevice_TG.py b/tests/ttnn/multichip_unit_tests/test_multidevice_TG.py
@@ -1573,3 +1573,19 @@ def test_sharded_distributed_layernorm(mesh_device, input_width, input_height, c
     is_pass, output_pcc = comp_pcc(torch_output_tensor, tt_output_tensor, pcc=0.999)
 
     assert is_pass, f"PCC value: {output_pcc}"
+
+
+def test_ttnn_multi_device_all_gather_all_devices(t3k_mesh_device):
+    """Example test for running a 2x4-Ring All-Gather on galaxy"""
+    full_tensor = torch.ones((1, 1, 32, 32 * t3k_mesh_device.get_num_devices()), dtype=torch.bfloat16)
+    for i in range(t3k_mesh_device.get_num_devices()):
+        full_tensor[..., i * 32 : (i + 1) * 32] = i
+
+    ttnn_tensor = ttnn.from_torch(full_tensor, mesh_mapper=ShardTensorToMesh(t3k_mesh_device, dim=3))
+    ttnn_tensor = ttnn.to_device(ttnn_tensor, t3k_mesh_device)
+    ttnn_tensor = ttnn.all_gather(ttnn_tensor, dim=3, num_links=1)
+
+    device_tensors: typing.List[ttnn.Tensor] = ttnn.get_device_tensors(ttnn_tensor)
+    for device_tensor in device_tensors:
+        device_tensor_torch = ttnn.to_torch(device_tensor)
+        assert torch.all(device_tensor_torch == full_tensor)
diff --git a/tests/ttnn/unit_tests/gtests/test_ccl_on_galaxy.cpp b/tests/ttnn/unit_tests/gtests/test_ccl_on_galaxy.cpp
@@ -130,8 +130,9 @@ TEST(GalaxyTests, TestAllGatherDeadlock) {
     }
     // Iterate over each row and run line all-gather multiple times.
     // For each row, send adversarial traffic to the first chip, that can hang the network if the CCL is not tagged.
+    auto view = MeshDeviceView(*mesh);
     for (uint32_t row = 0; row < 8; row++) {
-        auto devs = mesh->get_devices_on_row(row);
+        auto devs = view.get_devices_on_row(row);
         std::vector<uint32_t> device_ids = {};
         for (auto dev : devs) {
             device_ids.push_back(dev->id());
@@ -189,13 +190,14 @@ TEST(GalaxyTests, TestReduceScatterDeadlock) {
     std::shared_ptr<MeshDevice> mesh = ttnn::multi_device::open_mesh_device(mesh_shape, 0, 0, 1, DispatchCoreType::WORKER);
     // Create the outer ring on which Reduce Scatter will be run. This allows us to verify that there are no deadlocks when we send CCLs to the
     // first tunnel (forward path).
-    std::vector<Device*> ring_devices = mesh->get_devices_on_row(0); // Tunnel 0
-    std::vector<Device*> ring_devices_1 = mesh->get_devices_on_column(mesh_shape.second - 1); // Orthogonal to tunnel .. no deadlocks
+    auto view = MeshDeviceView(*mesh);
+    std::vector<Device*> ring_devices = view.get_devices_on_row(0); // Tunnel 0
+    std::vector<Device*> ring_devices_1 = view.get_devices_on_column(mesh_shape.second - 1); // Orthogonal to tunnel .. no deadlocks
     ring_devices_1 = std::vector<Device*>(ring_devices_1.begin() + 1, ring_devices_1.end());
-    std::vector<Device*> ring_devices_2 = mesh->get_devices_on_row(7); // Tunnel 7 .. potential deadlocks with lack of buffering
+    std::vector<Device*> ring_devices_2 = view.get_devices_on_row(7); // Tunnel 7 .. potential deadlocks with lack of buffering
     std::reverse(ring_devices_2.begin(), ring_devices_2.end());
     ring_devices_2 = std::vector<Device*>(ring_devices_2.begin() + 1, ring_devices_2.end());
-    std::vector<Device*> ring_devices_3 = mesh->get_devices_on_column(0); // Orthogonal to tunnel .. no deadlocks
+    std::vector<Device*> ring_devices_3 = view.get_devices_on_column(0); // Orthogonal to tunnel .. no deadlocks
     std::reverse(ring_devices_3.begin(), ring_devices_3.end());
     ring_devices_3 = std::vector<Device*>(ring_devices_3.begin() + 1, ring_devices_3.end() - 1);