#13541: Conv2d enable shallow convs in auto shard

Enable shallow convs in auto shard codepath. Shallow convs use less L1 memory, so we get more tests to pass on torch traces. Decouple shallow convs from split reader so that we can enable shallow convs in a more generic way.
tenstorrent · Oct 25, 2024 · 0194005 · 0194005
1 parent e966f77
commit 0194005
Show file tree

Hide file tree

Showing 10 changed files with 134 additions and 277 deletions.
diff --git a/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_new_conv_api.py b/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_new_conv_api.py
@@ -635,13 +635,6 @@ def __init__(
             width=self.conv1_output_width,
             in_channels=self.conv1_input_channels,
             out_channels=self.conv1_output_channels,
-            kernel_size=[self.conv1_kernel_size[0], self.conv1_kernel_size[1]],
-            stride=[self.conv1_stride[0], self.conv1_stride[1]],
-            padding=[self.conv1_padding[0], self.conv1_padding[1]],
-            dilation=[1, 1],
-            groups=1,
-            weights_width=self.conv1_weight_tensor.shape[3],
-            input_width=self.conv1_input_width,
         )
 
     def __del__(self):

diff --git a/models/experimental/functional_unet/tt/unet_shallow_ttnn.py b/models/experimental/functional_unet/tt/unet_shallow_ttnn.py
@@ -266,7 +266,7 @@ def __init__(
                 output_height=self.conv2.input_height,
                 output_width=self.conv2.input_width,
                 output_channels=self.conv1.out_channels,
-                device=device,
+                compute_grid_size=device.compute_with_storage_grid_size(),
                 block_shard_orientation=ttnn.ShardOrientation.ROW_MAJOR,
                 is_out_tiled=True,
             )
@@ -320,7 +320,7 @@ def __init__(
                 output_height=self.conv2.input_height,
                 output_width=self.conv2.input_width,
                 output_channels=self.conv1.out_channels,
-                device=device,
+                compute_grid_size=device.compute_with_storage_grid_size(),
                 block_shard_orientation=ttnn.ShardOrientation.ROW_MAJOR,
                 is_out_tiled=True,
             )
@@ -448,7 +448,7 @@ def __init__(self, parameters: ParameterDict, device, mesh_mapper=None) -> None:
             output_height=self.bnc2.input_height,
             output_width=self.bnc2.input_width,
             output_channels=self.bnc.out_channels,
-            device=device,
+            compute_grid_size=device.compute_with_storage_grid_size(),
             block_shard_orientation=ttnn.ShardOrientation.ROW_MAJOR,
             is_out_tiled=True,
         )

diff --git a/tests/sweep_framework/sweeps/conv2d/short/conv2d_short_sweep.py b/tests/sweep_framework/sweeps/conv2d/short/conv2d_short_sweep.py
@@ -20,6 +20,7 @@
             # Contains following params
             # [batch_size, output_channels, input_channels, input_height, input_width, kernel_height, kernel_width, stride_x, stride_y, pad_x, pad_y, groups, bias, dilation]
             [1, 32, 1, 28, 28, 3, 3, 1, 1, 0, 0, 1, True, 1],
+            [1, 100, 100, 14, 14, 3, 3, 1, 1, 1, 1, 100, False, 1],
             [1, 1008, 1008, 14, 14, 3, 3, 2, 2, 1, 1, 21, False, 1],
             [1, 1008, 1008, 7, 7, 3, 3, 1, 1, 1, 1, 21, False, 1],
             [1, 1024, 1024, 10, 10, 3, 3, 1, 1, 1, 1, 1024, False, 1],
@@ -454,25 +455,16 @@ def test_conv2d_localrun(device, input_spec):
     # [batch_size, output_channels, input_channels, input_height, input_width, kernel_height, kernel_width, stride_x, stride_y, pad_x, pad_y, groups, bias, dilation]
     # Input is 32MB maps to MM 64 cores, we neeed to avoid sharding this tensor and use dram intrelaved directly with MM
     [1, 256, 1024, 128, 128, 1, 1, 1, 1, 0, 0, 1, False, 1],  # 6
-    [1, 1024, 1024, 19, 19, 1, 1, 1, 1, 0, 0, 1, True, 1],  # 9
-    [1, 2048, 1024, 7, 7, 1, 1, 1, 1, 0, 0, 1, True, 1],  # 11
     [1, 1056, 1056, 48, 48, 3, 3, 1, 1, 1, 1, 4, False, 1],  # 14
     [1, 1056, 1056, 96, 96, 3, 3, 2, 2, 1, 1, 4, False, 1],  # 15
     [1, 192, 192, 99, 99, 5, 5, 2, 2, 0, 0, 192, False, 1],  # 100
     [1, 2520, 2520, 14, 14, 3, 3, 2, 2, 1, 1, 15, False, 1],  # 141
     [1, 2904, 2904, 24, 24, 3, 3, 1, 1, 1, 1, 11, False, 1],  # 170
     [1, 2904, 2904, 48, 48, 3, 3, 2, 2, 1, 1, 11, False, 1],  # 171
-    [1, 1024, 3, 224, 224, 16, 16, 16, 16, 0, 0, 1, True, 1],  # 172
     [1, 1024, 3, 224, 224, 32, 32, 32, 32, 0, 0, 1, True, 1],  # 173
-    [1, 768, 3, 224, 224, 16, 16, 16, 16, 0, 0, 1, True, 1],  # 181
     [1, 768, 3, 224, 224, 32, 32, 32, 32, 0, 0, 1, False, 1],  # 182
     [1, 768, 3, 224, 224, 32, 32, 32, 32, 0, 0, 1, True, 1],  # 183
-    [1, 32, 3, 299, 299, 3, 3, 2, 2, 0, 0, 1, False, 1],  # 192
-    [1, 32, 3, 381, 381, 3, 3, 2, 2, 0, 0, 1, False, 1],  # 197
     [1, 768, 3, 384, 512, 32, 32, 32, 32, 0, 0, 1, True, 1],  # 199
-    [1, 192, 3, 512, 672, 16, 16, 16, 16, 0, 0, 1, True, 1],  # 202
-    [1, 1280, 3, 518, 518, 14, 14, 14, 14, 0, 0, 1, True, 1],  # 203
-    [1, 64, 3, 720, 1280, 7, 7, 2, 2, 3, 3, 1, False, 1],  # 204
     [1, 64, 3, 800, 1088, 7, 7, 2, 2, 3, 3, 1, False, 1],  # 205
     [1, 336, 336, 112, 112, 3, 3, 2, 2, 1, 1, 2, False, 1],  # 241
     [1, 336, 336, 48, 48, 5, 5, 1, 1, 2, 2, 336, False, 1],  # 245

diff --git a/tests/sweep_framework/sweeps/max_pool2d/short/max_pool2d_short_sweep.py b/tests/sweep_framework/sweeps/max_pool2d/short/max_pool2d_short_sweep.py
@@ -132,7 +132,7 @@ def run(
         output_height=out_h,
         output_width=out_w,
         output_channels=in_c,
-        device=device,
+        compute_grid_size=device.compute_with_storage_grid_size(),
         is_out_tiled=False,
     )
     sharded_memory_config = ttnn._ttnn.operations.conv2d.create_sharded_memory_config_from_parallel_config(

diff --git a/tests/ttnn/unit_tests/operations/test_maxpool2d.py b/tests/ttnn/unit_tests/operations/test_maxpool2d.py
@@ -107,7 +107,7 @@ def run_max_pool(
             output_height=out_h,
             output_width=out_w,
             output_channels=in_c,
-            device=device,
+            compute_grid_size=device.compute_with_storage_grid_size(),
             block_shard_orientation=ttnn.ShardOrientation.ROW_MAJOR,
             is_out_tiled=False,
         )
@@ -632,7 +632,7 @@ def test_pool_core_nondivis(
             output_height=out_h,
             output_width=out_w,
             output_channels=in_c,
-            device=device,
+            compute_grid_size=device.compute_with_storage_grid_size(),
             block_shard_orientation=ttnn.ShardOrientation.ROW_MAJOR,
             is_out_tiled=True,
         )