From 65c745e9c04fb38271ca446f92e68c3d707105a0 Mon Sep 17 00:00:00 2001 From: Aleks Knezevic Date: Thu, 2 May 2024 01:58:43 +0000 Subject: [PATCH] #7560: Slow down MMs when running non-perf tests to avoid ND hang. No hang observed on BM machine where perf tests are run. --- .../tests/test_perf_stable_diffusion.py | 2 +- .../tt2/ttnn_functional_cross_attention.py | 2 +- .../tt2/ttnn_functional_feedforward.py | 5 ++++- .../tt2/ttnn_functional_geglu.py | 13 +++++++----- ...ttnn_functional_unet_2d_condition_model.py | 20 +++++++++---------- .../tt2/ttnn_functional_utility_functions.py | 7 +++++++ tests/scripts/nightly/run_wh_b0_only.sh | 2 +- tests/scripts/run_python_model_tests.sh | 2 ++ .../stable_diffusion/test_cross_attention.py | 12 ++++------- .../test_unet_2d_condition_model.py | 17 ++++++++++++---- 10 files changed, 50 insertions(+), 32 deletions(-) diff --git a/models/experimental/functional_stable_diffusion/tests/test_perf_stable_diffusion.py b/models/experimental/functional_stable_diffusion/tests/test_perf_stable_diffusion.py index a946b28225e..8315226f09c 100644 --- a/models/experimental/functional_stable_diffusion/tests/test_perf_stable_diffusion.py +++ b/models/experimental/functional_stable_diffusion/tests/test_perf_stable_diffusion.py @@ -232,7 +232,7 @@ def test_stable_diffusion_device_perf(expected_perf): margin = 0.02 batch = 1 iterations = 1 - command = f"pytest tests/ttnn/integration_tests/stable_diffusion/test_unet_2d_condition_model.py::test_unet_2d_condition_model512x512[batch_size=2-in_channels=4-input_height=64-input_width=64-device_l1_small_size=32768]" + command = f"pytest tests/ttnn/integration_tests/stable_diffusion/test_unet_2d_condition_model.py::test_unet_2d_condition_model_512x512[batch_size=2-in_channels=4-input_height=64-input_width=64-device_l1_small_size=32768]" cols = ["DEVICE FW", "DEVICE KERNEL", "DEVICE BRISC KERNEL"] inference_time_key = "AVG DEVICE KERNEL SAMPLES/S" diff --git a/models/experimental/functional_stable_diffusion/tt2/ttnn_functional_cross_attention.py b/models/experimental/functional_stable_diffusion/tt2/ttnn_functional_cross_attention.py index 5c64b87d7ac..b5c563b4282 100644 --- a/models/experimental/functional_stable_diffusion/tt2/ttnn_functional_cross_attention.py +++ b/models/experimental/functional_stable_diffusion/tt2/ttnn_functional_cross_attention.py @@ -304,7 +304,7 @@ def time_sharded_attention(self, query, t_key, value, head_size): per_core_M=tiles_per_shard, per_core_N=seq_len // 32, out_subblock_h=1, - out_subblock_w=8, + out_subblock_w=4, fuse_batch=True, fused_activation=None, mcast_in0=False, diff --git a/models/experimental/functional_stable_diffusion/tt2/ttnn_functional_feedforward.py b/models/experimental/functional_stable_diffusion/tt2/ttnn_functional_feedforward.py index be8de10ab9d..f9c3fda3174 100644 --- a/models/experimental/functional_stable_diffusion/tt2/ttnn_functional_feedforward.py +++ b/models/experimental/functional_stable_diffusion/tt2/ttnn_functional_feedforward.py @@ -58,7 +58,6 @@ def __init__(self, device, parameters): def __call__(self, config, hidden_states): hidden_states = self.geglu(config, hidden_states) - # TODO: Output sharded once https://github.com/tenstorrent/tt-metal/issues/6775 is fixed interleaved_output = False size = hidden_states.shape[-2] grid_size = self.grid_sizes[size] @@ -68,6 +67,10 @@ def __call__(self, config, hidden_states): out_block_h = math.ceil(M / grid_size[1] / 32) out_block_w = math.ceil(N / grid_size[0] / 32) out_subblock_h, out_subblock_w = determine_largest_subblock_size(out_block_h, out_block_w) + # TODO: https://github.com/tenstorrent/tt-metal/issues/7560 + if size == 512: + out_subblock_h = 1 + out_subblock_w = 1 program_config = ttnn.experimental.operations.primary.MatmulMultiCoreReuseMultiCastProgramConfig( compute_with_storage_grid_size=grid_size, in0_block_w=in0_block_w, diff --git a/models/experimental/functional_stable_diffusion/tt2/ttnn_functional_geglu.py b/models/experimental/functional_stable_diffusion/tt2/ttnn_functional_geglu.py index 647ffe8aeaa..b95bde681d2 100644 --- a/models/experimental/functional_stable_diffusion/tt2/ttnn_functional_geglu.py +++ b/models/experimental/functional_stable_diffusion/tt2/ttnn_functional_geglu.py @@ -8,6 +8,7 @@ import tt_lib as ttl from models.experimental.functional_stable_diffusion.tt2.ttnn_functional_utility_functions import ( determine_largest_subblock_size, + determine_blocking, ) @@ -95,11 +96,13 @@ def __call__(self, config, hidden_states): ttnn.experimental.tensor.TensorMemoryLayout.BLOCK_SHARDED, ttnn.experimental.tensor.ShardOrientation.ROW_MAJOR, ) - in0_block_h = M // grid_size[1] // 32 - in0_block_w = K // grid_size[0] // 32 - out_block_h = math.ceil(M / grid_size[1] / 32) - out_block_w = math.ceil(N / grid_size[0] / 32) - out_subblock_h, out_subblock_w = determine_largest_subblock_size(out_block_h, out_block_w) + in0_block_h, in0_block_w, out_subblock_h, out_subblock_w, out_block_h, out_block_w = determine_blocking( + M, K, N, grid_size + ) + # TODO: https://github.com/tenstorrent/tt-metal/issues/7560 + if size == 512: + out_subblock_h = 1 + out_subblock_w = 1 program_config = ttnn.experimental.operations.primary.MatmulMultiCoreReuseMultiCastProgramConfig( compute_with_storage_grid_size=grid_size, in0_block_w=in0_block_w, diff --git a/models/experimental/functional_stable_diffusion/tt2/ttnn_functional_unet_2d_condition_model.py b/models/experimental/functional_stable_diffusion/tt2/ttnn_functional_unet_2d_condition_model.py index adcde260218..55ae3cc6096 100644 --- a/models/experimental/functional_stable_diffusion/tt2/ttnn_functional_unet_2d_condition_model.py +++ b/models/experimental/functional_stable_diffusion/tt2/ttnn_functional_unet_2d_condition_model.py @@ -114,7 +114,7 @@ def __init__( out_channels = parameters.conv_in.weight.shape[0] in_channels = parameters.conv_in.weight.shape[1] - print(f"CIN: height: {input_height}, width: {input_width}, dim: {2 * input_height * input_width}") + logger.info(f"CIN: height: {input_height}, width: {input_width}, dim: {2 * input_height * input_width}") self.conv_in = ttnn.Conv2d( in_channels=in_channels, out_channels=out_channels, @@ -140,7 +140,7 @@ def __init__( self.down_blocks = [] input_height = self.conv_in.output_height input_width = self.conv_in.output_height - print(f"D-1: height: {input_height}, width: {input_width}, dim: {2 * input_height * input_width}") + logger.info(f"D-1: height: {input_height}, width: {input_width}, dim: {2 * input_height * input_width}") self.down_block_types = down_block_types for i, down_block_type in enumerate(down_block_types): if down_block_type == "CrossAttnDownBlock2D": @@ -169,7 +169,7 @@ def __init__( self.down_blocks.append(down_block) input_height = down_block.output_height input_width = down_block.output_width - print(f"D{i}: height: {input_height}, width: {input_width}, dim: {2 * input_height * input_width}") + logger.info(f"D{i}: height: {input_height}, width: {input_width}, dim: {2 * input_height * input_width}") assert mid_block_type == "UNetMidBlock2DCrossAttn" self.mid_block = unet_mid_block_2d_cross_attn( @@ -183,7 +183,7 @@ def __init__( ) input_height = self.mid_block.output_height input_width = self.mid_block.output_width - print(f"MID: height: {input_height}, width: {input_width}, dim: {2 * input_height * input_width}") + logger.info(f"MID: height: {input_height}, width: {input_width}, dim: {2 * input_height * input_width}") self.up_blocks = [] self.up_block_types = up_block_types @@ -214,7 +214,7 @@ def __init__( self.up_blocks.append(up_block) input_height = up_block.output_height input_width = up_block.output_width - print(f"UP{i}: height: {input_height}, width: {input_width}, dim: {2 * input_height * input_width}") + logger.info(f"UP{i}: height: {input_height}, width: {input_width}, dim: {2 * input_height * input_width}") parameters.conv_out.weight, parameters.conv_out.bias = permute_conv_weights( parameters.conv_out.weight, parameters.conv_out.bias @@ -227,7 +227,7 @@ def __init__( out_channels = parameters.conv_out.weight.shape[0] in_channels = parameters.conv_out.weight.shape[1] - print(f"COU: height: {input_height}, width: {input_width}, dim: {2 * input_height * input_width}") + logger.info(f"COU: height: {input_height}, width: {input_width}, dim: {2 * input_height * input_width}") self.conv_out = ttnn.Conv2d( in_channels=in_channels, out_channels=out_channels, @@ -432,7 +432,7 @@ def __call__( output_channel = block_out_channels[0] for i, (down_block_type, down_block) in enumerate(zip(self.down_block_types, self.down_blocks)): ttl.device.DumpDeviceProfiler(self.device) - print(f"Down block {i}") + logger.info(f"Down block {i}") input_channel = output_channel output_channel = block_out_channels[i] is_final_block = i == len(block_out_channels) - 1 @@ -486,7 +486,7 @@ def __call__( down_block_res_samples += res_samples # 4.mid - print("Mid block") + logger.info("Mid block") sample = self.mid_block( hidden_states=sample, temb=emb, @@ -517,7 +517,7 @@ def __call__( output_channel = reversed_block_out_channels[0] for i, (up_block_type, up_block) in enumerate(zip(self.up_block_types, self.up_blocks)): ttl.device.DumpDeviceProfiler(self.device) - print(f"Up block {i}") + logger.info(f"Up block {i}") is_final_block = i == len(block_out_channels) - 1 prev_output_channel = output_channel @@ -591,8 +591,6 @@ def __call__( ), f"CrossAttnUpBlock2D, and UpBlock2D are the only up blocks implemented! you requested {up_block_type}" # 6.post-process - # print(sample.shape) - # print(sample.memory_config()) sample = ttnn.to_layout(sample, ttnn.ROW_MAJOR_LAYOUT) if self.fallback_on_groupnorm: assert self.norm_num_groups == norm_num_groups diff --git a/models/experimental/functional_stable_diffusion/tt2/ttnn_functional_utility_functions.py b/models/experimental/functional_stable_diffusion/tt2/ttnn_functional_utility_functions.py index 0125736cc8e..740965498e9 100644 --- a/models/experimental/functional_stable_diffusion/tt2/ttnn_functional_utility_functions.py +++ b/models/experimental/functional_stable_diffusion/tt2/ttnn_functional_utility_functions.py @@ -233,4 +233,11 @@ def determine_blocking(M, K, N, grid_size, transpose_mcast=False): out_block_h = math.ceil(M / logical_grid_size[1] / 32) out_block_w = math.ceil(N / logical_grid_size[0] / 32) out_subblock_h, out_subblock_w = determine_largest_subblock_size(out_block_h, out_block_w) + # TODO: https://github.com/tenstorrent/tt-metal/issues/7560 + # There's a bug that causes an ND hang, until it's solved reduce subblock sizes to 1, if we're not + import os + + if os.environ.get("SLOW_MATMULS", "0") == "1": + out_subblock_h = 1 + out_subblock_w = 1 return in0_block_h, in0_block_w, out_subblock_h, out_subblock_w, out_block_h, out_block_w diff --git a/tests/scripts/nightly/run_wh_b0_only.sh b/tests/scripts/nightly/run_wh_b0_only.sh index 1f3679f8539..21886b51b75 100755 --- a/tests/scripts/nightly/run_wh_b0_only.sh +++ b/tests/scripts/nightly/run_wh_b0_only.sh @@ -10,7 +10,7 @@ fi echo "Running nightly tests for WH B0 only" env pytest tests/ttnn/integration_tests/unet # -> failing: issue #7556 -# env pytest tests/ttnn/integration_tests/stable_diffusion # -> failing/hanging: issue #7560 +SLOW_MATMULS=1 WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml env pytest tests/ttnn/integration_tests/stable_diffusion env pytest models/demos/mamba/tests/test_mamba_ssm.py env pytest models/demos/mamba/tests/test_mamba_block.py diff --git a/tests/scripts/run_python_model_tests.sh b/tests/scripts/run_python_model_tests.sh index 77b54919df4..3c0009cb9e6 100755 --- a/tests/scripts/run_python_model_tests.sh +++ b/tests/scripts/run_python_model_tests.sh @@ -53,4 +53,6 @@ if [ "$ARCH_NAME" != "wormhole_b0" ]; then pytest $TT_METAL_HOME/models/demos/falcon7b/tests/unit_tests/test_falcon_attn_matmul.py -k "not attn_matmul_from_cache" # higher sequence lengths and different formats trigger memory issues pytest $TT_METAL_HOME/models/demos/falcon7b/tests/unit_tests/test_falcon_matmuls_and_bmms_with_mixed_precision.py -k "seq_len_128 and in0_BFLOAT16-in1_BFLOAT8_B-out_BFLOAT16-weights_DRAM" + + SLOW_MATMULS=1 WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest $TT_METAL_HOME/tests/ttnn/integration_tests/stable_diffusion/test_unet_2d_condition_model.py -k 512 fi diff --git a/tests/ttnn/integration_tests/stable_diffusion/test_cross_attention.py b/tests/ttnn/integration_tests/stable_diffusion/test_cross_attention.py index c2f36a27c06..0c9ad4b5913 100644 --- a/tests/ttnn/integration_tests/stable_diffusion/test_cross_attention.py +++ b/tests/ttnn/integration_tests/stable_diffusion/test_cross_attention.py @@ -16,7 +16,7 @@ cross_attention as tt2_ttnn_cross_attention, ) from ttnn.model_preprocessing import preprocess_model_parameters -from tests.ttnn.utils_for_testing import assert_with_pcc +from tests.ttnn.utils_for_testing import assert_with_pcc, comp_pcc from models.utility_functions import ( skip_for_grayskull, ) @@ -254,17 +254,13 @@ def test_cross_attention_512x512(device, model_name, N, C, H, W, index, has_enco ttnn_hidden_states = ttnn.to_device(ttnn_hidden_states, device) model = tt2_ttnn_cross_attention(device, parameters) - signpost(header="start") ttnn_output = model( ttnn_hidden_states, ttnn_encoder_hidden_states, attention_mask=None, dim_head=W // 8, ) - signpost(header="stop") - - ttnn_output = ttnn.from_device(ttnn_output) ttnn_output = ttnn.to_torch(ttnn_output) - ttnn_output = ttnn_output.reshape(N, C, H, W) - - assert_with_pcc(torch_output, ttnn_output, pcc=0.99) + passing, output = comp_pcc(torch_output, ttnn_output, pcc=0.99) + print(output) + assert passing diff --git a/tests/ttnn/integration_tests/stable_diffusion/test_unet_2d_condition_model.py b/tests/ttnn/integration_tests/stable_diffusion/test_unet_2d_condition_model.py index 507e78f32cf..d7ac7fdc467 100644 --- a/tests/ttnn/integration_tests/stable_diffusion/test_unet_2d_condition_model.py +++ b/tests/ttnn/integration_tests/stable_diffusion/test_unet_2d_condition_model.py @@ -8,7 +8,6 @@ import pytest from tqdm.auto import tqdm import time -from tracy import signpost from tests.ttnn.utils_for_testing import assert_with_pcc from models.utility_functions import ( @@ -211,7 +210,13 @@ def test_unet_2d_condition_model_512x512(device, batch_size, in_channels, input_ model = UNet2D(device, parameters, batch_size, input_height, input_width, reader_patterns_cache) first_iter = time.time() - signpost(header="start") + use_signpost = True + try: + from tracy import signpost + except ModuleNotFoundError: + use_signpost = False + if use_signpost: + signpost(header="start") ttnn_output = model( input, timestep=ttnn_timestep, @@ -222,8 +227,10 @@ def test_unet_2d_condition_model_512x512(device, batch_size, in_channels, input_ return_dict=return_dict, config=config, ) - signpost(header="stop") + if use_signpost: + signpost(header="stop") first_iter = time.time() - first_iter + ttnn_output = ttnn_to_torch(ttnn_output) print(f"First iteration took {first_iter} seconds") # times = [] # for i in range(50): @@ -238,6 +245,9 @@ def test_unet_2d_condition_model_512x512(device, batch_size, in_channels, input_ # return_dict=return_dict, # config=config, # ) + # ttnn_output = ttnn_to_torch(ttnn_output) + # passing, output = comp_pcc(torch_output, ttnn_output, pcc=0.99) + # print(output) # end = time.time() # times.append(end - start) # print(f"Current iteration took {end - start} seconds") @@ -247,7 +257,6 @@ def test_unet_2d_condition_model_512x512(device, batch_size, in_channels, input_ # print(iter) # print(f"Time taken for 50 iterations: {total_time}") # print(f"Samples per second: {50 / total_time}") - ttnn_output = ttnn_to_torch(ttnn_output) passing, output = comp_pcc(torch_output, ttnn_output, pcc=0.99) print(output) assert passing