diff --git a/models/demos/llama3/demo/demo.py b/models/demos/llama3/demo/demo.py index 975427aacc2..a2f443f3712 100644 --- a/models/demos/llama3/demo/demo.py +++ b/models/demos/llama3/demo/demo.py @@ -277,8 +277,7 @@ def run_llama3_demo(user_input, single_layer, mesh_device, instruct_mode, is_ci_ profiler.end("loading_weights_to_device") logger.info("Finished loading weights to device.") - # TODO Change this back to 100 - max_generated_tokens = 20 # Maximum number of tokens to generate per user + max_generated_tokens = 100 # Maximum number of tokens to generate per user num_tokens_generated_decode = [] logger.info("Starting inference...") @@ -422,7 +421,6 @@ def run_llama3_demo(user_input, single_layer, mesh_device, instruct_mode, is_ci_ # Get cos/sin matrices for the current position of each user rot_mats = rope_setup.get_rot_mats(current_pos_padded) rot_mat_idxs = rope_setup.get_rot_idxs(current_pos_padded) - # Compile logger.info(f"Compiling model trace...") decode_input = ttnn.unsqueeze_to_4D(tt_embd(tt_out_tok)) diff --git a/models/demos/llama3/tests/test_llama_attention.py b/models/demos/llama3/tests/test_llama_attention.py index f3c50eb40be..a7bb88dc2d5 100644 --- a/models/demos/llama3/tests/test_llama_attention.py +++ b/models/demos/llama3/tests/test_llama_attention.py @@ -34,8 +34,14 @@ ) @pytest.mark.parametrize( "paged_attention", - (True, False), - ids=("paged_attention", "non_paged_attention"), + ( + True, + # False, + ), + ids=( + "paged_attention", + # "non_paged_attention", + ), ) @pytest.mark.parametrize( "paged_attention_params", @@ -43,7 +49,7 @@ ) @pytest.mark.parametrize( "batch_size", - (32,), # TODO Miguel: should we include batch==1 in the unit tests as well? + (1,), ) @pytest.mark.parametrize( "max_seq_len", diff --git a/models/demos/llama3/tests/test_llama_perf.py b/models/demos/llama3/tests/test_llama_perf.py index 55dd13f7aa3..24daaa38f18 100644 --- a/models/demos/llama3/tests/test_llama_perf.py +++ b/models/demos/llama3/tests/test_llama_perf.py @@ -36,15 +36,6 @@ (1024, 30), ), ) -@pytest.mark.parametrize( - "mesh_device", - [ - {"N150": (1, 1), "N300": (1, 2), "T3K": (1, 8), "TG": (8, 4)}.get( - os.environ.get("FAKE_DEVICE"), len(ttnn.get_device_ids()) - ) - ], - indirect=True, -) @pytest.mark.parametrize( "paged_attention", ( diff --git a/models/demos/llama3/tt/llama_common.py b/models/demos/llama3/tt/llama_common.py index 4ca08fbcc43..43ca95bbe74 100644 --- a/models/demos/llama3/tt/llama_common.py +++ b/models/demos/llama3/tt/llama_common.py @@ -18,7 +18,7 @@ def forward(self, x): # Default configuration for Paged Attention class PagedAttentionConfig: - def __init__(self, block_size=64, max_num_blocks=2048): + def __init__(self, block_size=32, max_num_blocks=1024): self.block_size = block_size self.max_num_blocks = max_num_blocks diff --git a/models/demos/llama3/tt/model_config.py b/models/demos/llama3/tt/model_config.py index 35c0a7bbf36..5ddb79f166c 100644 --- a/models/demos/llama3/tt/model_config.py +++ b/models/demos/llama3/tt/model_config.py @@ -25,8 +25,8 @@ # TODO: Miguel: Remove from here. I've added this to llama common instead, and each test should define their own values class PagedAttentionConfig: - block_size = 64 - max_num_blocks = 2048 + block_size = 32 + max_num_blocks = 1024 class TtModelArgs: