#0: Fix N150-8B demo. Minor fixes after rebase.

tenstorrent · Nov 26, 2024 · 4ff627a · 4ff627a
1 parent 0ec756b
commit 4ff627a
Show file tree

Hide file tree

Showing 5 changed files with 13 additions and 18 deletions.
diff --git a/models/demos/llama3/demo/demo.py b/models/demos/llama3/demo/demo.py
@@ -277,8 +277,7 @@ def run_llama3_demo(user_input, single_layer, mesh_device, instruct_mode, is_ci_
     profiler.end("loading_weights_to_device")
     logger.info("Finished loading weights to device.")
 
-    # TODO Change this back to 100
-    max_generated_tokens = 20  # Maximum number of tokens to generate per user
+    max_generated_tokens = 100  # Maximum number of tokens to generate per user
     num_tokens_generated_decode = []
 
     logger.info("Starting inference...")
@@ -422,7 +421,6 @@ def run_llama3_demo(user_input, single_layer, mesh_device, instruct_mode, is_ci_
         # Get cos/sin matrices for the current position of each user
         rot_mats = rope_setup.get_rot_mats(current_pos_padded)
         rot_mat_idxs = rope_setup.get_rot_idxs(current_pos_padded)
-
         # Compile
         logger.info(f"Compiling model trace...")
         decode_input = ttnn.unsqueeze_to_4D(tt_embd(tt_out_tok))

diff --git a/models/demos/llama3/tests/test_llama_attention.py b/models/demos/llama3/tests/test_llama_attention.py
@@ -34,16 +34,22 @@
 )
 @pytest.mark.parametrize(
     "paged_attention",
-    (True, False),
-    ids=("paged_attention", "non_paged_attention"),
+    (
+        True,
+        # False,
+    ),
+    ids=(
+        "paged_attention",
+        # "non_paged_attention",
+    ),
 )
 @pytest.mark.parametrize(
     "paged_attention_params",
     [{"page_block_size": 64, "page_max_num_blocks": 2048}],
 )
 @pytest.mark.parametrize(
     "batch_size",
-    (32,),  # TODO Miguel: should we include batch==1 in the unit tests as well?
+    (1,),
 )
 @pytest.mark.parametrize(
     "max_seq_len",

diff --git a/models/demos/llama3/tests/test_llama_perf.py b/models/demos/llama3/tests/test_llama_perf.py
@@ -36,15 +36,6 @@
         (1024, 30),
     ),
 )
-@pytest.mark.parametrize(
-    "mesh_device",
-    [
-        {"N150": (1, 1), "N300": (1, 2), "T3K": (1, 8), "TG": (8, 4)}.get(
-            os.environ.get("FAKE_DEVICE"), len(ttnn.get_device_ids())
-        )
-    ],
-    indirect=True,
-)
 @pytest.mark.parametrize(
     "paged_attention",
     (

diff --git a/models/demos/llama3/tt/llama_common.py b/models/demos/llama3/tt/llama_common.py
@@ -18,7 +18,7 @@ def forward(self, x):
 
 # Default configuration for Paged Attention
 class PagedAttentionConfig:
-    def __init__(self, block_size=64, max_num_blocks=2048):
+    def __init__(self, block_size=32, max_num_blocks=1024):
         self.block_size = block_size
         self.max_num_blocks = max_num_blocks
 

diff --git a/models/demos/llama3/tt/model_config.py b/models/demos/llama3/tt/model_config.py
@@ -25,8 +25,8 @@
 
 # TODO: Miguel: Remove from here. I've added this to llama common instead, and each test should define their own values
 class PagedAttentionConfig:
-    block_size = 64
-    max_num_blocks = 2048
+    block_size = 32
+    max_num_blocks = 1024
 
 
 class TtModelArgs: