Skip to content

Commit

Permalink
#0: Fix N150-8B demo. Minor fixes after rebase.
Browse files Browse the repository at this point in the history
  • Loading branch information
mtairum committed Nov 26, 2024
1 parent 0ec756b commit 4ff627a
Show file tree
Hide file tree
Showing 5 changed files with 13 additions and 18 deletions.
4 changes: 1 addition & 3 deletions models/demos/llama3/demo/demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,8 +277,7 @@ def run_llama3_demo(user_input, single_layer, mesh_device, instruct_mode, is_ci_
profiler.end("loading_weights_to_device")
logger.info("Finished loading weights to device.")

# TODO Change this back to 100
max_generated_tokens = 20 # Maximum number of tokens to generate per user
max_generated_tokens = 100 # Maximum number of tokens to generate per user
num_tokens_generated_decode = []

logger.info("Starting inference...")
Expand Down Expand Up @@ -422,7 +421,6 @@ def run_llama3_demo(user_input, single_layer, mesh_device, instruct_mode, is_ci_
# Get cos/sin matrices for the current position of each user
rot_mats = rope_setup.get_rot_mats(current_pos_padded)
rot_mat_idxs = rope_setup.get_rot_idxs(current_pos_padded)

# Compile
logger.info(f"Compiling model trace...")
decode_input = ttnn.unsqueeze_to_4D(tt_embd(tt_out_tok))
Expand Down
12 changes: 9 additions & 3 deletions models/demos/llama3/tests/test_llama_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,16 +34,22 @@
)
@pytest.mark.parametrize(
"paged_attention",
(True, False),
ids=("paged_attention", "non_paged_attention"),
(
True,
# False,
),
ids=(
"paged_attention",
# "non_paged_attention",
),
)
@pytest.mark.parametrize(
"paged_attention_params",
[{"page_block_size": 64, "page_max_num_blocks": 2048}],
)
@pytest.mark.parametrize(
"batch_size",
(32,), # TODO Miguel: should we include batch==1 in the unit tests as well?
(1,),
)
@pytest.mark.parametrize(
"max_seq_len",
Expand Down
9 changes: 0 additions & 9 deletions models/demos/llama3/tests/test_llama_perf.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,15 +36,6 @@
(1024, 30),
),
)
@pytest.mark.parametrize(
"mesh_device",
[
{"N150": (1, 1), "N300": (1, 2), "T3K": (1, 8), "TG": (8, 4)}.get(
os.environ.get("FAKE_DEVICE"), len(ttnn.get_device_ids())
)
],
indirect=True,
)
@pytest.mark.parametrize(
"paged_attention",
(
Expand Down
2 changes: 1 addition & 1 deletion models/demos/llama3/tt/llama_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def forward(self, x):

# Default configuration for Paged Attention
class PagedAttentionConfig:
def __init__(self, block_size=64, max_num_blocks=2048):
def __init__(self, block_size=32, max_num_blocks=1024):
self.block_size = block_size
self.max_num_blocks = max_num_blocks

Expand Down
4 changes: 2 additions & 2 deletions models/demos/llama3/tt/model_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@

# TODO: Miguel: Remove from here. I've added this to llama common instead, and each test should define their own values
class PagedAttentionConfig:
block_size = 64
max_num_blocks = 2048
block_size = 32
max_num_blocks = 1024


class TtModelArgs:
Expand Down

0 comments on commit 4ff627a

Please sign in to comment.