tenstorrent · cglagovichTT · Oct 29, 2024 · Oct 18, 2024 · Oct 21, 2024 · Oct 21, 2024
@@ -12,6 +12,7 @@ jobs:
           { name: "t3k_falcon40b_tests", arch: wormhole_b0, cmd: run_t3000_falcon40b_tests, timeout: 50, owner_id: U053W15B6JF}, #Djordje Ivanovic
           { name: "t3k_llama3_70b_tests", arch: wormhole_b0, cmd: run_t3000_llama3_70b_tests, timeout: 30, owner_id: U03FJB5TM5Y}, #Colman Glagovich
           { name: "t3k_llama3_tests", arch: wormhole_b0, cmd: run_t3000_llama3_tests, timeout: 30, owner_id: U03PUAKE719}, # Miguel Tairum
+          { name: "t3k_llama3_vision_tests", arch: wormhole_b0, cmd: run_t3000_llama3_vision_tests, timeout: 30, owner_id: U03FJB5TM5Y}, #Colman Glagovich
           { name: "t3k_falcon7b_tests", arch: wormhole_b0, cmd: run_t3000_falcon7b_tests, timeout: 90, owner_id: U05RWH3QUPM}, #Salar Hosseini
           { name: "t3k_mixtral_tests", arch: wormhole_b0, cmd: run_t3000_mixtral_tests, timeout: 50, owner_id: U03PUAKE719}, # Miguel Tairum
         ]

@@ -15,6 +15,8 @@ jobs:
           { name: "t3k falcon40b tests", arch: wormhole_b0, cmd: run_t3000_falcon40b_tests, timeout: 120, owner_id: U04S2UV6L8N}, #Sofija Jovic
           { name: "t3k llama2_70b tests", arch: wormhole_b0, cmd: run_t3000_llama2_70b_tests, timeout: 60, owner_id: U03FJB5TM5Y}, #Colman Glagovich
           { name: "t3k llama3 tests", arch: wormhole_b0, cmd: run_t3000_llama3_tests, timeout: 60, owner_id: U03PUAKE719}, #Miguel Tairum Cruz
+          { name: "t3k llama3.2-vision tests", arch: wormhole_b0, cmd: run_t3000_llama3.2-11b-vision_freq_tests, timeout: 60, owner_id: U03FJB5TM5Y}, #Colman Glagovich
+          { name: "t3k n300 mesh llama3.2-vision tests", arch: wormhole_b0, cmd: run_t3000_spoof_n300_llama3.2-11b-vision_freq_tests, timeout: 60, owner_id: U03FJB5TM5Y}, #Colman Glagovich
           { name: "t3k mixtral tests", arch: wormhole_b0, cmd: run_t3000_mixtral_tests, timeout: 60, owner_id: U03PUAKE719}, #Miguel Tairum Cruz
           { name: "t3k resnet tests", arch: wormhole_b0, cmd: run_t3000_resnet_tests, timeout: 30, owner_id: U013121KDH9}, #Austin Ho
         ]

@@ -15,6 +15,8 @@ jobs:
           { name: "t3k falcon40b tests", arch: wormhole_b0, cmd: run_t3000_falcon40b_tests, timeout: 30, owner_id: U053W15B6JF}, #Djordje Ivanovic
           { name: "t3k llama3-small tests", arch: wormhole_b0, cmd: run_t3000_llama3-small_tests, timeout: 30, owner_id: U03PUAKE719},  #Miguel Tairum Cruz
           { name: "t3k llama3.2-11b tests", arch: wormhole_b0, cmd: run_t3000_llama3.2-11b_tests, timeout: 30, owner_id: U03PUAKE719},  #Miguel Tairum Cruz
+          { name: "t3k llama3.2-11b-vision tests", arch: wormhole_b0, cmd: run_t3000_llama3.2-11b-vision_unit_tests, timeout: 30, owner_id: U03FJB5TM5Y},  #Colman Glagovich
+          { name: "t3k n300 mesh llama3.2-11b-vision tests", arch: wormhole_b0, cmd: run_t3000_spoof_n300_llama3.2-11b-vision_unit_tests, timeout: 30, owner_id: U03FJB5TM5Y},  #Colman Glagovich
           { name: "t3k mixtral tests", arch: wormhole_b0, cmd: run_t3000_mixtral_tests, timeout: 30, owner_id: U03PUAKE719}, #Miguel Tairum Cruz
           { name: "t3k grok tests", arch: wormhole_b0, cmd: run_t3000_grok_tests, timeout: 30, owner_id: U03HY7MK4BT}, #Mark O'Connor
           { name: "t3k unet shallow tests", arch: wormhole_b0, cmd: run_t3000_unet_shallow_tests, timeout: 30, owner_id: U06ECNVR0EN}, #Evan Smal

diff --git a/.gitmodules b/.gitmodules
@@ -28,3 +28,6 @@
 [submodule "tt_metal/third_party/tt_llk_blackhole"]
 	path = tt_metal/third_party/tt_llk_blackhole
 	url = https://github.com/tenstorrent/tt-llk-bh.git
+[submodule "models/demos/llama3/reference/llama_models"]
+	path = models/demos/llama3/reference/llama_models
+	url = https://github.com/tenstorrent/llama-models.git
@@ -0,0 +1,108 @@
+# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+# SPDX-License-Identifier: Apache-2.0
+
+from pathlib import Path
+from typing import Optional
+from loguru import logger
+
+from PIL import Image as PIL_Image
+from termcolor import cprint
+
+from models.demos.llama3.demo.multimodal_demo_text import create_multimodal_model
+import models.demos.llama3.reference.llama_models.models.llama3.reference_impl.generation as llama_reference_generation
+
+from models.demos.llama3.reference.llama_models.models.llama3.api.datatypes import ImageMedia, UserMessage
+
+THIS_DIR = Path(__file__).parent.parent.resolve() / "reference/llama_models/models/scripts/"
+
+import torch
+import pytest
+import os
+import ttnn
+
+
+@pytest.mark.parametrize(
+    "mesh_device",
+    [
+        {"N150": (1, 1), "N300": (1, 2), "T3K": (1, 8), "TG": (8, 4)}.get(
+            os.environ.get("FAKE_DEVICE"), len(ttnn.get_device_ids())
+        )
+    ],
+    indirect=True,
+)
+@pytest.mark.parametrize(
+    "target",
+    ("tt", "cpu"),
+)
+@pytest.mark.parametrize(
+    "warmup_iters",
+    (0, 1),
+)
+def test_llama_multimodal_demo_chat(
+    mesh_device,
+    target,
+    warmup_iters,
+    temperature: float = 0.5,
+    top_p: float = 0.9,
+    max_seq_len: int = 512,
+    max_batch_size: int = 4,
+    max_gen_len: Optional[int] = 200,
+    model_parallel_size: Optional[int] = None,
+):
+    mesh_device.enable_program_cache()
+    mesh_device.enable_async(True)
+    ckpt_dir = os.environ["LLAMA_DIR"]
+    tokenizer_path = str(Path(ckpt_dir) / "tokenizer.model")
+
+    logger.info(f"Creating reference model from checkpoint in '{ckpt_dir}'")
+    generator = llama_reference_generation.Llama.build(
+        ckpt_dir,
+        tokenizer_path=tokenizer_path,
+        max_seq_len=max_seq_len,
+        max_batch_size=max_batch_size,
+        model_parallel_size=model_parallel_size,
+    )
+
+    if target == "tt":
+        logger.info(f"Creating TT model on {len(mesh_device.get_devices())} devices")
+        model = create_multimodal_model(generator.args, mesh_device)
+        generator.model = model
+
+    # image understanding
+    dialogs = []
+    with open(THIS_DIR / "resources/dog.jpg", "rb") as f:
+        img = PIL_Image.open(f).convert("RGB")
+
+    dialogs = [
+        [
+            UserMessage(
+                content=[
+                    ImageMedia(image=img),
+                    "Describe this image in two sentences",
+                ],
+            )
+        ],
+    ]
+    # text only
+    dialogs += [
+        [UserMessage(content="what is the recipe of mayonnaise in two sentences?")],
+    ]
+
+    print(f"Running text completion on {target}")
+    for _ in range(warmup_iters + 1):
+        for dialog in dialogs:
+            result = generator.chat_completion(
+                dialog,
+                max_gen_len=max_gen_len,
+                temperature=temperature,
+                top_p=top_p,
+            )
+
+            for msg in dialog:
+                print(f"{msg.role.capitalize()}: {msg.content}\n")
+
+            out_message = result.generation
+            print(f"> {out_message.role.capitalize()}: {out_message.content}")
+            for t in out_message.tool_calls:
+                print(f"  Tool call: {t.tool_name} ({t.arguments})")
+            print("\n==================================\n")
@@ -1,26 +1,18 @@
 # SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
-
 # SPDX-License-Identifier: Apache-2.0
+
 from pathlib import Path
 from typing import Optional
 from loguru import logger
 
 from PIL import Image as PIL_Image
 from termcolor import cprint
 
-import importlib
+import models.demos.llama3.reference.llama_models.models.llama3.reference_impl.generation as llama_reference_generation
 
-llama_reference_generation = importlib.import_module(
-    "models.demos.t3000.llama2_70b.reference.llama-models.models.llama3.reference_impl.generation"
-)
+from models.demos.llama3.reference.llama_models.models.llama3.api.datatypes import ImageMedia
 
-# Must import from reference for formatter to understand type of ImageMedia
-datatypes = importlib.import_module("models.demos.t3000.llama2_70b.reference.llama-models.models.llama3.api.datatypes")
-ImageMedia = datatypes.ImageMedia
-
-# THIS_DIR = Path(__file__).parent.resolve()
-# TODO: Generalize not to cglagovich home :)
-THIS_DIR = Path("/home/cglagovich/tt-metal/models/demos/t3000/llama2_70b/reference/llama-models/models/scripts/")
+THIS_DIR = Path(__file__).parent.parent.resolve() / "reference/llama_models/models/scripts/"
 
 import torch
 import pytest
@@ -59,14 +51,19 @@ def create_multimodal_model(model_args, mesh_device, dtype=ttnn.bfloat16):
     "target",
     ("tt", "cpu"),
 )
+@pytest.mark.parametrize(
+    "warmup_iters",
+    (0, 1),
+)
 def test_llama_multimodal_demo_text(
     mesh_device,
     target,
-    temperature: float = 0,
+    warmup_iters,
+    temperature: float = 0.5,
     top_p: float = 0.9,
     max_seq_len: int = 512,
     max_batch_size: int = 4,
-    max_gen_len: Optional[int] = None,
+    max_gen_len: Optional[int] = 200,
     model_parallel_size: Optional[int] = None,
 ):
     mesh_device.enable_program_cache()
@@ -96,33 +93,30 @@ def test_llama_multimodal_demo_text(
 
     with open(THIS_DIR / "resources/ocr_image.jpeg", "rb") as f:
         ocr_image = PIL_Image.open(f).convert("RGB")
-    # with open(THIS_DIR / "resources/clutter.jpeg", "rb") as f:
-    #     clutter = PIL_Image.open(f).convert("RGB")
+
+    with open(THIS_DIR / "resources/clutter.jpeg", "rb") as f:
+        clutter = PIL_Image.open(f).convert("RGB")
 
     interleaved_contents = [
         # text only
-        # "The color of the sky is blue but sometimes it can also be",
+        "The color of the sky is blue but sometimes it can also be",
         # image understanding
-        # [
-        #     ImageMedia(image=img),
-        #     "If I had to write a haiku for this one",
-        # ],
+        [ImageMedia(image=img), "If I had to write a haiku for this one"],
+        [ImageMedia(image=img2), "Couting the number of individual spaghetti strands in this image"],
         [ImageMedia(image=ocr_image), "The full text in this image is as follows"],
-        # [
-        #     ImageMedia(image=clutter),
-        #     "The count of vases, books, and miscellaneous items in this image is",
-        # ]
+        [ImageMedia(image=clutter), "The count of vases, books, and miscellaneous items in this image is"],
     ]
 
     print(f"Running text completion on {target}")
-    for content in interleaved_contents:
-        result = generator.text_completion(
-            content,
-            max_gen_len=max_gen_len,
-            temperature=temperature,
-            top_p=top_p,
-        )
-
-        cprint(f"{content}", end="")
-        cprint(f"{result.generation}", color="yellow")
-        print("\n==================================\n")
+    for _ in range(warmup_iters + 1):
+        for content in interleaved_contents:
+            result = generator.text_completion(
+                content,
+                max_gen_len=max_gen_len,
+                temperature=temperature,
+                top_p=top_p,
+            )
+
+            cprint(f"{content}", end="")
+            cprint(f"{result.generation}", color="yellow")
+            print("\n==================================\n")
@@ -184,7 +184,14 @@ def main(stdscr):
                         commands = parse_list(command_input, allow_space=False)
 
                         # Generate combinations (reordered)
-                        combinations = [(c, m, d) for c in commands for m in models for d in devices]
+                        # combinations = [(c, m, d) for c in commands for m in models for d in devices]
+                        combinations = [
+                            (c, m, d)
+                            for c in commands
+                            for m in models
+                            for d in devices
+                            if not (m in ["11b", "11b-b"] and d == "n150")
+                        ]
 
                         # Create output entries
                         for command, model, device in combinations:
@@ -230,7 +237,7 @@ def main(stdscr):
             else:
                 # Ignore enter key when exiting
                 continue
-        elif c == curses.KEY_BACKSPACE or c == 127 or c == ord("x"):
+        elif c == curses.KEY_BACKSPACE or c == 127:
             if current_line < len(input_fields):
                 current_field = current_line
                 # Remove last character from current field
@@ -506,6 +513,19 @@ def run_entry_command(entry, screen_lock, output_entries, screen_needs_update):
         "model": "pytest models/demos/llama3/tests/test_llama_model.py::test_llama_model_inference[wormhole_b0-True-mesh_device0-full]",
         "model-prefill": "pytest models/demos/llama3/tests/test_llama_model_prefill.py::test_llama_model_inference[wormhole_b0-True-mesh_device0-4096]",
         "model-quick": "pytest models/demos/llama3/tests/test_llama_model.py -k quick",
+        "vision-mlp": "pytest models/demos/llama3/tests/multimodal/test_llama_image_mlp.py",
+        "vision-attn": "pytest models/demos/llama3/tests/multimodal/test_llama_image_attention.py",
+        "vision-block": "pytest models/demos/llama3/tests/multimodal/test_llama_image_block.py",
+        "vision-xfmr": "pytest models/demos/llama3/tests/multimodal/test_llama_image_transformer.py",
+        "vision-xattn": "pytest models/demos/llama3/tests/multimodal/test_llama_cross_attention.py",
+        "vision-xblock": "pytest models/demos/llama3/tests/multimodal/test_llama_cross_block.py",
+        "vision-conv": "pytest models/demos/llama3/tests/multimodal/test_llama_conv2d_patch.py",
+        "vision-class": "pytest models/demos/llama3/tests/multimodal/test_llama_class_embedding.py",
+        "vision-tile-pos": "pytest models/demos/llama3/tests/multimodal/test_llama_tile_position_embedding.py",
+        "vision-pos": "pytest models/demos/llama3/tests/multimodal/test_llama_positional_embedding.py",
+        "vision-encoder": "pytest models/demos/llama3/tests/multimodal/test_llama_vision_encoder.py",
+        "vision-text-xfmr": "pytest models/demos/llama3/tests/multimodal/test_llama_cross_attention_transformer_text.py",
+        "vision-vision-xfmr": "pytest models/demos/llama3/tests/multimodal/test_llama_cross_attention_transformer_vision.py",
     }
 
     # Check if the command is a shortcut and replace it if necessary
@@ -657,6 +677,7 @@ def get_llama_dir(model):
         "3b": os.environ.get("LLAMA_32_3B_DIR", "/proj_sw/user_dev/llama32-data/Llama3.2-3B-Instruct"),
         "8b": os.environ.get("LLAMA_31_8B_DIR", "/proj_sw/user_dev/llama31-8b-data/Meta-Llama-3.1-8B-Instruct"),
         "11b": os.environ.get("LLAMA_32_11B_DIR", "/proj_sw/user_dev/llama32-data/Llama3.2-11B-Vision-Instruct"),
+        "11b-b": os.environ.get("LLAMA_32_11B_BASE_DIR", "/proj_sw/user_dev/llama32-data/Llama3.2-11B-Vision"),
     }.get(model.lower(), "")
 
     if not llama_dir or not os.path.exists(llama_dir):

diff --git a/models/demos/llama3/reference/llama_models b/models/demos/llama3/reference/llama_models
@@ -3,11 +3,9 @@
 # SPDX-License-Identifier: Apache-2.0
 
 ##### Python imports #####
-import math
 import pytest
 from loguru import logger
 import os
-import itertools
 
 ##### PyTorch imports #####
 import torch
@@ -56,18 +54,16 @@ def forward(self, x):
 @pytest.mark.parametrize(
     "mesh_device",
     [
-        {"N150": (1, 1), "N300": (1, 2), "T3K": (2, 4), "TG": (8, 4)}.get(
+        {"N150": (1, 1), "N300": (1, 2), "T3K": (1, 8), "TG": (8, 4)}.get(
             os.environ.get("FAKE_DEVICE"), len(ttnn.get_device_ids())
         )
     ],
     indirect=True,
 )
 @pytest.mark.parametrize(
-    "input_shape",
+    "bsz, num_concurrent_media, num_chunks",
     [
-        ((1, 4, 4, 1024, 1280)),
-        ((1, 4, 4, 1024 + 1, 1280)),
-        ((1, 4, 4, 1032, 1280)),
+        ((1, 4, 4)),
     ],
 )
 @pytest.mark.parametrize(
@@ -81,12 +77,14 @@ def test_llama_class_embedding_inference(
     use_program_cache,
     reset_seeds,
     # Input params
-    input_shape,
+    bsz,
+    num_concurrent_media,
+    num_chunks,
     layout,
     ensure_gc,
 ):
     dtype = ttnn.bfloat16
-    pcc = 0.9999
+    pcc_required = 0.9999
 
     mesh_device.enable_async(True)
 
@@ -97,13 +95,8 @@ def test_llama_class_embedding_inference(
         k[len(first_layer_prefix) :]: v for k, v in state_dict.items() if (k.startswith(first_layer_prefix))
     }
 
-    (
-        bsz,
-        num_concurrent_media,
-        num_chunks,
-        ntok,
-        dim,
-    ) = input_shape
+    ntok = nearest_32(model_args.vision_chunk_ntok)
+    dim = model_args.vision_dim
 
     ##### Prepare inputs #####
     input_tensor = torch.randn(bsz * num_concurrent_media * num_chunks, ntok, dim)
@@ -145,12 +138,8 @@ def test_llama_class_embedding_inference(
     # Only select output from one device
     tt_output_torch = tt_output_torch[..., :dim].view(reference_output.shape)
 
-    passing, pcc_message = comp_pcc(reference_output, tt_output_torch, pcc)
+    passing, pcc_message = comp_pcc(reference_output, tt_output_torch, pcc_required)
 
     logger.info(comp_allclose(reference_output, tt_output_torch))
     logger.info(f"PCC: {pcc_message}")
-    if passing:
-        logger.info(f"Llama_ClassEmbedding Passed!")
-    else:
-        logger.warning(f"Llama_ClassEmbedding Failed!")
-        assert passing, f"PCC value is lower than {pcc} for some of the outputs. Check Warnings!"
+    assert passing, f"PCC value is lower than {pcc_required} for some of the outputs. Check Warnings!"