Skip to content

Commit

Permalink
#0: remove test code
Browse files Browse the repository at this point in the history
  • Loading branch information
yugaoTT committed Nov 29, 2024
1 parent c5cfaa0 commit 0d77265
Showing 1 changed file with 0 additions and 142 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,6 @@

# SPDX-License-Identifier: Apache-2.0

import math
import torch
import ttnn
import pytest
from tqdm import tqdm


import pytest
from loguru import logger
from models.utility_functions import is_wormhole_b0, is_grayskull, is_blackhole, skip_for_wormhole_b0
Expand All @@ -27,141 +20,6 @@
)


@pytest.mark.timeout(300)
def test_dram_sharded_matmul(device, use_program_cache, reset_seeds):
# Model configuration
dim = 4096
vocab_size = 128256
split_size = vocab_size // 2

# Create dummy input
batch_size = 1
seq_len = 1
x = torch.randn(batch_size, seq_len, dim)

# Create dummy weights and split them
output_weight = torch.randn(vocab_size, dim)
output_weight_1 = output_weight[:split_size]
output_weight_2 = output_weight[split_size:]

# Perform PyTorch matmul for comparison
reference_output = torch.matmul(x, output_weight.t())

# Configure memory layout for output_weight (for both parts)
def create_output_mem_config(size):
# Calculate padded size to ensure it's divisible by (32 * 12)
padded_size = math.ceil(size / (32 * 12)) * (32 * 12)
if padded_size != size:
print(f"Original size: {size}, Padded size: {padded_size}")
shard_spec = ttnn.ShardSpec(
ttnn.CoreRangeSet(
{
ttnn.CoreRange(
ttnn.CoreCoord(0, 0),
ttnn.CoreCoord(device.dram_grid_size().x - 1, device.dram_grid_size().y - 1),
)
}
),
(4096, padded_size // 12),
ttnn.ShardOrientation.ROW_MAJOR,
False,
)
return ttnn.MemoryConfig(ttnn.TensorMemoryLayout.WIDTH_SHARDED, ttnn.BufferType.DRAM, shard_spec)

# Convert output_weight parts to ttnn tensors
output_weight_ttnn_1 = ttnn.as_tensor(
output_weight_1.permute(1, 0),
device=device,
memory_config=create_output_mem_config(split_size),
layout=ttnn.TILE_LAYOUT,
dtype=ttnn.bfloat8_b,
)
output_weight_ttnn_2 = ttnn.as_tensor(
output_weight_2.permute(1, 0),
device=device,
memory_config=create_output_mem_config(split_size),
layout=ttnn.TILE_LAYOUT,
dtype=ttnn.bfloat8_b,
)

# Convert input to ttnn tensor
x_ttnn = ttnn.from_torch(
x,
device=device,
dtype=ttnn.bfloat16,
layout=ttnn.TILE_LAYOUT,
memory_config=ttnn.create_sharded_memory_config(
(32, 4096 // 64), # Shard shape: [32, 64] -> 1 shard per core
ttnn.CoreGrid(y=8, x=8),
ttnn.ShardStrategy.WIDTH,
ttnn.ShardOrientation.ROW_MAJOR,
use_height_and_width_as_shard_shape=True,
),
)

# Configure compute kernel
compute_kernel_config = ttnn.WormholeComputeKernelConfig(
math_fidelity=ttnn.MathFidelity.HiFi2,
math_approx_mode=False,
fp32_dest_acc_en=False,
packer_l1_acc=True,
)

# Configure program
program_config = ttnn.MatmulMultiCoreReuseMultiCastDRAMShardedProgramConfig(
in0_block_w=1,
per_core_M=1,
per_core_N=32, # 128256 / 2 / tile_size / core_count
fused_activation=None,
)

for i in tqdm(range(100)):
# Run the linear layers
output_1 = ttnn.linear(
x_ttnn,
output_weight_ttnn_1,
compute_kernel_config=compute_kernel_config,
program_config=program_config,
memory_config=ttnn.L1_WIDTH_SHARDED_MEMORY_CONFIG,
dtype=ttnn.bfloat8_b,
)
output_2 = ttnn.linear(
x_ttnn,
output_weight_ttnn_2,
compute_kernel_config=compute_kernel_config,
program_config=program_config,
memory_config=ttnn.L1_WIDTH_SHARDED_MEMORY_CONFIG,
dtype=ttnn.bfloat8_b,
)

output_1 = ttnn.sharded_to_interleaved(output_1)
output_2 = ttnn.sharded_to_interleaved(output_2)

# Concatenate the outputs
output = ttnn.concat([output_1, output_2], dim=-1)

# Convert output back to PyTorch tensor
ttnn_output_torch = ttnn.to_torch(output)

# Assertions
assert ttnn_output_torch.shape == (
batch_size,
seq_len,
vocab_size,
), f"Expected output shape {(batch_size, seq_len, vocab_size)}, but got {ttnn_output_torch.shape}"
assert not torch.isnan(ttnn_output_torch).any(), "Output contains NaN values"
assert not torch.isinf(ttnn_output_torch).any(), "Output contains infinite values"

print("Output shape:", ttnn_output_torch.shape)
print("TTNN output sample:", ttnn_output_torch[0, 0, :10].tolist()) # Print first 10 elements of the TTNN output
print(
"Reference output sample:", reference_output[0, 0, :10].tolist()
) # Print first 10 elements of the reference output

# Compare TTNN output with PyTorch matmul
pcc = ttnn.pearson_correlation_coefficient(ttnn_output_torch.flatten(), reference_output.flatten())


def find_max_subblock(out_block_h, out_block_w):
max_product = 0
best_h = 1
Expand Down

0 comments on commit 0d77265

Please sign in to comment.