From 7f753407e5e7f3b33f8f51e72a554165a0879d7d Mon Sep 17 00:00:00 2001 From: Jiahan Xie Date: Sun, 24 Nov 2024 11:43:05 -0500 Subject: [PATCH 1/5] packed systolic matmul --- tests/dataflow/test_packed_systolic.py | 97 ++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100644 tests/dataflow/test_packed_systolic.py diff --git a/tests/dataflow/test_packed_systolic.py b/tests/dataflow/test_packed_systolic.py new file mode 100644 index 00000000..f4798835 --- /dev/null +++ b/tests/dataflow/test_packed_systolic.py @@ -0,0 +1,97 @@ +# test_packed_systolic.py +import allo +from allo.ir.types import int8, int16, int32 +import allo.dataflow as df +import allo.backend.hls as hls +import numpy as np + +L, D = 2, 2 +M, N, K = L, 1*D, D +PP = 2 +P0, P1 = M // PP + 2, N + 2 + +if PP == 2: + np_type = np.int16 + allo_type = int16 +else: + raise ValueError(f"Unsupported packing factor: {PP}") + +@df.region() +def top(): + fifo_A = df.array(df.pipe(dtype=allo_type, shape=(), depth=4), shape=(P0, P1)) + fifo_B = df.array(df.pipe(dtype=allo_type, shape=(), depth=4), shape=(P0, P1)) + + @df.kernel(mapping=[P0, P1]) + def gemm( + X_packed: allo_type[L // PP, D], + W_packed: allo_type[D, 1 * D // PP], + Z_packed: allo_type[L // PP, 1 * D] + ): + i, j = df.get_pid() + # Peripheral kernels + with allo.meta_if(i in {0, M + 1} and j in {0, N + 1}): + pass + with allo.meta_elif(j == 0): + # i > 0 + for k in range(K // PP): + fifo_A[i, j + 1].put(X_packed[i - 1, k]) + with allo.meta_elif(i == 0): + # j > 0 + for k in range(K // PP): + fifo_B[i + 1, j].put(W_packed[k, (j - 1)]) + # drain + with allo.meta_elif(i == M + 1 and j > 0): + for k in range(K // PP): + b: allo_type = fifo_B[i, j].get() + with allo.meta_elif(j == N + 1 and i > 0): + for k in range(K // PP): + a: allo_type = fifo_A[i, j].get() + # main body + with allo.meta_else(): + c: allo_type = 0 + for k in range(K): + a: allo_type = fifo_A[i, j].get() + b: allo_type = fifo_B[i, j].get() + for p in range(PP): + a_unpacked: int8 = a[p * 8 : (p + 1) * 8] + b_unpacked: int8 = b[p * 8 : (p + 1) * 8] + c[p * 8 : (p + 1) * 8] += a_unpacked * b_unpacked + fifo_A[i, j + 1].put(a) + fifo_B[i + 1, j].put(b) + Z_packed[i - 1, j - 1][k * 8 : (k + 1) * 8] += c + +def test_packed_systolic(): + X = np.random.randint(-4, 4, size=(L, D)).astype(np.int8) + print("X:") + print(X) + W_A_cst = np.random.randint(-4, 4, size=(D, 1 * D)).astype(np.int8) + print("W_A_cst:") + print(W_A_cst) + + packed_X = np.ascontiguousarray( + np.ascontiguousarray(X).view(np_type).transpose() + ) + print("packed_X:") + print(packed_X) + W_A_packed = np.ascontiguousarray( + np.ascontiguousarray(W_A_cst.transpose()).view(np_type).transpose() + ) + print("W_A_packed") + print(W_A_packed) + Z_packed = np.zeros((L // PP, 1 * D), dtype=np_type) + mod = df.build(top) + if hls.is_available("vitis_hls"): + mod(packed_X, W_A_packed, Z_packed) + np_C = X @ W_A_cst + print("np_C:") + print(np_C) + np_C_packed = np.ascontiguousarray( + np.ascontiguousarray(np_C).view(np_type).transpose() + ) + print("np_C_packed:") + print(np_C_packed) + np.testing.assert_allclose(Z_packed, np_C_packed, atol=1e-3) + print("Passed!") + +if __name__ == "__main__": + test_packed_systolic() From c6dbe16c1ab5465203c147b16395fba89601cfe9 Mon Sep 17 00:00:00 2001 From: Jiahan Xie Date: Sun, 24 Nov 2024 12:34:32 -0500 Subject: [PATCH 2/5] license header check --- tests/dataflow/test_packed_systolic.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/dataflow/test_packed_systolic.py b/tests/dataflow/test_packed_systolic.py index f4798835..111cb04c 100644 --- a/tests/dataflow/test_packed_systolic.py +++ b/tests/dataflow/test_packed_systolic.py @@ -1,4 +1,6 @@ -# test_packed_systolic.py +# Copyright Allo authors. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + import allo from allo.ir.types import int8, int16, int32 import allo.dataflow as df From df1e41a28029305e6b2394b540fab9d831cc7baa Mon Sep 17 00:00:00 2001 From: Jiahan Xie Date: Sun, 24 Nov 2024 12:59:36 -0500 Subject: [PATCH 3/5] format to pep8 standard --- tests/dataflow/test_packed_systolic.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/tests/dataflow/test_packed_systolic.py b/tests/dataflow/test_packed_systolic.py index 111cb04c..d6ddffc7 100644 --- a/tests/dataflow/test_packed_systolic.py +++ b/tests/dataflow/test_packed_systolic.py @@ -8,7 +8,7 @@ import numpy as np L, D = 2, 2 -M, N, K = L, 1*D, D +M, N, K = L, 1 * D, D PP = 2 P0, P1 = M // PP + 2, N + 2 @@ -18,6 +18,7 @@ else: raise ValueError(f"Unsupported packing factor: {PP}") + @df.region() def top(): fifo_A = df.array(df.pipe(dtype=allo_type, shape=(), depth=4), shape=(P0, P1)) @@ -27,7 +28,7 @@ def top(): def gemm( X_packed: allo_type[L // PP, D], W_packed: allo_type[D, 1 * D // PP], - Z_packed: allo_type[L // PP, 1 * D] + Z_packed: allo_type[L // PP, 1 * D], ): i, j = df.get_pid() # Peripheral kernels @@ -62,6 +63,7 @@ def gemm( fifo_B[i + 1, j].put(b) Z_packed[i - 1, j - 1][k * 8 : (k + 1) * 8] += c + def test_packed_systolic(): X = np.random.randint(-4, 4, size=(L, D)).astype(np.int8) print("X:") @@ -69,10 +71,8 @@ def test_packed_systolic(): W_A_cst = np.random.randint(-4, 4, size=(D, 1 * D)).astype(np.int8) print("W_A_cst:") print(W_A_cst) - - packed_X = np.ascontiguousarray( - np.ascontiguousarray(X).view(np_type).transpose() - ) + + packed_X = np.ascontiguousarray(np.ascontiguousarray(X).view(np_type).transpose()) print("packed_X:") print(packed_X) W_A_packed = np.ascontiguousarray( @@ -95,5 +95,6 @@ def test_packed_systolic(): np.testing.assert_allclose(Z_packed, np_C_packed, atol=1e-3) print("Passed!") + if __name__ == "__main__": test_packed_systolic() From f7c6363cc12cd5644452cf57ec6b6f430ec22fa7 Mon Sep 17 00:00:00 2001 From: Jiahan Xie Date: Sun, 24 Nov 2024 22:46:20 -0500 Subject: [PATCH 4/5] pass 2 by 2 case --- tests/dataflow/test_packed_systolic.py | 38 ++++++++++---------------- 1 file changed, 15 insertions(+), 23 deletions(-) diff --git a/tests/dataflow/test_packed_systolic.py b/tests/dataflow/test_packed_systolic.py index d6ddffc7..ab4f5981 100644 --- a/tests/dataflow/test_packed_systolic.py +++ b/tests/dataflow/test_packed_systolic.py @@ -32,66 +32,58 @@ def gemm( ): i, j = df.get_pid() # Peripheral kernels - with allo.meta_if(i in {0, M + 1} and j in {0, N + 1}): + with allo.meta_if(i in {0, M // PP + 1} and j in {0, N + 1}): pass with allo.meta_elif(j == 0): # i > 0 - for k in range(K // PP): + for k in range(K): fifo_A[i, j + 1].put(X_packed[i - 1, k]) with allo.meta_elif(i == 0): # j > 0 - for k in range(K // PP): - fifo_B[i + 1, j].put(W_packed[k, (j - 1)]) + for k in range(K): + fifo_B[i + 1, j].put(W_packed[j // PP, 0]) + # drain - with allo.meta_elif(i == M + 1 and j > 0): - for k in range(K // PP): + with allo.meta_elif(i == M // PP + 1 and j > 0): + for k in range(K): b: allo_type = fifo_B[i, j].get() with allo.meta_elif(j == N + 1 and i > 0): - for k in range(K // PP): + for k in range(K): a: allo_type = fifo_A[i, j].get() # main body with allo.meta_else(): - c: allo_type = 0 + Z_elm: allo_type = Z_packed[i - 1, j - 1] for k in range(K): + c: allo_type = 0 a: allo_type = fifo_A[i, j].get() b: allo_type = fifo_B[i, j].get() for p in range(PP): a_unpacked: int8 = a[p * 8 : (p + 1) * 8] b_unpacked: int8 = b[p * 8 : (p + 1) * 8] - c[p * 8 : (p + 1) * 8] += a_unpacked * b_unpacked + c += a_unpacked * b_unpacked fifo_A[i, j + 1].put(a) fifo_B[i + 1, j].put(b) - Z_packed[i - 1, j - 1][k * 8 : (k + 1) * 8] += c + Z_elm[k * 8 : (k + 1) * 8] += c + Z_packed[i - 1, j - 1] = Z_elm def test_packed_systolic(): X = np.random.randint(-4, 4, size=(L, D)).astype(np.int8) - print("X:") - print(X) W_A_cst = np.random.randint(-4, 4, size=(D, 1 * D)).astype(np.int8) - print("W_A_cst:") - print(W_A_cst) packed_X = np.ascontiguousarray(np.ascontiguousarray(X).view(np_type).transpose()) - print("packed_X:") - print(packed_X) W_A_packed = np.ascontiguousarray( np.ascontiguousarray(W_A_cst.transpose()).view(np_type).transpose() ) - print("W_A_packed") - print(W_A_packed) Z_packed = np.zeros((L // PP, 1 * D), dtype=np_type) mod = df.build(top) if hls.is_available("vitis_hls"): mod(packed_X, W_A_packed, Z_packed) + np_C = X @ W_A_cst - print("np_C:") - print(np_C) np_C_packed = np.ascontiguousarray( - np.ascontiguousarray(np_C).view(np_type).transpose() + np.ascontiguousarray(np_C.transpose()).view(np_type).transpose() ) - print("np_C_packed:") - print(np_C_packed) np.testing.assert_allclose(Z_packed, np_C_packed, atol=1e-3) print("Passed!") From 680f36486791a868de4eb07cf35ae650a3bd4bf3 Mon Sep 17 00:00:00 2001 From: Jiahan Xie Date: Thu, 28 Nov 2024 14:24:50 -0500 Subject: [PATCH 5/5] increase M, N, K, and PP --- tests/dataflow/test_packed_systolic.py | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/tests/dataflow/test_packed_systolic.py b/tests/dataflow/test_packed_systolic.py index ab4f5981..052e976d 100644 --- a/tests/dataflow/test_packed_systolic.py +++ b/tests/dataflow/test_packed_systolic.py @@ -7,14 +7,16 @@ import allo.backend.hls as hls import numpy as np -L, D = 2, 2 -M, N, K = L, 1 * D, D -PP = 2 +M, N, K = 8, 8, 4 +PP = 4 P0, P1 = M // PP + 2, N + 2 if PP == 2: np_type = np.int16 allo_type = int16 +elif PP == 4: + np_type = np.int32 + allo_type = int32 else: raise ValueError(f"Unsupported packing factor: {PP}") @@ -26,9 +28,9 @@ def top(): @df.kernel(mapping=[P0, P1]) def gemm( - X_packed: allo_type[L // PP, D], - W_packed: allo_type[D, 1 * D // PP], - Z_packed: allo_type[L // PP, 1 * D], + X_packed: allo_type[M, K // PP], + W_packed: allo_type[K // PP, N], + Z_packed: allo_type[M // PP, N], ): i, j = df.get_pid() # Peripheral kernels @@ -37,11 +39,11 @@ def gemm( with allo.meta_elif(j == 0): # i > 0 for k in range(K): - fifo_A[i, j + 1].put(X_packed[i - 1, k]) + fifo_A[i, j + 1].put(X_packed[(i - 1) * PP, k]) with allo.meta_elif(i == 0): # j > 0 for k in range(K): - fifo_B[i + 1, j].put(W_packed[j // PP, 0]) + fifo_B[i + 1, j].put(W_packed[k // PP, j - 1]) # drain with allo.meta_elif(i == M // PP + 1 and j > 0): @@ -68,14 +70,14 @@ def gemm( def test_packed_systolic(): - X = np.random.randint(-4, 4, size=(L, D)).astype(np.int8) - W_A_cst = np.random.randint(-4, 4, size=(D, 1 * D)).astype(np.int8) + X = np.random.randint(-4, 4, size=(M, K)).astype(np.int8) + W_A_cst = np.random.randint(-4, 4, size=(K, N)).astype(np.int8) - packed_X = np.ascontiguousarray(np.ascontiguousarray(X).view(np_type).transpose()) + packed_X = np.ascontiguousarray(np.ascontiguousarray(X).view(np_type)) W_A_packed = np.ascontiguousarray( np.ascontiguousarray(W_A_cst.transpose()).view(np_type).transpose() ) - Z_packed = np.zeros((L // PP, 1 * D), dtype=np_type) + Z_packed = np.zeros((M // PP, N), dtype=np_type) mod = df.build(top) if hls.is_available("vitis_hls"): mod(packed_X, W_A_packed, Z_packed)